diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c51abf4e3..6009e0c37 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -219,7 +219,9 @@ jobs: runs-on: ubuntu-20.04 needs: build_vm strategy: + fail-fast: false matrix: + name: [regression, opencl, cache, config1, config2, debug, stress, vm] xlen: [32, 64] steps: @@ -267,4 +269,4 @@ jobs: steps: - name: Check Completion - run: echo "All matrix jobs passed" \ No newline at end of file + run: echo "All matrix jobs passed" diff --git a/Makefile.in b/Makefile.in index 7f594747a..bfe944998 100644 --- a/Makefile.in +++ b/Makefile.in @@ -44,10 +44,10 @@ clean: clean-build $(MAKE) -C $(VORTEX_HOME)/third_party clean # Install setup -KERNEL_INC_DST = $(PREFIX)/kernel/include -KERNEL_LIB_DST = $(PREFIX)/kernel/lib$(XLEN) -RUNTIME_INC_DST = $(PREFIX)/runtime/include -RUNTIME_LIB_DST = $(PREFIX)/runtime/lib +KERNEL_INC_DST = $(INSTALLDIR)/kernel/include +KERNEL_LIB_DST = $(INSTALLDIR)/kernel/lib$(XLEN) +RUNTIME_INC_DST = $(INSTALLDIR)/runtime/include +RUNTIME_LIB_DST = $(INSTALLDIR)/runtime/lib KERNEL_HEADERS = $(wildcard $(VORTEX_HOME)/kernel/include/*.h) KERNEL_LIBS = $(wildcard kernel/*.a) diff --git a/README.md b/README.md index 6cabdeee5..4322f06bc 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ -[![Build Status](https://travis-ci.com/vortexgpgpu/vortex.svg?branch=master)](https://travis-ci.com/vortexgpgpu/vortex) - # Vortex GPGPU Vortex is a full-stack open-source RISC-V GPGPU. @@ -47,20 +45,20 @@ More detailed build instructions can be found [here](docs/install_vortex.md). - [Yosys](https://github.com/YosysHQ/yosys) - [Sv2v](https://github.com/zachjs/sv2v) ### Install development tools -``` - sudo apt-get install build-essential - sudo apt-get install binutils - sudo apt-get install python - sudo apt-get install uuid-dev - sudo apt-get install git +```sh +sudo apt-get install build-essential +sudo apt-get install binutils +sudo apt-get install python +sudo apt-get install uuid-dev +sudo apt-get install git ``` ### Install Vortex codebase +```sh + git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git + cd vortex ``` - git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git -b vortex_vm - cd vortex -``` - ### Configure your build folder +```sh # # By default, the toolchain default install location is the /opt folder and can be overridden by setting --tooldir. # This is the example for volvo server @@ -72,38 +70,45 @@ More detailed build instructions can be found [here](docs/install_vortex.md). ../configure --xlen=32 --tooldir=/software/vortex-toolchain-2024-2024-08-09 --prefix=$OUT_DIR # Run the following instead to enable virtual memory feature in compilation ../configure --xlen=32 --tooldir=/software/vortex-toolchain-2024-2024-08-09 --prefix=$OUT_DIR --vm_enable=1 - +``` ### Install prebuilt toolchain # We will use the precomipled tools in volvo toolchanin directory ### set environment variables +```sh # should always run before using the toolchain! source ./ci/toolchain_env.sh +``` ### Building Vortex - make -s +```sh +make -s +``` + ### Quick demo running vecadd OpenCL kernel on 2 cores - $ ./ci/blackbox.sh --cores=2 --app=vecadd +```sh +./ci/blackbox.sh --cores=2 --app=vecadd +``` ### Common Developer Tips - Installing Vortex kernel and runtime libraries to use with external tools requires passing --prefix= to the configure script. - ```sh - $ ../configure --xlen=32 --tooldir=$HOME/tools --prefix= - $ make -s - $ make install - `````` +```sh +../configure --xlen=32 --tooldir=$HOME/tools --prefix= +make -s +make install +``` - Building Vortex 64-bit simply requires using --xlen=64 configure option. - ```sh - $ ../configure --xlen=32 --tooldir=$HOME/tools - ``` +```sh +../configure --xlen=32 --tooldir=$HOME/tools +``` - Sourcing "./ci/toolchain_env.sh" is required everytime you start a new terminal. we recommend adding "source /ci/toolchain_env.sh" to your ~/.bashrc file to automate the process at login. - ```sh - $ echo "source /ci/toolchain_env.sh" >> ~/.bashrc - ``` +```sh +echo "source /ci/toolchain_env.sh" >> ~/.bashrc +``` - Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again to get it propagated into your build folder. - ```sh - $ ../configure - ``` +```sh +../configure +``` - To debug the GPU, you can generate a "run.log" trace. see /docs/debugging.md for more information. - ```sh - $ ./ci/blackbox.sh --app=demo --debug=3 - ``` +```sh +./ci/blackbox.sh --app=demo --debug=3 +``` - For additional information, check out the /docs. diff --git a/ci/regression.sh.in b/ci/regression.sh.in index 57c021a70..9ba65cfee 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -23,6 +23,8 @@ rm -f blackbox.*.cache XLEN=${XLEN:=@XLEN@} +XSIZE=$((XLEN / 8)) + echo "Vortex Regression Test: XLEN=$XLEN" unittest() @@ -99,11 +101,11 @@ regression() # test global barrier CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tgbar" --cores=2 - CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tgbar" --cores=2 + CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tgbar" --cores=2 # test local barrier ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar" - ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tbar" + ./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tbar" echo "regression tests done!" } @@ -148,32 +150,54 @@ vm(){ echo "vm tests done!" } -test_csv_trace() +cache() { - # test CSV trace generation - make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null - make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null - make -C tests/riscv/isa run-simx-32im > run_simx.log - make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log - ./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv - ./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv - diff trace_rtlsim.csv trace_simx.csv - # clean build - make -C sim/simx clean - make -C sim/rtlsim clean -} + echo "begin cache tests..." -debug() -{ - echo "begin debugging tests..." + # disable local memory + CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo --perf=1 + CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=demo --perf=1 - test_csv_trace + # disable L1 cache + CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" - ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" - ./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1" + # reduce l1 line size + CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=rtlsim --app=io_addr + CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=io_addr + CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx - echo "debugging tests done!" + # test cache ways + CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx + + # test cache banking + CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx + CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx + + # test writeback + CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --app=mstress + CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --app=mstress + CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress + CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress + + # cache clustering + CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=4 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=4 --warps=1 --threads=2 + + # L2/L3 + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=diverge --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --app=diverge --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1" + + echo "begin cache tests..." } config1() @@ -189,10 +213,12 @@ config1() ./ci/blackbox.sh --driver=simx --warps=8 --threads=16 --app=diverge # cores clustering - ./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1" + ./ci/blackbox.sh --driver=rtlsim --cores=4 --app=diverge --args="-n1" + ./ci/blackbox.sh --driver=simx --cores=4 --app=diverge --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1" - ./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1" - ./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1" + ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1" # issue width CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge @@ -212,22 +238,19 @@ config1() CONFIGS="-DISSUE_WIDTH=2 -DNUM_FPU_BLOCK=1 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx CONFIGS="-DISSUE_WIDTH=4 -DNUM_FPU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx + # FPU's PE scaling + CONFIGS="-DFMA_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfmadd" + CONFIGS="-DFCVT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tftoi" + CONFIGS="-DFDIV_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfdiv" + CONFIGS="-DFSQRT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfsqrt" + CONFIGS="-DFNCP_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfclamp" + # LSU scaling CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx - # L2/L3 - ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1" - ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1" - ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1" - ./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1" - ./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1" - - # multiple L1 caches per socket - CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=8 --warps=1 --threads=2 - echo "configuration-1 tests done!" } @@ -262,55 +285,63 @@ config2() # disabling ZICOND extension CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo - # disable local memory - CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo --perf=1 - CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=demo --perf=1 - # test AXI bus - AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=demo - - # disable L1 cache - CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - - # reduce l1 line size - CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=rtlsim --app=io_addr - CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=simx --app=io_addr - CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx - - # test cache ways - CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx - - # test cache banking - CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx - CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx + AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=mstress # test 128-bit MEM block - CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=demo + CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=mstress + + # test XLEN-bit MEM block + CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=opae --app=mstress + CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=mstress + + # test memory coalescing + CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8 + CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8 # test single-bank DRAM - CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --app=demo + CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress # test 27-bit DRAM address - CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --app=demo + CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --app=mstress echo "configuration-2 tests done!" } +test_csv_trace() +{ + # test CSV trace generation + make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null + make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null + make -C tests/riscv/isa run-simx-32im > run_simx.log + make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log + ./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv + ./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv + diff trace_rtlsim.csv trace_simx.csv + # clean build + make -C sim/simx clean + make -C sim/rtlsim clean +} + +debug() +{ + echo "begin debugging tests..." + + test_csv_trace + + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" + ./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1" + + echo "debugging tests done!" +} + stress() { echo "begin stress tests..." # test verilator reset values - CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood + CONFIGS="-DVERILATOR_RESET_VALUE=1 -DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --args="-n128" --l2cache echo "stress tests done!" @@ -329,19 +360,14 @@ synthesis() show_usage() { echo "Vortex Regression Test" - echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]" + echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]" } -start=$SECONDS - declare -a tests=() clean=0 while [ "$1" != "" ]; do case $1 in - --vm ) - tests+=("vm") - ;; --clean ) clean=1 ;; @@ -360,6 +386,12 @@ while [ "$1" != "" ]; do --opencl ) tests+=("opencl") ;; + --cache ) + tests+=("cache") + ;; + --vm ) + tests+=("vm") + ;; --config1 ) tests+=("config1") ;; @@ -382,6 +414,7 @@ while [ "$1" != "" ]; do tests+=("kernel") tests+=("regression") tests+=("opencl") + tests+=("cache") tests+=("config1") tests+=("config2") tests+=("debug") @@ -405,6 +438,8 @@ then make -s fi +start=$SECONDS + for test in "${tests[@]}"; do $test done diff --git a/ci/trace_csv.py b/ci/trace_csv.py index c3113de85..4a36f5f6a 100755 --- a/ci/trace_csv.py +++ b/ci/trace_csv.py @@ -19,6 +19,8 @@ import csv import re import inspect +configs = None + def parse_args(): parser = argparse.ArgumentParser(description='CPU trace log to CSV format converter.') parser.add_argument('-t', '--type', default='simx', help='log type (rtlsim or simx)') @@ -26,6 +28,24 @@ def parse_args(): parser.add_argument('log', help='Input log file') return parser.parse_args() +def load_config(filename): + config_pattern = r"CONFIGS: num_threads=(\d+), num_warps=(\d+), num_cores=(\d+), num_clusters=(\d+), socket_size=(\d+), local_mem_base=0x([0-9a-fA-F]+), num_barriers=(\d+)" + with open(filename, 'r') as file: + for line in file: + config_match = re.search(config_pattern, line) + if config_match: + config = { + 'num_threads': int(config_match.group(1)), + 'num_warps': int(config_match.group(2)), + 'num_cores': int(config_match.group(3)), + 'num_clusters': int(config_match.group(4)), + 'socket_size': int(config_match.group(5)), + 'local_mem_base': int(config_match.group(6), 16), + 'num_barriers': int(config_match.group(7)), + } + return config + return None + def parse_simx(log_lines): pc_pattern = r"PC=(0x[0-9a-fA-F]+)" instr_pattern = r"Instr (0x[0-9a-fA-F]+):" @@ -46,10 +66,10 @@ def parse_simx(log_lines): instr_data = {} instr_data["lineno"] = lineno instr_data["PC"] = re.search(pc_pattern, line).group(1) - instr_data["core_id"] = re.search(core_id_pattern, line).group(1) - instr_data["warp_id"] = re.search(warp_id_pattern, line).group(1) + instr_data["core_id"] = int(re.search(core_id_pattern, line).group(1)) + instr_data["warp_id"] = int(re.search(warp_id_pattern, line).group(1)) instr_data["tmask"] = re.search(tmask_pattern, line).group(1) - instr_data["uuid"] = re.search(uuid_pattern, line).group(1) + instr_data["uuid"] = int(re.search(uuid_pattern, line).group(1)) elif line.startswith("DEBUG Instr"): instr_data["instr"] = re.search(instr_pattern, line).group(1) instr_data["opcode"] = re.search(opcode_pattern, line).group(1) @@ -60,6 +80,7 @@ def parse_simx(log_lines): instr_data["destination"] = re.search(destination_pattern, line).group(1) except Exception as e: print("Error at line {}: {}".format(lineno, e)) + instr_data = None if instr_data: entries.append(instr_data) return entries @@ -95,7 +116,7 @@ def append_value(text, reg, value, tmask_arr, sep): return text, sep def parse_rtlsim(log_lines): - config_pattern = r"CONFIGS: num_threads=(\d+), num_warps=(\d+), num_cores=(\d+), num_clusters=(\d+), socket_size=(\d+), local_mem_base=(\d+), num_barriers=(\d+)" + global configs line_pattern = r"\d+: cluster(\d+)-socket(\d+)-core(\d+)-(decode|issue|commit)" pc_pattern = r"PC=(0x[0-9a-fA-F]+)" instr_pattern = r"instr=(0x[0-9a-fA-F]+)" @@ -117,36 +138,20 @@ def parse_rtlsim(log_lines): uuid_pattern = r"#(\d+)" entries = [] instr_data = {} - num_threads = 0 - num_warps = 0 - num_cores = 0 - num_clusters = 0 - socket_size = 0 - local_mem_base = 0 - num_barriers = 0 - num_sockets = 0 + num_cores = configs['num_cores'] + socket_size = configs['socket_size'] + num_sockets = (num_cores + socket_size - 1) // socket_size for lineno, line in enumerate(log_lines, start=1): try: - config_match = re.search(config_pattern, line) - if config_match: - num_threads = int(config_match.group(1)) - num_warps = int(config_match.group(2)) - num_cores = int(config_match.group(3)) - num_clusters = int(config_match.group(4)) - socket_size = int(config_match.group(5)) - local_mem_base = int(config_match.group(6)) - num_barriers = int(config_match.group(7)) - num_sockets = (num_cores + socket_size - 1) // socket_size - continue line_match = re.search(line_pattern, line) if line_match: PC = re.search(pc_pattern, line).group(1) - warp_id = re.search(warp_id_pattern, line).group(1) + warp_id = int(re.search(warp_id_pattern, line).group(1)) tmask = re.search(tmask_pattern, line).group(1) - uuid = re.search(uuid_pattern, line).group(1) - cluster_id = line_match.group(1) - socket_id = line_match.group(2) - core_id = line_match.group(3) + uuid = int(re.search(uuid_pattern, line).group(1)) + cluster_id = int(line_match.group(1)) + socket_id = int(line_match.group(2)) + core_id = int(line_match.group(3)) stage = line_match.group(4) if stage == "decode": trace = {} @@ -273,7 +278,9 @@ def split_log_file(log_filename): return sublogs def main(): + global configs args = parse_args() + configs = load_config(args.log) sublogs = split_log_file(args.log) write_csv(sublogs, args.csv, args.type) diff --git a/config.mk.in b/config.mk.in index 6b20a3050..8ec052094 100644 --- a/config.mk.in +++ b/config.mk.in @@ -19,7 +19,7 @@ TOOLDIR ?= @TOOLDIR@ OSVERSION ?= @OSVERSION@ -PREFIX ?= @PREFIX@ +INSTALLDIR ?= @INSTALLDIR@ LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex diff --git a/configure b/configure index 2c0811ec3..c8f932488 100755 --- a/configure +++ b/configure @@ -63,7 +63,7 @@ copy_files() { filename_no_ext="${filename%.in}" dest_file="$dest_dir/$filename_no_ext" mkdir -p "$dest_dir" - sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@PREFIX@|$PREFIX|g; s|@VM_ENABLE@|$VM_ENABLE|g" "$file" > "$dest_file" + sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@VM_ENABLE@|$VM_ENABLE|g" "$file" > "$dest_file" # apply permissions to bash scripts read -r firstline < "$dest_file" if [[ "$firstline" =~ ^#!.*bash ]]; then @@ -178,4 +178,4 @@ THIRD_PARTY_DIR=$SCRIPT_DIR/third_party copy_files "$SCRIPT_DIR" "$CURRENT_DIR" -echo "VM Enable: "$VM_ENABLE \ No newline at end of file +echo "VM Enable: "$VM_ENABLE diff --git a/docs/debugging.md b/docs/debugging.md index e0450e5e7..6e2e14890 100644 --- a/docs/debugging.md +++ b/docs/debugging.md @@ -53,9 +53,9 @@ A waveform trace `trace.vcd` will be generated in the current directory during t ## Analyzing Vortex trace log When debugging Vortex RTL or SimX Simulator, reading the trace run.log file can be overwhelming when the trace gets really large. -We provide a trace sanitizer tool under ./hw/scripts/trace_csv.py that you can use to convert the large trace into a CSV file containing all the instructions that executed with their source and destination operands. To increase compatibility between traces you will need to initialize RTLSIM's GPRs to zero by defining GPR_RESET. +We provide a trace sanitizer tool under ./hw/scripts/trace_csv.py that you can use to convert the large trace into a CSV file containing all the instructions that executed with their source and destination operands. - $ CONFIGS="-DGPR_RESET" ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=3 --log=run_rtlsim.log + $ ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=3 --log=run_rtlsim.log $ ./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv $ ./ci/blackbox.sh --driver=simx --app=demo --debug=3 --log=run_simx.log diff --git a/hw/rtl/VX_cluster.sv b/hw/rtl/VX_cluster.sv index 108e95073..714e69dd4 100644 --- a/hw/rtl/VX_cluster.sv +++ b/hw/rtl/VX_cluster.sv @@ -96,10 +96,11 @@ module VX_cluster import VX_gpu_pkg::*; #( .CRSQ_SIZE (`L2_CRSQ_SIZE), .MSHR_SIZE (`L2_MSHR_SIZE), .MRSQ_SIZE (`L2_MRSQ_SIZE), - .MREQ_SIZE (`L2_MREQ_SIZE), + .MREQ_SIZE (`L2_WRITEBACK ? `L2_MSHR_SIZE : `L2_MREQ_SIZE), .TAG_WIDTH (L2_TAG_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`L2_WRITEBACK), + .DIRTY_BYTES (`L2_WRITEBACK), .UUID_WIDTH (`UUID_WIDTH), .CORE_OUT_BUF (2), .MEM_OUT_BUF (2), diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 45041ac4a..6b6a0ad86 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -217,7 +217,7 @@ `ifndef IO_COUT_ADDR `define IO_COUT_ADDR `IO_BASE_ADDR `endif -`define IO_COUT_SIZE `MEM_BLOCK_SIZE +`define IO_COUT_SIZE 64 `ifndef IO_MPM_ADDR `define IO_MPM_ADDR (`IO_COUT_ADDR + `IO_COUT_SIZE) @@ -685,7 +685,7 @@ // Number of Banks `ifndef L3_NUM_BANKS -`define L3_NUM_BANKS `MIN(4, `NUM_CLUSTERS) +`define L3_NUM_BANKS `MIN(8, `NUM_CLUSTERS) `endif // Core Response Queue Size @@ -718,6 +718,15 @@ `define L3_WRITEBACK 0 `endif +`ifndef MEMORY_BANKS +`define MEMORY_BANKS 8 +`endif + +// Number of Memory Ports from LLC +`ifndef NUM_MEM_PORTS +`define NUM_MEM_PORTS `MIN(`MEMORY_BANKS, `L3_NUM_BANKS) +`endif + // ISA Extensions ///////////////////////////////////////////////////////////// `ifdef EXT_A_ENABLE diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 73a6edd78..59f5ef0f5 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -238,11 +238,11 @@ `define RESET_RELAY(dst, src) \ `RESET_RELAY_EX (dst, src, 1, 0) -// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2 -`define TO_OUT_BUF_SIZE(out_reg) `MIN(out_reg, 2) +// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2, 5 -> 2 +`define TO_OUT_BUF_SIZE(s) `MIN(s, 2) -// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2 -`define TO_OUT_BUF_REG(out_reg) ((out_reg & 1) + ((out_reg >> 2) << 1)) +// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2, 5 > 3 +`define TO_OUT_BUF_REG(s) ((s < 2) ? s : (s - 2)) `define REPEAT(n,f,s) `_REPEAT_``n(f,s) `define _REPEAT_0(f,s) diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv index abdf67612..694edfe9c 100644 --- a/hw/rtl/VX_socket.sv +++ b/hw/rtl/VX_socket.sv @@ -145,11 +145,12 @@ module VX_socket import VX_gpu_pkg::*; #( .CRSQ_SIZE (`DCACHE_CRSQ_SIZE), .MSHR_SIZE (`DCACHE_MSHR_SIZE), .MRSQ_SIZE (`DCACHE_MRSQ_SIZE), - .MREQ_SIZE (`DCACHE_MREQ_SIZE), + .MREQ_SIZE (`DCACHE_WRITEBACK ? `DCACHE_MSHR_SIZE : `DCACHE_MREQ_SIZE), .TAG_WIDTH (DCACHE_TAG_WIDTH), .UUID_WIDTH (`UUID_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`DCACHE_WRITEBACK), + .DIRTY_BYTES (`DCACHE_WRITEBACK), .NC_ENABLE (1), .CORE_OUT_BUF (2), .MEM_OUT_BUF (2) @@ -178,8 +179,6 @@ module VX_socket import VX_gpu_pkg::*; #( `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH); `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH); - `RESET_RELAY (mem_arb_reset, reset); - VX_mem_arb #( .NUM_INPUTS (2), .DATA_SIZE (`L1_LINE_SIZE), @@ -190,7 +189,7 @@ module VX_socket import VX_gpu_pkg::*; #( .RSP_OUT_BUF (2) ) mem_arb ( .clk (clk), - .reset (mem_arb_reset), + .reset (reset), .bus_in_if (l1_mem_bus_if), .bus_out_if (l1_mem_arb_bus_if) ); diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 927ffae96..2eac22a5a 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -166,6 +166,10 @@ `define VX_CSR_MPM_MEM_WRITES_H 12'hB99 `define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency `define VX_CSR_MPM_MEM_LT_H 12'hB9A +`define VX_CSR_MPM_MEM_BANK_CNTR 12'hB1E // memory bank requests +`define VX_CSR_MPM_MEM_BANK_CNTR_H 12'hB9E +`define VX_CSR_MPM_MEM_BANK_TICK 12'hB1F // memory ticks +`define VX_CSR_MPM_MEM_BANK_TICK_H 12'hB9F // PERF: lmem `define VX_CSR_MPM_LMEM_READS 12'hB1B // memory reads `define VX_CSR_MPM_LMEM_READS_H 12'hB9B diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv index d3ef57c72..978259101 100644 --- a/hw/rtl/Vortex.sv +++ b/hw/rtl/Vortex.sv @@ -80,10 +80,11 @@ module Vortex import VX_gpu_pkg::*; ( .CRSQ_SIZE (`L3_CRSQ_SIZE), .MSHR_SIZE (`L3_MSHR_SIZE), .MRSQ_SIZE (`L3_MRSQ_SIZE), - .MREQ_SIZE (`L3_MREQ_SIZE), + .MREQ_SIZE (`L3_WRITEBACK ? `L3_MSHR_SIZE : `L3_MREQ_SIZE), .TAG_WIDTH (L2_MEM_TAG_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`L3_WRITEBACK), + .DIRTY_BYTES (`L3_WRITEBACK), .UUID_WIDTH (`UUID_WIDTH), .CORE_OUT_BUF (2), .MEM_OUT_BUF (2), @@ -192,12 +193,12 @@ module Vortex import VX_gpu_pkg::*; ( always @(posedge clk) begin if (mem_req_fire) begin if (mem_req_rw) - `TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h data=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data)); + `TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data)); else - `TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen)); + `TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen)); end if (mem_rsp_fire) begin - `TRACE(1, ("%d: MEM Rsp: tag=0x%0h, data=0x%0h\n", $time, mem_rsp_tag, mem_rsp_data)); + `TRACE(1, ("%d: MEM Rd Rsp: tag=0x%0h, data=0x%h\n", $time, mem_rsp_tag, mem_rsp_data)); end end `endif diff --git a/hw/rtl/afu/opae/vortex_afu.sv b/hw/rtl/afu/opae/vortex_afu.sv index cd49e7ddd..93f63c48d 100644 --- a/hw/rtl/afu/opae/vortex_afu.sv +++ b/hw/rtl/afu/opae/vortex_afu.sv @@ -240,13 +240,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ MMIO_CMD_ARG0: begin cmd_args[0] <= 64'(cp2af_sRxPort.c0.data); `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_CMD_ARG0: data=0x%0h\n", $time, 64'(cp2af_sRxPort.c0.data))); + `TRACE(2, ("%d: MMIO_CMD_ARG0: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data))); `endif end MMIO_CMD_ARG1: begin cmd_args[1] <= 64'(cp2af_sRxPort.c0.data); `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_CMD_ARG1: data=0x%0h\n", $time, 64'(cp2af_sRxPort.c0.data))); + `TRACE(2, ("%d: MMIO_CMD_ARG1: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data))); `endif end MMIO_CMD_ARG2: begin @@ -263,13 +263,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ `ifdef SCOPE MMIO_SCOPE_WRITE: begin `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_SCOPE_WRITE: data=0x%0h\n", $time, cmd_scope_wdata)); + `TRACE(2, ("%d: MMIO_SCOPE_WRITE: data=0x%h\n", $time, cmd_scope_wdata)); `endif end `endif default: begin `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: Unknown MMIO Wr: addr=0x%0h, data=0x%0h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data))); + `TRACE(2, ("%d: Unknown MMIO Wr: addr=0x%0h, data=0x%h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data))); `endif end endcase @@ -305,14 +305,14 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ MMIO_SCOPE_READ: begin mmio_tx.data <= cmd_scope_rdata; `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_SCOPE_READ: data=0x%0h\n", $time, cmd_scope_rdata)); + `TRACE(2, ("%d: MMIO_SCOPE_READ: data=0x%h\n", $time, cmd_scope_rdata)); `endif end `endif MMIO_DEV_CAPS: begin mmio_tx.data <= dev_caps; `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_DEV_CAPS: data=0x%0h\n", $time, dev_caps)); + `TRACE(2, ("%d: MMIO_DEV_CAPS: data=0x%h\n", $time, dev_caps)); `endif end MMIO_ISA_CAPS: begin @@ -580,8 +580,6 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ .TAG_WIDTH (AVS_REQ_TAGW+1) ) mem_bus_if[1](); - `RESET_RELAY (mem_arb_reset, reset); - VX_mem_arb #( .NUM_INPUTS (2), .DATA_SIZE (LMEM_DATA_SIZE), @@ -592,7 +590,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ .RSP_OUT_BUF (0) ) mem_arb ( .clk (clk), - .reset (mem_arb_reset), + .reset (reset), .bus_in_if (cci_vx_mem_bus_if), .bus_out_if (mem_bus_if) ); @@ -760,7 +758,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE); end `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%0h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data)); + `TRACE(2, ("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data)); `endif end @@ -778,14 +776,12 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ end end - `RESET_RELAY (cci_rdq_reset, reset); - VX_fifo_queue #( .DATAW (CCI_RD_QUEUE_DATAW), .DEPTH (CCI_RD_QUEUE_SIZE) ) cci_rd_req_queue ( .clk (clk), - .reset (cci_rdq_reset), + .reset (reset), .push (cci_rdq_push), .pop (cci_rdq_pop), .data_in (cci_rdq_din), @@ -906,7 +902,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ cci_wr_req_done <= 1; end `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%0h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data)); + `TRACE(2, ("%d: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data)); `endif end @@ -1093,13 +1089,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ always @(posedge clk) begin for (integer i = 0; i < NUM_LOCAL_MEM_BANKS; ++i) begin if (avs_write[i] && ~avs_waitrequest[i]) begin - `TRACE(2, ("%d: AVS Wr Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i])); + `TRACE(2, ("%d: AVS Wr Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i])); end if (avs_read[i] && ~avs_waitrequest[i]) begin `TRACE(2, ("%d: AVS Rd Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i])); end if (avs_readdatavalid[i]) begin - `TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h\n", $time, i, avs_readdata[i])); + `TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h\n", $time, i, avs_readdata[i])); end end end diff --git a/hw/rtl/afu/xrt/VX_afu_wrap.sv b/hw/rtl/afu/xrt/VX_afu_wrap.sv index 15be69007..a844802e9 100644 --- a/hw/rtl/afu/xrt/VX_afu_wrap.sv +++ b/hw/rtl/afu/xrt/VX_afu_wrap.sv @@ -377,13 +377,13 @@ module VX_afu_wrap #( `TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i])); end if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin - `TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%0h\n", $time, i, m_axi_mem_wdata_a[i])); + `TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%h\n", $time, i, m_axi_mem_wdata_a[i])); end if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin `TRACE(2, ("%d: AFU Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i])); end if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin - `TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i])); + `TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i])); end end end diff --git a/hw/rtl/cache/VX_bank_flush.sv b/hw/rtl/cache/VX_bank_flush.sv index 15d1e8379..6c02c1e13 100644 --- a/hw/rtl/cache/VX_bank_flush.sv +++ b/hw/rtl/cache/VX_bank_flush.sv @@ -14,6 +14,7 @@ `include "VX_cache_define.vh" module VX_bank_flush #( + parameter BANK_ID = 0, // Size of cache in bytes parameter CACHE_SIZE = 1024, // Size of line inside a bank in bytes @@ -27,33 +28,36 @@ module VX_bank_flush #( ) ( input wire clk, input wire reset, - input wire flush_in_valid, - output wire flush_in_ready, - output wire flush_out_init, - output wire flush_out_valid, - output wire [`CS_LINE_SEL_BITS-1:0] flush_out_line, - output wire [NUM_WAYS-1:0] flush_out_way, - input wire flush_out_ready, - input wire mshr_empty + input wire flush_begin, + output wire flush_end, + output wire flush_init, + output wire flush_valid, + output wire [`CS_LINE_SEL_BITS-1:0] flush_line, + output wire [NUM_WAYS-1:0] flush_way, + input wire flush_ready, + input wire mshr_empty, + input wire bank_empty ); - parameter CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0); + // ways interation is only needed when eviction is enabled + localparam CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0); - parameter STATE_IDLE = 2'd0; - parameter STATE_INIT = 2'd1; - parameter STATE_FLUSH = 2'd2; + localparam STATE_IDLE = 0; + localparam STATE_INIT = 1; + localparam STATE_WAIT1 = 2; + localparam STATE_FLUSH = 3; + localparam STATE_WAIT2 = 4; + localparam STATE_DONE = 5; + + reg [2:0] state_r, state_n; reg [CTR_WIDTH-1:0] counter_r; - reg [1:0] state_r, state_n; - reg flush_in_ready_r, flush_in_ready_n; always @(*) begin state_n = state_r; - flush_in_ready_n = 0; case (state_r) - // STATE_IDLE - default: begin - if (flush_in_valid && mshr_empty) begin - state_n = STATE_FLUSH; + STATE_IDLE: begin + if (flush_begin) begin + state_n = STATE_WAIT1; end end STATE_INIT: begin @@ -61,25 +65,41 @@ module VX_bank_flush #( state_n = STATE_IDLE; end end - STATE_FLUSH: begin - if (counter_r == ((2 ** CTR_WIDTH)-1)) begin - state_n = STATE_IDLE; - flush_in_ready_n = 1; + STATE_WAIT1: begin + // wait for pending requests to complete + if (mshr_empty) begin + state_n = STATE_FLUSH; end end + STATE_FLUSH: begin + if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_ready) begin + state_n = (BANK_ID == 0) ? STATE_DONE : STATE_WAIT2; + end + end + STATE_WAIT2: begin + // ensure the bank is empty before notifying the cache flush unit, + // because the flush request to lower caches only goes through bank0 + // and it is important that request gets send out last. + if (bank_empty) begin + state_n = STATE_DONE; + end + end + STATE_DONE: begin + // generate a completion pulse + state_n = STATE_IDLE; + end endcase end always @(posedge clk) begin if (reset) begin - state_r <= STATE_INIT; + state_r <= STATE_INIT; counter_r <= '0; - flush_in_ready_r <= '0; end else begin state_r <= state_n; - flush_in_ready_r <= flush_in_ready_n; if (state_r != STATE_IDLE) begin - if ((state_r == STATE_INIT) || flush_out_ready) begin + if ((state_r == STATE_INIT) + || ((state_r == STATE_FLUSH) && flush_ready)) begin counter_r <= counter_r + CTR_WIDTH'(1); end end else begin @@ -88,22 +108,20 @@ module VX_bank_flush #( end end - assign flush_in_ready = flush_in_ready_r; - - assign flush_out_init = (state_r == STATE_INIT); - - assign flush_out_valid = (state_r == STATE_FLUSH); - assign flush_out_line = counter_r[`CS_LINE_SEL_BITS-1:0]; + assign flush_end = (state_r == STATE_DONE); + assign flush_init = (state_r == STATE_INIT); + assign flush_valid = (state_r == STATE_FLUSH); + assign flush_line = counter_r[`CS_LINE_SEL_BITS-1:0]; if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin - reg [NUM_WAYS-1:0] flush_out_way_r; + reg [NUM_WAYS-1:0] flush_way_r; always @(*) begin - flush_out_way_r = '0; - flush_out_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1; + flush_way_r = '0; + flush_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1; end - assign flush_out_way = flush_out_way_r; + assign flush_way = flush_way_r; end else begin - assign flush_out_way = {NUM_WAYS{1'b1}}; + assign flush_way = {NUM_WAYS{1'b1}}; end endmodule diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index acaa1dac3..ae0747690 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -45,6 +45,9 @@ module VX_cache import VX_gpu_pkg::*; #( // Enable cache writeback parameter WRITEBACK = 0, + // Enable dirty bytes on writeback + parameter DIRTY_BYTES = 0, + // Request debug identifier parameter UUID_WIDTH = 0, @@ -69,8 +72,13 @@ module VX_cache import VX_gpu_pkg::*; #( VX_mem_bus_if.master mem_bus_if ); - `STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter")) - `STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter")) + `STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter: number of banks must be power of 2")) + `STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter: writeback requires write enable")) + `STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter: dirty bytes require writeback")) + + // In writeback mode, memory fill response may issue a new memory request to handle evicted blocks. + // We need to ensure that the memory request queue never fills up to avoid deadlock. + `STATIC_ASSERT(!WRITEBACK || (MREQ_SIZE >= MSHR_SIZE), ("invalid parameter: writeback requires MREQ_SIZE >= MSHR_SIZE")) localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS); localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS); @@ -101,26 +109,23 @@ module VX_cache import VX_gpu_pkg::*; #( .TAG_WIDTH (TAG_WIDTH) ) core_bus2_if[NUM_REQS](); - wire [NUM_BANKS-1:0] per_bank_flush_valid; - wire [NUM_BANKS-1:0] per_bank_flush_ready; + wire [NUM_BANKS-1:0] per_bank_flush_begin; + wire [NUM_BANKS-1:0] per_bank_flush_end; wire [NUM_BANKS-1:0] per_bank_core_req_fire; - // this reset relay is required to sync with bank initialization - `RESET_RELAY (flush_reset, reset); - VX_cache_flush #( .NUM_REQS (NUM_REQS), .NUM_BANKS (NUM_BANKS), .BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency ) flush_unit ( .clk (clk), - .reset (flush_reset), + .reset (reset), .core_bus_in_if (core_bus_if), .core_bus_out_if (core_bus2_if), .bank_req_fire (per_bank_core_req_fire), - .flush_valid (per_bank_flush_valid), - .flush_ready (per_bank_flush_ready) + .flush_begin (per_bank_flush_begin), + .flush_end (per_bank_flush_end) ); /////////////////////////////////////////////////////////////////////////// @@ -131,9 +136,9 @@ module VX_cache import VX_gpu_pkg::*; #( wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s; wire [NUM_REQS-1:0] core_rsp_ready_s; - for (genvar i = 0; i < NUM_REQS; ++i) begin + `RESET_RELAY_EX (core_rsp_reset, reset, NUM_REQS, `MAX_FANOUT); - `RESET_RELAY (core_rsp_reset, reset); + for (genvar i = 0; i < NUM_REQS; ++i) begin VX_elastic_buffer #( .DATAW (`CS_WORD_WIDTH + TAG_WIDTH), @@ -141,7 +146,7 @@ module VX_cache import VX_gpu_pkg::*; #( .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF)) ) core_rsp_buf ( .clk (clk), - .reset (core_rsp_reset), + .reset (core_rsp_reset[i]), .valid_in (core_rsp_valid_s[i]), .ready_in (core_rsp_ready_s[i]), .data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}), @@ -165,15 +170,13 @@ module VX_cache import VX_gpu_pkg::*; #( wire mem_bus_if_flush; - `RESET_RELAY (mem_req_reset, reset); - VX_elastic_buffer #( .DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1), .SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0), .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) ) mem_req_buf ( .clk (clk), - .reset (mem_req_reset), + .reset (reset), .valid_in (mem_req_valid_s), .ready_in (mem_req_ready_s), .data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s, mem_req_flush_s}), @@ -192,15 +195,13 @@ module VX_cache import VX_gpu_pkg::*; #( wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s; wire mem_rsp_ready_s; - `RESET_RELAY (mem_rsp_reset, reset); - VX_elastic_buffer #( .DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH), .SIZE (MRSQ_SIZE), .OUT_REG (MRSQ_SIZE > 2) ) mem_rsp_queue ( .clk (clk), - .reset (mem_rsp_reset), + .reset (reset), .valid_in (mem_bus_if.rsp_valid), .ready_in (mem_bus_if.rsp_ready), .data_in ({mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data}), @@ -316,6 +317,7 @@ module VX_cache import VX_gpu_pkg::*; #( .NUM_OUTPUTS (NUM_BANKS), .DATAW (CORE_REQ_DATAW), .PERF_CTR_BITS (`PERF_CTR_BITS), + .ARBITER ("F"), .OUT_BUF (REQ_XBAR_BUF) ) req_xbar ( .clk (clk), @@ -373,6 +375,7 @@ module VX_cache import VX_gpu_pkg::*; #( .MSHR_SIZE (MSHR_SIZE), .MREQ_SIZE (MREQ_SIZE), .WRITE_ENABLE (WRITE_ENABLE), + .DIRTY_BYTES (DIRTY_BYTES), .WRITEBACK (WRITEBACK), .UUID_WIDTH (UUID_WIDTH), .TAG_WIDTH (TAG_WIDTH), @@ -423,8 +426,8 @@ module VX_cache import VX_gpu_pkg::*; #( .mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)), .mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]), - .flush_valid (per_bank_flush_valid[bank_id]), - .flush_ready (per_bank_flush_ready[bank_id]) + .flush_begin (per_bank_flush_begin[bank_id]), + .flush_end (per_bank_flush_end[bank_id]) ); if (NUM_BANKS == 1) begin @@ -448,7 +451,8 @@ module VX_cache import VX_gpu_pkg::*; #( VX_stream_xbar #( .NUM_INPUTS (NUM_BANKS), .NUM_OUTPUTS (NUM_REQS), - .DATAW (CORE_RSP_DATAW) + .DATAW (CORE_RSP_DATAW), + .ARBITER ("F") ) rsp_xbar ( .clk (clk), .reset (rsp_xbar_reset), @@ -494,15 +498,13 @@ module VX_cache import VX_gpu_pkg::*; #( }; end - `RESET_RELAY (mem_arb_reset, reset); - VX_stream_arb #( .NUM_INPUTS (NUM_BANKS), .DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + MSHR_ADDR_WIDTH + 1), .ARBITER ("F") ) mem_req_arb ( .clk (clk), - .reset (mem_arb_reset), + .reset (reset), .valid_in (per_bank_mem_req_valid), .ready_in (per_bank_mem_req_ready), .data_in (data_in), diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index 03f3efd41..dbbb4aba3 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -44,6 +44,9 @@ module VX_cache_bank #( // Enable cache writeback parameter WRITEBACK = 0, + // Enable dirty bytes on writeback + parameter DIRTY_BYTES = 0, + // Request debug identifier parameter UUID_WIDTH = 0, @@ -105,8 +108,8 @@ module VX_cache_bank #( output wire mem_rsp_ready, // flush - input wire flush_valid, - output wire flush_ready + input wire flush_begin, + output wire flush_end ); localparam PIPELINE_STAGES = 2; @@ -117,6 +120,7 @@ module VX_cache_bank #( wire crsp_queue_stall; wire mshr_alm_full; + wire mreq_queue_empty; wire mreq_queue_alm_full; wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr; @@ -132,11 +136,12 @@ module VX_cache_bank #( wire [MSHR_ADDR_WIDTH-1:0] replay_id; wire replay_ready; - wire is_init_st0; + wire is_init_st0, is_init_st1; wire is_flush_st0, is_flush_st1; wire [NUM_WAYS-1:0] flush_way_st0; wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1; + wire [`CS_LINE_SEL_BITS-1:0] line_sel_st0, line_sel_st1; wire rw_sel, rw_st0, rw_st1; wire [WORD_SEL_WIDTH-1:0] wsel_sel, wsel_st0, wsel_st1; wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1; @@ -149,7 +154,8 @@ module VX_cache_bank #( wire is_creq_st0, is_creq_st1; wire is_fill_st0, is_fill_st1; wire is_replay_st0, is_replay_st1; - wire creq_flush_st0, creq_flush_st1; + wire creq_flush_sel, creq_flush_st0, creq_flush_st1; + wire evict_dirty_st0, evict_dirty_st1; wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1; wire [NUM_WAYS-1:0] tag_matches_st0; wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0; @@ -157,73 +163,82 @@ module VX_cache_bank #( wire mshr_pending_st0, mshr_pending_st1; wire mshr_empty; - wire line_flush_valid; - wire line_flush_init; - wire [`CS_LINE_SEL_BITS-1:0] line_flush_sel; - wire [NUM_WAYS-1:0] line_flush_way; - wire line_flush_ready; + wire flush_valid; + wire init_valid; + wire [`CS_LINE_SEL_BITS-1:0] flush_sel; + wire [NUM_WAYS-1:0] flush_way; + wire flush_ready; + + // ensure we have no pending memory request in the bank + wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty; // flush unit VX_bank_flush #( + .BANK_ID (BANK_ID), .CACHE_SIZE (CACHE_SIZE), .LINE_SIZE (LINE_SIZE), .NUM_BANKS (NUM_BANKS), .NUM_WAYS (NUM_WAYS), .WRITEBACK (WRITEBACK) ) flush_unit ( - .clk (clk), - .reset (reset), - .flush_in_valid (flush_valid), - .flush_in_ready (flush_ready), - .flush_out_init (line_flush_init), - .flush_out_valid (line_flush_valid), - .flush_out_line (line_flush_sel), - .flush_out_way (line_flush_way), - .flush_out_ready (line_flush_ready), - .mshr_empty (mshr_empty) + .clk (clk), + .reset (reset), + .flush_begin (flush_begin), + .flush_end (flush_end), + .flush_init (init_valid), + .flush_valid (flush_valid), + .flush_line (flush_sel), + .flush_way (flush_way), + .flush_ready (flush_ready), + .mshr_empty (mshr_empty), + .bank_empty (no_pending_req) ); - wire rdw_hazard_st0; - reg rdw_hazard_st1; + wire rdw_hazard1_sel; + wire rdw_hazard2_sel; + reg rdw_hazard3_st1; - wire pipe_stall = crsp_queue_stall || rdw_hazard_st1; + wire pipe_stall = crsp_queue_stall || rdw_hazard3_st1; // inputs arbitration: // mshr replay has highest priority to maximize utilization since there is no miss. // handle memory responses next to prevent deadlock with potential memory request from a miss. // flush has precedence over core requests to ensure that the cache is in a consistent state. - wire replay_grant = ~line_flush_init; + wire replay_grant = ~init_valid; wire replay_enable = replay_grant && replay_valid; - wire fill_grant = ~line_flush_init && ~replay_enable; + wire fill_grant = ~init_valid && ~replay_enable; wire fill_enable = fill_grant && mem_rsp_valid; - wire flush_grant = ~line_flush_init && ~replay_enable && ~fill_enable; - wire flush_enable = flush_grant && line_flush_valid; + wire flush_grant = ~init_valid && ~replay_enable && ~fill_enable; + wire flush_enable = flush_grant && flush_valid; - wire creq_grant = ~line_flush_init && ~replay_enable && ~fill_enable && ~flush_enable; + wire creq_grant = ~init_valid && ~replay_enable && ~fill_enable && ~flush_enable; wire creq_enable = creq_grant && core_req_valid; assign replay_ready = replay_grant - && ~rdw_hazard_st0 + && ~rdw_hazard1_sel && ~pipe_stall; assign mem_rsp_ready = fill_grant + && (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions + && ~rdw_hazard2_sel && ~pipe_stall; - assign line_flush_ready = flush_grant - && ~mreq_queue_alm_full - && ~pipe_stall; + assign flush_ready = flush_grant + && (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions + && ~rdw_hazard2_sel + && ~pipe_stall; assign core_req_ready = creq_grant && ~mreq_queue_alm_full && ~mshr_alm_full && ~pipe_stall; - wire init_fire = line_flush_init; + wire init_fire = init_valid; wire replay_fire = replay_valid && replay_ready; wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; - wire flush_fire = line_flush_valid && line_flush_ready; + wire flush_fire = flush_valid && flush_ready; wire core_req_fire = core_req_valid && core_req_ready; assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire; @@ -232,8 +247,9 @@ module VX_cache_bank #( assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel; assign req_idx_sel = replay_valid ? replay_idx : core_req_idx; assign tag_sel = replay_valid ? replay_tag : core_req_tag; + assign creq_flush_sel = core_req_valid && core_req_flush; - assign addr_sel = (line_flush_init | line_flush_valid) ? `CS_LINE_ADDR_WIDTH'(line_flush_sel) : + assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) : (replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr)); if (WRITE_ENABLE) begin @@ -260,8 +276,8 @@ module VX_cache_bank #( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_sel, line_flush_init, replay_enable, fill_enable, flush_enable, creq_enable, core_req_flush, line_flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}), - .data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0}) + .data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}), + .data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0}) ); if (UUID_WIDTH != 0) begin @@ -273,18 +289,20 @@ module VX_cache_bank #( wire do_init_st0 = valid_st0 && is_init_st0; wire do_flush_st0 = valid_st0 && is_flush_st0; wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0; + wire do_creq_wr_st0 = valid_st0 && is_creq_st0 && rw_st0; wire do_replay_rd_st0 = valid_st0 && is_replay_st0 && ~rw_st0; + wire do_replay_wr_st0 = valid_st0 && is_replay_st0 && rw_st0; wire do_fill_st0 = valid_st0 && is_fill_st0; - wire do_lookup_st0 = valid_st0 && ~(is_fill_st0 || is_init_st0); - wire do_cache_rd_st0 = do_creq_rd_st0 || do_replay_rd_st0; + wire do_cache_wr_st0 = do_creq_wr_st0 || do_replay_wr_st0; + wire do_lookup_st0 = do_cache_rd_st0 || do_cache_wr_st0; wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0]; - wire [NUM_WAYS-1:0] repl_way_st0; - wire [`CS_TAG_SEL_BITS-1:0] repl_tag_st0; + assign line_sel_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0]; - `RESET_RELAY (tag_reset, reset); + wire [NUM_WAYS-1:0] evict_way_st0; + wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0; VX_cache_tags #( .INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)), @@ -294,42 +312,51 @@ module VX_cache_bank #( .NUM_BANKS (NUM_BANKS), .NUM_WAYS (NUM_WAYS), .WORD_SIZE (WORD_SIZE), + .WRITEBACK (WRITEBACK), .UUID_WIDTH (UUID_WIDTH) ) cache_tags ( .clk (clk), - .reset (tag_reset), + .reset (reset), .req_uuid (req_uuid_st0), .stall (pipe_stall), - // init/fill/lookup/flush - .init (do_init_st0 || do_flush_st0), + // init/flush/fill/write/lookup + .init (do_init_st0), + .flush (do_flush_st0), .fill (do_fill_st0), + .write (do_cache_wr_st0), .lookup (do_lookup_st0), .line_addr (addr_st0), + .way_sel (flush_way_st0), .tag_matches(tag_matches_st0), // replacement - .repl_way (repl_way_st0), - .repl_tag (repl_tag_st0) + .evict_dirty(evict_dirty_st0), + .evict_way (evict_way_st0), + .evict_tag (evict_tag_st0) ); + wire [`CS_LINE_ADDR_WIDTH-1:0] addr2_st0; + + wire is_flush2_st0 = WRITEBACK && is_flush_st0; + assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0; - assign way_sel_st0 = is_fill_st0 ? repl_way_st0 : (is_flush_st0 ? flush_way_st0 : tag_matches_st0); + assign way_sel_st0 = (is_fill_st0 || is_flush2_st0) ? evict_way_st0 : tag_matches_st0; - wire [`CS_LINE_ADDR_WIDTH-1:0] addr_r_st0 = (is_fill_st0 || is_flush_st0) ? {repl_tag_st0, addr_st0[`CS_LINE_SEL_BITS-1:0]} : addr_st0; + assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0; VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1), + .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_st0, is_flush_st0, is_replay_st0, is_fill_st0, is_creq_st0, creq_flush_st0, rw_st0, addr_r_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, mshr_pending_st0}), - .data_out ({valid_st1, is_flush_st1, is_replay_st1, is_fill_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, mshr_pending_st1}) + .data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}), + .data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1}) ); // we have a tag hit @@ -343,35 +370,40 @@ module VX_cache_bank #( wire is_read_st1 = is_creq_st1 && ~rw_st1; wire is_write_st1 = is_creq_st1 && rw_st1; + + wire do_init_st1 = valid_st1 && is_init_st1; + wire do_fill_st1 = valid_st1 && is_fill_st1; + wire do_flush_st1 = valid_st1 && is_flush_st1; + wire do_creq_rd_st1 = valid_st1 && is_read_st1; wire do_creq_wr_st1 = valid_st1 && is_write_st1; - wire do_fill_st1 = valid_st1 && is_fill_st1; wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1; wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1; - wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1; - wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1; - wire do_read_hit_st1 = do_creq_rd_st1 && is_hit_st1; wire do_read_miss_st1 = do_creq_rd_st1 && ~is_hit_st1; wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1; wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1; - wire do_flush_st1 = valid_st1 && is_flush_st1; + wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1; + wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1; + + assign line_sel_st1 = addr_st1[`CS_LINE_SEL_BITS-1:0]; `UNUSED_VAR (do_write_miss_st1) // ensure mshr replay always get a hit - `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("runtime error: invalid mshr replay")); + `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("missed mshr replay")); - // detect BRAM's read-during-write hazard - assign rdw_hazard_st0 = do_fill_st0; // stall cycle after a fill - wire rdw_case1 = do_cache_rd_st0 && do_cache_wr_st1 && (addr_st0 == addr_st1); // standard cache access - wire rdw_case2 = WRITEBACK && (do_flush_st0 || do_fill_st0) && do_cache_wr_st1; // a writeback can evict preceeding write - always @(posedge clk) begin // after a write to same address - rdw_hazard_st1 <= (rdw_case1 || rdw_case2) - && ~rdw_hazard_st1; // invalidate if pipeline stalled to avoid repeats + // both tag and data stores use BRAM with no read-during-write protection. + // we ned to stall the pipeline to prevent read-after-write hazards. + assign rdw_hazard1_sel = do_fill_st0; // stall first replay following a fill + assign rdw_hazard2_sel = WRITEBACK && do_cache_wr_st0; // a writeback can evict any preceeding write + always @(posedge clk) begin + // stall reads following writes to same line address + rdw_hazard3_st1 <= do_cache_rd_st0 && do_cache_wr_st1 && (line_sel_st0 == line_sel_st1) + && ~rdw_hazard3_st1; // release pipeline stall end wire [`CS_LINE_WIDTH-1:0] write_data_st1 = {`CS_WORDS_PER_LINE{data_st1[`CS_WORD_WIDTH-1:0]}}; @@ -380,7 +412,6 @@ module VX_cache_bank #( wire [`CS_LINE_WIDTH-1:0] dirty_data_st1; wire [LINE_SIZE-1:0] dirty_byteen_st1; - wire dirty_valid_st1; if (`CS_WORDS_PER_LINE > 1) begin reg [LINE_SIZE-1:0] write_byteen_r; @@ -393,8 +424,6 @@ module VX_cache_bank #( assign write_byteen_st1 = byteen_st1; end - `RESET_RELAY (data_reset, reset); - VX_cache_data #( .INSTANCE_ID ($sformatf("%s-data", INSTANCE_ID)), .BANK_ID (BANK_ID), @@ -405,17 +434,19 @@ module VX_cache_bank #( .WORD_SIZE (WORD_SIZE), .WRITE_ENABLE (WRITE_ENABLE), .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), .UUID_WIDTH (UUID_WIDTH) ) cache_data ( .clk (clk), - .reset (data_reset), + .reset (reset), .req_uuid (req_uuid_st1), .stall (pipe_stall), + .init (do_init_st1), .read (do_cache_rd_st1), - .fill (do_fill_st1 && ~rdw_hazard_st1), + .fill (do_fill_st1), .flush (do_flush_st1), .write (do_cache_wr_st1), .way_sel (way_sel_st1), @@ -425,7 +456,6 @@ module VX_cache_bank #( .write_data (write_data_st1), .write_byteen(write_byteen_st1), .read_data (read_data_st1), - .dirty_valid(dirty_valid_st1), .dirty_data (dirty_data_st1), .dirty_byteen(dirty_byteen_st1) ); @@ -461,8 +491,6 @@ module VX_cache_bank #( `UNUSED_PIN (size) ); - `RESET_RELAY (mshr_reset, reset); - VX_cache_mshr #( .INSTANCE_ID ($sformatf("%s-mshr", INSTANCE_ID)), .BANK_ID (BANK_ID), @@ -473,7 +501,7 @@ module VX_cache_bank #( .DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH) ) cache_mshr ( .clk (clk), - .reset (mshr_reset), + .reset (reset), .deq_req_uuid (req_uuid_sel), .lkp_req_uuid (req_uuid_st0), @@ -536,16 +564,14 @@ module VX_cache_bank #( assign crsp_queue_data = read_data_st1; assign crsp_queue_tag = tag_st1; - `RESET_RELAY (crsp_queue_reset, reset); - VX_elastic_buffer #( .DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH), .SIZE (CRSQ_SIZE), .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF)) ) core_rsp_queue ( .clk (clk), - .reset (crsp_queue_reset), - .valid_in (crsp_queue_valid && ~rdw_hazard_st1), + .reset (reset), + .valid_in (crsp_queue_valid && ~rdw_hazard3_st1), .ready_in (crsp_queue_ready), .data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}), .data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}), @@ -557,7 +583,7 @@ module VX_cache_bank #( // schedule memory request - wire mreq_queue_push, mreq_queue_pop, mreq_queue_empty; + wire mreq_queue_push, mreq_queue_pop; wire [`CS_LINE_WIDTH-1:0] mreq_queue_data; wire [LINE_SIZE-1:0] mreq_queue_byteen; wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr; @@ -565,30 +591,42 @@ module VX_cache_bank #( wire mreq_queue_rw; wire mreq_queue_flush; - wire is_evict_st1 = (is_fill_st1 || is_flush_st1) && dirty_valid_st1; - wire do_writeback_st1 = valid_st1 && is_evict_st1; - `UNUSED_VAR (do_writeback_st1) + wire is_fill_or_flush_st1 = is_fill_st1 || is_flush_st1; + wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1; + wire do_writeback_st1 = do_fill_or_flush_st1 && evict_dirty_st1; if (WRITEBACK) begin + if (DIRTY_BYTES) begin + // ensure dirty bytes match the tag info + wire has_dirty_bytes = (| dirty_byteen_st1); + `RUNTIME_ASSERT (~do_fill_or_flush_st1 || (evict_dirty_st1 == has_dirty_bytes), ("missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", evict_dirty_st1, has_dirty_bytes, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID))); + end assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1) || do_writeback_st1) - && ~rdw_hazard_st1; + && ~rdw_hazard3_st1; end else begin - `UNUSED_VAR (dirty_valid_st1) + `UNUSED_VAR (do_writeback_st1) assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1) || do_creq_wr_st1) - && ~rdw_hazard_st1; + && ~rdw_hazard3_st1; end - assign mreq_queue_pop = mem_req_valid && mem_req_ready; - assign mreq_queue_rw = WRITE_ENABLE && (WRITEBACK ? is_evict_st1 : rw_st1); + assign mreq_queue_pop = mem_req_valid && mem_req_ready; assign mreq_queue_addr = addr_st1; - assign mreq_queue_id = mshr_id_st1; - assign mreq_queue_data = is_write_st1 ? write_data_st1 : dirty_data_st1; - assign mreq_queue_byteen = is_write_st1 ? write_byteen_st1 : dirty_byteen_st1; + assign mreq_queue_id = mshr_id_st1; assign mreq_queue_flush = creq_flush_st1; - `RESET_RELAY (mreq_queue_reset, reset); + if (WRITE_ENABLE) begin + assign mreq_queue_rw = WRITEBACK ? is_fill_or_flush_st1 : rw_st1; + assign mreq_queue_data = WRITEBACK ? dirty_data_st1 : write_data_st1; + assign mreq_queue_byteen = WRITEBACK ? dirty_byteen_st1 : write_byteen_st1; + end else begin + assign mreq_queue_rw = 0; + assign mreq_queue_data = 0; + assign mreq_queue_byteen = 0; + `UNUSED_VAR (dirty_data_st1) + `UNUSED_VAR (dirty_byteen_st1) + end VX_fifo_queue #( .DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + 1), @@ -597,7 +635,7 @@ module VX_cache_bank #( .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) ) mem_req_queue ( .clk (clk), - .reset (mreq_queue_reset), + .reset (reset), .push (mreq_queue_push), .pop (mreq_queue_pop), .data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_data, mreq_queue_flush}), @@ -621,32 +659,32 @@ module VX_cache_bank #( `ifdef DBG_TRACE_CACHE wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready; - wire pipeline_stall = (replay_valid || mem_rsp_valid || core_req_valid || line_flush_valid) - && ~(replay_fire || mem_rsp_fire || core_req_fire || line_flush_valid); + wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || flush_valid) + && ~(replay_fire || mem_rsp_fire || core_req_fire || flush_fire); always @(posedge clk) begin - if (pipeline_stall) begin - `TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw_st0=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard_st0)); + if (input_stall || pipe_stall) begin + `TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw1=%b, rdw2=%b, rdw3=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard1_sel, rdw_hazard2_sel, rdw_hazard3_st1)); end if (mem_rsp_fire) begin - `TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data)); + `TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data)); end if (replay_fire) begin `TRACE(2, ("%d: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel)); end if (core_req_fire) begin if (core_req_rw) - `TRACE(2, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel)); + `TRACE(2, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel)); else `TRACE(2, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel)); end if (crsp_queue_fire) begin - `TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1)); + `TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1)); end if (mreq_queue_push) begin if (do_creq_wr_st1 && !WRITEBACK) - `TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1)); + `TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1)); else if (do_writeback_st1) - `TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%b, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data)); + `TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data)); else `TRACE(2, ("%d: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1)); end diff --git a/hw/rtl/cache/VX_cache_cluster.sv b/hw/rtl/cache/VX_cache_cluster.sv index c567ddbc5..939768b63 100644 --- a/hw/rtl/cache/VX_cache_cluster.sv +++ b/hw/rtl/cache/VX_cache_cluster.sv @@ -49,6 +49,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( // Enable cache writeback parameter WRITEBACK = 0, + // Enable dirty bytes on writeback + parameter DIRTY_BYTES = 0, + // Request debug identifier parameter UUID_WIDTH = 0, @@ -99,6 +102,8 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .TAG_WIDTH (ARB_TAG_WIDTH) ) arb_core_bus_if[NUM_CACHES * NUM_REQS](); + `RESET_RELAY_EX (cache_arb_reset, reset, NUM_REQS, `MAX_FANOUT); + for (genvar i = 0; i < NUM_REQS; ++i) begin VX_mem_bus_if #( .DATA_SIZE (WORD_SIZE), @@ -114,8 +119,6 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( `ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]); end - `RESET_RELAY (arb_reset, reset); - VX_mem_arb #( .NUM_INPUTS (NUM_INPUTS), .NUM_OUTPUTS (NUM_CACHES), @@ -127,7 +130,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0) ) cache_arb ( .clk (clk), - .reset (arb_reset), + .reset (cache_arb_reset[i]), .bus_in_if (core_bus_tmp_if), .bus_out_if (arb_core_bus_tmp_if) ); @@ -155,6 +158,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .MREQ_SIZE (MREQ_SIZE), .WRITE_ENABLE (WRITE_ENABLE), .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), .UUID_WIDTH (UUID_WIDTH), .TAG_WIDTH (ARB_TAG_WIDTH), .TAG_SEL_IDX (TAG_SEL_IDX), diff --git a/hw/rtl/cache/VX_cache_data.sv b/hw/rtl/cache/VX_cache_data.sv index 6bf8f1c3e..a114e1689 100644 --- a/hw/rtl/cache/VX_cache_data.sv +++ b/hw/rtl/cache/VX_cache_data.sv @@ -30,6 +30,8 @@ module VX_cache_data #( parameter WRITE_ENABLE = 1, // Enable cache writeback parameter WRITEBACK = 0, + // Enable dirty bytes on writeback + parameter DIRTY_BYTES = 0, // Request debug identifier parameter UUID_WIDTH = 0 ) ( @@ -42,6 +44,7 @@ module VX_cache_data #( input wire stall, + input wire init, input wire read, input wire fill, input wire flush, @@ -53,89 +56,88 @@ module VX_cache_data #( input wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen, input wire [NUM_WAYS-1:0] way_sel, output wire [`CS_WORD_WIDTH-1:0] read_data, - output wire dirty_valid, output wire [`CS_LINE_WIDTH-1:0] dirty_data, output wire [LINE_SIZE-1:0] dirty_byteen ); `UNUSED_SPARAM (INSTANCE_ID) `UNUSED_PARAM (BANK_ID) `UNUSED_PARAM (WORD_SIZE) - `UNUSED_VAR (reset) `UNUSED_VAR (stall) `UNUSED_VAR (line_addr) + `UNUSED_VAR (init) `UNUSED_VAR (read) `UNUSED_VAR (flush) localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1; wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0]; + + wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_rdata; wire [`LOG2UP(NUM_WAYS)-1:0] way_idx; if (WRITEBACK) begin - reg [`CS_LINES_PER_BANK * NUM_WAYS-1:0][LINE_SIZE-1:0] dirty_bytes_r; - reg [`CS_LINES_PER_BANK * NUM_WAYS-1:0] dirty_blocks_r; + if (DIRTY_BYTES) begin + wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_rdata; + wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_wdata; - wire [`CLOG2(`CS_LINES_PER_BANK * NUM_WAYS)-1:0] way_addr; - if (NUM_WAYS > 1) begin - assign way_addr = {line_sel, way_idx}; + for (genvar i = 0; i < NUM_WAYS; ++i) begin + wire [LINE_SIZE-1:0] wdata = write ? (bs_rdata[i] | write_byteen) : ((fill || flush) ? '0 : bs_rdata[i]); + assign bs_wdata[i] = init ? '0 : (way_sel[i] ? wdata : bs_rdata[i]); + end + + VX_sp_ram #( + .DATAW (LINE_SIZE * NUM_WAYS), + .SIZE (`CS_LINES_PER_BANK) + ) byteen_store ( + .clk (clk), + .reset (reset), + .read (write || fill || flush), + .write (init || write || fill || flush), + .wren (1'b1), + .addr (line_sel), + .wdata (bs_wdata), + .rdata (bs_rdata) + ); + + assign dirty_byteen = bs_rdata[way_idx]; end else begin - assign way_addr = line_sel; + assign dirty_byteen = {LINE_SIZE{1'b1}}; end - always @(posedge clk) begin - if (fill) begin - dirty_bytes_r[way_addr] <= '0; - end else if (write) begin - dirty_bytes_r[way_addr] <= dirty_bytes_r[way_addr] | write_byteen; + wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] flipped_rdata; + for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin + for (genvar j = 0; j < NUM_WAYS; ++j) begin + assign flipped_rdata[j][i] = line_rdata[i][j]; end end - - always @(posedge clk) begin - if (reset) begin - for (integer i = 0; i < `CS_LINES_PER_BANK * NUM_WAYS; ++i) begin - dirty_blocks_r[i] <= 0; - end - end else begin - if (fill) begin - dirty_blocks_r[way_addr] <= 0; - end else if (write) begin - dirty_blocks_r[way_addr] <= 1; - end - end - end - - assign dirty_byteen = dirty_bytes_r[way_addr]; - assign dirty_valid = dirty_blocks_r[way_addr]; + assign dirty_data = flipped_rdata[way_idx]; end else begin assign dirty_byteen = '0; - assign dirty_valid = 0; + assign dirty_data = '0; end // order the data layout to perform ways multiplexing last. - // this allows converting way index to binary in parallel with BRAM read. + // this allows converting way index to binary in parallel with BRAM readaccess and way selection. - wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] wdata; - wire [BYTEENW-1:0] wren; + wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_wdata; + wire [BYTEENW-1:0] line_wren; if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin - for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin - assign wdata[i] = (fill || !WRITE_ENABLE) ? {NUM_WAYS{fill_data[i]}} : {NUM_WAYS{write_data[i]}}; - end - wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w; for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin for (genvar j = 0; j < NUM_WAYS; ++j) begin + assign line_wdata[i][j] = (fill || !WRITE_ENABLE) ? fill_data[i] : write_data[i]; assign wren_w[i][j] = ((fill || !WRITE_ENABLE) ? {WORD_SIZE{1'b1}} : write_byteen[i]) & {WORD_SIZE{(way_sel[j] || (NUM_WAYS == 1))}}; end end - assign wren = wren_w; + assign line_wren = wren_w; end else begin `UNUSED_VAR (write) `UNUSED_VAR (write_byteen) `UNUSED_VAR (write_data) - assign wdata = fill_data; - assign wren = fill; + assign line_wdata = fill_data; + assign line_wren = fill; end VX_onehot_encoder #( @@ -146,53 +148,50 @@ module VX_cache_data #( `UNUSED_PIN (valid_out) ); - wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] rdata; + wire line_read = (read && ~stall) + || (WRITEBACK && (fill || flush)); + + wire line_write = write || fill; VX_sp_ram #( .DATAW (`CS_LINE_WIDTH * NUM_WAYS), .SIZE (`CS_LINES_PER_BANK), .WRENW (BYTEENW), - .NO_RWCHECK (1) + .NO_RWCHECK (1), + .RW_ASSERT (1) ) data_store ( .clk (clk), - .read (1'b1), - .write (write || fill), - .wren (wren), + .reset (reset), + .read (line_read), + .write (line_write), + .wren (line_wren), .addr (line_sel), - .wdata (wdata), - .rdata (rdata) + .wdata (line_wdata), + .rdata (line_rdata) ); wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata; if (`CS_WORDS_PER_LINE > 1) begin - assign per_way_rdata = rdata[wsel]; + assign per_way_rdata = line_rdata[wsel]; end else begin `UNUSED_VAR (wsel) - assign per_way_rdata = rdata; + assign per_way_rdata = line_rdata; end assign read_data = per_way_rdata[way_idx]; - wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] dirty_data_w; - for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin - for (genvar j = 0; j < NUM_WAYS; ++j) begin - assign dirty_data_w[j][i] = rdata[i][j]; - end - end - assign dirty_data = dirty_data_w[way_idx]; - `ifdef DBG_TRACE_CACHE always @(posedge clk) begin if (fill && ~stall) begin - `TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data)); + `TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data)); end if (flush && ~stall) begin - `TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b, byteen=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_valid, dirty_byteen)); + `TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_byteen, dirty_data)); end if (read && ~stall) begin - `TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid)); + `TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid)); end if (write && ~stall) begin - `TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid)); + `TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid)); end end `endif diff --git a/hw/rtl/cache/VX_cache_flush.sv b/hw/rtl/cache/VX_cache_flush.sv index 7c46a48f0..7a33565fc 100644 --- a/hw/rtl/cache/VX_cache_flush.sv +++ b/hw/rtl/cache/VX_cache_flush.sv @@ -26,13 +26,16 @@ module VX_cache_flush #( VX_mem_bus_if.slave core_bus_in_if [NUM_REQS], VX_mem_bus_if.master core_bus_out_if [NUM_REQS], input wire [NUM_BANKS-1:0] bank_req_fire, - output wire [NUM_BANKS-1:0] flush_valid, - input wire [NUM_BANKS-1:0] flush_ready + output wire [NUM_BANKS-1:0] flush_begin, + input wire [NUM_BANKS-1:0] flush_end ); localparam STATE_IDLE = 0; - localparam STATE_WAIT = 1; + localparam STATE_WAIT1 = 1; localparam STATE_FLUSH = 2; - localparam STATE_DONE = 3; + localparam STATE_WAIT2 = 3; + localparam STATE_DONE = 4; + + reg [2:0] state, state_n; // track in-flight core requests @@ -76,8 +79,6 @@ module VX_cache_flush #( `UNUSED_VAR (bank_req_fire) end - - reg [1:0] state, state_n; reg [NUM_BANKS-1:0] flush_done, flush_done_n; wire [NUM_REQS-1:0] flush_req_mask; @@ -113,22 +114,32 @@ module VX_cache_flush #( case (state) STATE_IDLE: begin if (flush_req_enable) begin - state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT : STATE_FLUSH; + state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT1 : STATE_FLUSH; end end - STATE_WAIT: begin + STATE_WAIT1: begin if (no_inflight_reqs) begin state_n = STATE_FLUSH; end end STATE_FLUSH: begin - flush_done_n = flush_done | flush_ready; - if (flush_done_n == 0) begin + // generate a flush request pulse + state_n = STATE_WAIT2; + end + STATE_WAIT2: begin + // wait for all banks to finish flushing + flush_done_n = flush_done | flush_end; + if (flush_done_n == {NUM_BANKS{1'b1}}) begin state_n = STATE_DONE; + flush_done_n = '0; + // only release current flush requests + // and keep normal requests locked lock_released_n = flush_req_mask; end end STATE_DONE: begin + // wait until released flush requests are issued + // when returning to IDLE state other requests will unlock lock_released_n = lock_released & ~core_bus_out_ready; if (lock_released_n == 0) begin state_n = STATE_IDLE; @@ -149,6 +160,6 @@ module VX_cache_flush #( end end - assign flush_valid = {NUM_BANKS{state == STATE_FLUSH}}; + assign flush_begin = {NUM_BANKS{state == STATE_FLUSH}}; endmodule diff --git a/hw/rtl/cache/VX_cache_init.sv b/hw/rtl/cache/VX_cache_init.sv deleted file mode 100644 index 3cccdcdae..000000000 --- a/hw/rtl/cache/VX_cache_init.sv +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -`include "VX_cache_define.vh" - -// cache flush unit -module VX_cache_init #( - // Size of cache in bytes - parameter CACHE_SIZE = 1024, - // Size of line inside a bank in bytes - parameter LINE_SIZE = 16, - // Number of banks - parameter NUM_BANKS = 1, - // Number of associative ways - parameter NUM_WAYS = 1 -) ( - input wire clk, - input wire reset, - output wire [`CS_LINE_SEL_BITS-1:0] addr_out, - output wire valid_out -); - reg enabled; - reg [`CS_LINE_SEL_BITS-1:0] line_ctr; - - always @(posedge clk) begin - if (reset) begin - enabled <= 1; - line_ctr <= '0; - end else begin - if (enabled) begin - if (line_ctr == ((2 ** `CS_LINE_SEL_BITS)-1)) begin - enabled <= 0; - end - line_ctr <= line_ctr + `CS_LINE_SEL_BITS'(1); - end - end - end - - assign addr_out = line_ctr; - assign valid_out = enabled; - -endmodule diff --git a/hw/rtl/cache/VX_cache_mshr.sv b/hw/rtl/cache/VX_cache_mshr.sv index b0e577283..4f8163269 100644 --- a/hw/rtl/cache/VX_cache_mshr.sv +++ b/hw/rtl/cache/VX_cache_mshr.sv @@ -232,9 +232,10 @@ module VX_cache_mshr #( .LUTRAM (1) ) entries ( .clk (clk), + .reset (reset), .read (1'b1), .write (allocate_valid), - `UNUSED_PIN (wren), + .wren (1'b1), .waddr (allocate_id_r), .wdata (allocate_data), .raddr (dequeue_id_r), diff --git a/hw/rtl/cache/VX_cache_tags.sv b/hw/rtl/cache/VX_cache_tags.sv index 4595bdbcf..7fef69be6 100644 --- a/hw/rtl/cache/VX_cache_tags.sv +++ b/hw/rtl/cache/VX_cache_tags.sv @@ -26,6 +26,8 @@ module VX_cache_tags #( parameter NUM_WAYS = 1, // Size of a word in bytes parameter WORD_SIZE = 1, + // Enable cache writeback + parameter WRITEBACK = 0, // Request debug identifier parameter UUID_WIDTH = 0 ) ( @@ -40,74 +42,100 @@ module VX_cache_tags #( // init/fill/lookup input wire init, + input wire flush, input wire fill, + input wire write, input wire lookup, input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr, + input wire [NUM_WAYS-1:0] way_sel, output wire [NUM_WAYS-1:0] tag_matches, - // replacement - output wire [NUM_WAYS-1:0] repl_way, - output wire [`CS_TAG_SEL_BITS-1:0] repl_tag + // eviction + output wire evict_dirty, + output wire [NUM_WAYS-1:0] evict_way, + output wire [`CS_TAG_SEL_BITS-1:0] evict_tag ); `UNUSED_SPARAM (INSTANCE_ID) `UNUSED_PARAM (BANK_ID) - `UNUSED_VAR (reset) `UNUSED_VAR (lookup) - // valid, tag - localparam TAG_WIDTH = 1 + `CS_TAG_SEL_BITS; + // valid, dirty, tag + localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS; wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0]; wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_ADDR_TAG(line_addr); wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag; wire [NUM_WAYS-1:0] read_valid; + wire [NUM_WAYS-1:0] read_dirty; if (NUM_WAYS > 1) begin - reg [NUM_WAYS-1:0] repl_way_r; + reg [NUM_WAYS-1:0] evict_way_r; // cyclic assignment of replacement way always @(posedge clk) begin if (reset) begin - repl_way_r <= 1; - end else if (~stall) begin // hold the value on stalls prevent filling different slots twice - repl_way_r <= {repl_way_r[NUM_WAYS-2:0], repl_way_r[NUM_WAYS-1]}; + evict_way_r <= 1; + end else if (~stall) begin // holding the value on stalls prevents filling different slots twice + evict_way_r <= {evict_way_r[NUM_WAYS-2:0], evict_way_r[NUM_WAYS-1]}; end end - assign repl_way = repl_way_r; + assign evict_way = fill ? evict_way_r : way_sel; VX_onehot_mux #( .DATAW (`CS_TAG_SEL_BITS), .N (NUM_WAYS) - ) repl_tag_sel ( + ) evict_tag_sel ( .data_in (read_tag), - .sel_in (repl_way_r), - .data_out (repl_tag) + .sel_in (evict_way), + .data_out (evict_tag) ); end else begin `UNUSED_VAR (stall) - assign repl_way = 1'b1; - assign repl_tag = read_tag; + assign evict_way = 1'b1; + assign evict_tag = read_tag; end + // fill and flush need to also read in writeback mode + wire fill_s = fill && (!WRITEBACK || ~stall); + wire flush_s = flush && (!WRITEBACK || ~stall); + for (genvar i = 0; i < NUM_WAYS; ++i) begin - wire do_fill = fill && repl_way[i]; - wire do_write = init || do_fill; - wire line_valid = ~init; + wire do_fill = fill_s && evict_way[i]; + wire do_flush = flush_s && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode + wire do_write = WRITEBACK && write && tag_matches[i]; + + wire line_read = (WRITEBACK && (fill_s || flush_s)); + wire line_write = init || do_fill || do_flush || do_write; + wire line_valid = ~(init || flush); + + wire [TAG_WIDTH-1:0] line_wdata; + wire [TAG_WIDTH-1:0] line_rdata; + + if (WRITEBACK) begin + assign line_wdata = {line_valid, write, line_tag}; + assign {read_valid[i], read_dirty[i], read_tag[i]} = line_rdata; + end else begin + assign line_wdata = {line_valid, line_tag}; + assign {read_valid[i], read_tag[i]} = line_rdata; + assign read_dirty[i] = 1'b0; + end VX_sp_ram #( .DATAW (TAG_WIDTH), .SIZE (`CS_LINES_PER_BANK), - .NO_RWCHECK (1) + .NO_RWCHECK (1), + .RW_ASSERT (1) ) tag_store ( .clk (clk), - .read (1'b1), - .write (do_write), - `UNUSED_PIN (wren), + .reset (reset), + .read (line_read), + .write (line_write), + .wren (1'b1), .addr (line_sel), - .wdata ({line_valid, line_tag}), - .rdata ({read_valid[i], read_tag[i]}) + .wdata (line_wdata), + .rdata (line_rdata) ); end @@ -115,19 +143,31 @@ module VX_cache_tags #( assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]); end + assign evict_dirty = | (read_dirty & evict_way); + `ifdef DBG_TRACE_CACHE + wire [`CS_LINE_ADDR_WIDTH-1:0] evict_line_addr = {evict_tag, line_sel}; always @(posedge clk) begin if (fill && ~stall) begin - `TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), repl_way, line_sel, line_tag)); + `TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, dirty=%b, evict_addr=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), evict_way, line_sel, line_tag, evict_dirty, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID))); end if (init) begin `TRACE(3, ("%d: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel)); end + if (flush && ~stall) begin + `TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID), way_sel, line_sel, evict_dirty)); + end if (lookup && ~stall) begin if (tag_matches != 0) begin - `TRACE(3, ("%d: %s hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); + if (write) + `TRACE(3, ("%d: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); + else + `TRACE(3, ("%d: %s read-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); end else begin - `TRACE(3, ("%d: %s miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid)); + if (write) + `TRACE(3, ("%d: %s write-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid)); + else + `TRACE(3, ("%d: %s read-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid)); end end end diff --git a/hw/rtl/cache/VX_cache_top.sv b/hw/rtl/cache/VX_cache_top.sv index 59dd1c364..0959701aa 100644 --- a/hw/rtl/cache/VX_cache_top.sv +++ b/hw/rtl/cache/VX_cache_top.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,20 +20,20 @@ module VX_cache_top import VX_gpu_pkg::*; #( parameter NUM_REQS = 4, // Size of cache in bytes - parameter CACHE_SIZE = 16384, + parameter CACHE_SIZE = 16384, // Size of line inside a bank in bytes - parameter LINE_SIZE = 64, + parameter LINE_SIZE = 64, // Number of banks parameter NUM_BANKS = 4, // Number of associative ways parameter NUM_WAYS = 4, // Size of a word in bytes - parameter WORD_SIZE = 4, + parameter WORD_SIZE = 4, // Core Response Queue Size parameter CRSQ_SIZE = 2, // Miss Reserv Queue Knob - parameter MSHR_SIZE = 16, + parameter MSHR_SIZE = 16, // Memory Response Queue Size parameter MRSQ_SIZE = 0, // Memory Request Queue Size @@ -42,6 +42,12 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Enable cache writeable parameter WRITE_ENABLE = 1, + // Enable cache writeback + parameter WRITEBACK = 0, + + // Enable dirty bytes on writeback + parameter DIRTY_BYTES = 0, + // Request debug identifier parameter UUID_WIDTH = 0, @@ -55,7 +61,7 @@ module VX_cache_top import VX_gpu_pkg::*; #( parameter MEM_OUT_BUF = 2, parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS) - ) ( + ) ( input wire clk, input wire reset, @@ -82,17 +88,17 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Memory request output wire mem_req_valid, - output wire mem_req_rw, + output wire mem_req_rw, output wire [LINE_SIZE-1:0] mem_req_byteen, output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr, - output wire [`CS_LINE_WIDTH-1:0] mem_req_data, - output wire [MEM_TAG_WIDTH-1:0] mem_req_tag, + output wire [`CS_LINE_WIDTH-1:0] mem_req_data, + output wire [MEM_TAG_WIDTH-1:0] mem_req_tag, input wire mem_req_ready, - + // Memory response - input wire mem_rsp_valid, + input wire mem_rsp_valid, input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data, - input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag, + input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag, output wire mem_rsp_ready ); VX_mem_bus_if #( @@ -127,18 +133,18 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Memory request assign mem_req_valid = mem_bus_if.req_valid; - assign mem_req_rw = mem_bus_if.req_data.rw; + assign mem_req_rw = mem_bus_if.req_data.rw; assign mem_req_byteen = mem_bus_if.req_data.byteen; assign mem_req_addr = mem_bus_if.req_data.addr; - assign mem_req_data = mem_bus_if.req_data.data; - assign mem_req_tag = mem_bus_if.req_data.tag; + assign mem_req_data = mem_bus_if.req_data.data; + assign mem_req_tag = mem_bus_if.req_data.tag; assign mem_bus_if.req_ready = mem_req_ready; `UNUSED_VAR (mem_bus_if.req_data.atype) - + // Memory response - assign mem_bus_if.rsp_valid = mem_rsp_valid; + assign mem_bus_if.rsp_valid = mem_rsp_valid; assign mem_bus_if.rsp_data.data = mem_rsp_data; - assign mem_bus_if.rsp_data.tag = mem_rsp_tag; + assign mem_bus_if.rsp_data.tag = mem_rsp_tag; assign mem_rsp_ready = mem_bus_if.rsp_ready; VX_cache #( @@ -156,6 +162,8 @@ module VX_cache_top import VX_gpu_pkg::*; #( .TAG_WIDTH (TAG_WIDTH), .UUID_WIDTH (UUID_WIDTH), .WRITE_ENABLE (WRITE_ENABLE), + .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), .CORE_OUT_BUF (CORE_OUT_BUF), .MEM_OUT_BUF (MEM_OUT_BUF) ) cache ( diff --git a/hw/rtl/cache/VX_cache_wrap.sv b/hw/rtl/cache/VX_cache_wrap.sv index 082d8b4e1..37940297f 100644 --- a/hw/rtl/cache/VX_cache_wrap.sv +++ b/hw/rtl/cache/VX_cache_wrap.sv @@ -48,6 +48,9 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( // Enable cache writeback parameter WRITEBACK = 0, + // Enable dirty bytes on writeback + parameter DIRTY_BYTES = 0, + // Request debug identifier parameter UUID_WIDTH = 0, @@ -187,6 +190,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( .MREQ_SIZE (MREQ_SIZE), .WRITE_ENABLE (WRITE_ENABLE), .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), .UUID_WIDTH (UUID_WIDTH), .TAG_WIDTH (TAG_WIDTH), .CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF), @@ -223,12 +227,12 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( always @(posedge clk) begin if (core_req_fire) begin if (core_bus_if[i].req_data.rw) - `TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid)); + `TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid)); else `TRACE(1, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid)); end if (core_rsp_fire) begin - `TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid)); + `TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid)); end end end @@ -250,14 +254,14 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( always @(posedge clk) begin if (mem_req_fire) begin if (mem_bus_if.req_data.rw) - `TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", + `TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid)); else `TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid)); end if (mem_rsp_fire) begin - `TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%0h (#%0d)\n", + `TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid)); end end diff --git a/hw/rtl/core/VX_alu_muldiv.sv b/hw/rtl/core/VX_alu_muldiv.sv index 460295463..3beb035f4 100644 --- a/hw/rtl/core/VX_alu_muldiv.sv +++ b/hw/rtl/core/VX_alu_muldiv.sv @@ -83,7 +83,7 @@ module VX_alu_muldiv #( .DEPTH (`LATENCY_IMUL), .RESETW (1) ) mul_shift_reg ( - .clk(clk), + .clk (clk), .reset (reset), .enable (mul_ready_in), .data_in ({mul_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, mul_result_tmp}), @@ -324,6 +324,7 @@ module VX_alu_muldiv #( VX_stream_arb #( .NUM_INPUTS (2), .DATAW (TAG_WIDTH + (NUM_LANES * `XLEN)), + .ARBITER ("F"), .OUT_BUF (1) ) rsp_buf ( .clk (clk), diff --git a/hw/rtl/core/VX_alu_unit.sv b/hw/rtl/core/VX_alu_unit.sv index d8c131838..86bcaf05e 100644 --- a/hw/rtl/core/VX_alu_unit.sv +++ b/hw/rtl/core/VX_alu_unit.sv @@ -57,7 +57,7 @@ module VX_alu_unit #( for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin - `RESET_RELAY (block_reset, reset); + `RESET_RELAY_EN (block_reset, reset,(BLOCK_SIZE > 1)); wire is_muldiv_op = `EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV); @@ -72,15 +72,13 @@ module VX_alu_unit #( assign int_execute_if.valid = per_block_execute_if[block_idx].valid && ~is_muldiv_op; assign int_execute_if.data = per_block_execute_if[block_idx].data; - `RESET_RELAY (int_reset, block_reset); - VX_alu_int #( .INSTANCE_ID ($sformatf("%s-int%0d", INSTANCE_ID, block_idx)), .BLOCK_IDX (block_idx), .NUM_LANES (NUM_LANES) ) alu_int ( .clk (clk), - .reset (int_reset), + .reset (block_reset), .execute_if (int_execute_if), .branch_ctl_if (branch_ctl_if[block_idx]), .commit_if (int_commit_if) @@ -99,14 +97,12 @@ module VX_alu_unit #( assign muldiv_execute_if.valid = per_block_execute_if[block_idx].valid && is_muldiv_op; assign muldiv_execute_if.data = per_block_execute_if[block_idx].data; - `RESET_RELAY (muldiv_reset, block_reset); - VX_alu_muldiv #( .INSTANCE_ID ($sformatf("%s-muldiv%0d", INSTANCE_ID, block_idx)), .NUM_LANES (NUM_LANES) ) muldiv_unit ( .clk (clk), - .reset (muldiv_reset), + .reset (block_reset), .execute_if (muldiv_execute_if), .commit_if (muldiv_commit_if) ); @@ -121,15 +117,14 @@ module VX_alu_unit #( // send response - `RESET_RELAY (arb_reset, block_reset); - VX_stream_arb #( .NUM_INPUTS (RSP_ARB_SIZE), .DATAW (RSP_ARB_DATAW), - .OUT_BUF (PARTIAL_BW ? 1 : 3) + .OUT_BUF (PARTIAL_BW ? 1 : 3), + .ARBITER ("F") ) rsp_arb ( .clk (clk), - .reset (arb_reset), + .reset (block_reset), .valid_in ({ `ifdef EXT_M_ENABLE muldiv_commit_if.valid, diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 090f47199..4c82db812 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -313,6 +313,7 @@ module VX_core import VX_gpu_pkg::*; #( .DATA_SIZE (DCACHE_WORD_SIZE), .TAG_WIDTH (DCACHE_TAG_WIDTH), .TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH), + .ARBITER ("P"), .REQ_OUT_BUF (0), .RSP_OUT_BUF (0) ) lsu_adapter ( diff --git a/hw/rtl/core/VX_dcr_data.sv b/hw/rtl/core/VX_dcr_data.sv index 58e51efc5..4ac137547 100644 --- a/hw/rtl/core/VX_dcr_data.sv +++ b/hw/rtl/core/VX_dcr_data.sv @@ -52,7 +52,7 @@ module VX_dcr_data import VX_gpu_pkg::*, VX_trace_pkg::*; ( if (dcr_bus_if.write_valid) begin `TRACE(1, ("%d: base-dcr: state=", $time)); trace_base_dcr(1, dcr_bus_if.write_addr); - `TRACE(1, (", data=0x%0h\n", dcr_bus_if.write_data)); + `TRACE(1, (", data=0x%h\n", dcr_bus_if.write_data)); end end `endif diff --git a/hw/rtl/core/VX_dispatch_unit.sv b/hw/rtl/core/VX_dispatch_unit.sv index 4adde52ab..618ea1221 100644 --- a/hw/rtl/core/VX_dispatch_unit.sv +++ b/hw/rtl/core/VX_dispatch_unit.sv @@ -40,7 +40,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH); localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN); localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1; - localparam FANOUT_ENABLE= (`NUM_THREADS > MAX_FANOUT); + localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT /2)); localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS); localparam DATA_REGS_OFF = 0; @@ -85,6 +85,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx); assign issue_indices[block_idx] = issue_idx; + `RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1)); + wire valid_p, ready_p; if (`NUM_THREADS != NUM_LANES) begin @@ -100,7 +102,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( wire fire_eop = fire_p && is_last_p; always @(posedge clk) begin - if (reset) begin + if (block_reset) begin sent_mask_p <= '0; is_first_p <= 1; end else begin @@ -215,8 +217,6 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( assign isw = block_idx; end - `RESET_RELAY(buf_out_reset, reset); - wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw); VX_elastic_buffer #( @@ -225,7 +225,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) buf_out ( .clk (clk), - .reset (buf_out_reset), + .reset (block_reset), .valid_in (valid_p), .ready_in (ready_p), .data_in ({ diff --git a/hw/rtl/core/VX_fetch.sv b/hw/rtl/core/VX_fetch.sv index 59c419a83..043a87939 100644 --- a/hw/rtl/core/VX_fetch.sv +++ b/hw/rtl/core/VX_fetch.sv @@ -56,9 +56,10 @@ module VX_fetch import VX_gpu_pkg::*; #( .LUTRAM (1) ) tag_store ( .clk (clk), + .reset (reset), .read (1'b1), .write (icache_req_fire), - `UNUSED_PIN (wren), + .wren (1'b1), .waddr (req_tag), .wdata ({schedule_if.data.PC, schedule_if.data.tmask}), .raddr (rsp_tag), diff --git a/hw/rtl/core/VX_fpu_unit.sv b/hw/rtl/core/VX_fpu_unit.sv index 8622db490..496b24e29 100644 --- a/hw/rtl/core/VX_fpu_unit.sv +++ b/hw/rtl/core/VX_fpu_unit.sv @@ -57,7 +57,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( `UNUSED_VAR (per_block_execute_if[block_idx].data.tid) `UNUSED_VAR (per_block_execute_if[block_idx].data.wb) - `RESET_RELAY (block_reset, reset); + `RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1)); // Store request info wire fpu_req_valid, fpu_req_ready; @@ -84,14 +84,12 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( wire execute_fire = per_block_execute_if[block_idx].valid && per_block_execute_if[block_idx].ready; wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready; - `RESET_RELAY (ibuf_reset, block_reset); - VX_index_buffer #( .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + PID_WIDTH + 1 + 1), .SIZE (`FPUQ_SIZE) ) tag_store ( .clk (clk), - .reset (ibuf_reset), + .reset (block_reset), .acquire_en (execute_fire), .write_addr (fpu_req_tag), .write_data ({per_block_execute_if[block_idx].data.uuid, per_block_execute_if[block_idx].data.wid, per_block_execute_if[block_idx].data.tmask, per_block_execute_if[block_idx].data.PC, per_block_execute_if[block_idx].data.rd, per_block_execute_if[block_idx].data.pid, per_block_execute_if[block_idx].data.sop, per_block_execute_if[block_idx].data.eop}), @@ -113,8 +111,6 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( assign fpu_req_valid = per_block_execute_if[block_idx].valid && ~mdata_full; assign per_block_execute_if[block_idx].ready = fpu_req_ready && ~mdata_full; - `RESET_RELAY (fpu_reset, block_reset); - `ifdef FPU_DPI VX_fpu_dpi #( @@ -123,7 +119,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( .OUT_BUF (PARTIAL_BW ? 1 : 3) ) fpu_dpi ( .clk (clk), - .reset (fpu_reset), + .reset (block_reset), .valid_in (fpu_req_valid), .mask_in (per_block_execute_if[block_idx].data.tmask), @@ -152,7 +148,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( .OUT_BUF (PARTIAL_BW ? 1 : 3) ) fpu_fpnew ( .clk (clk), - .reset (fpu_reset), + .reset (block_reset), .valid_in (fpu_req_valid), .mask_in (per_block_execute_if[block_idx].data.tmask), @@ -181,7 +177,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( .OUT_BUF (PARTIAL_BW ? 1 : 3) ) fpu_dsp ( .clk (clk), - .reset (fpu_reset), + .reset (block_reset), .valid_in (fpu_req_valid), .mask_in (per_block_execute_if[block_idx].data.tmask), @@ -228,14 +224,12 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( // send response - `RESET_RELAY (rsp_reset, block_reset); - VX_elastic_buffer #( .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1), .SIZE (0) ) rsp_buf ( .clk (clk), - .reset (rsp_reset), + .reset (block_reset), .valid_in (fpu_rsp_valid), .ready_in (fpu_rsp_ready), .data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}), diff --git a/hw/rtl/core/VX_gather_unit.sv b/hw/rtl/core/VX_gather_unit.sv index 98d362056..293495eba 100644 --- a/hw/rtl/core/VX_gather_unit.sv +++ b/hw/rtl/core/VX_gather_unit.sv @@ -79,15 +79,13 @@ module VX_gather_unit import VX_gpu_pkg::*; #( .NUM_LANES (NUM_LANES) ) commit_tmp_if(); - `RESET_RELAY(commit_out_reset, reset); - VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (commit_out_reset), + .reset (reset), .valid_in (commit_out_valid[i]), .ready_in (commit_out_ready[i]), .data_in (commit_out_data[i]), diff --git a/hw/rtl/core/VX_ipdom_stack.sv b/hw/rtl/core/VX_ipdom_stack.sv index 01d5ec78e..0ec05cbae 100644 --- a/hw/rtl/core/VX_ipdom_stack.sv +++ b/hw/rtl/core/VX_ipdom_stack.sv @@ -72,9 +72,10 @@ module VX_ipdom_stack #( .LUTRAM (OUT_REG ? 0 : 1) ) store ( .clk (clk), + .reset (reset), .read (1'b1), .write (push), - `UNUSED_PIN (wren), + .wren (1'b1), .waddr (wr_ptr), .wdata ({q1, q0}), .raddr (rd_ptr), diff --git a/hw/rtl/core/VX_lmem_unit.sv b/hw/rtl/core/VX_lmem_unit.sv index e896b4000..accb7a586 100644 --- a/hw/rtl/core/VX_lmem_unit.sv +++ b/hw/rtl/core/VX_lmem_unit.sv @@ -39,6 +39,8 @@ module VX_lmem_unit import VX_gpu_pkg::*; #( .TAG_WIDTH (LSU_TAG_WIDTH) ) lsu_switch_if[`NUM_LSU_BLOCKS](); + `RESET_RELAY_EX (block_reset, reset, `NUM_LSU_BLOCKS, 1); + for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin wire [`NUM_LSU_LANES-1:0] is_addr_local_mask; @@ -52,15 +54,13 @@ module VX_lmem_unit import VX_gpu_pkg::*; #( wire req_global_ready; wire req_local_ready; - `RESET_RELAY (switch_reset, reset); - VX_elastic_buffer #( .DATAW (REQ_DATAW), .SIZE (2), .OUT_REG (1) ) req_global_buf ( .clk (clk), - .reset (switch_reset), + .reset (block_reset[i]), .valid_in (lsu_mem_in_if[i].req_valid && is_addr_global), .data_in ({ lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask, @@ -91,7 +91,7 @@ module VX_lmem_unit import VX_gpu_pkg::*; #( .OUT_REG (0) ) req_local_buf ( .clk (clk), - .reset (switch_reset), + .reset (block_reset[i]), .valid_in (lsu_mem_in_if[i].req_valid && is_addr_local), .data_in ({ lsu_mem_in_if[i].req_data.mask & is_addr_local_mask, @@ -126,7 +126,7 @@ module VX_lmem_unit import VX_gpu_pkg::*; #( .OUT_BUF (1) ) rsp_arb ( .clk (clk), - .reset (switch_reset), + .reset (block_reset[i]), .valid_in ({ lsu_switch_if[i].rsp_valid, lsu_mem_out_if[i].rsp_valid @@ -157,18 +157,17 @@ module VX_lmem_unit import VX_gpu_pkg::*; #( .TAG_WIDTH (LSU_TAG_WIDTH) ) lmem_bus_tmp_if[`NUM_LSU_LANES](); - `RESET_RELAY (adapter_reset, reset); - VX_lsu_adapter #( .NUM_LANES (`NUM_LSU_LANES), .DATA_SIZE (LSU_WORD_SIZE), .TAG_WIDTH (LSU_TAG_WIDTH), .TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH), + .ARBITER ("P"), .REQ_OUT_BUF (3), .RSP_OUT_BUF (0) ) lsu_adapter ( .clk (clk), - .reset (adapter_reset), + .reset (block_reset[i]), .lsu_mem_if (lsu_switch_if[i]), .mem_bus_if (lmem_bus_tmp_if) ); diff --git a/hw/rtl/core/VX_lsu_slice.sv b/hw/rtl/core/VX_lsu_slice.sv index 120dc9f8e..8c685fca2 100644 --- a/hw/rtl/core/VX_lsu_slice.sv +++ b/hw/rtl/core/VX_lsu_slice.sv @@ -490,6 +490,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( VX_stream_arb #( .NUM_INPUTS (2), .DATAW (RSP_ARB_DATAW), + .ARBITER ("P"), // prioritize commit_rsp_if .OUT_BUF (3) ) rsp_arb ( .clk (clk), diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 17d8a9d0c..e3df0c1fa 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -13,6 +13,13 @@ `include "VX_define.vh" +// reset all GPRs in debug mode +`ifdef SIMULATION +`ifndef NDEBUG +`define GPR_RESET +`endif +`endif + module VX_operands import VX_gpu_pkg::*; #( parameter `STRING INSTANCE_ID = "", parameter NUM_BANKS = 4, @@ -36,8 +43,9 @@ module VX_operands import VX_gpu_pkg::*; #( localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS); localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS); localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS; - localparam METADATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS; - localparam DATAW = `UUID_WIDTH + METADATAW + 3 * `NUM_THREADS * `XLEN; + localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH; + localparam REGS_DATAW = `XLEN * `NUM_THREADS; + localparam DATAW = META_DATAW + NUM_SRC_REGS * REGS_DATAW; localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS); localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS; localparam XLEN_SIZE = `XLEN / 8; @@ -46,30 +54,28 @@ module VX_operands import VX_gpu_pkg::*; #( `UNUSED_VAR (writeback_if.data.sop) wire [NUM_SRC_REGS-1:0] src_valid; - wire [NUM_SRC_REGS-1:0] req_in_valid; - wire [NUM_SRC_REGS-1:0] req_in_ready; + wire [NUM_SRC_REGS-1:0] req_in_valid, req_in_ready; wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_in_data; wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx; - wire [NUM_BANKS-1:0] gpr_rd_valid_n, gpr_rd_ready; - reg [NUM_BANKS-1:0] gpr_rd_valid; - wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr_n; - reg [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr; - wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data; - wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx_n; - reg [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx; + wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready; + wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2; + wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr, gpr_rd_addr_st1; + wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st1, gpr_rd_data_st2; + wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx, gpr_rd_req_idx_st1, gpr_rd_req_idx_st2; - wire pipe_in_ready; - reg pipe_out_valid; - wire pipe_out_ready; - reg [`UUID_WIDTH-1:0] pipe_out_uuid; - reg [METADATAW-1:0] pipe_out_data; + wire pipe_valid_st1, pipe_ready_st1; + wire pipe_valid_st2, pipe_ready_st2; + wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2; - reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data, src_data_n; - reg [NUM_SRC_REGS-1:0] data_fetched; - reg has_collision, has_collision_n; + reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n; + wire [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2; - wire stg_in_valid, stg_in_ready; + reg [NUM_SRC_REGS-1:0] data_fetched_n; + wire [NUM_SRC_REGS-1:0] data_fetched_st1; + + reg has_collision_n; + wire has_collision_st1; wire [NUM_SRC_REGS-1:0][`NR_BITS-1:0] src_regs = {scoreboard_if.data.rs3, scoreboard_if.data.rs2, @@ -89,7 +95,7 @@ module VX_operands import VX_gpu_pkg::*; #( end for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin - assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched[i]; + assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched_st1[i]; end assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid; @@ -109,13 +115,20 @@ module VX_operands import VX_gpu_pkg::*; #( .data_in (req_in_data), .sel_in (req_bank_idx), .ready_in (req_in_ready), - .valid_out (gpr_rd_valid_n), - .data_out (gpr_rd_addr_n), - .sel_out (gpr_rd_req_idx_n), + .valid_out (gpr_rd_valid), + .data_out (gpr_rd_addr), + .sel_out (gpr_rd_req_idx), .ready_out (gpr_rd_ready) ); - assign gpr_rd_ready = {NUM_BANKS{stg_in_ready}}; + wire pipe_in_ready = pipe_ready_st1 || ~pipe_valid_st1; + + assign gpr_rd_ready = {NUM_BANKS{pipe_in_ready}}; + + assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n; + + wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1; + wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2; always @(*) begin has_collision_n = 0; @@ -129,83 +142,82 @@ module VX_operands import VX_gpu_pkg::*; #( end always @(*) begin - src_data_n = src_data; - for (integer b = 0; b < NUM_BANKS; ++b) begin - if (gpr_rd_valid[b]) begin - src_data_n[gpr_rd_req_idx[b]] = gpr_rd_data[b]; - end - end - end - - wire pipe_stall = pipe_out_valid && ~pipe_out_ready; - assign pipe_in_ready = ~pipe_stall; - - assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n; - - wire stg_in_fire = stg_in_valid && stg_in_ready; - - always @(posedge clk) begin - if (reset) begin - pipe_out_valid <= 0; - gpr_rd_valid <= '0; - data_fetched <= '0; - src_data <= '0; + data_fetched_n = data_fetched_st1; + if (scoreboard_if.ready) begin + data_fetched_n = '0; end else begin - if (~pipe_stall) begin - pipe_out_valid <= scoreboard_if.valid; - gpr_rd_valid <= gpr_rd_valid_n; - if (scoreboard_if.ready) begin - data_fetched <= '0; - end else begin - data_fetched <= data_fetched | req_in_ready; - end - if (stg_in_fire) begin - src_data <= '0; - end else begin - src_data <= src_data_n; - end - end - end - if (~pipe_stall) begin - pipe_out_uuid <= scoreboard_if.data.uuid; - pipe_out_data <= { - scoreboard_if.data.wis, - scoreboard_if.data.tmask, - scoreboard_if.data.PC, - scoreboard_if.data.wb, - scoreboard_if.data.ex_type, - scoreboard_if.data.op_type, - scoreboard_if.data.op_args, - scoreboard_if.data.rd - }; - has_collision <= has_collision_n; - gpr_rd_addr <= gpr_rd_addr_n; - gpr_rd_req_idx <= gpr_rd_req_idx_n; + data_fetched_n = data_fetched_st1 | req_in_ready; end end - assign pipe_out_ready = stg_in_ready; - assign stg_in_valid = pipe_out_valid && ~has_collision; + assign pipe_data = { + scoreboard_if.data.wis, + scoreboard_if.data.tmask, + scoreboard_if.data.PC, + scoreboard_if.data.wb, + scoreboard_if.data.ex_type, + scoreboard_if.data.op_type, + scoreboard_if.data.op_args, + scoreboard_if.data.rd, + scoreboard_if.data.uuid + }; + + VX_pipe_register #( + .DATAW (1 + NUM_SRC_REGS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)), + .RESETW (1 + NUM_SRC_REGS) + ) pipe_reg1 ( + .clk (clk), + .reset (reset), + .enable (pipe_in_ready), + .data_in ({scoreboard_if.valid, data_fetched_n, gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}), + .data_out ({pipe_valid_st1, data_fetched_st1, gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1}) + ); + + assign pipe_ready_st1 = pipe_ready_st2 || ~pipe_valid_st2; + + assign src_data_st1 = pipe_fire_st2 ? '0 : src_data_n; + + wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1; + + `RESET_RELAY (pipe2_reset, reset); // needed for pipe_reg2's wide RESETW + + VX_pipe_register #( + .DATAW (1 + NUM_SRC_REGS * REGS_DATAW + NUM_BANKS + NUM_BANKS * REGS_DATAW + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH), + .RESETW (1 + NUM_SRC_REGS * REGS_DATAW) + ) pipe_reg2 ( + .clk (clk), + .reset (pipe2_reset), + .enable (pipe_ready_st1), + .data_in ({pipe_valid2_st1, src_data_st1, gpr_rd_valid_st1, gpr_rd_data_st1, pipe_data_st1, gpr_rd_req_idx_st1}), + .data_out ({pipe_valid_st2, src_data_st2, gpr_rd_valid_st2, gpr_rd_data_st2, pipe_data_st2, gpr_rd_req_idx_st2}) + ); + + always @(*) begin + src_data_n = src_data_st2; + for (integer b = 0; b < NUM_BANKS; ++b) begin + if (gpr_rd_valid_st2[b]) begin + src_data_n[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b]; + end + end + end VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)), .LUTRAM (1) - ) out_buffer ( + ) out_buf ( .clk (clk), .reset (reset), - .valid_in (stg_in_valid), - .ready_in (stg_in_ready), + .valid_in (pipe_valid_st2), + .ready_in (pipe_ready_st2), .data_in ({ - pipe_out_uuid, - pipe_out_data, + pipe_data_st2, src_data_n[0], src_data_n[1], src_data_n[2] }), .data_out ({ - operands_if.data.uuid, operands_if.data.wis, operands_if.data.tmask, operands_if.data.PC, @@ -214,6 +226,7 @@ module VX_operands import VX_gpu_pkg::*; #( operands_if.data.op_type, operands_if.data.op_args, operands_if.data.rd, + operands_if.data.uuid, operands_if.data.rs1_data, operands_if.data.rs2_data, operands_if.data.rs3_data @@ -262,27 +275,24 @@ module VX_operands import VX_gpu_pkg::*; #( assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}}; end - `ifdef GPR_RESET - VX_dp_ram_rst #( - `else VX_dp_ram #( - `endif - .DATAW (`XLEN * `NUM_THREADS), + .DATAW (REGS_DATAW), .SIZE (PER_BANK_REGS * PER_ISSUE_WARPS), .WRENW (BYTEENW), + `ifdef GPR_RESET + .RESET_RAM (1), + `endif .NO_RWCHECK (1) ) gpr_ram ( .clk (clk), - `ifdef GPR_RESET .reset (reset), - `endif - .read (1'b1), + .read (pipe_fire_st1), .wren (wren), .write (gpr_wr_enabled), .waddr (gpr_wr_addr), .wdata (writeback_if.data.data), - .raddr (gpr_rd_addr[b]), - .rdata (gpr_rd_data[b]) + .raddr (gpr_rd_addr_st1[b]), + .rdata (gpr_rd_data_st1[b]) ); end diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index 6bc748745..71a74c6ac 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -383,16 +383,16 @@ module VX_schedule import VX_gpu_pkg::*; #( wire [`NUM_WARPS-1:0] pending_warp_empty; wire [`NUM_WARPS-1:0] pending_warp_alm_empty; - for (genvar i = 0; i < `NUM_WARPS; ++i) begin + `RESET_RELAY_EX (pending_instr_reset, reset, `NUM_WARPS, `MAX_FANOUT); - `RESET_RELAY (pending_instr_reset, reset); + for (genvar i = 0; i < `NUM_WARPS; ++i) begin VX_pending_size #( .SIZE (4096), .ALM_EMPTY (1) ) counter ( .clk (clk), - .reset (pending_instr_reset), + .reset (pending_instr_reset[i]), .incr (per_warp_incr[i]), .decr (commit_sched_if.committed_warps[i]), .empty (pending_warp_empty[i]), diff --git a/hw/rtl/core/VX_sfu_unit.sv b/hw/rtl/core/VX_sfu_unit.sv index add229893..5ef4211d0 100644 --- a/hw/rtl/core/VX_sfu_unit.sv +++ b/hw/rtl/core/VX_sfu_unit.sv @@ -179,7 +179,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( VX_gather_unit #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), - .OUT_BUF (1) + .OUT_BUF (3) ) gather_unit ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_fpu_cvt.sv b/hw/rtl/fpu/VX_fpu_cvt.sv index 6d74ddcb7..37a2ab419 100644 --- a/hw/rtl/fpu/VX_fpu_cvt.sv +++ b/hw/rtl/fpu/VX_fpu_cvt.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,7 +21,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( parameter TAG_WIDTH = 1 ) ( input wire clk, - input wire reset, + input wire reset, output wire ready_in, input wire valid_in, @@ -36,7 +36,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( input wire is_signed, input wire [NUM_LANES-1:0][31:0] dataa, - output wire [NUM_LANES-1:0][31:0] result, + output wire [NUM_LANES-1:0][31:0] result, output wire has_fflags, output wire [`FP_FLAGS_BITS-1:0] fflags, @@ -45,25 +45,26 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( input wire ready_out, output wire valid_out -); +); `UNUSED_VAR (frm) - wire [NUM_LANES-1:0] mask_out; + wire [NUM_LANES-1:0] mask_out; wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out; fflags_t [NUM_LANES-1:0] fflags_out; - wire pe_enable; + wire pe_enable; wire [NUM_PES-1:0][31:0] pe_data_in; wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; - + VX_pe_serializer #( - .NUM_LANES (NUM_LANES), - .NUM_PES (NUM_PES), + .NUM_LANES (NUM_LANES), + .NUM_PES (NUM_PES), .LATENCY (`LATENCY_FCVT), .DATA_IN_WIDTH(32), .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), - .PE_REG (0) + .PE_REG (0), + .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), @@ -94,7 +95,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( .enable (pe_enable), .frm (frm), .is_itof (is_itof), - .is_signed (is_signed), + .is_signed (is_signed), .dataa (pe_data_in[i][0 +: 32]), .result (pe_data_out[i][0 +: 32]), .fflags (pe_data_out[i][32 +: `FP_FLAGS_BITS]) diff --git a/hw/rtl/fpu/VX_fpu_div.sv b/hw/rtl/fpu/VX_fpu_div.sv index 0647a8782..81fc8f022 100644 --- a/hw/rtl/fpu/VX_fpu_div.sv +++ b/hw/rtl/fpu/VX_fpu_div.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,7 +21,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #( parameter TAG_WIDTH = 1 ) ( input wire clk, - input wire reset, + input wire reset, input wire valid_in, output wire ready_in, @@ -31,10 +31,10 @@ module VX_fpu_div import VX_fpu_pkg::*; #( input wire [TAG_WIDTH-1:0] tag_in, input wire [`INST_FRM_BITS-1:0] frm, - + input wire [NUM_LANES-1:0][31:0] dataa, input wire [NUM_LANES-1:0][31:0] datab, - output wire [NUM_LANES-1:0][31:0] result, + output wire [NUM_LANES-1:0][31:0] result, output wire has_fflags, output wire [`FP_FLAGS_BITS-1:0] fflags, @@ -47,27 +47,28 @@ module VX_fpu_div import VX_fpu_pkg::*; #( `UNUSED_VAR (frm) wire [NUM_LANES-1:0][2*32-1:0] data_in; - wire [NUM_LANES-1:0] mask_out; + wire [NUM_LANES-1:0] mask_out; wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out; wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out; - wire pe_enable; + wire pe_enable; wire [NUM_PES-1:0][2*32-1:0] pe_data_in; - wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; + wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; for (genvar i = 0; i < NUM_LANES; ++i) begin assign data_in[i][0 +: 32] = dataa[i]; assign data_in[i][32 +: 32] = datab[i]; end - + VX_pe_serializer #( - .NUM_LANES (NUM_LANES), - .NUM_PES (NUM_PES), + .NUM_LANES (NUM_LANES), + .NUM_PES (NUM_PES), .LATENCY (`LATENCY_FDIV), .DATA_IN_WIDTH(2*32), .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), - .PE_REG (0) + .PE_REG (0), + .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), @@ -92,7 +93,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #( fflags_t [NUM_LANES-1:0] per_lane_fflags; `ifdef QUARTUS - + for (genvar i = 0; i < NUM_PES; ++i) begin acl_fdiv fdiv ( .clk (clk), @@ -103,8 +104,8 @@ module VX_fpu_div import VX_fpu_pkg::*; #( .q (pe_data_out[i][0 +: 32]) ); assign pe_data_out[i][32 +: `FP_FLAGS_BITS] = 'x; - end - + end + assign has_fflags = 0; assign per_lane_fflags = 'x; `UNUSED_VAR (fflags_out) @@ -131,21 +132,21 @@ module VX_fpu_div import VX_fpu_pkg::*; #( assign has_fflags = 1; assign per_lane_fflags = fflags_out; -`else +`else for (genvar i = 0; i < NUM_PES; ++i) begin reg [63:0] r; - `UNUSED_VAR (r) + `UNUSED_VAR (r) fflags_t f; - always @(*) begin + always @(*) begin dpi_fdiv ( - pe_enable, - int'(0), - {32'hffffffff, pe_data_in[i][0 +: 32]}, - {32'hffffffff, pe_data_in[i][32 +: 32]}, - frm, - r, + pe_enable, + int'(0), + {32'hffffffff, pe_data_in[i][0 +: 32]}, + {32'hffffffff, pe_data_in[i][32 +: 32]}, + frm, + r, f ); end diff --git a/hw/rtl/fpu/VX_fpu_fma.sv b/hw/rtl/fpu/VX_fpu_fma.sv index 8151fbf55..3522d8a1e 100644 --- a/hw/rtl/fpu/VX_fpu_fma.sv +++ b/hw/rtl/fpu/VX_fpu_fma.sv @@ -98,7 +98,8 @@ module VX_fpu_fma import VX_fpu_pkg::*; #( .DATA_IN_WIDTH(3*32), .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), - .PE_REG ((NUM_LANES != NUM_PES) ? 1 : 0) + .PE_REG ((NUM_LANES != NUM_PES) ? 1 : 0), // must be registered for DSPs + .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_fpu_ncp.sv b/hw/rtl/fpu/VX_fpu_ncp.sv index 017738775..34b822d89 100644 --- a/hw/rtl/fpu/VX_fpu_ncp.sv +++ b/hw/rtl/fpu/VX_fpu_ncp.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -35,7 +35,7 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #( input wire [NUM_LANES-1:0][31:0] dataa, input wire [NUM_LANES-1:0][31:0] datab, - output wire [NUM_LANES-1:0][31:0] result, + output wire [NUM_LANES-1:0][31:0] result, output wire has_fflags, output wire [`FP_FLAGS_BITS-1:0] fflags, @@ -44,15 +44,15 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #( input wire ready_out, output wire valid_out -); +); `UNUSED_VAR (frm) wire [NUM_LANES-1:0][2*32-1:0] data_in; - wire [NUM_LANES-1:0] mask_out; + wire [NUM_LANES-1:0] mask_out; wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out; fflags_t [NUM_LANES-1:0] fflags_out; - wire pe_enable; + wire pe_enable; wire [NUM_PES-1:0][2*32-1:0] pe_data_in; wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; @@ -60,15 +60,16 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #( assign data_in[i][0 +: 32] = dataa[i]; assign data_in[i][32 +: 32] = datab[i]; end - + VX_pe_serializer #( - .NUM_LANES (NUM_LANES), - .NUM_PES (NUM_PES), + .NUM_LANES (NUM_LANES), + .NUM_PES (NUM_PES), .LATENCY (`LATENCY_FNCP), .DATA_IN_WIDTH(2*32), .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), - .PE_REG (0) + .PE_REG (0), + .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), @@ -97,8 +98,8 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #( .clk (clk), .reset (reset), .enable (pe_enable), - .frm (frm), - .op_type (op_type), + .frm (frm), + .op_type (op_type), .dataa (pe_data_in[i][0 +: 32]), .datab (pe_data_in[i][32 +: 32]), .result (pe_data_out[i][0 +: 32]), diff --git a/hw/rtl/fpu/VX_fpu_sqrt.sv b/hw/rtl/fpu/VX_fpu_sqrt.sv index 03529e629..a6e6dda9a 100644 --- a/hw/rtl/fpu/VX_fpu_sqrt.sv +++ b/hw/rtl/fpu/VX_fpu_sqrt.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -18,10 +18,10 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( parameter NUM_LANES = 1, parameter NUM_PES = `UP(NUM_LANES /`FSQRT_PE_RATIO), - parameter TAG_WIDTH = 1 + parameter TAG_WIDTH = 1 ) ( input wire clk, - input wire reset, + input wire reset, output wire ready_in, input wire valid_in, @@ -29,11 +29,11 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( input wire [NUM_LANES-1:0] mask_in, input wire [TAG_WIDTH-1:0] tag_in, - + input wire [`INST_FRM_BITS-1:0] frm, input wire [NUM_LANES-1:0][31:0] dataa, - output wire [NUM_LANES-1:0][31:0] result, + output wire [NUM_LANES-1:0][31:0] result, output wire has_fflags, output wire [`FP_FLAGS_BITS-1:0] fflags, @@ -46,22 +46,23 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( `UNUSED_VAR (frm) - wire [NUM_LANES-1:0] mask_out; + wire [NUM_LANES-1:0] mask_out; wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out; wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out; - wire pe_enable; + wire pe_enable; wire [NUM_PES-1:0][31:0] pe_data_in; wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; - + VX_pe_serializer #( - .NUM_LANES (NUM_LANES), - .NUM_PES (NUM_PES), + .NUM_LANES (NUM_LANES), + .NUM_PES (NUM_PES), .LATENCY (`LATENCY_FSQRT), .DATA_IN_WIDTH(32), .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), - .PE_REG (0) + .PE_REG (0), + .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), @@ -83,10 +84,10 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS]; end - fflags_t [NUM_LANES-1:0] per_lane_fflags; + fflags_t [NUM_LANES-1:0] per_lane_fflags; `ifdef QUARTUS - + for (genvar i = 0; i < NUM_PES; ++i) begin acl_fsqrt fsqrt ( .clk (clk), @@ -105,7 +106,7 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( `elsif VIVADO for (genvar i = 0; i < NUM_PES; ++i) begin - wire tuser; + wire tuser; xil_fsqrt fsqrt ( .aclk (clk), @@ -130,17 +131,17 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( `UNUSED_VAR (r) fflags_t f; - always @(*) begin + always @(*) begin dpi_fsqrt ( - pe_enable, - int'(0), - {32'hffffffff, pe_data_in[i]}, - frm, - r, + pe_enable, + int'(0), + {32'hffffffff, pe_data_in[i]}, + frm, + r, f ); end - + VX_shift_register #( .DATAW (32 + $bits(fflags_t)), .DEPTH (`LATENCY_FSQRT) diff --git a/hw/rtl/libs/VX_avs_adapter.sv b/hw/rtl/libs/VX_avs_adapter.sv index 6e9abf597..35d329c7b 100644 --- a/hw/rtl/libs/VX_avs_adapter.sv +++ b/hw/rtl/libs/VX_avs_adapter.sv @@ -81,12 +81,15 @@ module VX_avs_adapter #( assign req_queue_push[i] = mem_req_valid && ~mem_req_rw && bank_req_ready[i] && (req_bank_sel == i); end + `RESET_RELAY_EX (bank_reset, reset, NUM_BANKS, 1); + for (genvar i = 0; i < NUM_BANKS; ++i) begin + VX_pending_size #( .SIZE (RD_QUEUE_SIZE) ) pending_size ( .clk (clk), - .reset (reset), + .reset (bank_reset[i]), .incr (req_queue_push[i]), .decr (req_queue_pop[i]), `UNUSED_PIN (empty), @@ -102,7 +105,7 @@ module VX_avs_adapter #( .DEPTH (RD_QUEUE_SIZE) ) rd_req_queue ( .clk (clk), - .reset (reset), + .reset (bank_reset[i]), .push (req_queue_push[i]), .pop (req_queue_pop[i]), .data_in (mem_req_tag), @@ -132,7 +135,7 @@ module VX_avs_adapter #( .OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF)) ) req_out_buf ( .clk (clk), - .reset (reset), + .reset (bank_reset[i]), .valid_in (valid_out_w), .ready_in (ready_out_w), .data_in ({mem_req_rw, mem_req_byteen, req_bank_off, mem_req_data}), @@ -168,12 +171,13 @@ module VX_avs_adapter #( wire [NUM_BANKS-1:0] rsp_queue_empty; for (genvar i = 0; i < NUM_BANKS; ++i) begin + VX_fifo_queue #( .DATAW (DATA_WIDTH), .DEPTH (RD_QUEUE_SIZE) ) rd_rsp_queue ( .clk (clk), - .reset (reset), + .reset (bank_reset[i]), .push (avs_readdatavalid[i]), .pop (req_queue_pop[i]), .data_in (avs_readdata[i]), @@ -195,7 +199,7 @@ module VX_avs_adapter #( VX_stream_arb #( .NUM_INPUTS (NUM_BANKS), .DATAW (DATA_WIDTH + TAG_WIDTH), - .ARBITER ("R"), + .ARBITER ("F"), .OUT_BUF (RSP_OUT_BUF) ) rsp_arb ( .clk (clk), diff --git a/hw/rtl/libs/VX_axi_adapter.sv b/hw/rtl/libs/VX_axi_adapter.sv index 69e3e3adc..7fffb9be2 100644 --- a/hw/rtl/libs/VX_axi_adapter.sv +++ b/hw/rtl/libs/VX_axi_adapter.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,10 +15,10 @@ `TRACING_OFF module VX_axi_adapter #( - parameter DATA_WIDTH = 512, + parameter DATA_WIDTH = 512, parameter ADDR_WIDTH = 32, parameter TAG_WIDTH = 8, - parameter NUM_BANKS = 1, + parameter NUM_BANKS = 1, parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)), parameter RSP_OUT_BUF = 0 ) ( @@ -34,13 +34,13 @@ module VX_axi_adapter #( input wire [TAG_WIDTH-1:0] mem_req_tag, output wire mem_req_ready, - // Vortex response - output wire mem_rsp_valid, + // Vortex response + output wire mem_rsp_valid, output wire [DATA_WIDTH-1:0] mem_rsp_data, output wire [TAG_WIDTH-1:0] mem_rsp_tag, input wire mem_rsp_ready, - // AXI write request address channel + // AXI write request address channel output wire m_axi_awvalid [NUM_BANKS], input wire m_axi_awready [NUM_BANKS], output wire [ADDR_WIDTH-1:0] m_axi_awaddr [NUM_BANKS], @@ -54,7 +54,7 @@ module VX_axi_adapter #( output wire [3:0] m_axi_awqos [NUM_BANKS], output wire [3:0] m_axi_awregion [NUM_BANKS], - // AXI write request data channel + // AXI write request data channel output wire m_axi_wvalid [NUM_BANKS], input wire m_axi_wready [NUM_BANKS], output wire [DATA_WIDTH-1:0] m_axi_wdata [NUM_BANKS], @@ -66,7 +66,7 @@ module VX_axi_adapter #( output wire m_axi_bready [NUM_BANKS], input wire [TAG_WIDTH-1:0] m_axi_bid [NUM_BANKS], input wire [1:0] m_axi_bresp [NUM_BANKS], - + // AXI read address channel output wire m_axi_arvalid [NUM_BANKS], input wire m_axi_arready [NUM_BANKS], @@ -74,13 +74,13 @@ module VX_axi_adapter #( output wire [TAG_WIDTH-1:0] m_axi_arid [NUM_BANKS], output wire [7:0] m_axi_arlen [NUM_BANKS], output wire [2:0] m_axi_arsize [NUM_BANKS], - output wire [1:0] m_axi_arburst [NUM_BANKS], + output wire [1:0] m_axi_arburst [NUM_BANKS], output wire [1:0] m_axi_arlock [NUM_BANKS], output wire [3:0] m_axi_arcache [NUM_BANKS], output wire [2:0] m_axi_arprot [NUM_BANKS], output wire [3:0] m_axi_arqos [NUM_BANKS], output wire [3:0] m_axi_arregion [NUM_BANKS], - + // AXI read response channel input wire m_axi_rvalid [NUM_BANKS], output wire m_axi_rready [NUM_BANKS], @@ -88,15 +88,15 @@ module VX_axi_adapter #( input wire m_axi_rlast [NUM_BANKS], input wire [TAG_WIDTH-1:0] m_axi_rid [NUM_BANKS], input wire [1:0] m_axi_rresp [NUM_BANKS] -); +); localparam AXSIZE = `CLOG2(DATA_WIDTH/8); - localparam BANK_ADDRW = `LOG2UP(NUM_BANKS); + localparam BANK_ADDRW = `LOG2UP(NUM_BANKS); localparam LOG2_NUM_BANKS = `CLOG2(NUM_BANKS); wire [BANK_ADDRW-1:0] req_bank_sel; if (NUM_BANKS > 1) begin - assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0]; + assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0]; end else begin assign req_bank_sel = '0; end @@ -108,12 +108,12 @@ module VX_axi_adapter #( for (genvar i = 0; i < NUM_BANKS; ++i) begin wire m_axi_aw_fire = m_axi_awvalid[i] && m_axi_awready[i]; - wire m_axi_w_fire = m_axi_wvalid[i] && m_axi_wready[i]; + wire m_axi_w_fire = m_axi_wvalid[i] && m_axi_wready[i]; always @(posedge clk) begin if (reset) begin m_axi_aw_ack[i] <= 0; m_axi_w_ack[i] <= 0; - end else begin + end else begin if (mem_req_fire && (req_bank_sel == i)) begin m_axi_aw_ack[i] <= 0; m_axi_w_ack[i] <= 0; @@ -127,10 +127,10 @@ module VX_axi_adapter #( end end - wire axi_write_ready [NUM_BANKS]; + wire axi_write_ready [NUM_BANKS]; for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign axi_write_ready[i] = (m_axi_awready[i] || m_axi_aw_ack[i]) + assign axi_write_ready[i] = (m_axi_awready[i] || m_axi_aw_ack[i]) && (m_axi_wready[i] || m_axi_w_ack[i]); end @@ -141,17 +141,17 @@ module VX_axi_adapter #( assign mem_req_ready = mem_req_rw ? axi_write_ready[0] : m_axi_arready[0]; end - // AXI write request address channel + // AXI write request address channel for (genvar i = 0; i < NUM_BANKS; ++i) begin assign m_axi_awvalid[i] = mem_req_valid && mem_req_rw && (req_bank_sel == i) && ~m_axi_aw_ack[i]; assign m_axi_awaddr[i] = (ADDR_WIDTH'(mem_req_addr) >> LOG2_NUM_BANKS) << AXSIZE; assign m_axi_awid[i] = mem_req_tag; - assign m_axi_awlen[i] = 8'b00000000; + assign m_axi_awlen[i] = 8'b00000000; assign m_axi_awsize[i] = 3'(AXSIZE); - assign m_axi_awburst[i] = 2'b00; - assign m_axi_awlock[i] = 2'b00; + assign m_axi_awburst[i] = 2'b00; + assign m_axi_awlock[i] = 2'b00; assign m_axi_awcache[i] = 4'b0000; - assign m_axi_awprot[i] = 3'b000; + assign m_axi_awprot[i] = 3'b000; assign m_axi_awqos[i] = 4'b0000; assign m_axi_awregion[i]= 4'b0000; end @@ -170,31 +170,31 @@ module VX_axi_adapter #( `UNUSED_VAR (m_axi_bid[i]) `UNUSED_VAR (m_axi_bresp[i]) assign m_axi_bready[i] = 1'b1; - `RUNTIME_ASSERT(~m_axi_bvalid[i] || m_axi_bresp[i] == 0, ("%t: *** AXI response error", $time)); + `RUNTIME_ASSERT(~m_axi_bvalid[i] || m_axi_bresp[i] == 0, ("%t: *** AXI response error", $time)); end // AXI read request channel for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign m_axi_arvalid[i] = mem_req_valid && ~mem_req_rw && (req_bank_sel == i); + assign m_axi_arvalid[i] = mem_req_valid && ~mem_req_rw && (req_bank_sel == i); assign m_axi_araddr[i] = (ADDR_WIDTH'(mem_req_addr) >> LOG2_NUM_BANKS) << AXSIZE; assign m_axi_arid[i] = mem_req_tag; assign m_axi_arlen[i] = 8'b00000000; assign m_axi_arsize[i] = 3'(AXSIZE); - assign m_axi_arburst[i] = 2'b00; - assign m_axi_arlock[i] = 2'b00; + assign m_axi_arburst[i] = 2'b00; + assign m_axi_arlock[i] = 2'b00; assign m_axi_arcache[i] = 4'b0000; assign m_axi_arprot[i] = 3'b000; assign m_axi_arqos[i] = 4'b0000; assign m_axi_arregion[i]= 4'b0000; end - // AXI read response channel + // AXI read response channel wire [NUM_BANKS-1:0] rsp_arb_valid_in; wire [NUM_BANKS-1:0][DATA_WIDTH+TAG_WIDTH-1:0] rsp_arb_data_in; wire [NUM_BANKS-1:0] rsp_arb_ready_in; - `UNUSED_VAR (m_axi_rlast) + `UNUSED_VAR (m_axi_rlast) for (genvar i = 0; i < NUM_BANKS; ++i) begin assign rsp_arb_valid_in[i] = m_axi_rvalid[i]; @@ -203,11 +203,11 @@ module VX_axi_adapter #( `RUNTIME_ASSERT(~m_axi_rvalid[i] || m_axi_rlast[i] == 1, ("%t: *** AXI response error", $time)); `RUNTIME_ASSERT(~m_axi_rvalid[i] || m_axi_rresp[i] == 0, ("%t: *** AXI response error", $time)); end - + VX_stream_arb #( .NUM_INPUTS (NUM_BANKS), .DATAW (DATA_WIDTH + TAG_WIDTH), - .ARBITER ("R"), + .ARBITER ("F"), .OUT_BUF (RSP_OUT_BUF) ) rsp_arb ( .clk (clk), diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index fa11a541f..6683eaecc 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -22,12 +22,16 @@ module VX_dp_ram #( parameter OUT_REG = 0, parameter NO_RWCHECK = 0, parameter LUTRAM = 0, + parameter RW_ASSERT = 0, + parameter RESET_RAM = 0, + parameter READ_ENABLE = 0, parameter INIT_ENABLE = 0, parameter INIT_FILE = "", parameter [DATAW-1:0] INIT_VALUE = 0, parameter ADDRW = `LOG2UP(SIZE) ) ( input wire clk, + input wire reset, input wire read, input wire write, input wire [WRENW-1:0] wren, @@ -50,44 +54,44 @@ module VX_dp_ram #( end \ end + `UNUSED_PARAM (RW_ASSERT) `UNUSED_VAR (read) + if (WRENW > 1) begin + `RUNTIME_ASSERT(~write || (| wren), ("invalid write enable mask")); + end + + wire [DATAW-1:0] rdata_w; + `ifdef SYNTHESIS if (WRENW > 1) begin `ifdef QUARTUS if (LUTRAM != 0) begin - if (OUT_REG != 0) begin - reg [DATAW-1:0] rdata_r; - `USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; - end - end - if (read) begin - rdata_r <= ram[raddr]; + `USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + for (integer i = 0; i < WRENW; ++i) begin + if (wren[i]) + ram[waddr][i] <= wdata[i * WSELW +: WSELW]; end end - assign rdata = rdata_r; - end else begin - `USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; - end - end - end - assign rdata = ram[raddr]; end + assign rdata_w = ram[raddr]; end else begin - if (OUT_REG != 0) begin - reg [DATAW-1:0] rdata_r; + if (NO_RWCHECK != 0) begin + `NO_RW_RAM_CHECK reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + for (integer i = 0; i < WRENW; ++i) begin + if (wren[i]) + ram[waddr][i] <= wdata[i * WSELW +: WSELW]; + end + end + end + assign rdata_w = ram[raddr]; + end else begin reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin @@ -97,37 +101,8 @@ module VX_dp_ram #( ram[waddr][i] <= wdata[i * WSELW +: WSELW]; end end - if (read) begin - rdata_r <= ram[raddr]; - end - end - assign rdata = rdata_r; - end else begin - if (NO_RWCHECK != 0) begin - `NO_RW_RAM_CHECK reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; - end - end - end - assign rdata = ram[raddr]; - end else begin - reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; - end - end - end - assign rdata = ram[raddr]; end + assign rdata_w = ram[raddr]; end end `else @@ -135,35 +110,18 @@ module VX_dp_ram #( if (LUTRAM != 0) begin `USE_FAST_BRAM reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; `RAM_INITIALIZATION - if (OUT_REG != 0) begin - reg [DATAW-1:0] rdata_r; - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end - end - if (read) begin - rdata_r <= ram[raddr]; + always @(posedge clk) begin + if (write) begin + for (integer i = 0; i < WRENW; ++i) begin + if (wren[i]) + ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; end end - assign rdata = rdata_r; - end else begin - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end - end - end - assign rdata = ram[raddr]; end + assign rdata_w = ram[raddr]; end else begin - if (OUT_REG != 0) begin - reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - reg [DATAW-1:0] rdata_r; + if (NO_RWCHECK != 0) begin + `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -172,37 +130,20 @@ module VX_dp_ram #( ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; end end - if (read) begin - rdata_r <= ram[raddr]; - end end - assign rdata = rdata_r; + assign rdata_w = ram[raddr]; end else begin - if (NO_RWCHECK != 0) begin - `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end + reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + for (integer i = 0; i < WRENW; ++i) begin + if (wren[i]) + ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; end end - assign rdata = ram[raddr]; - end else begin - reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end - end - end - assign rdata = ram[raddr]; end + assign rdata_w = ram[raddr]; end end `endif @@ -211,64 +152,36 @@ module VX_dp_ram #( if (LUTRAM != 0) begin `USE_FAST_BRAM reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; `RAM_INITIALIZATION - if (OUT_REG != 0) begin - reg [DATAW-1:0] rdata_r; - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; - end - if (read) begin - rdata_r <= ram[raddr]; - end + always @(posedge clk) begin + if (write) begin + ram[waddr] <= wdata; end - assign rdata = rdata_r; - end else begin - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; - end - end - assign rdata = ram[raddr]; end + assign rdata_w = ram[raddr]; end else begin - if (OUT_REG != 0) begin - reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - reg [DATAW-1:0] rdata_r; + if (NO_RWCHECK != 0) begin + `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin ram[waddr] <= wdata; end - if (read) begin - rdata_r <= ram[raddr]; - end end - assign rdata = rdata_r; + assign rdata_w = ram[raddr]; end else begin - if (NO_RWCHECK != 0) begin - `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; - end + reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + ram[waddr] <= wdata; end - assign rdata = ram[raddr]; - end else begin - reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; - end - end - assign rdata = ram[raddr]; end + assign rdata_w = ram[raddr]; end end end `else - // RAM emulation + // simulation reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; `RAM_INITIALIZATION @@ -277,39 +190,57 @@ module VX_dp_ram #( assign ram_n[i * WSELW +: WSELW] = ((WRENW == 1) | wren[i]) ? wdata[i * WSELW +: WSELW] : ram[waddr][i * WSELW +: WSELW]; end - if (OUT_REG != 0) begin - reg [DATAW-1:0] rdata_r; - always @(posedge clk) begin + reg [DATAW-1:0] prev_data; + reg [ADDRW-1:0] prev_waddr; + reg prev_write; + + always @(posedge clk) begin + if (RESET_RAM && reset) begin + for (integer i = 0; i < SIZE; ++i) begin + ram[i] <= DATAW'(INIT_VALUE); + end + end else begin if (write) begin ram[waddr] <= ram_n; end - if (read) begin - rdata_r <= ram[raddr]; - end end - assign rdata = rdata_r; - end else begin - reg [DATAW-1:0] prev_data; - reg [ADDRW-1:0] prev_waddr; - reg prev_write; - always @(posedge clk) begin - if (write) begin - ram[waddr] <= ram_n; - end - prev_write <= (| wren); + if (reset) begin + prev_write <= 0; + prev_data <= '0; + prev_waddr <= '0; + end else begin + prev_write <= write; prev_data <= ram[waddr]; prev_waddr <= waddr; end - if (LUTRAM || !NO_RWCHECK) begin - `UNUSED_VAR (prev_write) - `UNUSED_VAR (prev_data) - `UNUSED_VAR (prev_waddr) - assign rdata = ram[raddr]; - end else begin - assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; + end + + if (LUTRAM || !NO_RWCHECK) begin + `UNUSED_VAR (prev_write) + `UNUSED_VAR (prev_data) + `UNUSED_VAR (prev_waddr) + assign rdata_w = ram[raddr]; + end else begin + assign rdata_w = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; + if (RW_ASSERT) begin + `RUNTIME_ASSERT(~read || (rdata_w == ram[raddr]), ("read after write hazard")); end end `endif + if (OUT_REG != 0) begin + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (READ_ENABLE && reset) begin + rdata_r <= '0; + end else if (!READ_ENABLE || read) begin + rdata_r <= rdata_w; + end + end + assign rdata = rdata_r; + end else begin + assign rdata = rdata_w; + end + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_dp_ram_rst.sv b/hw/rtl/libs/VX_dp_ram_rst.sv deleted file mode 100644 index e7598dbe6..000000000 --- a/hw/rtl/libs/VX_dp_ram_rst.sv +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -`include "VX_platform.vh" - -`TRACING_OFF -module VX_dp_ram_rst #( - parameter DATAW = 1, - parameter SIZE = 1, - parameter ADDR_MIN = 0, - parameter WRENW = 1, - parameter OUT_REG = 0, - parameter NO_RWCHECK = 0, - parameter LUTRAM = 0, - parameter INIT_ENABLE = 0, - parameter INIT_FILE = "", - parameter [DATAW-1:0] INIT_VALUE = 0, - parameter ADDRW = `LOG2UP(SIZE) -) ( - input wire clk, - input wire reset, - input wire read, - input wire write, - input wire [WRENW-1:0] wren, - input wire [ADDRW-1:0] waddr, - input wire [DATAW-1:0] wdata, - input wire [ADDRW-1:0] raddr, - output wire [DATAW-1:0] rdata -); - localparam WSELW = DATAW / WRENW; - `STATIC_ASSERT((WRENW * WSELW == DATAW), ("invalid parameter")) - -`define RAM_INITIALIZATION \ - if (INIT_ENABLE != 0) begin \ - if (INIT_FILE != "") begin \ - initial $readmemh(INIT_FILE, ram); \ - end else begin \ - initial \ - for (integer i = 0; i < SIZE; ++i) \ - ram[i] = INIT_VALUE; \ - end \ - end - - `UNUSED_VAR (read) - - // RAM emulation - reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - - wire [DATAW-1:0] ram_n; - for (genvar i = 0; i < WRENW; ++i) begin - assign ram_n[i * WSELW +: WSELW] = ((WRENW == 1) | wren[i]) ? wdata[i * WSELW +: WSELW] : ram[waddr][i * WSELW +: WSELW]; - end - - if (OUT_REG != 0) begin - reg [DATAW-1:0] rdata_r; - always @(posedge clk) begin - if (reset) begin - for (integer i = 0; i < SIZE; ++i) begin - ram[i] <= DATAW'(INIT_VALUE); - end - rdata_r <= '0; - end else begin - if (write) begin - ram[waddr] <= ram_n; - end - if (read) begin - rdata_r <= ram[raddr]; - end - end - end - assign rdata = rdata_r; - end else begin - reg [DATAW-1:0] prev_data; - reg [ADDRW-1:0] prev_waddr; - reg prev_write; - always @(posedge clk) begin - if (reset) begin - for (integer i = 0; i < SIZE; ++i) begin - ram[i] <= DATAW'(INIT_VALUE); - end - prev_write <= 0; - prev_data <= '0; - prev_waddr <= '0; - end else begin - if (write) begin - ram[waddr] <= ram_n; - end - prev_write <= (| wren); - prev_data <= ram[waddr]; - prev_waddr <= waddr; - end - end - if (LUTRAM || !NO_RWCHECK) begin - `UNUSED_VAR (prev_write) - `UNUSED_VAR (prev_data) - `UNUSED_VAR (prev_waddr) - assign rdata = ram[raddr]; - end else begin - assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; - end - end - -endmodule -`TRACING_ON diff --git a/hw/rtl/libs/VX_elastic_buffer.sv b/hw/rtl/libs/VX_elastic_buffer.sv index 01464840c..9213572d3 100644 --- a/hw/rtl/libs/VX_elastic_buffer.sv +++ b/hw/rtl/libs/VX_elastic_buffer.sv @@ -18,7 +18,8 @@ module VX_elastic_buffer #( parameter DATAW = 1, parameter SIZE = 1, parameter OUT_REG = 0, - parameter LUTRAM = 0 + parameter LUTRAM = 0, + parameter MAX_FANOUT = 0 ) ( input wire clk, input wire reset, @@ -40,6 +41,43 @@ module VX_elastic_buffer #( assign data_out = data_in; assign ready_in = ready_out; + end else if (MAX_FANOUT != 0 && (DATAW > (MAX_FANOUT + MAX_FANOUT/2))) begin + + localparam NUM_SLICES = `CDIV(DATAW, MAX_FANOUT); + localparam N_DATAW = DATAW / NUM_SLICES; + + for (genvar i = 0; i < NUM_SLICES; ++i) begin + + localparam S_DATAW = (i == NUM_SLICES-1) ? (DATAW - i * N_DATAW) : N_DATAW; + + wire valid_out_t, ready_in_t; + `UNUSED_VAR (valid_out_t) + `UNUSED_VAR (ready_in_t) + + `RESET_RELAY (slice_reset, reset); + + VX_elastic_buffer #( + .DATAW (S_DATAW), + .SIZE (SIZE), + .OUT_REG (OUT_REG), + .LUTRAM (LUTRAM) + ) buffer_slice ( + .clk (clk), + .reset (slice_reset), + .valid_in (valid_in), + .data_in (data_in[i * N_DATAW +: S_DATAW]), + .ready_in (ready_in_t), + .valid_out (valid_out_t), + .data_out (data_out[i * N_DATAW +: S_DATAW]), + .ready_out (ready_out) + ); + + if (i == 0) begin + assign ready_in = ready_in_t; + assign valid_out = valid_out_t; + end + end + end else if (SIZE == 1) begin VX_pipe_buffer #( @@ -103,9 +141,9 @@ module VX_elastic_buffer #( assign ready_in = ~full; - VX_elastic_buffer #( + VX_pipe_buffer #( .DATAW (DATAW), - .SIZE ((OUT_REG == 2) ? 1 : 0) + .DEPTH ((OUT_REG > 0) ? (OUT_REG-1) : 0) ) out_buf ( .clk (clk), .reset (reset), diff --git a/hw/rtl/libs/VX_fair_arbiter.sv b/hw/rtl/libs/VX_fair_arbiter.sv index 838563dd8..82bcfc5c6 100644 --- a/hw/rtl/libs/VX_fair_arbiter.sv +++ b/hw/rtl/libs/VX_fair_arbiter.sv @@ -38,17 +38,16 @@ module VX_fair_arbiter #( end else begin - reg [NUM_REQS-1:0] grant_mask; + reg [NUM_REQS-1:0] requests_r; - wire [NUM_REQS-1:0] requests_rem = requests & ~grant_mask; - wire rem_valid = (| requests_rem); - wire [NUM_REQS-1:0] requests_qual = rem_valid ? requests_rem : requests; + wire [NUM_REQS-1:0] requests_sel = requests_r & requests; + wire [NUM_REQS-1:0] requests_qual = (| requests_sel) ? requests_sel : requests; always @(posedge clk) begin if (reset) begin - grant_mask <= '0; + requests_r <= '0; end else if (grant_ready) begin - grant_mask <= rem_valid ? (grant_mask | grant_onehot) : grant_onehot; + requests_r <= requests_qual & ~grant_onehot; end end diff --git a/hw/rtl/libs/VX_fifo_queue.sv b/hw/rtl/libs/VX_fifo_queue.sv index a430d32f7..ea00d67c7 100644 --- a/hw/rtl/libs/VX_fifo_queue.sv +++ b/hw/rtl/libs/VX_fifo_queue.sv @@ -177,10 +177,11 @@ module VX_fifo_queue #( .SIZE (DEPTH), .LUTRAM (LUTRAM) ) dp_ram ( - .clk(clk), + .clk (clk), + .reset (reset), .read (1'b1), .write (push), - `UNUSED_PIN (wren), + .wren (1'b1), .waddr (wr_ptr_r), .wdata (data_in), .raddr (rd_ptr_r), @@ -226,9 +227,10 @@ module VX_fifo_queue #( .LUTRAM (LUTRAM) ) dp_ram ( .clk (clk), + .reset (reset), .read (1'b1), .write (push), - `UNUSED_PIN (wren), + .wren (1'b1), .waddr (wr_ptr_r), .wdata (data_in), .raddr (rd_ptr_n_r), diff --git a/hw/rtl/libs/VX_find_first.sv b/hw/rtl/libs/VX_find_first.sv index f06971106..18f345855 100644 --- a/hw/rtl/libs/VX_find_first.sv +++ b/hw/rtl/libs/VX_find_first.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,10 +17,10 @@ module VX_find_first #( parameter N = 1, parameter DATAW = 1, - parameter REVERSE = 0 + parameter REVERSE = 0 ) ( input wire [N-1:0][DATAW-1:0] data_in, - input wire [N-1:0] valid_in, + input wire [N-1:0] valid_in, output wire [DATAW-1:0] data_out, output wire valid_out ); @@ -37,10 +37,12 @@ module VX_find_first #( assign s_n[TL+i] = REVERSE ? valid_in[N-1-i] : valid_in[i]; assign d_n[TL+i] = REVERSE ? data_in[N-1-i] : data_in[i]; end - - for (genvar i = TL+N; i < TN; ++i) begin - assign s_n[i] = 0; - assign d_n[i] = '0; + + if (TL < (TN-N)) begin + for (genvar i = TL+N; i < TN; ++i) begin + assign s_n[i] = 0; + assign d_n[i] = '0; + end end for (genvar j = 0; j < LOGN; ++j) begin @@ -48,10 +50,10 @@ module VX_find_first #( assign s_n[2**j-1+i] = s_n[2**(j+1)-1+i*2] | s_n[2**(j+1)-1+i*2+1]; assign d_n[2**j-1+i] = s_n[2**(j+1)-1+i*2] ? d_n[2**(j+1)-1+i*2] : d_n[2**(j+1)-1+i*2+1]; end - end - + end + assign valid_out = s_n[0]; assign data_out = d_n[0]; - + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_index_buffer.sv b/hw/rtl/libs/VX_index_buffer.sv index 9c19b9184..4e8439818 100644 --- a/hw/rtl/libs/VX_index_buffer.sv +++ b/hw/rtl/libs/VX_index_buffer.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,17 +24,17 @@ module VX_index_buffer #( input wire reset, output wire [ADDRW-1:0] write_addr, - input wire [DATAW-1:0] write_data, + input wire [DATAW-1:0] write_data, input wire acquire_en, input wire [ADDRW-1:0] read_addr, output wire [DATAW-1:0] read_data, input wire release_en, - + output wire empty, - output wire full + output wire full ); - + VX_allocator #( .SIZE (SIZE) ) allocator ( @@ -43,9 +43,9 @@ module VX_index_buffer #( .acquire_en (acquire_en), .acquire_addr (write_addr), .release_en (release_en), - .release_addr (read_addr), + .release_addr (read_addr), .empty (empty), - .full (full) + .full (full) ); VX_dp_ram #( @@ -54,14 +54,15 @@ module VX_index_buffer #( .LUTRAM (LUTRAM) ) data_table ( .clk (clk), + .reset (reset), .read (1'b1), .write (acquire_en), - `UNUSED_PIN (wren), + .wren (1'b1), .waddr (write_addr), .wdata (write_data), .raddr (read_addr), .rdata (read_data) ); - + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_mem_adapter.sv b/hw/rtl/libs/VX_mem_adapter.sv index b447bcc35..263df0159 100644 --- a/hw/rtl/libs/VX_mem_adapter.sv +++ b/hw/rtl/libs/VX_mem_adapter.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,10 +15,10 @@ `TRACING_OFF module VX_mem_adapter #( - parameter SRC_DATA_WIDTH = 1, - parameter SRC_ADDR_WIDTH = 1, - parameter DST_DATA_WIDTH = 1, - parameter DST_ADDR_WIDTH = 1, + parameter SRC_DATA_WIDTH = 1, + parameter SRC_ADDR_WIDTH = 1, + parameter DST_DATA_WIDTH = 1, + parameter DST_ADDR_WIDTH = 1, parameter SRC_TAG_WIDTH = 1, parameter DST_TAG_WIDTH = 1, parameter REQ_OUT_BUF = 0, @@ -35,9 +35,9 @@ module VX_mem_adapter #( input wire [SRC_TAG_WIDTH-1:0] mem_req_tag_in, output wire mem_req_ready_in, - output wire mem_rsp_valid_in, - output wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_in, - output wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_in, + output wire mem_rsp_valid_in, + output wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_in, + output wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_in, input wire mem_rsp_ready_in, output wire mem_req_valid_out, @@ -48,12 +48,12 @@ module VX_mem_adapter #( output wire [DST_TAG_WIDTH-1:0] mem_req_tag_out, input wire mem_req_ready_out, - input wire mem_rsp_valid_out, - input wire [DST_DATA_WIDTH-1:0] mem_rsp_data_out, + input wire mem_rsp_valid_out, + input wire [DST_DATA_WIDTH-1:0] mem_rsp_data_out, input wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_out, output wire mem_rsp_ready_out -); - `STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!")) +); + `STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!")) localparam DST_DATA_SIZE = (DST_DATA_WIDTH / 8); localparam DST_LDATAW = `CLOG2(DST_DATA_WIDTH); @@ -69,7 +69,7 @@ module VX_mem_adapter #( wire [DST_TAG_WIDTH-1:0] mem_req_tag_out_w; wire mem_req_ready_out_w; - wire mem_rsp_valid_in_w; + wire mem_rsp_valid_in_w; wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_in_w; wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_in_w; wire mem_rsp_ready_in_w; @@ -80,7 +80,7 @@ module VX_mem_adapter #( `UNUSED_VAR (clk) `UNUSED_VAR (reset) - + wire [D-1:0] req_idx = mem_req_addr_in[D-1:0]; wire [D-1:0] rsp_idx = mem_rsp_tag_out[D-1:0]; @@ -99,31 +99,31 @@ module VX_mem_adapter #( assign mem_req_valid_out_w = mem_req_valid_in; assign mem_req_rw_out_w = mem_req_rw_in; - assign mem_req_byteen_out_w = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3)); + assign mem_req_byteen_out_w = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3)); assign mem_req_data_out_w = DST_DATA_WIDTH'(mem_req_data_in) << ((DST_LDATAW'(req_idx)) << SRC_LDATAW); assign mem_req_tag_out_w = DST_TAG_WIDTH'({mem_req_tag_in, req_idx}); assign mem_req_ready_in = mem_req_ready_out_w; assign mem_rsp_valid_in_w = mem_rsp_valid_out; - assign mem_rsp_data_in_w = mem_rsp_data_out_w[rsp_idx]; + assign mem_rsp_data_in_w = mem_rsp_data_out_w[rsp_idx]; assign mem_rsp_tag_in_w = SRC_TAG_WIDTH'(mem_rsp_tag_out[SRC_TAG_WIDTH+D-1:D]); assign mem_rsp_ready_out = mem_rsp_ready_in_w; end else if (DST_LDATAW < SRC_LDATAW) begin - + reg [D-1:0] req_ctr, rsp_ctr; reg [P-1:0][DST_DATA_WIDTH-1:0] mem_rsp_data_out_r, mem_rsp_data_out_n; wire mem_req_out_fire = mem_req_valid_out && mem_req_ready_out; - wire mem_rsp_in_fire = mem_rsp_valid_out && mem_rsp_ready_out; + wire mem_rsp_in_fire = mem_rsp_valid_out && mem_rsp_ready_out; wire [P-1:0][DST_DATA_WIDTH-1:0] mem_req_data_in_w = mem_req_data_in; wire [P-1:0][DST_DATA_SIZE-1:0] mem_req_byteen_in_w = mem_req_byteen_in; always @(*) begin mem_rsp_data_out_n = mem_rsp_data_out_r; - if (mem_rsp_in_fire) begin + if (mem_rsp_in_fire) begin mem_rsp_data_out_n[rsp_ctr] = mem_rsp_data_out; end end @@ -139,24 +139,24 @@ module VX_mem_adapter #( if (mem_rsp_in_fire) begin rsp_ctr <= rsp_ctr + 1; end - end + end mem_rsp_data_out_r <= mem_rsp_data_out_n; end reg [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_r; wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_x; - + always @(posedge clk) begin if (mem_rsp_in_fire) begin mem_rsp_tag_in_r <= mem_rsp_tag_out; - end + end end assign mem_rsp_tag_in_x = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_out; - `RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_x == mem_rsp_tag_out), + `RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_x == mem_rsp_tag_out), ("%t: *** out-of-order memory reponse! cur=%d, expected=%d", $time, mem_rsp_tag_in_x, mem_rsp_tag_out)) wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr}; - + if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH + D)) begin `UNUSED_VAR (mem_req_addr_in_qual) assign mem_req_addr_out_w = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0]; @@ -181,8 +181,8 @@ module VX_mem_adapter #( end else begin `UNUSED_VAR (clk) - `UNUSED_VAR (reset) - + `UNUSED_VAR (reset) + if (DST_ADDR_WIDTH < SRC_ADDR_WIDTH) begin `UNUSED_VAR (mem_req_addr_in) assign mem_req_addr_out_w = mem_req_addr_in[DST_ADDR_WIDTH-1:0]; diff --git a/hw/rtl/libs/VX_mem_coalescer.sv b/hw/rtl/libs/VX_mem_coalescer.sv index 17eb01642..d1ffde09a 100644 --- a/hw/rtl/libs/VX_mem_coalescer.sv +++ b/hw/rtl/libs/VX_mem_coalescer.sv @@ -87,16 +87,16 @@ module VX_mem_coalescer #( localparam STATE_SETUP = 0; localparam STATE_SEND = 1; - reg state_r, state_n; + logic state_r, state_n; - reg out_req_valid_r, out_req_valid_n; - reg out_req_rw_r, out_req_rw_n; - reg [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n; - reg [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n; - reg [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype_r, out_req_atype_n; - reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] out_req_byteen_r, out_req_byteen_n; - reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] out_req_data_r, out_req_data_n; - reg [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n; + logic out_req_valid_r, out_req_valid_n; + logic out_req_rw_r, out_req_rw_n; + logic [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n; + logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n; + logic [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype_r, out_req_atype_n; + logic [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] out_req_byteen_r, out_req_byteen_n; + logic [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] out_req_data_r, out_req_data_n; + logic [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n; reg in_req_ready_n; @@ -135,7 +135,11 @@ module VX_mem_coalescer #( `UNUSED_PIN (onehot), .valid_out (batch_valid_n[i]) ); - assign seed_idx[i] = NUM_REQS_W'(i * DATA_RATIO) + NUM_REQS_W'(batch_idx); + if (OUT_REQS > 1) begin + assign seed_idx[i] = {(NUM_REQS_W-DATA_RATIO_W)'(i), batch_idx}; + end else begin + assign seed_idx[i] = batch_idx; + end end for (genvar i = 0; i < OUT_REQS; ++i) begin @@ -149,29 +153,6 @@ module VX_mem_coalescer #( end end - always @(posedge clk) begin - if (reset) begin - state_r <= STATE_SETUP; - processed_mask_r <= '0; - out_req_valid_r <= 0; - end else begin - state_r <= state_n; - batch_valid_r <= batch_valid_n; - seed_addr_r <= seed_addr_n; - seed_atype_r <= seed_atype_n; - addr_matches_r <= addr_matches_n; - out_req_valid_r <= out_req_valid_n; - out_req_mask_r <= out_req_mask_n; - out_req_rw_r <= out_req_rw_n; - out_req_addr_r <= out_req_addr_n; - out_req_atype_r <= out_req_atype_n; - out_req_byteen_r <= out_req_byteen_n; - out_req_data_r <= out_req_data_n; - out_req_tag_r <= out_req_tag_n; - processed_mask_r <= processed_mask_n; - end - end - wire [NUM_REQS-1:0] current_pmask = in_req_mask & addr_matches_r; reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] req_byteen_merged; @@ -248,6 +229,17 @@ module VX_mem_coalescer #( endcase end + VX_pipe_register #( + .DATAW (1 + NUM_REQS + 1 + 1 + NUM_REQS + OUT_REQS * (1 + 1 + OUT_ADDR_WIDTH + ATYPE_WIDTH + OUT_ADDR_WIDTH + ATYPE_WIDTH + DATA_OUT_SIZE + DATA_OUT_WIDTH) + OUT_TAG_WIDTH), + .RESETW (1 + NUM_REQS + 1) + ) pipe_reg ( + .clk (clk), + .reset (reset), + .enable (1'b1), + .data_in ({state_n, processed_mask_n, out_req_valid_n, out_req_rw_n, addr_matches_n, batch_valid_n, out_req_mask_n, seed_addr_n, seed_atype_n, out_req_addr_n, out_req_atype_n, out_req_byteen_n, out_req_data_n, out_req_tag_n}), + .data_out ({state_r, processed_mask_r, out_req_valid_r, out_req_rw_r, addr_matches_r, batch_valid_r, out_req_mask_r, seed_addr_r, seed_atype_r, out_req_addr_r, out_req_atype_r, out_req_byteen_r, out_req_data_r, out_req_tag_r}) + ); + wire out_rsp_fire = out_rsp_valid && out_rsp_ready; wire out_rsp_eop; diff --git a/hw/rtl/libs/VX_onehot_encoder.sv b/hw/rtl/libs/VX_onehot_encoder.sv index 92c7d1ea1..8f7ada257 100644 --- a/hw/rtl/libs/VX_onehot_encoder.sv +++ b/hw/rtl/libs/VX_onehot_encoder.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,13 +23,13 @@ module VX_onehot_encoder #( parameter MODEL = 1, parameter LN = `LOG2UP(N) ) ( - input wire [N-1:0] data_in, + input wire [N-1:0] data_in, output wire [LN-1:0] data_out, output wire valid_out -); +); if (N == 1) begin - assign data_out = data_in; + assign data_out = 0; assign valid_out = data_in; end else if (N == 2) begin @@ -37,43 +37,43 @@ module VX_onehot_encoder #( assign data_out = data_in[!REVERSE]; assign valid_out = (| data_in); - end else if (MODEL == 1) begin - localparam M = 1 << LN; - `IGNORE_UNOPTFLAT_BEGIN + end else if (MODEL == 1) begin + localparam M = 1 << LN; + `IGNORE_UNOPTFLAT_BEGIN wire [LN-1:0][M-1:0] addr; wire [LN:0][M-1:0] v; `IGNORE_UNOPTFLAT_END - + // base case, also handle padding for non-power of two inputs assign v[0] = REVERSE ? (M'(data_in) << (M - N)) : M'(data_in); - + for (genvar lvl = 1; lvl < (LN+1); ++lvl) begin localparam SN = 1 << (LN - lvl); localparam SI = M / SN; localparam SW = lvl; - + for (genvar s = 0; s < SN; ++s) begin `IGNORE_UNOPTFLAT_BEGIN wire [1:0] vs = {v[lvl-1][s*SI+(SI>>1)], v[lvl-1][s*SI]}; `IGNORE_UNOPTFLAT_END - + assign v[lvl][s*SI] = (| vs); if (lvl == 1) begin - assign addr[lvl-1][s*SI +: SW] = vs[!REVERSE]; + assign addr[lvl-1][s*SI +: SW] = vs[!REVERSE]; end else begin - assign addr[lvl-1][s*SI +: SW] = { + assign addr[lvl-1][s*SI +: SW] = { vs[!REVERSE], addr[lvl-2][s*SI +: SW-1] | addr[lvl-2][s*SI+(SI>>1) +: SW-1] }; - end - end - end - + end + end + end + assign data_out = addr[LN-1][LN-1:0]; assign valid_out = v[LN][0]; - end else if (MODEL == 2 && REVERSE == 0) begin + end else if (MODEL == 2 && REVERSE == 0) begin for (genvar j = 0; j < LN; ++j) begin wire [N-1:0] mask; @@ -90,19 +90,19 @@ module VX_onehot_encoder #( reg [LN-1:0] index_r; if (REVERSE != 0) begin - always @(*) begin - index_r = 'x; + always @(*) begin + index_r = 'x; for (integer i = N-1; i >= 0; --i) begin - if (data_in[i]) begin + if (data_in[i]) begin index_r = LN'(N-1-i); end end end end else begin - always @(*) begin - index_r = 'x; + always @(*) begin + index_r = 'x; for (integer i = 0; i < N; ++i) begin - if (data_in[i]) begin + if (data_in[i]) begin index_r = LN'(i); end end diff --git a/hw/rtl/libs/VX_onehot_mux.sv b/hw/rtl/libs/VX_onehot_mux.sv index 8d9b87c8e..cc0fffaa6 100644 --- a/hw/rtl/libs/VX_onehot_mux.sv +++ b/hw/rtl/libs/VX_onehot_mux.sv @@ -17,7 +17,8 @@ module VX_onehot_mux #( parameter DATAW = 1, parameter N = 1, - parameter MODEL = 1 + parameter MODEL = 1, + parameter LUT_OPT = 0 ) ( input wire [N-1:0][DATAW-1:0] data_in, input wire [N-1:0] sel_in, @@ -26,6 +27,90 @@ module VX_onehot_mux #( if (N == 1) begin `UNUSED_VAR (sel_in) assign data_out = data_in; + end else if (LUT_OPT && N == 2) begin + `UNUSED_VAR (sel_in) + assign data_out = sel_in[0] ? data_in[0] : data_in[1]; + end else if (LUT_OPT && N == 3) begin + reg [DATAW-1:0] data_out_r; + always @(*) begin + case (sel_in) + 3'b001: data_out_r = data_in[0]; + 3'b010: data_out_r = data_in[1]; + 3'b100: data_out_r = data_in[2]; + default: data_out_r = 'x; + endcase + end + assign data_out = data_out_r; + end else if (LUT_OPT && N == 4) begin + reg [DATAW-1:0] data_out_r; + always @(*) begin + case (sel_in) + 4'b0001: data_out_r = data_in[0]; + 4'b0010: data_out_r = data_in[1]; + 4'b0100: data_out_r = data_in[2]; + 4'b1000: data_out_r = data_in[3]; + default: data_out_r = 'x; + endcase + end + assign data_out = data_out_r; + end else if (LUT_OPT && N == 5) begin + reg [DATAW-1:0] data_out_r; + always @(*) begin + case (sel_in) + 5'b00001: data_out_r = data_in[0]; + 5'b00010: data_out_r = data_in[1]; + 5'b00100: data_out_r = data_in[2]; + 5'b01000: data_out_r = data_in[3]; + 5'b10000: data_out_r = data_in[4]; + default: data_out_r = 'x; + endcase + end + assign data_out = data_out_r; + end else if (LUT_OPT && N == 6) begin + reg [DATAW-1:0] data_out_r; + always @(*) begin + case (sel_in) + 6'b000001: data_out_r = data_in[0]; + 6'b000010: data_out_r = data_in[1]; + 6'b000100: data_out_r = data_in[2]; + 6'b001000: data_out_r = data_in[3]; + 6'b010000: data_out_r = data_in[4]; + 6'b100000: data_out_r = data_in[5]; + default: data_out_r = 'x; + endcase + end + assign data_out = data_out_r; + end else if (LUT_OPT && N == 7) begin + reg [DATAW-1:0] data_out_r; + always @(*) begin + case (sel_in) + 7'b0000001: data_out_r = data_in[0]; + 7'b0000010: data_out_r = data_in[1]; + 7'b0000100: data_out_r = data_in[2]; + 7'b0001000: data_out_r = data_in[3]; + 7'b0010000: data_out_r = data_in[4]; + 7'b0100000: data_out_r = data_in[5]; + 7'b1000000: data_out_r = data_in[6]; + default: data_out_r = 'x; + endcase + end + assign data_out = data_out_r; + end else if (LUT_OPT && N == 8) begin + reg [DATAW-1:0] data_out_r; + always @(*) begin + case (sel_in) + 8'b00000001: data_out_r = data_in[0]; + 8'b00000010: data_out_r = data_in[1]; + 8'b00000100: data_out_r = data_in[2]; + 8'b00001000: data_out_r = data_in[3]; + 8'b00010000: data_out_r = data_in[4]; + 8'b00100000: data_out_r = data_in[5]; + 8'b01000000: data_out_r = data_in[6]; + 8'b10000000: data_out_r = data_in[7]; + default: data_out_r = 'x; + endcase + end + assign data_out = data_out_r; end else if (MODEL == 1) begin wire [N-1:0][DATAW-1:0] mask; for (genvar i = 0; i < N; ++i) begin diff --git a/hw/rtl/libs/VX_pe_serializer.sv b/hw/rtl/libs/VX_pe_serializer.sv index 7060c258c..eac1eddcb 100644 --- a/hw/rtl/libs/VX_pe_serializer.sv +++ b/hw/rtl/libs/VX_pe_serializer.sv @@ -21,7 +21,8 @@ module VX_pe_serializer #( parameter DATA_IN_WIDTH = 1, parameter DATA_OUT_WIDTH = 1, parameter TAG_WIDTH = 0, - parameter PE_REG = 0 + parameter PE_REG = 0, + parameter OUT_BUF = 0 ) ( input wire clk, input wire reset, @@ -43,6 +44,11 @@ module VX_pe_serializer #( output wire [TAG_WIDTH-1:0] tag_out, input wire ready_out ); + wire valid_out_u; + wire [NUM_LANES-1:0][DATA_OUT_WIDTH-1:0] data_out_u; + wire [TAG_WIDTH-1:0] tag_out_u; + wire ready_out_u; + wire [NUM_PES-1:0][DATA_IN_WIDTH-1:0] pe_data_in_s; wire valid_out_s; wire [TAG_WIDTH-1:0] tag_out_s; @@ -105,7 +111,7 @@ module VX_pe_serializer #( reg [TAG_WIDTH-1:0] tag_out_r; wire valid_out_b = valid_out_s && batch_out_done; - wire ready_out_b = ready_out || ~valid_out; + wire ready_out_b = ready_out_u || ~valid_out_u; always @(posedge clk) begin if (reset) begin @@ -119,29 +125,42 @@ module VX_pe_serializer #( end end - assign enable = ready_out_b || ~valid_out_b; - assign ready_in = enable && batch_in_done; + assign enable = ready_out_b || ~valid_out_b; + assign ready_in = enable && batch_in_done; + assign pe_enable = enable; - assign pe_enable = enable; - - assign valid_out = valid_out_r; - assign data_out = data_out_r; - assign tag_out = tag_out_r; + assign valid_out_u = valid_out_r; + assign data_out_u = data_out_r; + assign tag_out_u = tag_out_r; end else begin assign pe_data_in_s = data_in; - assign enable = ready_out || ~valid_out; - assign ready_in = enable; + assign enable = ready_out_u || ~valid_out_u; + assign ready_in = enable; + assign pe_enable = enable; - assign pe_enable = enable; - - assign valid_out = valid_out_s; - assign data_out = pe_data_out; - assign tag_out = tag_out_s; + assign valid_out_u = valid_out_s; + assign data_out_u = pe_data_out; + assign tag_out_u = tag_out_s; end + VX_elastic_buffer #( + .DATAW (NUM_LANES * DATA_OUT_WIDTH + TAG_WIDTH), + .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), + .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) + ) out_buf ( + .clk (clk), + .reset (reset), + .valid_in (valid_out_u), + .ready_in (ready_out_u), + .data_in ({data_out_u, tag_out_u}), + .data_out ({data_out, tag_out}), + .valid_out (valid_out), + .ready_out (ready_out) + ); + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_pipe_buffer.sv b/hw/rtl/libs/VX_pipe_buffer.sv index 75a4579a0..167235c17 100644 --- a/hw/rtl/libs/VX_pipe_buffer.sv +++ b/hw/rtl/libs/VX_pipe_buffer.sv @@ -1,11 +1,11 @@ // Copyright 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,39 +24,53 @@ `TRACING_OFF module VX_pipe_buffer #( - parameter DATAW = 1, - parameter PASSTHRU = 0 -) ( + parameter DATAW = 1, + parameter DEPTH = 1 +) ( input wire clk, input wire reset, input wire valid_in, - output wire ready_in, + output wire ready_in, input wire [DATAW-1:0] data_in, output wire [DATAW-1:0] data_out, input wire ready_out, output wire valid_out -); - if (PASSTHRU != 0) begin +); + if (DEPTH == 0) begin `UNUSED_VAR (clk) `UNUSED_VAR (reset) assign ready_in = ready_out; - assign valid_out = valid_in; + assign valid_out = valid_in; assign data_out = data_in; end else begin - wire stall = valid_out && ~ready_out; + wire [DEPTH:0] valid; + `IGNORE_UNOPTFLAT_BEGIN + wire [DEPTH:0] ready; + `IGNORE_UNOPTFLAT_END + wire [DEPTH:0][DATAW-1:0] data; - VX_pipe_register #( - .DATAW (1 + DATAW), - .RESETW (1) - ) pipe_register ( - .clk (clk), - .reset (reset), - .enable (~stall), - .data_in ({valid_in, data_in}), - .data_out ({valid_out, data_out}) - ); + assign valid[0] = valid_in; + assign data[0] = data_in; + assign ready_in = ready[0]; + + for (genvar i = 0; i < DEPTH; ++i) begin + assign ready[i] = (ready[i+1] || ~valid[i+1]); + VX_pipe_register #( + .DATAW (1 + DATAW), + .RESETW (1) + ) pipe_register ( + .clk (clk), + .reset (reset), + .enable (ready[i]), + .data_in ({valid[i], data[i]}), + .data_out ({valid[i+1], data[i+1]}) + ); + end + + assign valid_out = valid[DEPTH]; + assign data_out = data[DEPTH]; + assign ready[DEPTH] = ready_out; - assign ready_in = ~stall; end endmodule diff --git a/hw/rtl/libs/VX_pipe_register.sv b/hw/rtl/libs/VX_pipe_register.sv index f8537ba78..707438abd 100644 --- a/hw/rtl/libs/VX_pipe_register.sv +++ b/hw/rtl/libs/VX_pipe_register.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -14,10 +14,11 @@ `include "VX_platform.vh" `TRACING_OFF -module VX_pipe_register #( - parameter DATAW = 1, - parameter RESETW = 0, - parameter DEPTH = 1 +module VX_pipe_register #( + parameter DATAW = 1, + parameter RESETW = 0, + parameter DEPTH = 1, + parameter MAX_FANOUT = 0 ) ( input wire clk, input wire reset, @@ -25,54 +26,76 @@ module VX_pipe_register #( input wire [DATAW-1:0] data_in, output wire [DATAW-1:0] data_out ); - if (DEPTH == 0) begin + if (DEPTH == 0) begin `UNUSED_VAR (clk) `UNUSED_VAR (reset) `UNUSED_VAR (enable) - assign data_out = data_in; - end else if (DEPTH == 1) begin - if (RESETW == 0) begin - `UNUSED_VAR (reset) - reg [DATAW-1:0] value; - - always @(posedge clk) begin - if (enable) begin - value <= data_in; - end + assign data_out = data_in; + end else if (DEPTH == 1) begin + if (MAX_FANOUT != 0 && (DATAW > (MAX_FANOUT + MAX_FANOUT/2))) begin + localparam NUM_SLICES = `CDIV(DATAW, MAX_FANOUT); + localparam N_DATAW = DATAW / NUM_SLICES; + for (genvar i = 0; i < NUM_SLICES; ++i) begin + localparam SLICE_START = i * N_DATAW; + localparam SLICE_END = SLICE_START + S_DATAW - 1; + localparam S_DATAW = (i == NUM_SLICES-1) ? (DATAW - SLICE_START) : N_DATAW; + localparam S_RESETW = (SLICE_END >= (DATAW - RESETW)) ? + ((SLICE_START >= (DATAW - RESETW)) ? S_DATAW : (SLICE_END - (DATAW - RESETW) + 1)) : 0; + VX_pipe_register #( + .DATAW (S_DATAW), + .RESETW (S_RESETW) + ) pipe_register_slice ( + .clk (clk), + .reset (reset), + .enable (enable), + .data_in (data_in[i * N_DATAW +: S_DATAW]), + .data_out (data_out[i * N_DATAW +: S_DATAW]) + ); end - assign data_out = value; - end else if (RESETW == DATAW) begin - reg [DATAW-1:0] value; - - always @(posedge clk) begin - if (reset) begin - value <= RESETW'(0); - end else if (enable) begin - value <= data_in; - end - end - assign data_out = value; end else begin - reg [DATAW-RESETW-1:0] value_d; - reg [RESETW-1:0] value_r; + if (RESETW == 0) begin + `UNUSED_VAR (reset) + reg [DATAW-1:0] value; - always @(posedge clk) begin - if (reset) begin - value_r <= RESETW'(0); - end else if (enable) begin - value_r <= data_in[DATAW-1:DATAW-RESETW]; + always @(posedge clk) begin + if (enable) begin + value <= data_in; + end end + assign data_out = value; + end else if (RESETW == DATAW) begin + reg [DATAW-1:0] value; + + always @(posedge clk) begin + if (reset) begin + value <= RESETW'(0); + end else if (enable) begin + value <= data_in; + end + end + assign data_out = value; + end else begin + reg [DATAW-RESETW-1:0] value_d; + reg [RESETW-1:0] value_r; + + always @(posedge clk) begin + if (reset) begin + value_r <= RESETW'(0); + end else if (enable) begin + value_r <= data_in[DATAW-1:DATAW-RESETW]; + end + end + + always @(posedge clk) begin + if (enable) begin + value_d <= data_in[DATAW-RESETW-1:0]; + end + end + assign data_out = {value_r, value_d}; end - - always @(posedge clk) begin - if (enable) begin - value_d <= data_in[DATAW-RESETW-1:0]; - end - end - assign data_out = {value_r, value_d}; end end else begin - wire [DEPTH:0][DATAW-1:0] data_delayed; + wire [DEPTH:0][DATAW-1:0] data_delayed; assign data_delayed[0] = data_in; for (genvar i = 1; i <= DEPTH; ++i) begin VX_pipe_register #( diff --git a/hw/rtl/libs/VX_reset_relay.sv b/hw/rtl/libs/VX_reset_relay.sv index 23cc32f2f..d7e735c25 100644 --- a/hw/rtl/libs/VX_reset_relay.sv +++ b/hw/rtl/libs/VX_reset_relay.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,8 +21,8 @@ module VX_reset_relay #( input wire clk, input wire reset, output wire [N-1:0] reset_o -); - if (MAX_FANOUT >= 0 && N > MAX_FANOUT) begin +); + if (MAX_FANOUT >= 0 && N > (MAX_FANOUT + MAX_FANOUT/2)) begin localparam F = `UP(MAX_FANOUT); localparam R = N / F; `PRESERVE_NET reg [R-1:0] reset_r; @@ -38,6 +38,6 @@ module VX_reset_relay #( `UNUSED_VAR (clk) assign reset_o = {N{reset}}; end - + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_rr_arbiter.sv b/hw/rtl/libs/VX_rr_arbiter.sv index 5c5f7b3b4..52a981184 100644 --- a/hw/rtl/libs/VX_rr_arbiter.sv +++ b/hw/rtl/libs/VX_rr_arbiter.sv @@ -15,9 +15,10 @@ `TRACING_OFF module VX_rr_arbiter #( - parameter NUM_REQS = 1, - parameter MODEL = 1, - parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS) + parameter NUM_REQS = 1, + parameter MODEL = 1, + parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS), + parameter LUT_OPT = 0 ) ( input wire clk, input wire reset, @@ -37,7 +38,7 @@ module VX_rr_arbiter #( assign grant_onehot = requests; assign grant_valid = requests[0]; - end else if (NUM_REQS == 2) begin + end else if (LUT_OPT && NUM_REQS == 2) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; @@ -63,7 +64,7 @@ module VX_rr_arbiter #( assign grant_onehot = grant_onehot_r; assign grant_valid = (| requests); - end /*else if (NUM_REQS == 3) begin + end else if (LUT_OPT && NUM_REQS == 3) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; @@ -93,7 +94,7 @@ module VX_rr_arbiter #( assign grant_onehot = grant_onehot_r; assign grant_valid = (| requests); - end */else if (NUM_REQS == 4) begin + end else if (LUT_OPT && NUM_REQS == 4) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; @@ -129,7 +130,7 @@ module VX_rr_arbiter #( assign grant_onehot = grant_onehot_r; assign grant_valid = (| requests); - end /*else if (NUM_REQS == 5) begin + end else if (LUT_OPT && NUM_REQS == 5) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; @@ -173,7 +174,7 @@ module VX_rr_arbiter #( assign grant_onehot = grant_onehot_r; assign grant_valid = (| requests); - end else if (NUM_REQS == 6) begin + end else if (LUT_OPT && NUM_REQS == 6) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; @@ -227,7 +228,7 @@ module VX_rr_arbiter #( assign grant_onehot = grant_onehot_r; assign grant_valid = (| requests); - end else if (NUM_REQS == 7) begin + end else if (LUT_OPT && NUM_REQS == 7) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; @@ -293,7 +294,7 @@ module VX_rr_arbiter #( assign grant_onehot = grant_onehot_r; assign grant_valid = (| requests); - end */else if (NUM_REQS == 8) begin + end else if (LUT_OPT && NUM_REQS == 8) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; diff --git a/hw/rtl/libs/VX_sp_ram.sv b/hw/rtl/libs/VX_sp_ram.sv index 297a23d20..4ab2a9b7a 100644 --- a/hw/rtl/libs/VX_sp_ram.sv +++ b/hw/rtl/libs/VX_sp_ram.sv @@ -21,13 +21,16 @@ module VX_sp_ram #( parameter WRENW = 1, parameter OUT_REG = 0, parameter NO_RWCHECK = 0, + parameter RW_ASSERT = 0, parameter LUTRAM = 0, + parameter RESET_RAM = 0, parameter INIT_ENABLE = 0, parameter INIT_FILE = "", parameter [DATAW-1:0] INIT_VALUE = 0, parameter ADDRW = `LOG2UP(SIZE) ) ( input wire clk, + input wire reset, input wire read, input wire write, input wire [WRENW-1:0] wren, @@ -42,13 +45,16 @@ module VX_sp_ram #( .WRENW (WRENW), .OUT_REG (OUT_REG), .NO_RWCHECK (NO_RWCHECK), + .RW_ASSERT (RW_ASSERT), .LUTRAM (LUTRAM), + .RESET_RAM (RESET_RAM), .INIT_ENABLE (INIT_ENABLE), .INIT_FILE (INIT_FILE), .INIT_VALUE (INIT_VALUE), .ADDRW (ADDRW) ) dp_ram ( .clk (clk), + .reset (reset), .read (read), .write (write), .wren (wren), diff --git a/hw/rtl/libs/VX_stream_arb.sv b/hw/rtl/libs/VX_stream_arb.sv index f9bb24f3d..98fed5859 100644 --- a/hw/rtl/libs/VX_stream_arb.sv +++ b/hw/rtl/libs/VX_stream_arb.sv @@ -18,7 +18,7 @@ module VX_stream_arb #( parameter NUM_INPUTS = 1, parameter NUM_OUTPUTS = 1, parameter DATAW = 1, - parameter `STRING ARBITER = "P", + parameter `STRING ARBITER = "R", parameter MAX_FANOUT = `MAX_FANOUT, parameter OUT_BUF = 0, parameter LUTRAM = 0, @@ -46,14 +46,14 @@ module VX_stream_arb #( for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin - localparam BATCH_BEGIN = i * NUM_REQS; - localparam BATCH_END = `MIN(BATCH_BEGIN + NUM_REQS, NUM_INPUTS); - localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN; + localparam SLICE_BEGIN = i * NUM_REQS; + localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_INPUTS); + localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN; `RESET_RELAY (slice_reset, reset); VX_stream_arb #( - .NUM_INPUTS (BATCH_SIZE), + .NUM_INPUTS (SLICE_SIZE), .NUM_OUTPUTS (1), .DATAW (DATAW), .ARBITER (ARBITER), @@ -63,9 +63,9 @@ module VX_stream_arb #( ) arb_slice ( .clk (clk), .reset (slice_reset), - .valid_in (valid_in[BATCH_END-1: BATCH_BEGIN]), - .ready_in (ready_in[BATCH_END-1: BATCH_BEGIN]), - .data_in (data_in[BATCH_END-1: BATCH_BEGIN]), + .valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]), + .ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]), + .data_in (data_in[SLICE_END-1: SLICE_BEGIN]), .data_out (data_out[i]), .sel_out (sel_out[i]), .valid_out (valid_out[i]), @@ -73,32 +73,32 @@ module VX_stream_arb #( ); end - end else if (MAX_FANOUT != 0 && (NUM_INPUTS > MAX_FANOUT)) begin + end else if (MAX_FANOUT != 0 && (NUM_INPUTS > (MAX_FANOUT + MAX_FANOUT /2))) begin // (#inputs > max_fanout) and (#outputs == 1) - localparam NUM_BATCHES = `CDIV(NUM_INPUTS, MAX_FANOUT); + localparam NUM_SLICES = `CDIV(NUM_INPUTS, MAX_FANOUT); localparam LOG_NUM_REQS2 = `CLOG2(MAX_FANOUT); - localparam LOG_NUM_REQS3 = `CLOG2(NUM_BATCHES); + localparam LOG_NUM_REQS3 = `CLOG2(NUM_SLICES); - wire [NUM_BATCHES-1:0] valid_tmp; - wire [NUM_BATCHES-1:0][DATAW+LOG_NUM_REQS2-1:0] data_tmp; - wire [NUM_BATCHES-1:0] ready_tmp; + wire [NUM_SLICES-1:0] valid_tmp; + wire [NUM_SLICES-1:0][DATAW+LOG_NUM_REQS2-1:0] data_tmp; + wire [NUM_SLICES-1:0] ready_tmp; - for (genvar i = 0; i < NUM_BATCHES; ++i) begin + for (genvar i = 0; i < NUM_SLICES; ++i) begin - localparam BATCH_BEGIN = i * MAX_FANOUT; - localparam BATCH_END = `MIN(BATCH_BEGIN + MAX_FANOUT, NUM_INPUTS); - localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN; + localparam SLICE_BEGIN = i * MAX_FANOUT; + localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_INPUTS); + localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN; wire [DATAW-1:0] data_tmp_u; - wire [`LOG2UP(BATCH_SIZE)-1:0] sel_tmp_u; + wire [`LOG2UP(SLICE_SIZE)-1:0] sel_tmp_u; `RESET_RELAY (slice_reset, reset); if (MAX_FANOUT != 1) begin VX_stream_arb #( - .NUM_INPUTS (BATCH_SIZE), + .NUM_INPUTS (SLICE_SIZE), .NUM_OUTPUTS (1), .DATAW (DATAW), .ARBITER (ARBITER), @@ -108,9 +108,9 @@ module VX_stream_arb #( ) fanout_slice_arb ( .clk (clk), .reset (slice_reset), - .valid_in (valid_in[BATCH_END-1: BATCH_BEGIN]), - .data_in (data_in[BATCH_END-1: BATCH_BEGIN]), - .ready_in (ready_in[BATCH_END-1: BATCH_BEGIN]), + .valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]), + .data_in (data_in[SLICE_END-1: SLICE_BEGIN]), + .ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]), .valid_out (valid_tmp[i]), .data_out (data_tmp_u), .sel_out (sel_tmp_u), @@ -125,7 +125,7 @@ module VX_stream_arb #( wire [LOG_NUM_REQS3-1:0] sel_out_u; VX_stream_arb #( - .NUM_INPUTS (NUM_BATCHES), + .NUM_INPUTS (NUM_SLICES), .NUM_OUTPUTS (1), .DATAW (DATAW + LOG_NUM_REQS2), .ARBITER (ARBITER), @@ -174,17 +174,9 @@ module VX_stream_arb #( ); assign valid_in_r = arb_valid; + assign data_in_r = data_in[arb_index]; assign arb_ready = ready_in_r; - VX_onehot_mux #( - .DATAW (DATAW), - .N (NUM_REQS) - ) onehot_mux ( - .data_in (data_in), - .sel_in (arb_onehot), - .data_out (data_in_r) - ); - for (genvar i = 0; i < NUM_REQS; ++i) begin assign ready_in[i] = ready_in_r && arb_onehot[i]; end @@ -214,15 +206,15 @@ module VX_stream_arb #( for (genvar i = 0; i < NUM_INPUTS; ++i) begin - localparam BATCH_BEGIN = i * NUM_REQS; - localparam BATCH_END = `MIN(BATCH_BEGIN + NUM_REQS, NUM_OUTPUTS); - localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN; + localparam SLICE_BEGIN = i * NUM_REQS; + localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_OUTPUTS); + localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN; `RESET_RELAY (slice_reset, reset); VX_stream_arb #( .NUM_INPUTS (1), - .NUM_OUTPUTS (BATCH_SIZE), + .NUM_OUTPUTS (SLICE_SIZE), .DATAW (DATAW), .ARBITER (ARBITER), .MAX_FANOUT (MAX_FANOUT), @@ -234,30 +226,30 @@ module VX_stream_arb #( .valid_in (valid_in[i]), .ready_in (ready_in[i]), .data_in (data_in[i]), - .data_out (data_out[BATCH_END-1: BATCH_BEGIN]), - .valid_out (valid_out[BATCH_END-1: BATCH_BEGIN]), - .ready_out (ready_out[BATCH_END-1: BATCH_BEGIN]), + .data_out (data_out[SLICE_END-1: SLICE_BEGIN]), + .valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]), + .ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]), `UNUSED_PIN (sel_out) ); - for (genvar j = BATCH_BEGIN; j < BATCH_END; ++j) begin + for (genvar j = SLICE_BEGIN; j < SLICE_END; ++j) begin assign sel_out[j] = i; end end - end else if (MAX_FANOUT != 0 && (NUM_OUTPUTS > MAX_FANOUT)) begin + end else if (MAX_FANOUT != 0 && (NUM_OUTPUTS > (MAX_FANOUT + MAX_FANOUT /2))) begin // (#inputs == 1) and (#outputs > max_fanout) - localparam NUM_BATCHES = `CDIV(NUM_OUTPUTS, MAX_FANOUT); + localparam NUM_SLICES = `CDIV(NUM_OUTPUTS, MAX_FANOUT); - wire [NUM_BATCHES-1:0] valid_tmp; - wire [NUM_BATCHES-1:0][DATAW-1:0] data_tmp; - wire [NUM_BATCHES-1:0] ready_tmp; + wire [NUM_SLICES-1:0] valid_tmp; + wire [NUM_SLICES-1:0][DATAW-1:0] data_tmp; + wire [NUM_SLICES-1:0] ready_tmp; VX_stream_arb #( .NUM_INPUTS (1), - .NUM_OUTPUTS (NUM_BATCHES), + .NUM_OUTPUTS (NUM_SLICES), .DATAW (DATAW), .ARBITER (ARBITER), .MAX_FANOUT (MAX_FANOUT), @@ -275,17 +267,17 @@ module VX_stream_arb #( `UNUSED_PIN (sel_out) ); - for (genvar i = 0; i < NUM_BATCHES; ++i) begin + for (genvar i = 0; i < NUM_SLICES; ++i) begin - localparam BATCH_BEGIN = i * MAX_FANOUT; - localparam BATCH_END = `MIN(BATCH_BEGIN + MAX_FANOUT, NUM_OUTPUTS); - localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN; + localparam SLICE_BEGIN = i * MAX_FANOUT; + localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_OUTPUTS); + localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN; `RESET_RELAY (slice_reset, reset); VX_stream_arb #( .NUM_INPUTS (1), - .NUM_OUTPUTS (BATCH_SIZE), + .NUM_OUTPUTS (SLICE_SIZE), .DATAW (DATAW), .ARBITER (ARBITER), .MAX_FANOUT (MAX_FANOUT), @@ -297,9 +289,9 @@ module VX_stream_arb #( .valid_in (valid_tmp[i]), .ready_in (ready_tmp[i]), .data_in (data_tmp[i]), - .data_out (data_out[BATCH_END-1: BATCH_BEGIN]), - .valid_out (valid_out[BATCH_END-1: BATCH_BEGIN]), - .ready_out (ready_out[BATCH_END-1: BATCH_BEGIN]), + .data_out (data_out[SLICE_END-1: SLICE_BEGIN]), + .valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]), + .ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]), `UNUSED_PIN (sel_out) ); end @@ -357,9 +349,9 @@ module VX_stream_arb #( // #Inputs == #Outputs - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin + `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); - `RESET_RELAY_EN (out_buf_reset, reset, (NUM_OUTPUTS > 1)); + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin VX_elastic_buffer #( .DATAW (DATAW), @@ -368,7 +360,7 @@ module VX_stream_arb #( .LUTRAM (LUTRAM) ) out_buf ( .clk (clk), - .reset (out_buf_reset), + .reset (out_buf_reset[i]), .valid_in (valid_in[i]), .ready_in (ready_in[i]), .data_in (data_in[i]), diff --git a/hw/rtl/libs/VX_stream_pack.sv b/hw/rtl/libs/VX_stream_pack.sv index df0000307..7f024b184 100644 --- a/hw/rtl/libs/VX_stream_pack.sv +++ b/hw/rtl/libs/VX_stream_pack.sv @@ -39,8 +39,9 @@ module VX_stream_pack #( input wire ready_out ); if (NUM_REQS > 1) begin + localparam LOG_NUM_REQS = `CLOG2(NUM_REQS); - wire [NUM_REQS-1:0] grant_onehot; + wire [LOG_NUM_REQS-1:0] grant_index; wire grant_valid; wire grant_ready; @@ -52,21 +53,12 @@ module VX_stream_pack #( .reset (reset), .requests (valid_in), .grant_valid (grant_valid), - `UNUSED_PIN (grant_index), - .grant_onehot(grant_onehot), + .grant_index (grant_index), + `UNUSED_PIN (grant_onehot), .grant_ready (grant_ready) ); - wire [TAG_WIDTH-1:0] tag_sel; - - VX_onehot_mux #( - .DATAW (TAG_WIDTH), - .N (NUM_REQS) - ) onehot_mux ( - .data_in (tag_in), - .sel_in (grant_onehot), - .data_out (tag_sel) - ); + wire [TAG_WIDTH-1:0] tag_sel = tag_in[grant_index]; wire [NUM_REQS-1:0] tag_matches; diff --git a/hw/rtl/libs/VX_stream_switch.sv b/hw/rtl/libs/VX_stream_switch.sv index f73929071..3a905cb1d 100644 --- a/hw/rtl/libs/VX_stream_switch.sv +++ b/hw/rtl/libs/VX_stream_switch.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -33,7 +33,7 @@ module VX_stream_switch #( output wire [NUM_INPUTS-1:0] ready_in, output wire [NUM_OUTPUTS-1:0] valid_out, - output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out, + output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out, input wire [NUM_OUTPUTS-1:0] ready_out ); if (NUM_INPUTS > NUM_OUTPUTS) begin @@ -52,7 +52,7 @@ module VX_stream_switch #( assign data_in_r[i][j] = '0; end end - end + end wire [NUM_OUTPUTS-1:0] valid_out_r; wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_r; @@ -65,25 +65,24 @@ module VX_stream_switch #( for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin for (genvar j = 0; j < NUM_REQS; ++j) begin - localparam ii = i * NUM_REQS + j; - if (ii < NUM_INPUTS) begin + localparam ii = i * NUM_REQS + j; + if (ii < NUM_INPUTS) begin assign ready_in[ii] = ready_out_r[i] & (sel_in[i] == LOG_NUM_REQS'(j)); end end end + `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin - - `RESET_RELAY_EN (out_buf_reset, reset, (NUM_OUTPUTS > 1)); - VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (out_buf_reset), - .valid_in (valid_out_r[i]), + .reset (out_buf_reset[i]), + .valid_in (valid_out_r[i]), .ready_in (ready_out_r[i]), .data_in (data_out_r[i]), .data_out (data_out[i]), @@ -93,7 +92,7 @@ module VX_stream_switch #( end end else if (NUM_OUTPUTS > NUM_INPUTS) begin - + wire [NUM_INPUTS-1:0][NUM_REQS-1:0] valid_out_r; wire [NUM_INPUTS-1:0][NUM_REQS-1:0] ready_out_r; @@ -104,51 +103,50 @@ module VX_stream_switch #( assign ready_in[i] = ready_out_r[i][sel_in[i]]; end + `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); + for (genvar i = 0; i < NUM_INPUTS; ++i) begin for (genvar j = 0; j < NUM_REQS; ++j) begin localparam ii = i * NUM_REQS + j; if (ii < NUM_OUTPUTS) begin - - `RESET_RELAY (out_buf_reset, reset); - VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (out_buf_reset), + .reset (out_buf_reset[ii]), .valid_in (valid_out_r[i][j]), .ready_in (ready_out_r[i][j]), - .data_in (data_in[i]), + .data_in (data_in[i]), .data_out (data_out[ii]), .valid_out (valid_out[ii]), .ready_out (ready_out[ii]) ); end else begin + `UNUSED_VAR (out_buf_reset[ii]) `UNUSED_VAR (valid_out_r[i][j]) assign ready_out_r[i][j] = '0; - end + end end end - + end else begin // #Inputs == #Outputs - + `UNUSED_VAR (sel_in) + `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin - - `RESET_RELAY_EN (out_buf_reset, reset, (NUM_OUTPUTS > 1)); - VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (out_buf_reset), + .reset (out_buf_reset[i]), .valid_in (valid_in[i]), .ready_in (ready_in[i]), .data_in (data_in[i]), @@ -159,6 +157,6 @@ module VX_stream_switch #( end end - + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_stream_xbar.sv b/hw/rtl/libs/VX_stream_xbar.sv index cb0d9a179..b7bdcbf5e 100644 --- a/hw/rtl/libs/VX_stream_xbar.sv +++ b/hw/rtl/libs/VX_stream_xbar.sv @@ -20,7 +20,7 @@ module VX_stream_xbar #( parameter DATAW = 4, parameter IN_WIDTH = `LOG2UP(NUM_INPUTS), parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS), - parameter ARBITER = "P", + parameter ARBITER = "R", parameter OUT_BUF = 0, parameter LUTRAM = 0, parameter MAX_FANOUT = `MAX_FANOUT, @@ -126,10 +126,9 @@ module VX_stream_xbar #( assign data_out_r = {NUM_OUTPUTS{data_in}}; assign ready_in = ready_out_r[sel_in]; + `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin - - `RESET_RELAY (out_buf_reset, reset); - VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), @@ -137,7 +136,7 @@ module VX_stream_xbar #( .LUTRAM (LUTRAM) ) out_buf ( .clk (clk), - .reset (out_buf_reset), + .reset (out_buf_reset[i]), .valid_in (valid_out_r[i]), .ready_in (ready_out_r[i]), .data_in (data_out_r[i]), diff --git a/hw/rtl/mem/VX_local_mem.sv b/hw/rtl/mem/VX_local_mem.sv index f59ebae5b..3dce0ec43 100644 --- a/hw/rtl/mem/VX_local_mem.sv +++ b/hw/rtl/mem/VX_local_mem.sv @@ -94,7 +94,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_req_idx; wire [NUM_BANKS-1:0] per_bank_req_ready; - wire [NUM_BANKS-1:0][REQ_DATAW-1:0] per_bank_req_data_all; + wire [NUM_BANKS-1:0][REQ_DATAW-1:0] per_bank_req_data_aos; wire [NUM_REQS-1:0] req_valid_in; wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in; @@ -111,7 +111,8 @@ module VX_local_mem import VX_gpu_pkg::*; #( req_bank_addr[i], mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, - mem_bus_if[i].req_data.tag}; + mem_bus_if[i].req_data.tag + }; assign mem_bus_if[i].req_ready = req_ready_in[i]; end @@ -120,6 +121,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( .NUM_OUTPUTS (NUM_BANKS), .DATAW (REQ_DATAW), .PERF_CTR_BITS (`PERF_CTR_BITS), + .ARBITER ("F"), .OUT_BUF (3) // output should be registered for the data_store addressing ) req_xbar ( .clk (clk), @@ -134,7 +136,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( .sel_in (req_bank_idx), .ready_in (req_ready_in), .valid_out (per_bank_req_valid), - .data_out (per_bank_req_data_all), + .data_out (per_bank_req_data_aos), .sel_out (per_bank_req_idx), .ready_out (per_bank_req_ready) ); @@ -145,7 +147,8 @@ module VX_local_mem import VX_gpu_pkg::*; #( per_bank_req_addr[i], per_bank_req_byteen[i], per_bank_req_data[i], - per_bank_req_tag[i]} = per_bank_req_data_all[i]; + per_bank_req_tag[i] + } = per_bank_req_data_aos[i]; end // banks access @@ -156,38 +159,55 @@ module VX_local_mem import VX_gpu_pkg::*; #( wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_rsp_tag; wire [NUM_BANKS-1:0] per_bank_rsp_ready; - `RESET_RELAY (bank_reset, reset); - for (genvar i = 0; i < NUM_BANKS; ++i) begin + wire bank_rsp_valid, bank_rsp_ready; + wire [WORD_WIDTH-1:0] bank_rsp_data; + + `RESET_RELAY_EN (bram_reset, reset, (NUM_BANKS > 1)); + VX_sp_ram #( .DATAW (WORD_WIDTH), .SIZE (WORDS_PER_BANK), - .WRENW (WORD_SIZE) + .WRENW (WORD_SIZE), + .NO_RWCHECK (1) ) data_store ( .clk (clk), - .read (1'b1), + .reset (bram_reset), + .read (per_bank_req_valid[i] && per_bank_req_ready[i] && ~per_bank_req_rw[i]), .write (per_bank_req_valid[i] && per_bank_req_ready[i] && per_bank_req_rw[i]), .wren (per_bank_req_byteen[i]), .addr (per_bank_req_addr[i]), .wdata (per_bank_req_data[i]), - .rdata (per_bank_rsp_data[i]) + .rdata (bank_rsp_data) ); - // drop write response - wire per_bank_req_valid_w, per_bank_req_ready_w; - assign per_bank_req_valid_w = per_bank_req_valid[i] && ~per_bank_req_rw[i]; - assign per_bank_req_ready[i] = per_bank_req_ready_w || per_bank_req_rw[i]; + // read-during-write hazard detection + reg [BANK_ADDR_WIDTH-1:0] last_wr_addr; + reg last_wr_valid; + always @(posedge clk) begin + if (bram_reset) begin + last_wr_valid <= 0; + end else begin + last_wr_valid <= per_bank_req_valid[i] && per_bank_req_ready[i] && per_bank_req_rw[i]; + end + last_wr_addr <= per_bank_req_addr[i]; + end + wire is_rdw_hazard = last_wr_valid && ~per_bank_req_rw[i] && (per_bank_req_addr[i] == last_wr_addr); - VX_elastic_buffer #( - .DATAW (REQ_SEL_WIDTH + TAG_WIDTH), - .SIZE (0) - ) bank_buf ( + // drop write response and stall on read-during-write hazard + assign bank_rsp_valid = per_bank_req_valid[i] && ~per_bank_req_rw[i] && ~is_rdw_hazard; + assign per_bank_req_ready[i] = (bank_rsp_ready || per_bank_req_rw[i]) && ~is_rdw_hazard; + + // register BRAM output + VX_pipe_buffer #( + .DATAW (REQ_SEL_WIDTH + WORD_WIDTH + TAG_WIDTH) + ) bram_buf ( .clk (clk), - .reset (bank_reset), - .valid_in (per_bank_req_valid_w), - .ready_in (per_bank_req_ready_w), - .data_in ({per_bank_req_idx[i], per_bank_req_tag[i]}), - .data_out ({per_bank_rsp_idx[i], per_bank_rsp_tag[i]}), + .reset (bram_reset), + .valid_in (bank_rsp_valid), + .ready_in (bank_rsp_ready), + .data_in ({per_bank_req_idx[i], bank_rsp_data, per_bank_req_tag[i]}), + .data_out ({per_bank_rsp_idx[i], per_bank_rsp_data[i], per_bank_rsp_tag[i]}), .valid_out (per_bank_rsp_valid[i]), .ready_out (per_bank_rsp_ready[i]) ); @@ -195,10 +215,10 @@ module VX_local_mem import VX_gpu_pkg::*; #( // bank responses gather - wire [NUM_BANKS-1:0][RSP_DATAW-1:0] per_bank_rsp_data_all; + wire [NUM_BANKS-1:0][RSP_DATAW-1:0] per_bank_rsp_data_aos; for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign per_bank_rsp_data_all[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]}; + assign per_bank_rsp_data_aos[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]}; end wire [NUM_REQS-1:0] rsp_valid_out; @@ -209,6 +229,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( .NUM_INPUTS (NUM_BANKS), .NUM_OUTPUTS (NUM_REQS), .DATAW (RSP_DATAW), + .ARBITER ("P"), // this priority arbiter has negligeable impact om performance .OUT_BUF (OUT_BUF) ) rsp_xbar ( .clk (clk), @@ -216,7 +237,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( `UNUSED_PIN (collisions), .sel_in (per_bank_rsp_idx), .valid_in (per_bank_rsp_valid), - .data_in (per_bank_rsp_data_all), + .data_in (per_bank_rsp_data_aos), .ready_in (per_bank_rsp_ready), .valid_out (rsp_valid_out), .data_out (rsp_data_out), @@ -310,7 +331,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( always @(posedge clk) begin if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin if (mem_bus_if[i].req_data.rw) begin - `TRACE(1, ("%d: %s wr-req: req_idx=%0d, addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", + `TRACE(1, ("%d: %s wr-req: req_idx=%0d, addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, req_uuid[i])); end else begin `TRACE(1, ("%d: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n", @@ -318,7 +339,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( end end if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin - `TRACE(1, ("%d: %s rd-rsp: req_idx=%0d, tag=0x%0h, data=0x%0h (#%0d)\n", + `TRACE(1, ("%d: %s rd-rsp: req_idx=%0d, tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.tag, mem_bus_if[i].rsp_data.data[i], rsp_uuid[i])); end end @@ -328,7 +349,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( always @(posedge clk) begin if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin if (per_bank_req_rw[i]) begin - `TRACE(2, ("%d: %s-bank%0d wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", + `TRACE(2, ("%d: %s-bank%0d wr-req: addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_uuid[i])); end else begin `TRACE(2, ("%d: %s-bank%0d rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n", @@ -336,7 +357,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( end end if (per_bank_rsp_valid[i] && per_bank_rsp_ready[i]) begin - `TRACE(2, ("%d: %s-bank%0d rd-rsp: tag=0x%0h, data=0x%0h (#%0d)\n", + `TRACE(2, ("%d: %s-bank%0d rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, per_bank_rsp_tag[i], per_bank_rsp_data[i], per_bank_rsp_uuid[i])); end end diff --git a/hw/syn/altera/opae/Makefile b/hw/syn/altera/opae/Makefile index 235c79c8d..62a9bb72c 100644 --- a/hw/syn/altera/opae/Makefile +++ b/hw/syn/altera/opae/Makefile @@ -73,12 +73,12 @@ ifneq ($(TARGET), fpga) CFLAGS += -DSIMULATION endif -# Debugigng +# Debugging ifdef DEBUG ifneq ($(TARGET), fpga) - CFLAGS += -DNDEBUG + CFLAGS += -DDEBUG_LEVEL=$(DEBUG) $(DBG_TRACE_FLAGS) else - CFLAGS += $(DBG_TRACE_FLAGS) + CFLAGS += -DNDEBUG endif else CFLAGS += -DNDEBUG diff --git a/hw/syn/altera/quartus/project.sdc b/hw/syn/altera/quartus/project.sdc index f6373a643..6ea508531 100644 --- a/hw/syn/altera/quartus/project.sdc +++ b/hw/syn/altera/quartus/project.sdc @@ -1 +1 @@ -create_clock -name {clk} -period "220 MHz" -waveform { 0.000 1.0 } [get_ports {clk}] \ No newline at end of file +create_clock -name {clk} -period "200 MHz" -waveform { 0.000 1.0 } [get_ports {clk}] \ No newline at end of file diff --git a/hw/syn/xilinx/README b/hw/syn/xilinx/README index b2218e65e..563c4c17e 100644 --- a/hw/syn/xilinx/README +++ b/hw/syn/xilinx/README @@ -45,6 +45,7 @@ FPGA_BIN_DIR= XRT_DEVICE_INDEX=1 TARGET=hw ./ci/blackbox.sh --driver=xr # build report logs /bin/vortex_afu.xclbin.info +/_x/logs/link/vivado.log # search for keyword "Very high fanout" /_x/reports/link/link/imp/impl_1_full_util_routed.rpt /_x/reports/link/imp/impl_1_hw_bb_locked_timing_summary_routed.rpt # search for keyword "VIOLATED" /_x/logs/link/syn/ulp_vortex_afu_1_0_synth_1_runme.log diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile index f8f0f5cb0..38ae29f36 100644 --- a/hw/syn/xilinx/xrt/Makefile +++ b/hw/syn/xilinx/xrt/Makefile @@ -111,14 +111,14 @@ ifeq ($(TARGET), hw_emu) CFLAGS += -DSIMULATION endif -# Debugigng +# Debugging ifdef DEBUG VPP_FLAGS += -g --debug.protocol all ifneq ($(TARGET), hw) - CFLAGS += -DNDEBUG - else VPP_FLAGS += --vivado.prop fileset.sim_1.xsim.elaborate.debug_level=all - CFLAGS += $(DBG_TRACE_FLAGS) + CFLAGS += -DDEBUG_LEVEL=$(DEBUG) $(DBG_TRACE_FLAGS) + else + CFLAGS += -DNDEBUG endif else VPP_FLAGS += --optimize 3 diff --git a/hw/syn/yosys/Makefile b/hw/syn/yosys/Makefile index a0c4fdcc9..80bfdae02 100644 --- a/hw/syn/yosys/Makefile +++ b/hw/syn/yosys/Makefile @@ -49,7 +49,7 @@ endif RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache RTL_INCLUDE += $(FPU_INCLUDE) -# Debugigng +# Debugging ifdef DEBUG CFLAGS += $(DBG_TRACE_FLAGS) else diff --git a/hw/unittest/common.mk b/hw/unittest/common.mk index ac3e6b4ff..48aefd415 100644 --- a/hw/unittest/common.mk +++ b/hw/unittest/common.mk @@ -29,7 +29,7 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count() VL_FLAGS += -j $(THREADS) #VL_FLAGS += --threads $(THREADS) -# Debugigng +# Debugging ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) diff --git a/kernel/src/vx_start.S b/kernel/src/vx_start.S index 630856f3b..290b68058 100644 --- a/kernel/src/vx_start.S +++ b/kernel/src/vx_start.S @@ -13,6 +13,7 @@ #include #include +#include #include "common.h" .section .init, "ax" @@ -51,12 +52,10 @@ _start: # la t0, trap_entry # csrw mtvec, t0 - # register global termination functions - la a0, __libc_fini_array - call atexit - +#ifdef HAVE_INITFINI_ARRAY # run global initialization functions call __libc_init_array +#endif # call main program routine call main diff --git a/kernel/src/vx_syscalls.c b/kernel/src/vx_syscalls.c index 6ff9fbb97..6f9c829ad 100644 --- a/kernel/src/vx_syscalls.c +++ b/kernel/src/vx_syscalls.c @@ -119,70 +119,13 @@ void __libc_fini_array (void) { } #endif -/* -#define MAX_CORES 64 -volatile int g_cxa_locks[MAX_CORES] = {0}; -*/ - -void __cxa_lock() { - /*int core_id = vx_core_id(); - g_cxa_locks[core_id] = 1; - vx_fence(); - for (int i = 1; i < MAX_CORES; ++i) { - int other = (core_id + i) % MAX_CORES; - while (g_cxa_locks[other]) { - vx_fence(); // cache coherence not supported, so we need to flush the caches - } - }*/ -} - -void __cxa_unlock() { - /*vx_fence(); - int core_id = vx_core_id(); - g_cxa_locks[core_id] = 0;*/ -} - -#define MAX_FEXITS 64 - -typedef struct { - void (*f[MAX_FEXITS])(void*); - void *a[MAX_FEXITS]; -} fexit_list_t; - -static fexit_list_t g_fexit_list; -static int g_num_fexits = 0; - -void __funcs_on_exit() { - void (*func)(void *), *arg; - fexit_list_t* fexit_list = &g_fexit_list; - for (int i = 0; i < g_num_fexits; ++i) { - func = fexit_list->f[i]; - arg = fexit_list->a[i]; - func(arg); - } -} - -void __cxa_finalize(void *dso) {} - -int __cxa_atexit(void (*func)(void *), void *arg, void *dso) { - __cxa_lock(); - int num_fexits = g_num_fexits; - if (num_fexits >= MAX_FEXITS) - return -1; - fexit_list_t* fexit_list = &g_fexit_list; - fexit_list->f[num_fexits] = func; - fexit_list->a[num_fexits] = arg; - g_num_fexits = num_fexits + 1; - __cxa_unlock(); - return 0; -} - -static void call(void *p) { - ((void (*)(void))(uintptr_t)p)(); -} - -int atexit(void (*func)(void)) { - return __cxa_atexit(call, (void*)(uintptr_t)func, 0); +// This function will be called by LIBC at program exit. +// Since this platform only support statically linked programs, +// it is not required to support LIBC's exit functions registration via atexit(). +void __funcs_on_exit (void) { +#ifdef HAVE_INITFINI_ARRAY + __libc_fini_array(); +#endif } #ifdef __cplusplus diff --git a/runtime/common/common.h b/runtime/common/common.h index 37fec4846..62a807904 100644 --- a/runtime/common/common.h +++ b/runtime/common/common.h @@ -21,6 +21,7 @@ #include #include +#include #define CACHE_BLOCK_SIZE 64 diff --git a/runtime/include/vortex.h b/runtime/include/vortex.h index 957e5d62a..0446e8f5d 100644 --- a/runtime/include/vortex.h +++ b/runtime/include/vortex.h @@ -34,6 +34,7 @@ typedef void* vx_buffer_h; #define VX_CAPS_GLOBAL_MEM_SIZE 0x5 #define VX_CAPS_LOCAL_MEM_SIZE 0x6 #define VX_CAPS_ISA_FLAGS 0x7 +#define VX_CAPS_NUM_MEM_BANKS 0x8 // device isa flags #define VX_ISA_STD_A (1ull << ISA_STD_A) diff --git a/runtime/opae/Makefile b/runtime/opae/Makefile index 3954d3f19..1a9810eca 100644 --- a/runtime/opae/Makefile +++ b/runtime/opae/Makefile @@ -30,7 +30,7 @@ else CXXFLAGS += -I$(SYN_DIR) endif -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else diff --git a/runtime/opae/vortex.cpp b/runtime/opae/vortex.cpp index 390d5acc4..06458fa1f 100755 --- a/runtime/opae/vortex.cpp +++ b/runtime/opae/vortex.cpp @@ -232,6 +232,9 @@ public: case VX_CAPS_ISA_FLAGS: _value = isa_caps_; break; + case VX_CAPS_NUM_MEM_BANKS: + _value = MEMORY_BANKS; + break; default: fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id); std::abort(); diff --git a/runtime/rtlsim/Makefile b/runtime/rtlsim/Makefile index 4523be18d..f6adbf8c8 100644 --- a/runtime/rtlsim/Makefile +++ b/runtime/rtlsim/Makefile @@ -19,7 +19,7 @@ LDFLAGS += -L$(DESTDIR) -lrtlsim SRCS := $(SRC_DIR)/vortex.cpp -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else diff --git a/runtime/rtlsim/vortex.cpp b/runtime/rtlsim/vortex.cpp index c75a6c12f..91df7f7e8 100644 --- a/runtime/rtlsim/vortex.cpp +++ b/runtime/rtlsim/vortex.cpp @@ -77,6 +77,9 @@ public: case VX_CAPS_ISA_FLAGS: _value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD; break; + case VX_CAPS_NUM_MEM_BANKS: + _value = MEMORY_BANKS; + break; default: std::cout << "invalid caps id: " << caps_id << std::endl; std::abort(); diff --git a/runtime/simx/Makefile b/runtime/simx/Makefile index 7615f72b2..31ab483e7 100644 --- a/runtime/simx/Makefile +++ b/runtime/simx/Makefile @@ -19,7 +19,7 @@ LDFLAGS += -L$(DESTDIR) -lsimx SRCS := $(SRC_DIR)/vortex.cpp -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp index e5ec36b60..1c8f47eaf 100644 --- a/runtime/simx/vortex.cpp +++ b/runtime/simx/vortex.cpp @@ -105,6 +105,9 @@ public: case VX_CAPS_ISA_FLAGS: _value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD; break; + case VX_CAPS_NUM_MEM_BANKS: + _value = MEMORY_BANKS; + break; default: std::cout << "invalid caps id: " << caps_id << std::endl; std::abort(); diff --git a/runtime/stub/Makefile b/runtime/stub/Makefile index 6dc8d88f8..ae6e27ed1 100644 --- a/runtime/stub/Makefile +++ b/runtime/stub/Makefile @@ -12,7 +12,7 @@ LDFLAGS += -shared -pthread -ldl SRCS := $(SRC_DIR)/vortex.cpp $(SRC_DIR)/utils.cpp -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else diff --git a/runtime/stub/utils.cpp b/runtime/stub/utils.cpp index eea7691f5..c1f75f092 100644 --- a/runtime/stub/utils.cpp +++ b/runtime/stub/utils.cpp @@ -211,6 +211,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t mem_reads = 0; uint64_t mem_writes = 0; uint64_t mem_lat = 0; + uint64_t mem_req_counter = 0; + uint64_t mem_ticks = 0; uint64_t num_cores; CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), { @@ -221,6 +223,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), { return err; }); + + uint64_t num_mem_bank_ports; + CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_MEM_BANKS, &num_mem_bank_ports), { + return err; + }); bool icache_enable = isa_flags & VX_ISA_EXT_ICACHE; bool dcache_enable = isa_flags & VX_ISA_EXT_DCACHE; @@ -314,7 +321,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { if (num_cores > 1) { uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_csrs_per_core + scrb_wctl_per_core; int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core); - fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, scrs=%d%%, wctl=%d%%)\n" + fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n" , core_id , scrb_stalls_per_core , scrb_percent_per_core @@ -533,6 +540,12 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_LT, core_id, &mem_lat), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_BANK_CNTR, core_id, &mem_req_counter), { + return err; + }); + CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_BANK_TICK, core_id, &mem_ticks), { + return err; + }); } } break; default: @@ -559,7 +572,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent); fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent); fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent); - fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, scrs=%d%%, wctl=%d%%)\n" + fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n" , scrb_stalls , scrb_percent , calcAvgPercent(scrb_alu, scrb_total) @@ -599,7 +612,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads); int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes); int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls); - int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls); + int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls); fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads); fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes); fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio); @@ -609,8 +622,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { } int mem_avg_lat = caclAverage(mem_lat, mem_reads); + int memory_bank_port_utilization = calcAvgPercent(mem_req_counter, (mem_ticks * num_mem_bank_ports)); fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes); fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat); + fprintf(stream, "PERF: memory bank port utilization=%d%%\n", memory_bank_port_utilization); } break; default: break; diff --git a/runtime/xrt/Makefile b/runtime/xrt/Makefile index 4a30c23cb..66d3e481b 100644 --- a/runtime/xrt/Makefile +++ b/runtime/xrt/Makefile @@ -26,7 +26,7 @@ endif PROJECT := libvortex-xrt.so -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else diff --git a/runtime/xrt/vortex.cpp b/runtime/xrt/vortex.cpp index 408bf23ed..5f4e27ff2 100644 --- a/runtime/xrt/vortex.cpp +++ b/runtime/xrt/vortex.cpp @@ -404,6 +404,9 @@ public: case VX_CAPS_ISA_FLAGS: _value = isa_caps_; break; + case VX_CAPS_NUM_MEM_BANKS: + _value = MEMORY_BANKS; + break; default: fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id); std::abort(); diff --git a/sim/common/dram_sim.cpp b/sim/common/dram_sim.cpp index f7cfa8a32..684dd6f7d 100644 --- a/sim/common/dram_sim.cpp +++ b/sim/common/dram_sim.cpp @@ -41,11 +41,11 @@ public: dram_config["MemorySystem"]["DRAM"]["impl"] = "HBM2"; dram_config["MemorySystem"]["DRAM"]["org"]["preset"] = "HBM2_8Gb"; dram_config["MemorySystem"]["DRAM"]["org"]["density"] = 8192; + dram_config["MemorySystem"]["DRAM"]["org"]["channel"] = 8; dram_config["MemorySystem"]["DRAM"]["timing"]["preset"] = "HBM2_2Gbps"; dram_config["MemorySystem"]["Controller"]["impl"] = "Generic"; dram_config["MemorySystem"]["Controller"]["Scheduler"]["impl"] = "FRFCFS"; dram_config["MemorySystem"]["Controller"]["RefreshManager"]["impl"] = "AllBank"; - dram_config["MemorySystem"]["Controller"]["RefreshManager"]["impl"] = "AllBank"; dram_config["MemorySystem"]["Controller"]["RowPolicy"]["impl"] = "OpenRowPolicy"; { YAML::Node draw_plugin; @@ -66,7 +66,7 @@ public: auto original_buf = std::cout.rdbuf(); std::cout.rdbuf(nullstream.rdbuf()); ramulator_frontend_->finalize(); - ramulator_memorysystem_->finalize(); + ramulator_memorysystem_->finalize(); std::cout.rdbuf(original_buf); } diff --git a/sim/common/mem.cpp b/sim/common/mem.cpp index e6e998fce..61dc38389 100644 --- a/sim/common/mem.cpp +++ b/sim/common/mem.cpp @@ -59,7 +59,7 @@ void RamMemDevice::read(void* data, uint64_t addr, uint64_t size) { if ((addr & (wordSize_-1)) || (addr_end & (wordSize_-1)) || (addr_end <= contents_.size())) { - std::cout << "lookup of 0x" << std::hex << (addr_end-1) << " failed.\n"; + std::cout << "lookup of 0x" << std::hex << (addr_end-1) << std::dec << " failed.\n"; throw BadAddress(); } @@ -74,7 +74,7 @@ void RamMemDevice::write(const void* data, uint64_t addr, uint64_t size) { if ((addr & (wordSize_-1)) || (addr_end & (wordSize_-1)) || (addr_end <= contents_.size())) { - std::cout << "lookup of 0x" << std::hex << (addr_end-1) << " failed.\n"; + std::cout << "lookup of 0x" << std::hex << (addr_end-1) << std::dec << " failed.\n"; throw BadAddress(); } @@ -115,8 +115,7 @@ void MemoryUnit::ADecoder::map(uint64_t start, uint64_t end, MemDevice &md) { void MemoryUnit::ADecoder::read(void* data, uint64_t addr, uint64_t size) { mem_accessor_t ma; if (!this->lookup(addr, size, &ma)) { - assert(0); - std::cout << "lookup of 0x" << std::hex << addr << " failed.\n"; + std::cout << "lookup of 0x" << std::hex << addr << std::dec << " failed.\n"; throw BadAddress(); } ma.md->read(data, ma.addr, size); @@ -125,8 +124,7 @@ void MemoryUnit::ADecoder::read(void* data, uint64_t addr, uint64_t size) { void MemoryUnit::ADecoder::write(const void* data, uint64_t addr, uint64_t size) { mem_accessor_t ma; if (!this->lookup(addr, size, &ma)) { - assert(0); - std::cout << "lookup of 0x" << std::hex << addr << " failed.\n"; + std::cout << "lookup of 0x" << std::hex << addr << std::dec << " failed.\n"; throw BadAddress(); } ma.md->write(data, ma.addr, size); @@ -408,7 +406,7 @@ bool ACLManager::check(uint64_t addr, uint64_t size, int flags) const { while (it != acl_map_.end() && it->first < end) { if (it->second.end > addr) { if ((it->second.flags & flags) != flags) { - std::cout << "Memory access violation from 0x" << std::hex << addr << " to 0x" << end << ", curent flags=" << it->second.flags << ", access flags=" << flags << std::endl; + std::cout << "Memory access violation from 0x" << std::hex << addr << " to 0x" << end << ", curent flags=" << it->second.flags << ", access flags=" << flags << std::dec << std::endl; return false; // Overlapping entry is missing at least one required flag bit } addr = it->second.end; // Move to the end of the current matching range @@ -759,4 +757,4 @@ std::pair MemoryUnit::page_table_walk(uint64_t vAddr_bits, AC return std::make_pair(cur_base_ppn, flags); } -#endif \ No newline at end of file +#endif diff --git a/sim/common/simobject.h b/sim/common/simobject.h index f4c84e3f3..31fc4c0e6 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -168,23 +168,23 @@ public: {} void* operator new(size_t /*size*/) { - return allocator().allocate(); + return allocator_.allocate(); } void operator delete(void* ptr) { - allocator().deallocate(ptr); + allocator_.deallocate(ptr); } protected: Func func_; Pkt pkt_; - static MemoryPool>& allocator() { - static MemoryPool> instance(64); - return instance; - } + static MemoryPool> allocator_; }; +template +MemoryPool> SimCallEvent::allocator_(64); + /////////////////////////////////////////////////////////////////////////////// template @@ -201,23 +201,23 @@ public: {} void* operator new(size_t /*size*/) { - return allocator().allocate(); + return allocator_.allocate(); } void operator delete(void* ptr) { - allocator().deallocate(ptr); + allocator_.deallocate(ptr); } protected: const SimPort* port_; Pkt pkt_; - static MemoryPool>& allocator() { - static MemoryPool> instance(64); - return instance; - } + static MemoryPool> allocator_; }; +template +MemoryPool> SimPortEvent::allocator_(64); + /////////////////////////////////////////////////////////////////////////////// class SimContext; diff --git a/sim/common/util.h b/sim/common/util.h index 83fdee7df..fd234d279 100644 --- a/sim/common/util.h +++ b/sim/common/util.h @@ -70,4 +70,28 @@ const char* fileExtension(const char* filepath); #endif void *aligned_malloc(size_t size, size_t alignment); -void aligned_free(void *ptr); \ No newline at end of file +void aligned_free(void *ptr); + +namespace vortex { + +// Verilator data type casting +template +class VDataCast; +template +class VDataCast 8)>::type> { +public: + template + static R get(T& obj) { + return reinterpret_cast(obj.data()); + } +}; +template +class VDataCast::type> { +public: + template + static R get(T& obj) { + return reinterpret_cast(&obj); + } +}; + +} \ No newline at end of file diff --git a/sim/opaesim/Makefile b/sim/opaesim/Makefile index 7b0d543d2..2e549ca74 100644 --- a/sim/opaesim/Makefile +++ b/sim/opaesim/Makefile @@ -83,13 +83,13 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count() VL_FLAGS += -j $(THREADS) #VL_FLAGS += --threads $(THREADS) -# Debugigng +# Debugging ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) else VL_FLAGS += -DNDEBUG - CXXFLAGS += -O3 -DNDEBUG + CXXFLAGS += -O2 -DNDEBUG endif # Enable scope analyzer @@ -123,7 +123,7 @@ $(DESTDIR)/vortex_afu.h : $(AFU_DIR)/vortex_afu.vh $(SCRIPT_DIR)/gen_config.py -i $^ -o $@ $(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/vortex_afu.h $(SCOPE_JSON) - verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $@.obj_dir -o $@ + verilator --build --exe $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $@.obj_dir -o $@ clean: rm -rf $(DESTDIR)/$(PROJECT).obj_dir diff --git a/sim/opaesim/opae_sim.cpp b/sim/opaesim/opae_sim.cpp index d6e06721d..7a1bae3e4 100644 --- a/sim/opaesim/opae_sim.cpp +++ b/sim/opaesim/opae_sim.cpp @@ -35,13 +35,13 @@ #include #include -#ifndef MEMORY_BANKS +//#ifndef MEMORY_BANKS #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS #else #define MEMORY_BANKS 2 #endif -#endif +//#endif #ifndef MEM_CLOCK_RATIO #define MEM_CLOCK_RATIO 1 @@ -380,7 +380,7 @@ private: device_->vcp2af_sRxPort_c0_hdr_resp_type = 0; memcpy(device_->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE); device_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata; - /*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata); + /*printf("%0ld: [sim] CCI Rd Rsp: addr=0x%lx, mdata=0x%x, data=0x", timestamp, cci_rd_it->addr, cci_rd_it->mdata); for (int i = 0; i < CACHE_BLOCK_SIZE; ++i) printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]); printf("\n");*/ @@ -398,7 +398,7 @@ private: cci_req.mdata = device_->af2cp_sTxPort_c0_hdr_mdata; auto host_ptr = (uint64_t*)(device_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE); memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE); - //printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, device_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata); + //printf("%0ld: [sim] CCI Rd Req: addr=0x%lx, mdata=0x%x\n", timestamp, device_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata); cci_reads_.emplace_back(cci_req); } @@ -453,7 +453,7 @@ private: } } - /*printf("%0ld: [sim] MEM Wr Req: bank=%d, addr=%x, data=", timestamp, b, byte_addr); + /*printf("%0ld: [sim] MEM Wr Req: bank=%d, 0x%x, data=0x", timestamp, b, byte_addr); for (int i = 0; i < MEM_BLOCK_SIZE; i++) { printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); } diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index e9487a2f4..3deffc759 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -65,7 +65,7 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count() VL_FLAGS += -j $(THREADS) #VL_FLAGS += --threads $(THREADS) -# Debugigng +# Debugging ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) diff --git a/sim/rtlsim/processor.cpp b/sim/rtlsim/processor.cpp index 2c31f939b..e5e00f49e 100644 --- a/sim/rtlsim/processor.cpp +++ b/sim/rtlsim/processor.cpp @@ -39,6 +39,7 @@ typedef VVortex Device; #include #include +#include #ifndef MEMORY_BANKS #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS @@ -316,11 +317,11 @@ private: auto mem_rsp_it = pending_mem_reqs_.begin(); auto mem_rsp = *mem_rsp_it; /* - printf("%0ld: [sim] MEM Rd Rsp: addr=%0lx, data=", timestamp, mem_rsp->addr); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); + printf("%0ld: [sim] MEM Rd Rsp: addr=0x%0lx, data=0x", timestamp, mem_rsp->addr); + for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) { + printf("%02x", mem_rsp->block[i]); + } + printf("\n"); */ device_->m_axi_rvalid[0] = 1; device_->m_axi_rid[0] = mem_rsp->tag; @@ -347,7 +348,7 @@ private: auto mem_rsp_it = pending_mem_reqs_.begin(); auto mem_rsp = *mem_rsp_it; /* - printf("%0ld: [sim] MEM Wr Rsp: addr=%0lx\n", timestamp, mem_rsp->addr); + printf("%0ld: [sim] MEM Wr Rsp: addr=0x%0lx\n", timestamp, mem_rsp->addr); */ device_->m_axi_bvalid[0] = 1; device_->m_axi_bid[0] = mem_rsp->tag; @@ -387,11 +388,15 @@ private: } else { // process writes /* - printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); + printf("%0ld: [sim] MEM Wr: addr=0x%0lx, byteen=0x", timestamp, base_addr); + for (int i = (MEM_BLOCK_SIZE/4)-1; i >= 0; --i) { + printf("%x", (int)((byteen >> (4 * i)) & 0xf)); + } + printf(", data=0x"); + for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) { + printf("%02x", data[i]); + } + printf("\n"); */ for (int i = 0; i < MEM_BLOCK_SIZE; i++) { if ((byteen >> i) & 0x1) { @@ -459,13 +464,13 @@ private: auto mem_rsp_it = pending_mem_reqs_.begin(); auto mem_rsp = *mem_rsp_it; /* - printf("%0ld: [sim] MEM Rd: tag=%0lx, addr=%0lx, data=", timestamp, mem_rsp->tag, mem_rsp->addr); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); + printf("%0ld: [sim] MEM Rd Rsp: tag=0x%0lx, addr=0x%0lx, data=0x", timestamp, mem_rsp->tag, mem_rsp->addr); + for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) { + printf("%02x", mem_rsp->block[i]); + } + printf("\n"); */ - memcpy(device_->mem_rsp_data.data(), mem_rsp->block.data(), MEM_BLOCK_SIZE); + memcpy(VDataCast::get(device_->mem_rsp_data), mem_rsp->block.data(), MEM_BLOCK_SIZE); device_->mem_rsp_tag = mem_rsp->tag; pending_mem_reqs_.erase(mem_rsp_it); mem_rd_rsp_active_ = true; @@ -480,7 +485,7 @@ private: uint64_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE); if (device_->mem_req_rw) { auto byteen = device_->mem_req_byteen; - auto data = (uint8_t*)(device_->mem_req_data.data()); + auto data = VDataCast::get(device_->mem_req_data); if (byte_addr >= uint64_t(IO_COUT_ADDR) && byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) { @@ -499,11 +504,15 @@ private: } else { // process writes /* - printf("%0ld: [sim] MEM Wr: tag=%0lx, addr=%0x, byteen=%0lx, data=", timestamp, device_->mem_req_tag, byte_addr, byteen); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); + printf("%0ld: [sim] MEM Wr Req: tag=0x%0lx, addr=0x%0lx, byteen=0x", timestamp, device_->mem_req_tag, byte_addr); + for (int i = (MEM_BLOCK_SIZE/4)-1; i >= 0; --i) { + printf("%x", (int)((byteen >> (4 * i)) & 0xf)); + } + printf(", data=0x"); + for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) { + printf("%d=%02x,", i, data[i]); + } + printf("\n"); */ for (int i = 0; i < MEM_BLOCK_SIZE; i++) { if ((byteen >> i) & 0x1) { @@ -530,7 +539,7 @@ private: ram_->read(mem_req->block.data(), byte_addr, MEM_BLOCK_SIZE); pending_mem_reqs_.emplace_back(mem_req); - //printf("%0ld: [sim] MEM Rd Req: addr=%0x, tag=%0lx\n", timestamp, byte_addr, device_->mem_req_tag); + //printf("%0ld: [sim] MEM Rd Req: addr=0x%0lx, tag=0x%0lx\n", timestamp, byte_addr, device_->mem_req_tag); // send dram request dram_queue_.push(mem_req); diff --git a/sim/simx/Makefile b/sim/simx/Makefile index 8520e5191..33120b13c 100644 --- a/sim/simx/Makefile +++ b/sim/simx/Makefile @@ -24,7 +24,7 @@ LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulato SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) #CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer diff --git a/sim/simx/cache_cluster.h b/sim/simx/cache_cluster.h index 63016577b..2ba26dc21 100644 --- a/sim/simx/cache_cluster.h +++ b/sim/simx/cache_cluster.h @@ -77,8 +77,8 @@ public: caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i)); } - caches_.at(i)->MemReqPort.bind(&cache_arb->ReqIn.at(i)); - cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPort); + caches_.at(i)->MemReqPorts.at(0).bind(&cache_arb->ReqIn.at(i)); + cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPorts.at(0)); } cache_arb->ReqOut.at(0).bind(&this->MemReqPort); diff --git a/sim/simx/cache_sim.cpp b/sim/simx/cache_sim.cpp index 65a8da70b..4f357f195 100644 --- a/sim/simx/cache_sim.cpp +++ b/sim/simx/cache_sim.cpp @@ -19,6 +19,7 @@ #include #include #include +#include using namespace vortex; @@ -315,27 +316,75 @@ public: simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i)); bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i)); } - bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort); - simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0)); + bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPorts.at(0)); + simobject->MemRspPorts.at(0).bind(&bypass_switch_->RspOut.at(0)); return; } - bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2); - bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort); - simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0)); + if (strcmp(simobject->name().c_str(), "l3cache")) { + bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2); + bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPorts.at(0)); + simobject->MemRspPorts.at(0).bind(&bypass_switch_->RspOut.at(0)); - if (config.B != 0) { - snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); - bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B)); - for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) { - mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i)); - bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); + if (config.B != 0) { + snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); + bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B)); + for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) { + mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i)); + bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); + } + bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0)); + } else { + mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0)); } - bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0)); - bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0)); } else { - mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); - bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0)); + // TODO: Change this into a crossbar + uint32_t max = MAX(2, config_.num_inputs); + //printf("%s connecting\n", simobject_->name().c_str()); + //3 + if (config.B != 0) { + bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, max, max); + for (uint32_t i = 0; i < max; ++i) { + //printf("%s connecting input=%d to MemPorts\n", simobject_->name().c_str(), i); + bypass_switch_->ReqOut.at(i).bind(&simobject->MemReqPorts.at(i % (1 << config.B))); + simobject->MemRspPorts.at(i % (1 << config.B)).bind(&bypass_switch_->RspOut.at(i)); + } + } else { + bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2); + bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPorts.at(0)); + simobject->MemRspPorts.at(0).bind(&bypass_switch_->RspOut.at(0)); + } + + if (config.B != 0) + { + snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); + bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B), (1 << config.B)); + for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) + { + //1 + //printf("%s Connecting memory ports to bank=%d\n", simobject_->name().c_str(), i); + mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i)); + bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); + } + //2 + if (config_.num_inputs > 1) { + for (uint32_t i = 0; i < max; ++i) { + //printf("%s connecting bank and bypass port=%d\n", simobject_->name().c_str(), i); + bank_switch_->ReqOut.at(i % (1 << config.B)).bind(&bypass_switch_->ReqIn.at(i)); + bypass_switch_->RspIn.at(i).bind(&bank_switch_->RspOut.at(i % (1 << config.B))); + } + } else { + bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0)); + } + } + else + { + mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0)); + } } // calculate cache initialization cycles @@ -673,8 +722,8 @@ CacheSim::CacheSim(const SimContext& ctx, const char* name, const Config& config : SimObject(ctx, name) , CoreReqPorts(config.num_inputs, this) , CoreRspPorts(config.num_inputs, this) - , MemReqPort(this) - , MemRspPort(this) + , MemReqPorts(NUM_MEM_PORTS, this) + , MemRspPorts(NUM_MEM_PORTS, this) , impl_(new Impl(this, config)) {} diff --git a/sim/simx/cache_sim.h b/sim/simx/cache_sim.h index df62bf854..aad489546 100644 --- a/sim/simx/cache_sim.h +++ b/sim/simx/cache_sim.h @@ -75,8 +75,8 @@ public: std::vector> CoreReqPorts; std::vector> CoreRspPorts; - SimPort MemReqPort; - SimPort MemRspPort; + std::vector> MemReqPorts; + std::vector> MemRspPorts; CacheSim(const SimContext& ctx, const char* name, const Config& config); ~CacheSim(); diff --git a/sim/simx/cluster.cpp b/sim/simx/cluster.cpp index 25669e26b..56e05e7a5 100644 --- a/sim/simx/cluster.cpp +++ b/sim/simx/cluster.cpp @@ -76,8 +76,8 @@ Cluster::Cluster(const SimContext& ctx, 2, // pipeline latency }); - l2cache_->MemReqPort.bind(&this->mem_req_port); - this->mem_rsp_port.bind(&l2cache_->MemRspPort); + l2cache_->MemReqPorts.at(0).bind(&this->mem_req_port); + this->mem_rsp_port.bind(&l2cache_->MemRspPorts.at(0)); icache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(0)); l2cache_->CoreRspPorts.at(0).bind(&icache_switch->RspOut.at(0)); diff --git a/sim/simx/constants.h b/sim/simx/constants.h index 09a509ce1..0c707b55c 100644 --- a/sim/simx/constants.h +++ b/sim/simx/constants.h @@ -21,10 +21,6 @@ #define MEM_CLOCK_RATIO 1 #endif -#ifndef MEMORY_BANKS -#define MEMORY_BANKS 2 -#endif - #define LSU_WORD_SIZE (XLEN / 8) #define LSU_CHANNELS NUM_LSU_LANES #define LSU_NUM_REQS (NUM_LSU_BLOCKS * LSU_CHANNELS) diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 6f817a3ae..82af146a3 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -215,7 +215,7 @@ void Core::fetch() { auto& mem_rsp = icache_rsp_port.front(); auto trace = pending_icache_.at(mem_rsp.tag); decode_latch_.push(trace); - DT(3, "icache-rsp: addr=0x" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace); + DT(3, "icache-rsp: addr=0x" << std::hex << trace->PC << ", tag=0x" << mem_rsp.tag << std::dec << ", " << *trace); pending_icache_.release(mem_rsp.tag); icache_rsp_port.pop(); --pending_ifetches_; @@ -232,7 +232,7 @@ void Core::fetch() { mem_req.cid = trace->cid; mem_req.uuid = trace->uuid; icache_req_ports.at(0).push(mem_req, 2); - DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); + DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=0x" << mem_req.tag << std::dec << ", " << *trace); fetch_latch_.pop(); ++perf_stats_.ifetches; ++pending_ifetches_; diff --git a/sim/simx/dcrs.cpp b/sim/simx/dcrs.cpp index bce4639ba..242d630eb 100644 --- a/sim/simx/dcrs.cpp +++ b/sim/simx/dcrs.cpp @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,13 +16,13 @@ using namespace vortex; -void DCRS::write(uint32_t addr, uint32_t value) { +void DCRS::write(uint32_t addr, uint32_t value) { if (addr >= VX_DCR_BASE_STATE_BEGIN && addr < VX_DCR_BASE_STATE_END) { base_dcrs.write(addr, value); return; } - std::cout << std::hex << "Error: invalid global DCR addr=0x" << addr << std::endl; + std::cout << "Error: invalid global DCR addr=0x" << std::hex << addr << std::dec << std::endl; std::abort(); } \ No newline at end of file diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp index 6cefe378f..dba57c4ef 100644 --- a/sim/simx/decode.cpp +++ b/sim/simx/decode.cpp @@ -416,19 +416,19 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) { int sep = 0; if (instr.getRDType() != RegType::None) { if (sep++ != 0) { os << ", "; } else { os << " "; } - os << instr.getRDType() << std::dec << instr.getRDest(); + os << instr.getRDType() << instr.getRDest(); } for (uint32_t i = 0; i < instr.getNRSrc(); ++i) { if (sep++ != 0) { os << ", "; } else { os << " "; } if (instr.getRSType(i) != RegType::None) { - os << instr.getRSType(i) << std::dec << instr.getRSrc(i); + os << instr.getRSType(i) << instr.getRSrc(i); } else { - os << "0x" << std::hex << instr.getRSrc(0); + os << "0x" << std::hex << instr.getRSrc(0) << std::dec; } } if (instr.hasImm()) { if (sep++ != 0) { os << ", "; } else { os << " "; } - os << "0x" << std::hex << instr.getImm(); + os << "0x" << std::hex << instr.getImm() << std::dec; } return os; } @@ -450,7 +450,7 @@ std::shared_ptr Emulator::decode(uint32_t code) const { auto op_it = sc_instTable.find(op); if (op_it == sc_instTable.end()) { - std::cout << std::hex << "Error: invalid opcode: 0x" << static_cast(op) << std::endl; + std::cout << "Error: invalid opcode: 0x" << std::hex << static_cast(op) << std::dec << std::endl; return nullptr; } diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 503e21cd9..8a95f4dbd 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -53,15 +53,26 @@ void Emulator::warp_t::clear(uint64_t startup_addr) { this->uuid = 0; this->fcsr = 0; + std::srand(50); + for (auto& reg_file : this->ireg_file) { for (auto& reg : reg_file) { + #ifndef NDEBUG reg = 0; + #else + reg = std::rand(); + #endif } + reg_file.at(0) = 0; // r0 = 0 } for (auto& reg_file : this->freg_file) { for (auto& reg : reg_file) { + #ifndef NDEBUG reg = 0; + #else + reg = std::rand(); + #endif } } } @@ -127,7 +138,7 @@ instr_trace_t* Emulator::step() { // process pending wspawn if (wspawn_.valid && active_warps_.count() == 1) { - DP(3, "*** Activate " << (wspawn_.num_warps-1) << " warps at PC: " << std::hex << wspawn_.nextPC); + DP(3, "*** Activate " << (wspawn_.num_warps-1) << " warps at PC: " << std::hex << wspawn_.nextPC << std::dec); for (uint32_t i = 1; i < wspawn_.num_warps; ++i) { auto& warp = warps_.at(i); warp.PC = wspawn_.nextPC; @@ -174,11 +185,11 @@ instr_trace_t* Emulator::step() { // Decode auto instr = this->decode(instr_code); if (!instr) { - std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=0x" << warp.PC << " (#" << std::dec << uuid << ")" << std::endl; + std::cout << "Error: invalid instruction 0x" << std::hex << instr_code << ", at PC=0x" << warp.PC << " (#" << std::dec << uuid << ")" << std::endl; std::abort(); } - DP(1, "Instr 0x" << std::hex << instr_code << ": " << *instr); + DP(1, "Instr 0x" << std::hex << instr_code << ": " << std::dec << *instr); // Create trace auto trace = new instr_trace_t(uuid, arch_); @@ -188,17 +199,17 @@ instr_trace_t* Emulator::step() { DP(5, "Register state:"); for (uint32_t i = 0; i < MAX_NUM_REGS; ++i) { - DPN(5, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':'); + DPN(5, " %r" << std::setfill('0') << std::setw(2) << i << ':' << std::hex); // Integer register file for (uint32_t j = 0; j < arch_.num_threads(); ++j) { - DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << warp.ireg_file.at(j).at(i) << std::setfill(' ') << ' '); + DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << warp.ireg_file.at(j).at(i) << std::setfill(' ') << ' '); } DPN(5, '|'); // Floating point register file for (uint32_t j = 0; j < arch_.num_threads(); ++j) { - DPN(5, ' ' << std::setfill('0') << std::setw(16) << std::hex << warp.freg_file.at(j).at(i) << std::setfill(' ') << ' '); + DPN(5, ' ' << std::setfill('0') << std::setw(16) << warp.freg_file.at(j).at(i) << std::setfill(' ') << ' '); } - DPN(5, std::endl); + DPN(5, std::dec << std::endl); } return trace; @@ -325,7 +336,7 @@ void Emulator::dcache_read(void *data, uint64_t addr, uint32_t size) { mmu_.read(data, addr, size, 0); } - DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl); + DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << std::dec << " (size=" << size << ", type=" << type << ")" << std::endl); } #endif @@ -367,7 +378,7 @@ void Emulator::dcache_write(const void* data, uint64_t addr, uint32_t size) { mmu_.write(data, addr, size, 0); } } - DPH(2, "Mem Write: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl); + DPH(2, "Mem Write: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << std::dec << " (size=" << size << ", type=" << type << ")" << std::endl); } #endif @@ -394,7 +405,7 @@ void Emulator::writeToStdOut(const void* data, uint64_t addr, uint32_t size) { char c = *(char*)data; ss_buf << c; if (c == '\n') { - std::cout << std::dec << "#" << tid << ": " << ss_buf.str() << std::flush; + std::cout << "#" << tid << ": " << ss_buf.str() << std::flush; ss_buf.str(""); } } @@ -516,6 +527,8 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { CSR_READ_64(VX_CSR_MPM_MEM_READS, proc_perf.mem_reads); CSR_READ_64(VX_CSR_MPM_MEM_WRITES, proc_perf.mem_writes); CSR_READ_64(VX_CSR_MPM_MEM_LT, proc_perf.mem_latency); + CSR_READ_64(VX_CSR_MPM_MEM_BANK_CNTR, proc_perf.memsim.counter); + CSR_READ_64(VX_CSR_MPM_MEM_BANK_TICK, proc_perf.memsim.ticks); CSR_READ_64(VX_CSR_MPM_LMEM_READS, lmem_perf.reads); CSR_READ_64(VX_CSR_MPM_LMEM_WRITES, lmem_perf.writes); @@ -523,12 +536,12 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { } } break; default: { - std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl; + std::cout << "Error: invalid MPM CLASS: value=" << perf_class << std::endl; std::abort(); } break; } } else { - std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl; + std::cout << "Error: invalid CSR read addr=0x"<< std::hex << addr << std::dec << std::endl; std::abort(); } } @@ -569,7 +582,7 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) { case VX_CSR_MCAUSE: break; default: { - std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl; + std::cout << "Error: invalid CSR write addr=0x" << std::hex << addr << ", value=0x" << value << std::dec << std::endl; std::abort(); } } diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index a037d995c..db098726b 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -102,7 +102,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { auto reg = instr.getRSrc(i); switch (type) { case RegType::Integer: - DPH(2, "Src" << std::dec << i << " Reg: " << type << std::dec << reg << "={"); + DPH(2, "Src" << i << " Reg: " << type << reg << "={"); for (uint32_t t = 0; t < num_threads; ++t) { if (t) DPN(2, ", "); if (!warp.tmask.test(t)) { @@ -110,12 +110,12 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { continue; } rsdata[t][i].u = warp.ireg_file.at(t)[reg]; - DPN(2, "0x" << std::hex << rsdata[t][i].i); + DPN(2, "0x" << std::hex << rsdata[t][i].i << std::dec); } DPN(2, "}" << std::endl); break; case RegType::Float: - DPH(2, "Src" << std::dec << i << " Reg: " << type << std::dec << reg << "={"); + DPH(2, "Src" << i << " Reg: " << type << reg << "={"); for (uint32_t t = 0; t < num_threads; ++t) { if (t) DPN(2, ", "); if (!warp.tmask.test(t)) { @@ -123,7 +123,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { continue; } rsdata[t][i].u64 = warp.freg_file.at(t)[reg]; - DPN(2, "0x" << std::hex << rsdata[t][i].f); + DPN(2, "0x" << std::hex << rsdata[t][i].f << std::dec); } DPN(2, "}" << std::endl); break; @@ -633,7 +633,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { all_taken = curr_taken; } else { if (all_taken != curr_taken) { - std::cout << "divergent branch! PC=0x" << std::hex << warp.PC << " (#" << std::dec << trace->uuid << ")\n" << std::flush; + std::cout << "divergent branch! PC=0x" << std::hex << warp.PC << std::dec << " (#" << trace->uuid << ")\n" << std::flush; std::abort(); } } @@ -1338,7 +1338,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { bool is_divergent = then_tmask.any() && else_tmask.any(); if (is_divergent) { if (stack_size == ipdom_size_) { - std::cout << "IPDOM stack is full! size=" << std::dec << stack_size << ", PC=0x" << std::hex << warp.PC << " (#" << std::dec << trace->uuid << ")\n" << std::flush; + std::cout << "IPDOM stack is full! size=" << stack_size << ", PC=0x" << std::hex << warp.PC << std::dec << " (#" << trace->uuid << ")\n" << std::flush; std::abort(); } // set new thread mask to the larger set @@ -1425,7 +1425,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { switch (type) { case RegType::Integer: if (rdest) { - DPH(2, "Dest Reg: " << type << std::dec << rdest << "={"); + DPH(2, "Dest Reg: " << type << rdest << "={"); for (uint32_t t = 0; t < num_threads; ++t) { if (t) DPN(2, ", "); if (!warp.tmask.test(t)) { @@ -1433,7 +1433,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { continue; } warp.ireg_file.at(t)[rdest] = rddata[t].i; - DPN(2, "0x" << std::hex << rddata[t].i); + DPN(2, "0x" << std::hex << rddata[t].i << std::dec); } DPN(2, "}" << std::endl); trace->dst_reg = {type, rdest}; @@ -1444,7 +1444,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { } break; case RegType::Float: - DPH(2, "Dest Reg: " << type << std::dec << rdest << "={"); + DPH(2, "Dest Reg: " << type << rdest << "={"); for (uint32_t t = 0; t < num_threads; ++t) { if (t) DPN(2, ", "); if (!warp.tmask.test(t)) { @@ -1452,7 +1452,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { continue; } warp.freg_file.at(t)[rdest] = rddata[t].u64; - DPN(2, "0x" << std::hex << rddata[t].f); + DPN(2, "0x" << std::hex << rddata[t].f << std::dec); } DPN(2, "}" << std::endl); trace->dst_reg = {type, rdest}; diff --git a/sim/simx/instr_trace.h b/sim/simx/instr_trace.h index 7f6b37580..bbf4eab59 100644 --- a/sim/simx/instr_trace.h +++ b/sim/simx/instr_trace.h @@ -146,14 +146,14 @@ inline std::ostream &operator<<(std::ostream &os, const instr_trace_t& trace) { for (uint32_t i = 0, n = trace.arch.num_threads(); i < n; ++i) { os << trace.tmask.test(i); } - os << ", PC=0x" << std::hex << trace.PC; + os << ", PC=0x" << std::hex << trace.PC << std::dec; os << ", wb=" << trace.wb; if (trace.dst_reg.type != RegType::None) { - os << ", rd=" << trace.dst_reg.type << std::dec << trace.dst_reg.idx; + os << ", rd=" << trace.dst_reg.type << trace.dst_reg.idx; } for (uint32_t i = 0; i < trace.src_regs.size(); ++i) { if (trace.src_regs[i].type != RegType::None) { - os << ", rs" << i << "=" << trace.src_regs[i].type << std::dec << trace.src_regs[i].idx; + os << ", rs" << i << "=" << trace.src_regs[i].type << trace.src_regs[i].idx; } } os << ", ex=" << trace.fu_type; @@ -162,7 +162,7 @@ inline std::ostream &operator<<(std::ostream &os, const instr_trace_t& trace) { os << ", sop=" << trace.sop; os << ", eop=" << trace.eop; } - os << " (#" << std::dec << trace.uuid << ")"; + os << " (#" << trace.uuid << ")"; return os; } diff --git a/sim/simx/local_mem.cpp b/sim/simx/local_mem.cpp index 195fe5300..1bab3fccb 100644 --- a/sim/simx/local_mem.cpp +++ b/sim/simx/local_mem.cpp @@ -52,13 +52,13 @@ public: void read(void* data, uint64_t addr, uint32_t size) { auto s_addr = to_local_addr(addr); - DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::endl); + DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl); ram_.read(data, s_addr, size); } void write(const void* data, uint64_t addr, uint32_t size) { auto s_addr = to_local_addr(addr); - DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::endl); + DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl); ram_.write(data, s_addr, size); } diff --git a/sim/simx/mem_sim.cpp b/sim/simx/mem_sim.cpp index a12713fea..a38f4c01c 100644 --- a/sim/simx/mem_sim.cpp +++ b/sim/simx/mem_sim.cpp @@ -33,6 +33,7 @@ private: struct DramCallbackArgs { MemSim* simobject; MemReq request; + uint32_t i; }; public: @@ -56,46 +57,49 @@ public: void tick() { dram_sim_.tick(); + uint32_t counter = 0; - if (simobject_->MemReqPort.empty()) - return; + for (uint32_t i = 0; i < NUM_MEM_PORTS; ++i) { + if (simobject_->MemReqPorts.at(i).empty()) + continue; - auto& mem_req = simobject_->MemReqPort.front(); + auto& mem_req = simobject_->MemReqPorts.at(i).front(); - // try to enqueue the request to the memory system - auto req_args = new DramCallbackArgs{simobject_, mem_req}; - auto enqueue_success = dram_sim_.send_request( - mem_req.write, - mem_req.addr, - 0, - [](void* arg) { - auto rsp_args = reinterpret_cast(arg); - // only send a response for read requests - if (!rsp_args->request.write) { - MemRsp mem_rsp{rsp_args->request.tag, rsp_args->request.cid, rsp_args->request.uuid}; - rsp_args->simobject->MemRspPort.push(mem_rsp, 1); - DT(3, rsp_args->simobject->name() << " mem-rsp: " << mem_rsp); - } - delete rsp_args; - }, - req_args - ); + // try to enqueue the request to the memory system + auto req_args = new DramCallbackArgs{simobject_, mem_req, i}; + auto enqueue_success = dram_sim_.send_request( + mem_req.write, + mem_req.addr, + 0, + [](void* arg) { + auto rsp_args = reinterpret_cast(arg); + // only send a response for read requests + if (!rsp_args->request.write) { + MemRsp mem_rsp{rsp_args->request.tag, rsp_args->request.cid, rsp_args->request.uuid}; + rsp_args->simobject->MemRspPorts.at(rsp_args->i).push(mem_rsp, 1); + DT(3, rsp_args->simobject->name() << " mem-rsp: " << mem_rsp << " bank: " << rsp_args->i); + } + delete rsp_args; + }, + req_args + ); - // check if the request was enqueued successfully - if (!enqueue_success) { - delete req_args; - return; + // check if the request was enqueued successfully + if (!enqueue_success) { + delete req_args; + continue; + } + + DT(3, simobject_->name() << " mem-req: " << mem_req << " bank: " << i); + + simobject_->MemReqPorts.at(i).pop(); + counter++; } - if (mem_req.write) { - ++perf_stats_.writes; - } else { - ++perf_stats_.reads; + perf_stats_.counter += counter; + if (counter > 0) { + ++perf_stats_.ticks; } - - DT(3, simobject_->name() << " mem-req: " << mem_req); - - simobject_->MemReqPort.pop(); } }; @@ -103,8 +107,8 @@ public: MemSim::MemSim(const SimContext& ctx, const char* name, const Config& config) : SimObject(ctx, name) - , MemReqPort(this) - , MemRspPort(this) + , MemReqPorts(NUM_MEM_PORTS, this) + , MemRspPorts(NUM_MEM_PORTS, this) , impl_(new Impl(this, config)) {} @@ -118,4 +122,8 @@ void MemSim::reset() { void MemSim::tick() { impl_->tick(); +} + +const MemSim::PerfStats &MemSim::perf_stats() const { + return impl_->perf_stats(); } \ No newline at end of file diff --git a/sim/simx/mem_sim.h b/sim/simx/mem_sim.h index 3f4d9801e..2f4f96187 100644 --- a/sim/simx/mem_sim.h +++ b/sim/simx/mem_sim.h @@ -26,17 +26,23 @@ public: }; struct PerfStats { - uint64_t reads; - uint64_t writes; + uint64_t counter; + uint64_t ticks; PerfStats() - : reads(0) - , writes(0) + : counter(0) + , ticks(0) {} + + PerfStats& operator+=(const PerfStats& rhs) { + this->counter += rhs.counter; + this->ticks += rhs.ticks; + return *this; + } }; - SimPort MemReqPort; - SimPort MemRspPort; + std::vector> MemReqPorts; + std::vector> MemRspPorts; MemSim(const SimContext& ctx, const char* name, const Config& config); ~MemSim(); diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index 01023125b..20caf2b49 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -47,8 +47,10 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) ); // connect L3 memory ports - l3cache_->MemReqPort.bind(&memsim_->MemReqPort); - memsim_->MemRspPort.bind(&l3cache_->MemRspPort); + for (uint32_t i = 0; i < NUM_MEM_PORTS; ++i) { + l3cache_->MemReqPorts.at(i).bind(&memsim_->MemReqPorts.at(i)); + memsim_->MemRspPorts.at(i).bind(&l3cache_->MemRspPorts.at(i)); + } // create clusters for (uint32_t i = 0; i < arch.num_clusters(); ++i) { @@ -59,16 +61,18 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) } // set up memory profiling - memsim_->MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){ - __unused (cycle); - perf_mem_reads_ += !req.write; - perf_mem_writes_ += req.write; - perf_mem_pending_reads_ += !req.write; - }); - memsim_->MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){ - __unused (cycle); - --perf_mem_pending_reads_; - }); + for (uint32_t i = 0; i < NUM_MEM_PORTS; ++i) { + memsim_->MemReqPorts.at(i).tx_callback([&](const MemReq& req, uint64_t cycle){ + __unused (cycle); + perf_mem_reads_ += !req.write; + perf_mem_writes_ += req.write; + perf_mem_pending_reads_ += !req.write; + }); + memsim_->MemRspPorts.at(i).tx_callback([&](const MemRsp&, uint64_t cycle){ + __unused (cycle); + --perf_mem_pending_reads_; + }); + } #ifndef NDEBUG // dump device configuration @@ -138,6 +142,7 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const { perf.mem_writes = perf_mem_writes_; perf.mem_latency = perf_mem_latency_; perf.l3cache = l3cache_->perf_stats(); + perf.memsim = memsim_->perf_stats(); return perf; } diff --git a/sim/simx/processor_impl.h b/sim/simx/processor_impl.h index 511c0cad6..fb4a37693 100644 --- a/sim/simx/processor_impl.h +++ b/sim/simx/processor_impl.h @@ -25,6 +25,7 @@ class ProcessorImpl { public: struct PerfStats { CacheSim::PerfStats l3cache; + MemSim::PerfStats memsim; uint64_t mem_reads; uint64_t mem_writes; uint64_t mem_latency; diff --git a/sim/simx/types.h b/sim/simx/types.h index 385015cc9..b452dd379 100644 --- a/sim/simx/types.h +++ b/sim/simx/types.h @@ -264,14 +264,14 @@ inline std::ostream &operator<<(std::ostream &os, const LsuReq& req) { for (size_t i = 0; i < req.mask.size(); ++i) { os << "addr" << i << "="; if (req.mask.test(i)) { - os << "0x" << std::hex << req.addrs.at(i); + os << "0x" << std::hex << req.addrs.at(i) << std::dec; } else { os << "-"; } os << ", "; } - os << std::dec << "tag=" << req.tag << ", cid=" << req.cid; - os << " (#" << std::dec << req.uuid << ")"; + os << "tag=0x" << std::hex << req.tag << std::dec << ", cid=" << req.cid; + os << " (#" << req.uuid << ")"; return os; } @@ -292,8 +292,8 @@ struct LsuRsp { }; inline std::ostream &operator<<(std::ostream &os, const LsuRsp& rsp) { - os << "mask=" << rsp.mask << ", tag=" << rsp.tag << ", cid=" << rsp.cid; - os << " (#" << std::dec << rsp.uuid << ")"; + os << "mask=" << rsp.mask << ", tag=0x" << std::hex << rsp.tag << std::dec << ", cid=" << rsp.cid; + os << " (#" << rsp.uuid << ")"; return os; } @@ -324,9 +324,9 @@ struct MemReq { inline std::ostream &operator<<(std::ostream &os, const MemReq& req) { os << "rw=" << req.write << ", "; - os << "addr=0x" << std::hex << req.addr << ", type=" << req.type; - os << std::dec << ", tag=" << req.tag << ", cid=" << req.cid; - os << " (#" << std::dec << req.uuid << ")"; + os << "addr=0x" << std::hex << req.addr << std::dec << ", type=" << req.type; + os << ", tag=0x" << std::hex << req.tag << std::dec << ", cid=" << req.cid; + os << " (#" << req.uuid << ")"; return os; } @@ -345,8 +345,8 @@ struct MemRsp { }; inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) { - os << "tag=" << rsp.tag << ", cid=" << rsp.cid; - os << " (#" << std::dec << rsp.uuid << ")"; + os << "tag=0x" << std::hex << rsp.tag << std::dec << ", cid=" << rsp.cid; + os << " (#" << rsp.uuid << ")"; return os; } diff --git a/sim/xrtsim/Makefile b/sim/xrtsim/Makefile index dd11c8d64..765e3e268 100644 --- a/sim/xrtsim/Makefile +++ b/sim/xrtsim/Makefile @@ -82,13 +82,13 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count() VL_FLAGS += -j $(THREADS) #VL_FLAGS += --threads $(THREADS) -# Debugigng +# Debugging ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) else VL_FLAGS += -DNDEBUG - CXXFLAGS += -O3 -DNDEBUG + CXXFLAGS += -O2 -DNDEBUG endif # Enable scope analyzer @@ -119,7 +119,7 @@ $(DESTDIR)/scope.json: $(DESTDIR)/vortex.xml $(SCRIPT_DIR)/scope.py $^ -o $@ $(DESTDIR)/$(PROJECT): $(SRCS) $(SCOPE_JSON) - verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $@.obj_dir -o $@ + verilator --build --exe $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $@.obj_dir -o $@ clean: rm -rf $(DESTDIR)/$(PROJECT).obj_dir diff --git a/tests/kernel/conform/tests.cpp b/tests/kernel/conform/tests.cpp index f5f33a13e..6125a6911 100644 --- a/tests/kernel/conform/tests.cpp +++ b/tests/kernel/conform/tests.cpp @@ -46,13 +46,15 @@ int test_global_memory() { /////////////////////////////////////////////////////////////////////////////// -int* lmem_addr = (int*)LMEM_BASE_ADDR; +volatile int* lmem_addr = (int*)LMEM_BASE_ADDR; int lmem_buffer[8]; void __attribute__((noinline)) do_lmem_wr() { unsigned tid = vx_thread_id(); lmem_addr[tid] = 65 + tid; + int x = lmem_addr[tid]; + lmem_addr[tid] = x; } void __attribute__((noinline)) do_lmem_rd() { diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index e4be7e712..e60cd6ec7 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -8,6 +8,9 @@ all: $(MAKE) -C psort $(MAKE) -C saxpy $(MAKE) -C sfilter + $(MAKE) -C sgemm2 + $(MAKE) -C sgemm3 + $(MAKE) -C psum $(MAKE) -C oclprintf $(MAKE) -C dotproduct $(MAKE) -C transpose @@ -19,9 +22,6 @@ all: $(MAKE) -C kmeans $(MAKE) -C blackscholes $(MAKE) -C bfs - $(MAKE) -C sgemm2 - $(MAKE) -C sgemm3 - $(MAKE) -C psum run-simx: $(MAKE) -C vecadd run-simx @@ -30,6 +30,9 @@ run-simx: $(MAKE) -C psort run-simx $(MAKE) -C saxpy run-simx $(MAKE) -C sfilter run-simx + $(MAKE) -C sgemm2 run-simx + $(MAKE) -C sgemm3 run-simx + $(MAKE) -C psum run-simx $(MAKE) -C oclprintf run-simx $(MAKE) -C dotproduct run-simx $(MAKE) -C transpose run-simx @@ -40,9 +43,6 @@ run-simx: $(MAKE) -C kmeans run-simx $(MAKE) -C blackscholes run-simx $(MAKE) -C bfs run-simx - $(MAKE) -C sgemm2 run-simx - $(MAKE) -C sgemm3 run-simx - $(MAKE) -C psum run-simx run-rtlsim: $(MAKE) -C vecadd run-rtlsim @@ -51,6 +51,9 @@ run-rtlsim: $(MAKE) -C psort run-rtlsim $(MAKE) -C saxpy run-rtlsim $(MAKE) -C sfilter run-rtlsim + $(MAKE) -C sgemm2 run-rtlsim + $(MAKE) -C sgemm3 run-rtlsim + $(MAKE) -C psum run-rtlsim $(MAKE) -C oclprintf run-rtlsim $(MAKE) -C dotproduct run-rtlsim $(MAKE) -C transpose run-rtlsim @@ -61,9 +64,6 @@ run-rtlsim: $(MAKE) -C kmeans run-rtlsim $(MAKE) -C blackscholes run-rtlsim $(MAKE) -C bfs run-rtlsim - $(MAKE) -C sgemm2 run-rtlsim - $(MAKE) -C sgemm3 run-rtlsim - $(MAKE) -C psum run-rtlsim clean: $(MAKE) -C vecadd clean @@ -72,6 +72,9 @@ clean: $(MAKE) -C psort clean $(MAKE) -C saxpy clean $(MAKE) -C sfilter clean + $(MAKE) -C sgemm2 clean + $(MAKE) -C sgemm3 clean + $(MAKE) -C psum clean $(MAKE) -C oclprintf clean $(MAKE) -C dotproduct clean $(MAKE) -C transpose clean @@ -82,7 +85,4 @@ clean: $(MAKE) -C guassian clean $(MAKE) -C kmeans clean $(MAKE) -C blackscholes clean - $(MAKE) -C bfs clean - $(MAKE) -C sgemm2 clean - $(MAKE) -C sgemm3 clean - $(MAKE) -C psum clean \ No newline at end of file + $(MAKE) -C bfs clean \ No newline at end of file diff --git a/tests/opencl/common.mk b/tests/opencl/common.mk index 0c559e8c5..2e287a944 100644 --- a/tests/opencl/common.mk +++ b/tests/opencl/common.mk @@ -44,7 +44,7 @@ CXXFLAGS += -I$(POCL_PATH)/include POCL_CC_FLAGS += LLVM_PREFIX=$(LLVM_VORTEX) POCL_VORTEX_BINTOOL="$(VX_BINTOOL)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)" -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 POCL_CC_FLAGS += POCL_DEBUG=all diff --git a/tests/regression/common.mk b/tests/regression/common.mk index c063fe34e..12b45e848 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -52,7 +52,7 @@ CXXFLAGS += -I$(VORTEX_RT_PATH)/include -I$(ROOT_DIR)/hw LDFLAGS += -L$(ROOT_DIR)/runtime -lvortex -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else diff --git a/tests/regression/stencil3d/main.cpp b/tests/regression/stencil3d/main.cpp index a47f94710..0536effc0 100644 --- a/tests/regression/stencil3d/main.cpp +++ b/tests/regression/stencil3d/main.cpp @@ -120,29 +120,19 @@ static void stencil_cpu(TYPE *out, const TYPE *in, uint32_t width, uint32_t heig // Check bounds and replicate the boundary values if (nx < 0) - { - nx = 0; - } + {nx = 0;} else if (nx >= (int)width) - { - nx = width - 1; - } + {nx = width - 1;} + if (ny < 0) - { - ny = 0; - } + {ny = 0;} else if (ny >= (int)height) - { - ny = height - 1; - } + {ny = height - 1;} + if (nz < 0) - { - nz = 0; - } + {nz = 0;} else if (nz >= (int)depth) - { - nz = depth - 1; - } + {nz = depth - 1;} // Sum up the values sum += in[nz * width * height + ny * width + nx]; @@ -238,8 +228,8 @@ int main(int argc, char *argv[]) uint32_t buf_size = size_cubed * sizeof(TYPE); std::cout << "data type: " << Comparator::type_str() << std::endl; - std::cout << "matrix size: " << size << "x" << size << std::endl; - std::cout << "block size: " << block_size << "x" << block_size << std::endl; + std::cout << "matrix size: " << size << "x" << size << "x" << size << std::endl; + std::cout << "block size: " << block_size << "x" << block_size << "x" << block_size << std::endl; kernel_arg.grid_dim[0] = size / block_size; kernel_arg.grid_dim[1] = size / block_size; diff --git a/tests/regression/vecaddx/main.cpp b/tests/regression/vecaddx/main.cpp index 9fe484ccb..d80e2fdc1 100644 --- a/tests/regression/vecaddx/main.cpp +++ b/tests/regression/vecaddx/main.cpp @@ -50,7 +50,7 @@ public: static const char* type_str() { return "float"; } - static int generate() { + static float generate() { return static_cast(rand()) / RAND_MAX; } static bool compare(float a, float b, int index, int errors) { diff --git a/tests/unittest/common.mk b/tests/unittest/common.mk index 4f94afa08..a6f6b2794 100644 --- a/tests/unittest/common.mk +++ b/tests/unittest/common.mk @@ -2,7 +2,7 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -I$(VORTEX_RT_PATH)/common -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else