From 1795980a5280666c4fe828a4c2d17541a9c744bc Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 21 Nov 2020 09:47:56 -0800 Subject: [PATCH] L2 and L1 using different block size support, RTLsim fixes, dram_rsp_ready optimization --- .travis.yml | 28 +- ci/blackbox.sh | 209 ++++++----- driver/opae/Makefile | 6 +- driver/opae/vlsim/Makefile | 8 +- driver/opae/vlsim/opae_sim.cpp | 21 +- driver/opae/vortex.cpp | 6 +- driver/rtlsim/Makefile | 6 +- driver/tests/demo/demo.cpp | 2 +- driver/tests/dogfood/dogfood.cpp | 2 +- driver/tests/dogfood/testcases.h | 21 +- hw/opae/VX_avs_wrapper.v | 43 +-- hw/opae/vortex_afu.sv | 20 +- hw/rtl/VX_cluster.v | 170 ++++++--- hw/rtl/VX_config.vh | 104 ++--- hw/rtl/VX_define.vh | 438 ++++++++++++---------- hw/rtl/VX_ibuffer.v | 6 +- hw/rtl/VX_mem_unit.v | 79 +--- hw/rtl/Vortex.v | 142 +++---- hw/rtl/cache/VX_bank.v | 26 +- hw/rtl/cache/VX_cache.v | 129 ++----- hw/rtl/cache/VX_cache_config.vh | 2 +- hw/rtl/cache/VX_cache_core_req_bank_sel.v | 40 +- hw/rtl/cache/VX_cache_miss_resrv.v | 2 +- hw/rtl/cache/VX_snp_forwarder.v | 110 ++++-- hw/rtl/cache/VX_snp_rsp_arb.v | 8 +- hw/rtl/interfaces/VX_cache_core_rsp_if.v | 2 + hw/rtl/interfaces/VX_cache_dram_req_if.v | 4 +- hw/rtl/interfaces/VX_cache_dram_rsp_if.v | 2 + hw/rtl/interfaces/VX_cache_snp_req_if.v | 4 +- hw/rtl/interfaces/VX_cache_snp_rsp_if.v | 2 + hw/rtl/interfaces/VX_cmt_to_csr_if.v | 8 +- hw/rtl/interfaces/VX_csr_io_req_if.v | 2 + hw/rtl/interfaces/VX_csr_io_rsp_if.v | 2 + hw/rtl/interfaces/VX_decode_if.v | 5 +- hw/rtl/interfaces/VX_exu_to_cmt_if.v | 4 +- hw/rtl/interfaces/VX_fpu_to_cmt_if.v | 4 +- hw/rtl/interfaces/VX_fpu_to_csr_if.v | 14 +- hw/rtl/interfaces/VX_gpr_rsp_if.v | 1 - hw/rtl/interfaces/VX_ifetch_req_if.v | 2 + hw/rtl/interfaces/VX_ifetch_rsp_if.v | 4 +- hw/rtl/interfaces/VX_lsu_req_if.v | 5 +- hw/rtl/interfaces/VX_warp_ctl_if.v | 11 +- hw/rtl/interfaces/VX_writeback_if.v | 3 +- hw/rtl/interfaces/VX_wstall_if.v | 2 +- hw/rtl/libs/VX_cam_buffer.v | 4 +- hw/scripts/gen_config.py | 72 +--- hw/simulate/Makefile | 8 +- hw/simulate/simulator.cpp | 121 +++--- hw/simulate/simulator.h | 6 +- hw/simulate/testbench.cpp | 4 + 50 files changed, 972 insertions(+), 952 deletions(-) diff --git a/.travis.yml b/.travis.yml index 03690e9d4..9e9281c14 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,19 +20,21 @@ install: - export PATH=$VERILATOR_ROOT/bin:$PATH script: - - make -j - - ci/test_runtime.sh - - ci/test_driver.sh - - ci/test_riscv_isa.sh - - ci/test_opencl.sh - - ci/blackbox.sh -run_debug - - ci/blackbox.sh -run_scope - - ci/blackbox.sh -run_1c - - ci/blackbox.sh -run_2c - - ci/blackbox.sh -run_4c - - ci/blackbox.sh -run_4c_l2 - - travis_wait 30 ci/blackbox.sh -run_4c_2l2_l3 - - travis_wait 30 ci/blackbox.sh -run_8c_4l2_l3 + - travis_wait 45 make + - travis_wait 45 ci/test_runtime.sh + - travis_wait 45 ci/test_driver.sh + - travis_wait 45 ci/test_riscv_isa.sh + - travis_wait 45 ci/test_opencl.sh + - travis_wait 45 ci/blackbox.sh --driver=rtlsim + - travis_wait 45 ci/blackbox.sh --driver=vlsim + - travis_wait 45 ci/blackbox.sh --driver=vlsim --scope + - travis_wait 45 ci/blackbox.sh --driver=vlsim --debug + - travis_wait 45 ci/blackbox.sh --driver=vlsim --cores=1 + - travis_wait 45 ci/blackbox.sh --driver=vlsim --cores=2 + - travis_wait 45 ci/blackbox.sh --driver=vlsim --cores=4 + - travis_wait 45 ci/blackbox.sh --driver=vlsim --cores=4 --l2cache + - travis_wait 45 ci/blackbox.sh --driver=vlsim --cores=2 --l2cache --clusters=2 + - travis_wait 45 ci/blackbox.sh --driver=vlsim --cores=2 --l2cache --clusters=4 after_success: # Gather code coverage diff --git a/ci/blackbox.sh b/ci/blackbox.sh index 8b8afa2b3..cacf9d566 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -3,107 +3,130 @@ # exit when any command fails set -e -run_1c() +show_usage() { - # test single core - make -C driver/opae/vlsim clean - CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae/vlsim > /dev/null 2>&1 - make -C benchmarks/opencl/sgemm run-vlsim + echo "Vortex BlackBox Test Driver v1.0" + echo "Usage: [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [[--driver=rtlsim|vlsim] [--debug] [--scope] [--app=vecadd|sgemm|basic|demo|dogfood][--help]]" } -run_2c() -{ - # test 2 cores - make -C driver/opae/vlsim clean - CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0" make -C driver/opae/vlsim > /dev/null 2>&1 - make -C benchmarks/opencl/sgemm run-vlsim -} +DRIVER=vlsim +APP=sgemm +CLUSTERS=1 +CORES=2 +WARPS=4 +THREADS=4 +L2=0 +DEBUG=0 +SCOPE=0 -run_4c() -{ - # test 4 cores - make -C driver/opae/vlsim clean - CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=0" make -C driver/opae/vlsim > /dev/null 2>&1 - make -C benchmarks/opencl/sgemm run-vlsim -} +for i in "$@" +do +case $i in + --driver=*) + DRIVER=${i#*=} + shift + ;; + --app=*) + APP=${i#*=} + shift + ;; + --clusters=*) + CLUSTERS=${i#*=} + shift + ;; + --cores=*) + CORES=${i#*=} + shift + ;; + --warps=*) + WARPS=${i#*=} + shift + ;; + --threads=*) + THREADS=${i#*=} + shift + ;; + --l2cache) + L2=1 + shift + ;; + --debug) + DEBUG=1 + shift + ;; + --scope) + SCOPE=1 + shift + ;; + --help) + show_usage + exit + ;; + *) + show_usage + exit + ;; +esac +done -run_4c_l2() -{ - # test 4 cores with L2 - make -C driver/opae/vlsim clean - CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1" make -C driver/opae/vlsim > /dev/null 2>&1 - make -C benchmarks/opencl/sgemm run-vlsim -} +case $DRIVER in + rtlsim) + DRIVER_PATH=driver/rtlsim + DRIVER_EXTRA= + ;; + vlsim) + DRIVER_PATH=driver/opae + DRIVER_EXTRA=vlsim + ;; + *) + echo "invalid driver: $DRIVER" + exit + ;; +esac -run_4c_2l2_l3() -{ - # test 4 cores with L2 and L3 - make -C driver/opae/vlsim clean - CONFIGS="-DNUM_CLUSTERS=2 -DNUM_CORES=2 -DL2_ENABLE=1" make -C driver/opae/vlsim > /dev/null 2>&1 - make -C benchmarks/opencl/sgemm run-vlsim -} +case $APP in + sgemm) + APP_PATH=benchmarks/opencl/sgemm + ;; + vecadd) + APP_PATH=benchmarks/opencl/vacadd + ;; + basic) + APP_PATH=driver/tests/basic + ;; + demo) + APP_PATH=driver/tests/demo + ;; + dogfood) + APP_PATH=driver/tests/dogfood + ;; + *) + echo "invalid app: $APP" + exit + ;; +esac -run_8c_4l2_l3() -{ - # test 8 cores with L2 and L3 - make -C driver/opae/vlsim clean - CONFIGS="-DNUM_CLUSTERS=4 -DNUM_CORES=2 -DL2_ENABLE=1" make -C driver/opae/vlsim > /dev/null 2>&1 - make -C benchmarks/opencl/sgemm run-vlsim -} +CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DL2_ENABLE=$L2" -run_debug() -{ - # test debug build - make -C driver/opae/vlsim clean - DEBUG=1 CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae/vlsim > /dev/null 2>&1 - make -C benchmarks/opencl/sgemm run-vlsim > /dev/null 2>&1 -} +echo "CONFIGS=$CONFIGS" -run_scope() -{ - # test build with scope analyzer - make -C driver/opae clean - SCOPE=1 CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae > /dev/null 2>&1 - make -C benchmarks/opencl/sgemm run-vlsim -} +make -C $DRIVER_PATH clean -usage() -{ - echo "usage: blackbox [[-run_1c] [-run_2c] [-run_4c] [-run_4c_l2] [-run_4c_2l2_l3] [-run_8c_4l2_l3] [-run_debug] [-run_scope] [-all] [-h|--help]]" -} +if [[ $DEBUG -eq 1 ]] +then + if [[ $SCOPE -eq 1 ]] + then + DEBUG=1 SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH $DRIVER_EXTRA > build.log 2>&1 + else + DEBUG=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH $DRIVER_EXTRA > build.log 2>&1 + fi +else + if [[ $SCOPE -eq 1 ]] + then + SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH $DRIVER_EXTRA > build.log 2>&1 + else + CONFIGS="$CONFIGS" make -C $DRIVER_PATH $DRIVER_EXTRA > build.log 2>&1 + fi +fi -while [ "$1" != "" ]; do - case $1 in - -run_1c ) run_1c - ;; - -run_2c ) run_2c - ;; - -run_4c ) run_4c - ;; - -run_4c_l2 ) run_4c_l2 - ;; - -run_4c_2l2_l3 ) run_4c_2l2_l3 - ;; - -run_8c_4l2_l3 ) run_8c_4l2_l3 - ;; - -run_debug ) run_debug - ;; - -run_scope ) run_scope - ;; - -all ) run_1c - run_2c - run_4c - run_4c_l2 - run_4c_2l2_l3 - run_8c_4l2_l3 - run_debug - run_scope - ;; - -h | --help ) usage - exit - ;; - * ) usage - exit 1 - esac - shift -done \ No newline at end of file +make -C $APP_PATH run-$DRIVER > run.log 2>&1 \ No newline at end of file diff --git a/driver/opae/Makefile b/driver/opae/Makefile index 4750e60c9..2af345395 100644 --- a/driver/opae/Makefile +++ b/driver/opae/Makefile @@ -1,7 +1,7 @@ OPAE_HOME ?= /tools/opae/1.4.0 -CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors -#CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors +#CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors +CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -I../include -I$(OPAE_HOME)/include -I../../hw @@ -60,7 +60,7 @@ json: ../../hw/opae/vortex_afu.json afu_json_mgr json-info --afu-json=$^ --c-hdr=$@ fpga: $(SRCS) - $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) $(FPGA_LIBS) -o $(PROJECT) + $(CXX) $(CXXFLAGS) -DUSE_FPGA $^ $(LDFLAGS) $(FPGA_LIBS) -o $(PROJECT) asesim: $(SRCS) $(ASE_DIR) $(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) $(ASE_LIBS) -o $(PROJECT_ASE) diff --git a/driver/opae/vlsim/Makefile b/driver/opae/vlsim/Makefile index 1d78f9a25..56352e661 100644 --- a/driver/opae/vlsim/Makefile +++ b/driver/opae/vlsim/Makefile @@ -1,5 +1,5 @@ -CFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -Wfatal-errors -#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors +#CFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -Wfatal-errors +CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors CFLAGS += -I../../../../hw @@ -30,7 +30,7 @@ CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 CFLAGS += -fPIC -CFLAGS += -DUSE_RTLSIM $(CONFIGS) +CFLAGS += -DUSE_VLSIM $(CONFIGS) CFLAGS += -DDUMP_PERF_STATS @@ -79,7 +79,7 @@ VL_FLAGS += -DNOPAE CFLAGS += -DNOPAE # use DPI FPU -#VL_FLAGS += -DFPU_FAST +VL_FLAGS += -DFPU_FAST RTL_INCLUDE += -I../../../hw/opae -I../../../hw/opae/ccip diff --git a/driver/opae/vlsim/opae_sim.cpp b/driver/opae/vlsim/opae_sim.cpp index 5339de616..d591ea01d 100644 --- a/driver/opae/vlsim/opae_sim.cpp +++ b/driver/opae/vlsim/opae_sim.cpp @@ -206,11 +206,10 @@ void opae_sim::sRxPort_bus() { vortex_afu_->vcp2af_sRxPort_c0_rspValid = 1; memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, cci_rd_it->block.data(), CACHE_BLOCK_SIZE); vortex_afu_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata; - /*printf("*** [vlsim] read-rsp: addr=%ld, mdata=%d, data=", cci_rd_it->addr, cci_rd_it->mdata); + /*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata); for (int i = 0; i < CACHE_BLOCK_SIZE; ++i) printf("%02x", cci_rd_it->block[CACHE_BLOCK_SIZE-1-i]); printf("\n");*/ - fflush(stdout); cci_reads_.erase(cci_rd_it); } } @@ -225,8 +224,7 @@ void opae_sim::sTxPort_bus() { cci_req.mdata = vortex_afu_->af2cp_sTxPort_c0_hdr_mdata; auto host_ptr = (uint64_t*)(vortex_afu_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE); memcpy(cci_req.block.data(), host_ptr, CACHE_BLOCK_SIZE); - //printf("*** [vlsim] read-req: addr=%ld, mdata=%d\n", vortex_afu_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata); - fflush(stdout); + //printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, vortex_afu_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata); cci_reads_.emplace_back(cci_req); } @@ -265,12 +263,12 @@ void opae_sim::avs_bus() { memcpy(vortex_afu_->avs_readdata, dram_rd_it->block.data(), CACHE_BLOCK_SIZE); uint32_t tag = dram_rd_it->tag; dram_reads_.erase(dram_rd_it); - /*printf("%0ld: VLSIM: DRAM rsp: addr=%x, pending={", timestamp, tag); + /*printf("%0ld: [sim] DRAM Rd Rsp: addr=%x, pending={", timestamp, tag); for (auto& req : dram_reads_) { if (req.cycles_left != 0) - printf(" !%0x", req.tag); + printf(" !%0x", req.tag); else - printf(" %0x", req.tag); + printf(" %0x", req.tag); } printf("}\n");*/ } @@ -288,7 +286,8 @@ void opae_sim::avs_bus() { // process DRAM requests if (!dram_stalled) { - if (vortex_afu_->avs_write) { + assert(!vortex_afu_->avs_read || !vortex_afu_->avs_write); + if (vortex_afu_->avs_write) { assert(0 == vortex_afu_->mem_bank_select); uint64_t byteen = vortex_afu_->avs_byteenable; unsigned base_addr = (vortex_afu_->avs_address * CACHE_BLOCK_SIZE); @@ -307,12 +306,12 @@ void opae_sim::avs_bus() { ram_.read(base_addr, CACHE_BLOCK_SIZE, dram_req.block.data()); dram_req.tag = base_addr; dram_reads_.emplace_back(dram_req); - /*printf("%0ld: VLSIM: DRAM req: addr=%x, pending={", timestamp, base_addr); + /*printf("%0ld: [sim] DRAM Rd Req: addr=%x, pending={", timestamp, base_addr); for (auto& req : dram_reads_) { if (req.cycles_left != 0) - printf(" !%0x", req.tag); + printf(" !%0x", req.tag); else - printf(" %0x", req.tag); + printf(" %0x", req.tag); } printf("}\n");*/ } diff --git a/driver/opae/vortex.cpp b/driver/opae/vortex.cpp index 3d2a388a7..318c69883 100755 --- a/driver/opae/vortex.cpp +++ b/driver/opae/vortex.cpp @@ -7,11 +7,11 @@ #include #include -#ifdef USE_VLSIM -#include "vlsim/fpga.h" -#else +#if defined(USE_FPGA) || defined(USE_ASE) #include #include +#elif defined(USE_VLSIM) +#include "vlsim/fpga.h" #endif #include diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index 3ed54b960..9ccb229bc 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -1,5 +1,5 @@ -CFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -Wfatal-errors -#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors +#CFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -Wfatal-errors +CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors CFLAGS += -I../../include -I../../../hw/simulate -I../../../hw @@ -65,7 +65,7 @@ else endif # use DPI FPU -#VL_FLAGS += -DFPU_FAST +VL_FLAGS += -DFPU_FAST PROJECT = libvortex.so # PROJECT = libvortex.dylib diff --git a/driver/tests/demo/demo.cpp b/driver/tests/demo/demo.cpp index 618823bd2..962c51ee0 100644 --- a/driver/tests/demo/demo.cpp +++ b/driver/tests/demo/demo.cpp @@ -92,7 +92,7 @@ int run_test(const kernel_arg_t& kernel_arg, } } if (errors != 0) { - std::cout << "Found " << errors << " errors!" << std::endl; + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; std::cout << "FAILED!" << std::endl; return 1; } diff --git a/driver/tests/dogfood/dogfood.cpp b/driver/tests/dogfood/dogfood.cpp index 87985c88b..d2aee1c8e 100644 --- a/driver/tests/dogfood/dogfood.cpp +++ b/driver/tests/dogfood/dogfood.cpp @@ -260,7 +260,7 @@ int main(int argc, char *argv[]) { (void*)vx_host_ptr(src1_buf), (void*)vx_host_ptr(src2_buf)); if (errors != 0) { - std::cout << "found " << errors << " errors!" << std::endl; + std::cout << "found " << std::dec << errors << " errors!" << std::endl; std::cout << "Test" << t << "-" << name << " FAILED!" << std::endl << std::flush; if (stop_on_error) { cleanup(); diff --git a/driver/tests/dogfood/testcases.h b/driver/tests/dogfood/testcases.h index 4e1301bd9..cbae336bd 100644 --- a/driver/tests/dogfood/testcases.h +++ b/driver/tests/dogfood/testcases.h @@ -14,29 +14,36 @@ union Float_t { } parts; }; -inline float fround(float x, int32_t precision = 4) { +inline float fround(float x, int32_t precision = 8) { auto power_of_10 = std::pow(10, precision); return std::round(x * power_of_10) / power_of_10; } -inline bool almost_equal_eps(float a, float b, float eps = std::numeric_limits::epsilon()) { - auto tolerance = std::min(fabs(a), fabs(b)) * eps; - return fabs(a - b) <= tolerance; +inline bool almost_equal_eps(float a, float b, int ulp = 128) { + auto eps = std::numeric_limits::epsilon() * (std::max(fabs(a), fabs(b)) * ulp); + auto d = fabs(a - b); + if (d > eps) { + std::cout << "*** almost_equal_eps: d=" << d << ", eps=" << eps << std::endl; + return false; + } + return true; } -inline bool almost_equal_ulp(float a, float b, int32_t ulp = 4) { +inline bool almost_equal_ulp(float a, float b, int32_t ulp = 6) { Float_t fa{a}, fb{b}; auto d = std::abs(fa.i - fb.i); if (d > ulp) { - std::cout << "*** float compare: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl; + std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl; return false; } return true; } inline bool almost_equal(float a, float b) { - if (almost_equal_eps(a, b)) + if (a == b) return true; + /*if (almost_equal_eps(a, b)) + return true;*/ return almost_equal_ulp(a, b); } diff --git a/hw/opae/VX_avs_wrapper.v b/hw/opae/VX_avs_wrapper.v index 2888ead0c..985bec986 100644 --- a/hw/opae/VX_avs_wrapper.v +++ b/hw/opae/VX_avs_wrapper.v @@ -45,19 +45,19 @@ module VX_avs_wrapper #( reg [AVS_BANKS_BITS-1:0] avs_bankselect_r; reg [AVS_BURSTW-1:0] avs_burstcount_r; - wire avs_rtq_push = !dram_req_rw && dram_req_valid && dram_req_ready; - wire avs_rtq_pop = dram_rsp_valid && dram_rsp_ready; + wire avs_reqq_push = dram_req_valid && dram_req_ready && !dram_req_rw; + wire avs_reqq_pop = dram_rsp_valid && dram_rsp_ready; - wire avs_rdq_push = avs_readdatavalid; - wire avs_rdq_pop = avs_rtq_pop; - wire avs_rdq_empty; + wire avs_rspq_push = avs_readdatavalid; + wire avs_rspq_pop = avs_reqq_pop; + wire avs_rspq_empty; reg [RD_QUEUE_ADDRW-1:0] avs_pending_reads; wire [RD_QUEUE_ADDRW-1:0] avs_pending_reads_n; assign avs_pending_reads_n = avs_pending_reads - + RD_QUEUE_ADDRW'((avs_rtq_push && !avs_rdq_pop) ? 1 : - (avs_rdq_pop && !avs_rtq_push) ? -1 : 0); + + RD_QUEUE_ADDRW'((avs_reqq_push && !avs_rspq_pop) ? 1 : + (avs_rspq_pop && !avs_reqq_push) ? -1 : 0); always @(posedge clk) begin if (reset) begin @@ -75,9 +75,9 @@ module VX_avs_wrapper #( ) rd_req_queue ( .clk (clk), .reset (reset), - .push (avs_rtq_push), + .push (avs_reqq_push), + .pop (avs_reqq_pop), .data_in (dram_req_tag), - .pop (avs_rtq_pop), .data_out (dram_rsp_tag), `UNUSED_PIN (empty), `UNUSED_PIN (full), @@ -90,37 +90,38 @@ module VX_avs_wrapper #( ) rd_rsp_queue ( .clk (clk), .reset (reset), - .push (avs_rdq_push), - .data_in (avs_readdata), - .pop (avs_rdq_pop), + .push (avs_rspq_push), + .pop (avs_rspq_pop), + .data_in (avs_readdata), .data_out (dram_rsp_data), - .empty (avs_rdq_empty), + .empty (avs_rspq_empty), `UNUSED_PIN (full), `UNUSED_PIN (size) ); - assign avs_read = dram_req_valid && !dram_req_rw; - assign avs_write = dram_req_valid && dram_req_rw; + wire rsp_queue_ready = (avs_pending_reads != RD_QUEUE_SIZE); + + assign avs_read = dram_req_valid && !dram_req_rw && rsp_queue_ready; + assign avs_write = dram_req_valid && dram_req_rw && rsp_queue_ready; assign avs_address = dram_req_addr; assign avs_byteenable = dram_req_byteen; assign avs_writedata = dram_req_data; - assign dram_req_ready = !avs_waitrequest - && (avs_pending_reads < RD_QUEUE_SIZE); + assign dram_req_ready = !avs_waitrequest && rsp_queue_ready; assign avs_burstcount = avs_burstcount_r; assign avs_bankselect = avs_bankselect_r; - assign dram_rsp_valid = !avs_rdq_empty; + assign dram_rsp_valid = !avs_rspq_empty; `ifdef DBG_PRINT_AVS always @(posedge clk) begin if (dram_req_valid && dram_req_ready) begin if (dram_req_rw) - $display("%t: AVS Wr Req: addr=%0h, byteen=%0h, tag=%0h, data=%0h", $time, `DRAM_TO_BYTE_ADDR(avs_address), avs_byteenable, dram_req_tag, avs_writedata); + $display("%t: AVS Wr Req: addr=%0h, byteen=%0h, tag=%0h, data=%0h", $time, `DRAM_TO_BYTE_ADDR(dram_req_addr), dram_req_byteen, dram_req_tag, dram_req_data); else - $display("%t: AVS Rd Req: addr=%0h, byteen=%0h, tag=%0h, pending=%0d", $time, `DRAM_TO_BYTE_ADDR(avs_address), avs_byteenable, dram_req_tag, avs_pending_reads_n); + $display("%t: AVS Rd Req: addr=%0h, byteen=%0h, tag=%0h, pending=%0d", $time, `DRAM_TO_BYTE_ADDR(dram_req_addr), dram_req_byteen, dram_req_tag, avs_pending_reads_n); end if (dram_rsp_valid && dram_rsp_ready) begin - $display("%t: AVS Rd Rsp: data=%0h, pending=%0d", $time, avs_readdata, avs_pending_reads_n); + $display("%t: AVS Rd Rsp: tag=%0h, data=%0h, pending=%0d", $time, dram_rsp_tag, dram_rsp_data, avs_pending_reads_n); end end `endif diff --git a/hw/opae/vortex_afu.sv b/hw/opae/vortex_afu.sv index dde37dc0c..17142b360 100644 --- a/hw/opae/vortex_afu.sv +++ b/hw/opae/vortex_afu.sv @@ -501,7 +501,6 @@ wire [AVS_REQ_TAGW-1:0] vx_dram_rsp_tag_unqual; wire cci_dram_rd_req_valid, cci_dram_wr_req_valid; wire [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_addr, cci_dram_wr_req_addr; wire [CCI_RD_RQ_DATAW-1:0] cci_rdq_dout; -wire [VX_DRAM_LINE_IDX-1:0] vx_dram_req_idx, vx_dram_rsp_idx; //-- @@ -526,20 +525,19 @@ assign vx_dram_req_valid_qual = vx_dram_req_valid && vx_enabled; assign vx_dram_req_addr_qual = vx_dram_req_addr[`VX_DRAM_ADDR_WIDTH-1:`VX_DRAM_ADDR_WIDTH-DRAM_ADDR_WIDTH]; if (`VX_DRAM_LINE_WIDTH != DRAM_LINE_WIDTH) begin - assign vx_dram_req_idx = vx_dram_req_addr[VX_DRAM_LINE_IDX-1:0]; + wire [VX_DRAM_LINE_IDX-1:0] vx_dram_req_idx = vx_dram_req_addr[VX_DRAM_LINE_IDX-1:0]; + wire [VX_DRAM_LINE_IDX-1:0] vx_dram_rsp_idx = vx_dram_rsp_tag_unqual[VX_DRAM_LINE_IDX-1:0]; assign vx_dram_req_byteen_qual = 64'(vx_dram_req_byteen) << (6'(vx_dram_req_addr[VX_DRAM_LINE_IDX-1:0]) << (VX_DRAM_LINE_LW-3)); assign vx_dram_req_data_qual = DRAM_LINE_WIDTH'(vx_dram_req_data) << ((DRAM_LINE_LW'(vx_dram_req_idx)) << VX_DRAM_LINE_LW); assign vx_dram_req_tag_qual = {vx_dram_req_tag, vx_dram_req_idx}; - assign vx_dram_rsp_data = vx_dram_rsp_data_unqual[vx_dram_rsp_idx]; + assign vx_dram_rsp_data = vx_dram_rsp_data_unqual[vx_dram_rsp_idx]; end else begin - assign vx_dram_req_idx = VX_DRAM_LINE_IDX'(0); assign vx_dram_req_byteen_qual = vx_dram_req_byteen; assign vx_dram_req_tag_qual = vx_dram_req_tag; assign vx_dram_req_data_qual = vx_dram_req_data; assign vx_dram_rsp_data = vx_dram_rsp_data_unqual; end -assign vx_dram_rsp_idx = vx_dram_rsp_tag_unqual[VX_DRAM_LINE_IDX-1:0]; assign vx_dram_rsp_tag = vx_dram_rsp_tag_unqual[`VX_DRAM_TAG_WIDTH+VX_DRAM_LINE_IDX-1:VX_DRAM_LINE_IDX]; //-- @@ -723,15 +721,15 @@ always @(posedge clk) begin cci_rd_req_wait <= 0; // restart new request batch end `ifdef DBG_PRINT_OPAE - $display("%t: CCI Rd Rsp: idx=%0d, ctr=%0d", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr); + $display("%t: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data); `endif end - if (cci_rdq_pop) begin + /*if (cci_rdq_pop) begin `ifdef DBG_PRINT_OPAE $display("%t: CCI Rd Queue Pop: pending=%0d", $time, cci_pending_reads_next); `endif - end + end*/ if (cci_dram_wr_req_fire) begin cci_dram_wr_req_addr_unqual <= cci_dram_wr_req_addr_unqual + ((CCI_RD_RQ_TAGW'(cci_dram_wr_req_ctr) == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) ? DRAM_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE) : DRAM_ADDR_WIDTH'(0)); @@ -836,15 +834,15 @@ begin cci_wr_req_addr <= cci_wr_req_addr + t_ccip_clAddr'(1); cci_wr_req_ctr <= cci_wr_req_ctr - DRAM_ADDR_WIDTH'(1); `ifdef DBG_PRINT_OPAE - $display("%t: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes_next); + $display("%t: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes_next, af2cp_sTxPort.c1.data); `endif end - `ifdef DBG_PRINT_OPAE + /*`ifdef DBG_PRINT_OPAE if (cci_wr_rsp_fire) begin $display("%t: CCI Wr Rsp: pending=%0d", $time, cci_pending_writes_next); end - `endif + `endif*/ if (cci_dram_rd_req_fire) begin cci_dram_rd_req_addr_unqual <= cci_dram_rd_req_addr_unqual + DRAM_ADDR_WIDTH'(1); diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index c4138aca7..30e45565e 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -304,30 +304,60 @@ module VX_cluster #( wire[`NUM_CORES-1:0][`DSNP_TAG_WIDTH-1:0] core_snp_fwdin_tag; wire[`NUM_CORES-1:0] core_snp_fwdin_ready; + wire snp_fwd_rsp_valid; + wire [`L2DRAM_ADDR_WIDTH-1:0] snp_fwd_rsp_addr; + wire snp_fwd_rsp_invalidate; + wire [`L2SNP_TAG_WIDTH-1:0] snp_fwd_rsp_tag; + wire snp_fwd_rsp_ready; + + reg [`L2NUM_REQUESTS-1:0] core_dram_rsp_ready_other; + reg core_dram_rsp_ready_all; + + always @(*) begin + core_dram_rsp_ready_other = {`L2NUM_REQUESTS{1'b1}}; + core_dram_rsp_ready_all = 1'b1; + + for (integer i = 0; i < `L2NUM_REQUESTS; i++) begin + for (integer j = 0; j < `L2NUM_REQUESTS; j++) begin + if (i != j) begin + if (0 == (j & 1)) + core_dram_rsp_ready_other[i] &= (per_core_D_dram_rsp_ready [(j/2)] | !core_dram_rsp_valid [j]); + else + core_dram_rsp_ready_other[i] &= (per_core_I_dram_rsp_ready [(j/2)] | !core_dram_rsp_valid [j]); + end + end + + if (0 == (i & 1)) + core_dram_rsp_ready_all &= (per_core_D_dram_rsp_ready [(i/2)] | !core_dram_rsp_valid [i]); + else + core_dram_rsp_ready_all &= (per_core_I_dram_rsp_ready [(i/2)] | !core_dram_rsp_valid [i]); + end + end + for (genvar i = 0; i < `L2NUM_REQUESTS; i = i + 2) begin - assign core_dram_req_valid [i] = per_core_D_dram_req_valid[(i/2)]; - assign core_dram_req_valid [i+1] = per_core_I_dram_req_valid[(i/2)]; + assign core_dram_req_valid [i] = per_core_D_dram_req_valid [(i/2)]; + assign core_dram_req_valid [i+1] = per_core_I_dram_req_valid [(i/2)]; - assign core_dram_req_rw [i] = per_core_D_dram_req_rw[(i/2)]; - assign core_dram_req_rw [i+1] = per_core_I_dram_req_rw[(i/2)]; + assign core_dram_req_rw [i] = per_core_D_dram_req_rw [(i/2)]; + assign core_dram_req_rw [i+1] = per_core_I_dram_req_rw [(i/2)]; - assign core_dram_req_byteen [i] = per_core_D_dram_req_byteen[(i/2)]; - assign core_dram_req_byteen [i+1] = per_core_I_dram_req_byteen[(i/2)]; + assign core_dram_req_byteen [i] = per_core_D_dram_req_byteen [(i/2)]; + assign core_dram_req_byteen [i+1] = per_core_I_dram_req_byteen [(i/2)]; - assign core_dram_req_addr [i] = per_core_D_dram_req_addr[(i/2)]; - assign core_dram_req_addr [i+1] = per_core_I_dram_req_addr[(i/2)]; + assign core_dram_req_addr [i] = per_core_D_dram_req_addr [(i/2)]; + assign core_dram_req_addr [i+1] = per_core_I_dram_req_addr [(i/2)]; - assign core_dram_req_data [i] = per_core_D_dram_req_data[(i/2)]; - assign core_dram_req_data [i+1] = per_core_I_dram_req_data[(i/2)]; + assign core_dram_req_data [i] = per_core_D_dram_req_data [(i/2)]; + assign core_dram_req_data [i+1] = per_core_I_dram_req_data [(i/2)]; - assign core_dram_req_tag [i] = per_core_D_dram_req_tag[(i/2)]; - assign core_dram_req_tag [i+1] = per_core_I_dram_req_tag[(i/2)]; + assign core_dram_req_tag [i] = per_core_D_dram_req_tag [(i/2)]; + assign core_dram_req_tag [i+1] = per_core_I_dram_req_tag [(i/2)]; assign per_core_D_dram_req_ready [(i/2)] = core_dram_req_ready; assign per_core_I_dram_req_ready [(i/2)] = core_dram_req_ready; - assign per_core_D_dram_rsp_valid [(i/2)] = core_dram_rsp_valid[i] && core_dram_rsp_ready; - assign per_core_I_dram_rsp_valid [(i/2)] = core_dram_rsp_valid[i+1] && core_dram_rsp_ready; + assign per_core_D_dram_rsp_valid [(i/2)] = core_dram_rsp_valid[i] & core_dram_rsp_ready_other [i]; + assign per_core_I_dram_rsp_valid [(i/2)] = core_dram_rsp_valid[i+1] & core_dram_rsp_ready_other [i+1]; assign per_core_D_dram_rsp_data [(i/2)] = core_dram_rsp_data[i]; assign per_core_I_dram_rsp_data [(i/2)] = core_dram_rsp_data[i+1]; @@ -346,32 +376,63 @@ module VX_cluster #( assign per_core_snp_rsp_ready [(i/2)] = core_snp_fwdin_ready [(i/2)]; end - assign core_dram_rsp_ready = (& per_core_D_dram_rsp_ready) && (& per_core_I_dram_rsp_ready); + assign core_dram_rsp_ready = core_dram_rsp_ready_all; + + VX_snp_forwarder #( + .CACHE_ID (`L2CACHE_ID), + .NUM_REQUESTS (`NUM_CORES), + .SRC_ADDR_WIDTH (`L2DRAM_ADDR_WIDTH), + .DST_ADDR_WIDTH (`DDRAM_ADDR_WIDTH), + .SNP_TAG_WIDTH (`L2SNP_TAG_WIDTH), + .SNRQ_SIZE (`L2SNRQ_SIZE) + ) snp_forwarder ( + .clk (clk), + .reset (reset), + + .snp_req_valid (snp_req_valid), + .snp_req_addr (snp_req_addr), + .snp_req_invalidate (snp_req_invalidate), + .snp_req_tag (snp_req_tag), + .snp_req_ready (snp_req_ready), + + .snp_rsp_valid (snp_fwd_rsp_valid), + .snp_rsp_addr (snp_fwd_rsp_addr), + .snp_rsp_invalidate (snp_fwd_rsp_invalidate), + .snp_rsp_tag (snp_fwd_rsp_tag), + .snp_rsp_ready (snp_fwd_rsp_ready), + + .snp_fwdout_valid (core_snp_fwdout_valid), + .snp_fwdout_addr (core_snp_fwdout_addr), + .snp_fwdout_invalidate(core_snp_fwdout_invalidate), + .snp_fwdout_tag (core_snp_fwdout_tag), + .snp_fwdout_ready (core_snp_fwdout_ready), + + .snp_fwdin_valid (core_snp_fwdin_valid), + .snp_fwdin_tag (core_snp_fwdin_tag), + .snp_fwdin_ready (core_snp_fwdin_ready) + ); VX_cache #( - .CACHE_ID (`L2CACHE_ID), - .CACHE_SIZE (`L2CACHE_SIZE), - .BANK_LINE_SIZE (`L2BANK_LINE_SIZE), - .NUM_BANKS (`L2NUM_BANKS), - .WORD_SIZE (`L2WORD_SIZE), - .NUM_REQUESTS (`L2NUM_REQUESTS), - .CREQ_SIZE (`L2CREQ_SIZE), - .MRVQ_SIZE (`L2MRVQ_SIZE), - .DRFQ_SIZE (`L2DRFQ_SIZE), - .SNRQ_SIZE (`L2SNRQ_SIZE), - .CWBQ_SIZE (`L2CWBQ_SIZE), - .DREQ_SIZE (`L2DREQ_SIZE), - .SNPQ_SIZE (`L2SNPQ_SIZE), - .DRAM_ENABLE (1), - .FLUSH_ENABLE (1), - .WRITE_ENABLE (1), - .SNOOP_FORWARDING (1), - .CORE_TAG_WIDTH (`DDRAM_TAG_WIDTH), - .CORE_TAG_ID_BITS (0), - .DRAM_TAG_WIDTH (`L2DRAM_TAG_WIDTH), - .NUM_SNP_REQUESTS (`NUM_CORES), - .SNP_REQ_TAG_WIDTH (`L2SNP_TAG_WIDTH), - .SNP_FWD_TAG_WIDTH (`DSNP_TAG_WIDTH) + .CACHE_ID (`L2CACHE_ID), + .CACHE_SIZE (`L2CACHE_SIZE), + .BANK_LINE_SIZE (`L2BANK_LINE_SIZE), + .NUM_BANKS (`L2NUM_BANKS), + .WORD_SIZE (`L2WORD_SIZE), + .NUM_REQUESTS (`L2NUM_REQUESTS), + .CREQ_SIZE (`L2CREQ_SIZE), + .MRVQ_SIZE (`L2MRVQ_SIZE), + .DRFQ_SIZE (`L2DRFQ_SIZE), + .SNRQ_SIZE (`L2SNRQ_SIZE), + .CWBQ_SIZE (`L2CWBQ_SIZE), + .DREQ_SIZE (`L2DREQ_SIZE), + .SNPQ_SIZE (`L2SNPQ_SIZE), + .DRAM_ENABLE (1), + .FLUSH_ENABLE (1), + .WRITE_ENABLE (1), + .CORE_TAG_WIDTH (`DDRAM_TAG_WIDTH), + .CORE_TAG_ID_BITS (0), + .DRAM_TAG_WIDTH (`L2DRAM_TAG_WIDTH), + .SNP_TAG_WIDTH (`L2SNP_TAG_WIDTH) ) l2cache ( `SCOPE_BIND_VX_cluster_l2cache @@ -409,29 +470,17 @@ module VX_cluster #( .dram_rsp_ready (dram_rsp_ready), // Snoop request - .snp_req_valid (snp_req_valid), - .snp_req_addr (snp_req_addr), - .snp_req_invalidate (snp_req_invalidate), - .snp_req_tag (snp_req_tag), - .snp_req_ready (snp_req_ready), + .snp_req_valid (snp_fwd_rsp_valid), + .snp_req_addr (snp_fwd_rsp_addr), + .snp_req_invalidate (snp_fwd_rsp_invalidate), + .snp_req_tag (snp_fwd_rsp_tag), + .snp_req_ready (snp_fwd_rsp_ready), // Snoop response .snp_rsp_valid (snp_rsp_valid), .snp_rsp_tag (snp_rsp_tag), .snp_rsp_ready (snp_rsp_ready), - // Snoop forwarding out - .snp_fwdout_valid (core_snp_fwdout_valid), - .snp_fwdout_addr (core_snp_fwdout_addr), - .snp_fwdout_invalidate(core_snp_fwdout_invalidate), - .snp_fwdout_tag (core_snp_fwdout_tag), - .snp_fwdout_ready (core_snp_fwdout_ready), - - // Snoop forwarding in - .snp_fwdin_valid (core_snp_fwdin_valid), - .snp_fwdin_tag (core_snp_fwdin_tag), - .snp_fwdin_ready (core_snp_fwdin_ready), - // Miss status `UNUSED_PIN (miss_vec) ); @@ -508,11 +557,12 @@ module VX_cluster #( if (`NUM_CORES > 1) begin VX_snp_forwarder #( - .CACHE_ID (`L2CACHE_ID), - .BANK_LINE_SIZE (`L2BANK_LINE_SIZE), - .NUM_REQUESTS (`NUM_CORES), - .SNRQ_SIZE (`L2SNRQ_SIZE), - .SNP_REQ_TAG_WIDTH (`L2SNP_TAG_WIDTH) + .CACHE_ID (`L2CACHE_ID), + .NUM_REQUESTS (`NUM_CORES), + .SRC_ADDR_WIDTH (`L2DRAM_ADDR_WIDTH), + .DST_ADDR_WIDTH (`DDRAM_ADDR_WIDTH), + .SNP_TAG_WIDTH (`L2SNP_TAG_WIDTH), + .SNRQ_SIZE (`L2SNRQ_SIZE) ) snp_forwarder ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 7359eddcf..de76845dd 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -8,7 +8,7 @@ `endif `ifndef NUM_CORES -`define NUM_CORES 2 +`define NUM_CORES 4 `endif `ifndef NUM_WARPS @@ -23,8 +23,20 @@ `define NUM_BARRIERS 4 `endif +`ifndef L2_ENABLE +`define L2_ENABLE (`NUM_CORES > 2) +`endif + +`ifndef L3_ENABLE +`define L3_ENABLE (`NUM_CLUSTERS > 1) +`endif + `ifndef GLOBAL_BLOCK_SIZE -`define GLOBAL_BLOCK_SIZE 16 +`define GLOBAL_BLOCK_SIZE 64 +`endif + +`ifndef L1_BLOCK_SIZE +`define L1_BLOCK_SIZE 16 `endif `ifndef STARTUP_ADDR @@ -57,14 +69,6 @@ `define FRAME_BUFFER_SIZE (FRAME_BUFFER_WIDTH * FRAME_BUFFER_HEIGHT) -`ifndef L2_ENABLE -`define L2_ENABLE 0 -`endif - -`ifndef L3_ENABLE -`define L3_ENABLE (`NUM_CLUSTERS > 1) -`endif - `ifndef EXT_M_DISABLE `define EXT_M_ENABLE `endif @@ -159,7 +163,7 @@ `define CSR_MIMPID 12'hF13 `define CSR_MHARTID 12'hF14 -// Pipeline Queues ============================================================ +// Pipeline Queues //////////////////////////////////////////////////////////// // Size of instruction queue `ifndef IBUF_SIZE @@ -181,28 +185,18 @@ `define FPUQ_SIZE 8 `endif -// Dcache Configurable Knobs ================================================== +// Dcache Configurable Knobs ////////////////////////////////////////////////// // Size of cache in bytes `ifndef DCACHE_SIZE -`define DCACHE_SIZE 4096 +`define DCACHE_SIZE 8192 `endif -// Size of line inside a bank in bytes -`ifndef DBANK_LINE_SIZE -`define DBANK_LINE_SIZE `GLOBAL_BLOCK_SIZE -`endif - -// Number of banks {1, 2, 4, 8,...} +// Number of banks `ifndef DNUM_BANKS `define DNUM_BANKS 4 `endif -// Size of a word in bytes -`ifndef DWORD_SIZE -`define DWORD_SIZE 4 -`endif - // Core Request Queue Size `ifndef DCREQ_SIZE `define DCREQ_SIZE `NUM_WARPS @@ -238,21 +232,11 @@ `define DSNRQ_SIZE 8 `endif -// Icache Configurable Knobs ================================================== +// Icache Configurable Knobs ////////////////////////////////////////////////// // Size of cache in bytes `ifndef ICACHE_SIZE -`define ICACHE_SIZE 2048 -`endif - -// Size of line inside a bank in bytes -`ifndef IBANK_LINE_SIZE -`define IBANK_LINE_SIZE `GLOBAL_BLOCK_SIZE -`endif - -// Size of a word in bytes -`ifndef IWORD_SIZE -`define IWORD_SIZE 4 +`define ICACHE_SIZE 8192 `endif // Core Request Queue Size @@ -280,28 +264,18 @@ `define IDRFQ_SIZE 8 `endif -// SM Configurable Knobs ====================================================== +// SM Configurable Knobs ////////////////////////////////////////////////////// // Size of cache in bytes `ifndef SCACHE_SIZE -`define SCACHE_SIZE 1024 +`define SCACHE_SIZE 4096 `endif -// Size of line inside a bank in bytes -`ifndef SBANK_LINE_SIZE -`define SBANK_LINE_SIZE `GLOBAL_BLOCK_SIZE -`endif - -// Number of banks {1, 2, 4, 8,...} +// Number of banks `ifndef SNUM_BANKS `define SNUM_BANKS 4 `endif -// Size of a word in bytes -`ifndef SWORD_SIZE -`define SWORD_SIZE 4 -`endif - // Core Request Queue Size `ifndef SCREQ_SIZE `define SCREQ_SIZE `NUM_WARPS @@ -312,28 +286,18 @@ `define SCWBQ_SIZE `SCREQ_SIZE `endif -// L2cache Configurable Knobs ================================================= +// L2cache Configurable Knobs ///////////////////////////////////////////////// // Size of cache in bytes `ifndef L2CACHE_SIZE -`define L2CACHE_SIZE 4096 +`define L2CACHE_SIZE 131072 `endif -// Size of line inside a bank in bytes -`ifndef L2BANK_LINE_SIZE -`define L2BANK_LINE_SIZE `GLOBAL_BLOCK_SIZE -`endif - -// Number of banks {1, 2, 4, 8,...} +// Number of banks `ifndef L2NUM_BANKS `define L2NUM_BANKS 4 `endif -// Size of a word in bytes -`ifndef L2WORD_SIZE -`define L2WORD_SIZE `L2BANK_LINE_SIZE -`endif - // Core Request Queue Size `ifndef L2CREQ_SIZE `define L2CREQ_SIZE 8 @@ -369,28 +333,18 @@ `define L2SNPQ_SIZE 8 `endif -// L3cache Configurable Knobs ================================================= +// L3cache Configurable Knobs ///////////////////////////////////////////////// // Size of cache in bytes `ifndef L3CACHE_SIZE -`define L3CACHE_SIZE 8192 +`define L3CACHE_SIZE 262144 `endif -// Size of line inside a bank in bytes -`ifndef L3BANK_LINE_SIZE -`define L3BANK_LINE_SIZE `GLOBAL_BLOCK_SIZE -`endif - -// Number of banks {1, 2, 4, 8,...} +// Number of banks `ifndef L3NUM_BANKS `define L3NUM_BANKS 4 `endif -// Size of a word in bytes -`ifndef L3WORD_SIZE -`define L3WORD_SIZE `L3BANK_LINE_SIZE -`endif - // Core Request Queue Size `ifndef L3CREQ_SIZE `define L3CREQ_SIZE 8 diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 84b39f7d8..4f679b2a2 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -6,203 +6,203 @@ /////////////////////////////////////////////////////////////////////////////// -`define NW_BITS `LOG2UP(`NUM_WARPS) +`define NW_BITS `LOG2UP(`NUM_WARPS) -`define NT_BITS `LOG2UP(`NUM_THREADS) +`define NT_BITS `LOG2UP(`NUM_THREADS) -`define NC_BITS `LOG2UP(`NUM_CORES) +`define NC_BITS `LOG2UP(`NUM_CORES) -`define NB_BITS `LOG2UP(`NUM_BARRIERS) +`define NB_BITS `LOG2UP(`NUM_BARRIERS) -`define REQS_BITS `LOG2UP(NUM_REQUESTS) +`define REQS_BITS `LOG2UP(NUM_REQUESTS) `ifdef EXT_F_ENABLE -`define NUM_REGS 64 +`define NUM_REGS 64 `else -`define NUM_REGS 32 +`define NUM_REGS 32 `endif -`define NR_BITS `LOG2UP(`NUM_REGS) +`define NR_BITS `LOG2UP(`NUM_REGS) -`define CSR_ADDR_BITS 12 +`define CSR_ADDR_BITS 12 -`define CSR_WIDTH 12 +`define CSR_WIDTH 12 /////////////////////////////////////////////////////////////////////////////// -`define INST_LUI 7'b0110111 -`define INST_AUIPC 7'b0010111 -`define INST_JAL 7'b1101111 -`define INST_JALR 7'b1100111 -`define INST_B 7'b1100011 // branch instructions -`define INST_L 7'b0000011 // load instructions -`define INST_S 7'b0100011 // store instructions -`define INST_I 7'b0010011 // immediate instructions -`define INST_R 7'b0110011 // register instructions -`define INST_F 7'b0001111 // Fence instructions -`define INST_SYS 7'b1110011 // system instructions +`define INST_LUI 7'b0110111 +`define INST_AUIPC 7'b0010111 +`define INST_JAL 7'b1101111 +`define INST_JALR 7'b1100111 +`define INST_B 7'b1100011 // branch instructions +`define INST_L 7'b0000011 // load instructions +`define INST_S 7'b0100011 // store instructions +`define INST_I 7'b0010011 // immediate instructions +`define INST_R 7'b0110011 // register instructions +`define INST_F 7'b0001111 // Fence instructions +`define INST_SYS 7'b1110011 // system instructions -`define INST_FL 7'b0000111 // float load instruction -`define INST_FS 7'b0100111 // float store instruction -`define INST_FMADD 7'b1000011 -`define INST_FMSUB 7'b1000111 -`define INST_FNMSUB 7'b1001011 -`define INST_FNMADD 7'b1001111 -`define INST_FCI 7'b1010011 // float common instructions +`define INST_FL 7'b0000111 // float load instruction +`define INST_FS 7'b0100111 // float store instruction +`define INST_FMADD 7'b1000011 +`define INST_FMSUB 7'b1000111 +`define INST_FNMSUB 7'b1001011 +`define INST_FNMADD 7'b1001111 +`define INST_FCI 7'b1010011 // float common instructions -`define INST_GPU 7'b1101011 +`define INST_GPU 7'b1101011 /////////////////////////////////////////////////////////////////////////////// -`define BYTEEN_SB 3'h0 -`define BYTEEN_SH 3'h1 -`define BYTEEN_SW 3'h2 -`define BYTEEN_UB 3'h4 -`define BYTEEN_UH 3'h5 -`define BYTEEN_BITS 3 -`define BYTEEN_TYPE(x) x[1:0] +`define BYTEEN_SB 3'h0 +`define BYTEEN_SH 3'h1 +`define BYTEEN_SW 3'h2 +`define BYTEEN_UB 3'h4 +`define BYTEEN_UH 3'h5 +`define BYTEEN_BITS 3 +`define BYTEEN_TYPE(x) x[1:0] -`define FRM_RNE 3'b000 // round to nearest even -`define FRM_RTZ 3'b001 // round to zero -`define FRM_RDN 3'b010 // round to -inf -`define FRM_RUP 3'b011 // round to +inf -`define FRM_RMM 3'b100 // round to nearest max magnitude -`define FRM_DYN 3'b111 // dynamic mode -`define FRM_BITS 3 +`define FRM_RNE 3'b000 // round to nearest even +`define FRM_RTZ 3'b001 // round to zero +`define FRM_RDN 3'b010 // round to -inf +`define FRM_RUP 3'b011 // round to +inf +`define FRM_RMM 3'b100 // round to nearest max magnitude +`define FRM_DYN 3'b111 // dynamic mode +`define FRM_BITS 3 /////////////////////////////////////////////////////////////////////////////// -`define EX_NOP 3'h0 -`define EX_ALU 3'h1 -`define EX_LSU 3'h2 -`define EX_CSR 3'h3 -`define EX_MUL 3'h4 -`define EX_FPU 3'h5 -`define EX_GPU 3'h6 -`define EX_BITS 3 +`define EX_NOP 3'h0 +`define EX_ALU 3'h1 +`define EX_LSU 3'h2 +`define EX_CSR 3'h3 +`define EX_MUL 3'h4 +`define EX_FPU 3'h5 +`define EX_GPU 3'h6 +`define EX_BITS 3 -`define NUM_EXS 6 -`define NE_BITS `LOG2UP(`NUM_EXS) +`define NUM_EXS 6 +`define NE_BITS `LOG2UP(`NUM_EXS) /////////////////////////////////////////////////////////////////////////////// -`define OP_BITS 4 -`define MOD_BITS 3 +`define OP_BITS 4 +`define MOD_BITS 3 -`define ALU_ADD 4'b0000 -`define ALU_LUI 4'b0010 -`define ALU_AUIPC 4'b0011 -`define ALU_SLTU 4'b0100 -`define ALU_SLT 4'b0101 -`define ALU_SRL 4'b1000 -`define ALU_SRA 4'b1001 -`define ALU_SUB 4'b1011 -`define ALU_AND 4'b1100 -`define ALU_OR 4'b1101 -`define ALU_XOR 4'b1110 -`define ALU_SLL 4'b1111 -`define ALU_OTHER 4'b0111 -`define ALU_BITS 4 -`define ALU_OP(x) x[`ALU_BITS-1:0] +`define ALU_ADD 4'b0000 +`define ALU_LUI 4'b0010 +`define ALU_AUIPC 4'b0011 +`define ALU_SLTU 4'b0100 +`define ALU_SLT 4'b0101 +`define ALU_SRL 4'b1000 +`define ALU_SRA 4'b1001 +`define ALU_SUB 4'b1011 +`define ALU_AND 4'b1100 +`define ALU_OR 4'b1101 +`define ALU_XOR 4'b1110 +`define ALU_SLL 4'b1111 +`define ALU_OTHER 4'b0111 +`define ALU_BITS 4 +`define ALU_OP(x) x[`ALU_BITS-1:0] `define ALU_OP_CLASS(x) x[3:2] -`define ALU_SIGNED(x) x[0] +`define ALU_SIGNED(x) x[0] -`define BR_EQ 4'b0000 -`define BR_NE 4'b0010 -`define BR_LTU 4'b0100 -`define BR_GEU 4'b0110 -`define BR_LT 4'b0101 -`define BR_GE 4'b0111 -`define BR_JAL 4'b1000 -`define BR_JALR 4'b1001 -`define BR_ECALL 4'b1010 -`define BR_EBREAK 4'b1011 -`define BR_MRET 4'b1100 -`define BR_SRET 4'b1101 -`define BR_DRET 4'b1110 -`define BR_OTHER 4'b1111 -`define BR_BITS 4 -`define BR_OP(x) x[`BR_BITS-1:0] -`define BR_NEG(x) x[1] -`define BR_LESS(x) x[2] -`define BR_STATIC(x) x[3] -`define ALU_BR_BITS 4 -`define ALU_BR_OP(x) x[`ALU_BR_BITS-1:0] -`define IS_BR_MOD(x) x[0] +`define BR_EQ 4'b0000 +`define BR_NE 4'b0010 +`define BR_LTU 4'b0100 +`define BR_GEU 4'b0110 +`define BR_LT 4'b0101 +`define BR_GE 4'b0111 +`define BR_JAL 4'b1000 +`define BR_JALR 4'b1001 +`define BR_ECALL 4'b1010 +`define BR_EBREAK 4'b1011 +`define BR_MRET 4'b1100 +`define BR_SRET 4'b1101 +`define BR_DRET 4'b1110 +`define BR_OTHER 4'b1111 +`define BR_BITS 4 +`define BR_OP(x) x[`BR_BITS-1:0] +`define BR_NEG(x) x[1] +`define BR_LESS(x) x[2] +`define BR_STATIC(x) x[3] +`define ALU_BR_BITS 4 +`define ALU_BR_OP(x) x[`ALU_BR_BITS-1:0] +`define IS_BR_MOD(x) x[0] -`define LSU_LB {1'b0, `BYTEEN_SB} -`define LSU_LH {1'b0, `BYTEEN_SH} -`define LSU_LW {1'b0, `BYTEEN_SW} -`define LSU_LBU {1'b0, `BYTEEN_UB} -`define LSU_LHU {1'b0, `BYTEEN_UH} -`define LSU_SB {1'b1, `BYTEEN_SB} -`define LSU_SH {1'b1, `BYTEEN_SH} -`define LSU_SW {1'b1, `BYTEEN_SW} -`define LSU_SBU {1'b1, `BYTEEN_UB} -`define LSU_SHU {1'b1, `BYTEEN_UH} -`define LSU_BITS 4 -`define LSU_RW(x) x[3] -`define LSU_BE(x) x[2:0] +`define LSU_LB {1'b0, `BYTEEN_SB} +`define LSU_LH {1'b0, `BYTEEN_SH} +`define LSU_LW {1'b0, `BYTEEN_SW} +`define LSU_LBU {1'b0, `BYTEEN_UB} +`define LSU_LHU {1'b0, `BYTEEN_UH} +`define LSU_SB {1'b1, `BYTEEN_SB} +`define LSU_SH {1'b1, `BYTEEN_SH} +`define LSU_SW {1'b1, `BYTEEN_SW} +`define LSU_SBU {1'b1, `BYTEEN_UB} +`define LSU_SHU {1'b1, `BYTEEN_UH} +`define LSU_BITS 4 +`define LSU_RW(x) x[3] +`define LSU_BE(x) x[2:0] -`define CSR_RW 2'h0 -`define CSR_RS 2'h1 -`define CSR_RC 2'h2 -`define CSR_OTHER 2'h3 -`define CSR_BITS 2 -`define CSR_OP(x) x[`CSR_BITS-1:0] +`define CSR_RW 2'h0 +`define CSR_RS 2'h1 +`define CSR_RC 2'h2 +`define CSR_OTHER 2'h3 +`define CSR_BITS 2 +`define CSR_OP(x) x[`CSR_BITS-1:0] -`define MUL_MUL 3'h0 -`define MUL_MULH 3'h1 -`define MUL_MULHSU 3'h2 -`define MUL_MULHU 3'h3 -`define MUL_DIV 3'h4 -`define MUL_DIVU 3'h5 -`define MUL_REM 3'h6 -`define MUL_REMU 3'h7 -`define MUL_BITS 3 -`define MUL_OP(x) x[`MUL_BITS-1:0] -`define IS_DIV_OP(x) x[2] +`define MUL_MUL 3'h0 +`define MUL_MULH 3'h1 +`define MUL_MULHSU 3'h2 +`define MUL_MULHU 3'h3 +`define MUL_DIV 3'h4 +`define MUL_DIVU 3'h5 +`define MUL_REM 3'h6 +`define MUL_REMU 3'h7 +`define MUL_BITS 3 +`define MUL_OP(x) x[`MUL_BITS-1:0] +`define IS_DIV_OP(x) x[2] -`define FPU_ADD 4'h0 -`define FPU_SUB 4'h1 -`define FPU_MUL 4'h2 -`define FPU_DIV 4'h3 -`define FPU_SQRT 4'h4 -`define FPU_MADD 4'h5 -`define FPU_MSUB 4'h6 -`define FPU_NMSUB 4'h7 -`define FPU_NMADD 4'h8 -`define FPU_CVTWS 4'h9 // FCVT.W.S -`define FPU_CVTWUS 4'hA // FCVT.WU.S -`define FPU_CVTSW 4'hB // FCVT.S.W -`define FPU_CVTSWU 4'hC // FCVT.S.WU -`define FPU_CLASS 4'hD -`define FPU_CMP 4'hE -`define FPU_MISC 4'hF // SGNJ, SGNJN, SGNJX, FMIN, FMAX, MVXW, MVWX -`define FPU_BITS 4 -`define FPU_OP(x) x[`FPU_BITS-1:0] +`define FPU_ADD 4'h0 +`define FPU_SUB 4'h1 +`define FPU_MUL 4'h2 +`define FPU_DIV 4'h3 +`define FPU_SQRT 4'h4 +`define FPU_MADD 4'h5 +`define FPU_MSUB 4'h6 +`define FPU_NMSUB 4'h7 +`define FPU_NMADD 4'h8 +`define FPU_CVTWS 4'h9 // FCVT.W.S +`define FPU_CVTWUS 4'hA // FCVT.WU.S +`define FPU_CVTSW 4'hB // FCVT.S.W +`define FPU_CVTSWU 4'hC // FCVT.S.WU +`define FPU_CLASS 4'hD +`define FPU_CMP 4'hE +`define FPU_MISC 4'hF // SGNJ, SGNJN, SGNJX, FMIN, FMAX, MVXW, MVWX +`define FPU_BITS 4 +`define FPU_OP(x) x[`FPU_BITS-1:0] -`define GPU_TMC 3'h0 -`define GPU_WSPAWN 3'h1 -`define GPU_SPLIT 3'h2 -`define GPU_JOIN 3'h3 -`define GPU_BAR 3'h4 -`define GPU_OTHER 3'h7 -`define GPU_BITS 3 -`define GPU_OP(x) x[`GPU_BITS-1:0] +`define GPU_TMC 3'h0 +`define GPU_WSPAWN 3'h1 +`define GPU_SPLIT 3'h2 +`define GPU_JOIN 3'h3 +`define GPU_BAR 3'h4 +`define GPU_OTHER 3'h7 +`define GPU_BITS 3 +`define GPU_OP(x) x[`GPU_BITS-1:0] /////////////////////////////////////////////////////////////////////////////// `ifdef EXT_M_ENABLE - `define ISA_EXT_M (1 << 12) + `define ISA_EXT_M (1 << 12) `else - `define ISA_EXT_M 0 + `define ISA_EXT_M 0 `endif `ifdef EXT_F_ENABLE - `define ISA_EXT_F (1 << 5) + `define ISA_EXT_F (1 << 5) `else - `define ISA_EXT_F 0 + `define ISA_EXT_F 0 `endif `define ISA_CODE (0 << 0) // A - Atomic Instructions extension \ @@ -234,144 +234,174 @@ /////////////////////////////////////////////////////////////////////////////// -`ifdef DBG_CACHE_REQ_INFO // pc, rd, wid -`define DBG_CACHE_REQ_MDATAW (32 + `NR_BITS + `NW_BITS) +`ifdef DBG_CACHE_REQ_INFO // pc, rd, wid +`define DBG_CACHE_REQ_MDATAW (32 + `NR_BITS + `NW_BITS) `else -`define DBG_CACHE_REQ_MDATAW 0 +`define DBG_CACHE_REQ_MDATAW 0 `endif ////////////////////////// Dcache Configurable Knobs ////////////////////////// // Cache ID -`define DCACHE_ID 32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0 +`define DCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0) + +// Block size in bytes +`define DBANK_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE) + +// Word size in bytes +`define DWORD_SIZE 4 // TAG sharing enable -`define DCORE_TAG_ID_BITS `LOG2UP(`LSUQ_SIZE) +`define DCORE_TAG_ID_BITS `LOG2UP(`LSUQ_SIZE) // Core request tag bits -`define DCORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCORE_TAG_ID_BITS) +`define DCORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCORE_TAG_ID_BITS) // DRAM request data bits -`define DDRAM_LINE_WIDTH (`DBANK_LINE_SIZE * 8) +`define DDRAM_LINE_WIDTH (`DBANK_LINE_SIZE * 8) // DRAM request address bits -`define DDRAM_ADDR_WIDTH (32 - `CLOG2(`DBANK_LINE_SIZE)) +`define DDRAM_ADDR_WIDTH (32 - `CLOG2(`DBANK_LINE_SIZE)) // DRAM byte enable bits -`define DDRAM_BYTEEN_WIDTH `DBANK_LINE_SIZE +`define DDRAM_BYTEEN_WIDTH `DBANK_LINE_SIZE // DRAM request tag bits -`define DDRAM_TAG_WIDTH `DDRAM_ADDR_WIDTH +`define DDRAM_TAG_WIDTH `DDRAM_ADDR_WIDTH -// Number of Word requests per cycle {1, 2, 4, 8, ...} -`define DNUM_REQUESTS `NUM_THREADS +// Core request size +`define DNUM_REQUESTS `NUM_THREADS // Snoop request tag bits -`define DSNP_TAG_WIDTH ((`NUM_CORES > 1) ? `LOG2UP(`L2SNRQ_SIZE) : `L2SNP_TAG_WIDTH) +`define DSNP_TAG_WIDTH ((`NUM_CORES > 1) ? `LOG2UP(`L2SNRQ_SIZE) : `L2SNP_TAG_WIDTH) ////////////////////////// Icache Configurable Knobs ////////////////////////// // Cache ID -`define ICACHE_ID 32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1 +`define ICACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1) + +// Block size in bytes +`define IBANK_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE) + +// Word size in bytes +`define IWORD_SIZE 4 // Number of banks -`define INUM_BANKS 1 +`define INUM_BANKS 1 // Core request address bits -`define ICORE_ADDR_WIDTH (32-`CLOG2(`IWORD_SIZE)) +`define ICORE_ADDR_WIDTH (32-`CLOG2(`IWORD_SIZE)) // Core request byte enable bits -`define ICORE_BYTEEN_WIDTH `DWORD_SIZE +`define ICORE_BYTEEN_WIDTH `DWORD_SIZE // TAG sharing enable -`define ICORE_TAG_ID_BITS `NW_BITS +`define ICORE_TAG_ID_BITS `NW_BITS // Core request tag bits -`define ICORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `ICORE_TAG_ID_BITS) +`define ICORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `ICORE_TAG_ID_BITS) // DRAM request data bits -`define IDRAM_LINE_WIDTH (`IBANK_LINE_SIZE * 8) +`define IDRAM_LINE_WIDTH (`IBANK_LINE_SIZE * 8) // DRAM request address bits -`define IDRAM_ADDR_WIDTH (32 - `CLOG2(`IBANK_LINE_SIZE)) +`define IDRAM_ADDR_WIDTH (32 - `CLOG2(`IBANK_LINE_SIZE)) // DRAM byte enable bits -`define IDRAM_BYTEEN_WIDTH `IBANK_LINE_SIZE +`define IDRAM_BYTEEN_WIDTH `IBANK_LINE_SIZE // DRAM request tag bits -`define IDRAM_TAG_WIDTH `IDRAM_ADDR_WIDTH +`define IDRAM_TAG_WIDTH `IDRAM_ADDR_WIDTH -// Number of Word requests per cycle {1, 2, 4, 8, ...} -`define INUM_REQUESTS 1 +// Core request size +`define INUM_REQUESTS 1 ////////////////////////// SM Configurable Knobs ////////////////////////////// // Cache ID -`define SCACHE_ID 32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2 +`define SCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2) -// Number of Word requests per cycle {1, 2, 4, 8, ...} -`define SNUM_REQUESTS `NUM_THREADS +// Block size in bytes +`define SBANK_LINE_SIZE 4 + +// Word size in bytes +`define SWORD_SIZE 4 + +// Core request size +`define SNUM_REQUESTS `NUM_THREADS // DRAM request address bits -`define SDRAM_ADDR_WIDTH (32 - `CLOG2(`SBANK_LINE_SIZE)) +`define SDRAM_ADDR_WIDTH (32 - `CLOG2(`SBANK_LINE_SIZE)) -// DRAM request tag bits -`define SDRAM_TAG_WIDTH `SDRAM_ADDR_WIDTH +// DRAM request tag bits +`define SDRAM_TAG_WIDTH `SDRAM_ADDR_WIDTH -// Number of Word requests per cycle {1, 2, 4, 8, ...} -`define SNUM_REQUESTS `NUM_THREADS +// Core request size +`define SNUM_REQUESTS `NUM_THREADS ////////////////////////// L2cache Configurable Knobs ///////////////////////// // Cache ID -`define L2CACHE_ID 32'(`L3_ENABLE) + CLUSTER_ID +`define L2CACHE_ID (32'(`L3_ENABLE) + CLUSTER_ID) + +// Block size in bytes +`define L2BANK_LINE_SIZE `GLOBAL_BLOCK_SIZE + +// Word size in bytes +`define L2WORD_SIZE `DBANK_LINE_SIZE // Core request tag bits -`define L2CORE_TAG_WIDTH (`DCORE_TAG_WIDTH + `CLOG2(`NUM_CORES)) +`define L2CORE_TAG_WIDTH (`DCORE_TAG_WIDTH + `CLOG2(`NUM_CORES)) // DRAM request data bits -`define L2DRAM_LINE_WIDTH (`L2_ENABLE ? (`L2BANK_LINE_SIZE * 8) : `DDRAM_LINE_WIDTH) +`define L2DRAM_LINE_WIDTH (`L2BANK_LINE_SIZE * 8) // DRAM request address bits -`define L2DRAM_ADDR_WIDTH (`L2_ENABLE ? (32 - `CLOG2(`L2BANK_LINE_SIZE)) : `DDRAM_ADDR_WIDTH) +`define L2DRAM_ADDR_WIDTH (32 - `CLOG2(`L2BANK_LINE_SIZE)) // DRAM byte enable bits -`define L2DRAM_BYTEEN_WIDTH (`L2_ENABLE ? `L2BANK_LINE_SIZE : `DDRAM_BYTEEN_WIDTH) +`define L2DRAM_BYTEEN_WIDTH `L2BANK_LINE_SIZE // DRAM request tag bits -`define L2DRAM_TAG_WIDTH (`L2_ENABLE ? `L2DRAM_ADDR_WIDTH : (`L2DRAM_ADDR_WIDTH+`CLOG2(`NUM_CORES*2))) +`define L2DRAM_TAG_WIDTH (`L2_ENABLE ? `L2DRAM_ADDR_WIDTH : (`L2DRAM_ADDR_WIDTH+`CLOG2(`NUM_CORES*2))) // Snoop request tag bits -`define L2SNP_TAG_WIDTH (`L3_ENABLE ? `LOG2UP(`L3SNRQ_SIZE) : `L3SNP_TAG_WIDTH) +`define L2SNP_TAG_WIDTH (`L3_ENABLE ? `LOG2UP(`L3SNRQ_SIZE) : `L3SNP_TAG_WIDTH) -// Number of Word requests per cycle {1, 2, 4, 8, ...} -`define L2NUM_REQUESTS (2 * `NUM_CORES) +// Core request size +`define L2NUM_REQUESTS (2 * `NUM_CORES) ////////////////////////// L3cache Configurable Knobs ///////////////////////// // Cache ID -`define L3CACHE_ID 0 +`define L3CACHE_ID 0 + +// Block size in bytes +`define L3BANK_LINE_SIZE `GLOBAL_BLOCK_SIZE + +// Word size in bytes +`define L3WORD_SIZE `L2BANK_LINE_SIZE // Core request tag bits -`define L3CORE_TAG_WIDTH (`L2CORE_TAG_WIDTH + `CLOG2(`NUM_CLUSTERS)) +`define L3CORE_TAG_WIDTH (`L2CORE_TAG_WIDTH + `CLOG2(`NUM_CLUSTERS)) // DRAM request data bits -`define L3DRAM_LINE_WIDTH (`L3_ENABLE ? (`L3BANK_LINE_SIZE * 8) : `L2DRAM_LINE_WIDTH) +`define L3DRAM_LINE_WIDTH (`L3BANK_LINE_SIZE * 8) // DRAM request address bits -`define L3DRAM_ADDR_WIDTH (`L3_ENABLE ? (32 - `CLOG2(`L3BANK_LINE_SIZE)) : `L2DRAM_ADDR_WIDTH) +`define L3DRAM_ADDR_WIDTH (32 - `CLOG2(`L3BANK_LINE_SIZE)) // DRAM byte enable bits -`define L3DRAM_BYTEEN_WIDTH (`L3_ENABLE ? `L3BANK_LINE_SIZE : `L2DRAM_BYTEEN_WIDTH) +`define L3DRAM_BYTEEN_WIDTH `L3BANK_LINE_SIZE // DRAM request tag bits -`define L3DRAM_TAG_WIDTH (`L3_ENABLE ? `L3DRAM_ADDR_WIDTH : `L2DRAM_TAG_WIDTH) +`define L3DRAM_TAG_WIDTH (`L3_ENABLE ? `L3DRAM_ADDR_WIDTH : `L2DRAM_TAG_WIDTH) // Snoop request tag bits -`define L3SNP_TAG_WIDTH 16 +`define L3SNP_TAG_WIDTH 16 -// Number of Word requests per cycle {1, 2, 4, 8, ...} -`define L3NUM_REQUESTS `NUM_CLUSTERS +// Core request size +`define L3NUM_REQUESTS `NUM_CLUSTERS /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index 1f0bcf0ec..4c4b2534a 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -168,9 +168,9 @@ module VX_ibuffer #( for (integer i = 0; i < `NUM_WARPS; i++) begin nw += 32'(q_size[i] != 0); end - assert(nw == 32'(num_warps)) else $display("%t: error: invalid num_warps: nw=%0d, ref=%0d", $time, num_warps, nw); - assert(~deq_valid || (q_size[deq_wid] != 0)) else $display("%t: error: invalid schedule: wid=%0d", $time, deq_wid); - assert(~deq_fire || (q_size[deq_wid] != 0)) else $display("%t: error: invalid dequeu: wid=%0d", $time, deq_wid); + assert(nw == 32'(num_warps)) else $error("%t: error: invalid num_warps: nw=%0d, ref=%0d", $time, num_warps, nw); + assert(~deq_valid || (q_size[deq_wid] != 0)) else $error("%t: error: invalid schedule: wid=%0d", $time, deq_wid); + assert(~deq_fire || (q_size[deq_wid] != 0)) else $error("%t: error: invalid dequeu: wid=%0d", $time, deq_wid); end `endif end diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index 733c08968..3b8d67682 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -75,7 +75,6 @@ module VX_mem_unit # ( .DRAM_ENABLE (0), .FLUSH_ENABLE (0), .WRITE_ENABLE (1), - .SNOOP_FORWARDING (0), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH), .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS), .DRAM_TAG_WIDTH (`SDRAM_TAG_WIDTH) @@ -127,44 +126,31 @@ module VX_mem_unit # ( `UNUSED_PIN (snp_rsp_tag), .snp_rsp_ready (1'b0), - // Snoop forward out - `UNUSED_PIN (snp_fwdout_valid), - `UNUSED_PIN (snp_fwdout_addr), - `UNUSED_PIN (snp_fwdout_invalidate), - `UNUSED_PIN (snp_fwdout_tag), - .snp_fwdout_ready (1'b0), - - // Snoop forward in - .snp_fwdin_valid (1'b0), - .snp_fwdin_tag (0), - `UNUSED_PIN (snp_fwdin_ready), - // Miss status `UNUSED_PIN (miss_vec) ); VX_cache #( - .CACHE_ID (`DCACHE_ID), - .CACHE_SIZE (`DCACHE_SIZE), - .BANK_LINE_SIZE (`DBANK_LINE_SIZE), - .NUM_BANKS (`DNUM_BANKS), - .WORD_SIZE (`DWORD_SIZE), - .NUM_REQUESTS (`DNUM_REQUESTS), - .CREQ_SIZE (`DCREQ_SIZE), - .MRVQ_SIZE (`DMRVQ_SIZE), - .DRFQ_SIZE (`DDRFQ_SIZE), - .SNRQ_SIZE (`DSNRQ_SIZE), - .CWBQ_SIZE (`DCWBQ_SIZE), - .DREQ_SIZE (`DDREQ_SIZE), - .SNPQ_SIZE (`DSNPQ_SIZE), - .DRAM_ENABLE (1), - .FLUSH_ENABLE (1), - .WRITE_ENABLE (1), - .SNOOP_FORWARDING (0), - .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH), - .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS), - .DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH), - .SNP_REQ_TAG_WIDTH (`DSNP_TAG_WIDTH) + .CACHE_ID (`DCACHE_ID), + .CACHE_SIZE (`DCACHE_SIZE), + .BANK_LINE_SIZE (`DBANK_LINE_SIZE), + .NUM_BANKS (`DNUM_BANKS), + .WORD_SIZE (`DWORD_SIZE), + .NUM_REQUESTS (`DNUM_REQUESTS), + .CREQ_SIZE (`DCREQ_SIZE), + .MRVQ_SIZE (`DMRVQ_SIZE), + .DRFQ_SIZE (`DDRFQ_SIZE), + .SNRQ_SIZE (`DSNRQ_SIZE), + .CWBQ_SIZE (`DCWBQ_SIZE), + .DREQ_SIZE (`DDREQ_SIZE), + .SNPQ_SIZE (`DSNPQ_SIZE), + .DRAM_ENABLE (1), + .FLUSH_ENABLE (1), + .WRITE_ENABLE (1), + .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH), + .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS), + .DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH), + .SNP_TAG_WIDTH (`DSNP_TAG_WIDTH) ) dcache ( `SCOPE_BIND_VX_mem_unit_dcache @@ -212,18 +198,6 @@ module VX_mem_unit # ( .snp_rsp_valid (dcache_snp_rsp_if.valid), .snp_rsp_tag (dcache_snp_rsp_if.tag), .snp_rsp_ready (dcache_snp_rsp_if.ready), - - // Snoop forward out - `UNUSED_PIN (snp_fwdout_valid), - `UNUSED_PIN (snp_fwdout_addr), - `UNUSED_PIN (snp_fwdout_invalidate), - `UNUSED_PIN (snp_fwdout_tag), - .snp_fwdout_ready (1'b0), - - // Snoop forward in - .snp_fwdin_valid (1'b0), - .snp_fwdin_tag (0), - `UNUSED_PIN (snp_fwdin_ready), // Miss status `UNUSED_PIN (miss_vec) @@ -246,7 +220,6 @@ module VX_mem_unit # ( .DRAM_ENABLE (1), .FLUSH_ENABLE (0), .WRITE_ENABLE (0), - .SNOOP_FORWARDING (0), .CORE_TAG_WIDTH (`ICORE_TAG_WIDTH), .CORE_TAG_ID_BITS (`ICORE_TAG_ID_BITS), .DRAM_TAG_WIDTH (`IDRAM_TAG_WIDTH) @@ -298,18 +271,6 @@ module VX_mem_unit # ( `UNUSED_PIN (snp_rsp_tag), .snp_rsp_ready (1'b0), - // Snoop forward out - `UNUSED_PIN (snp_fwdout_valid), - `UNUSED_PIN (snp_fwdout_addr), - `UNUSED_PIN (snp_fwdout_invalidate), - `UNUSED_PIN (snp_fwdout_tag), - .snp_fwdout_ready (1'b0), - - // Snoop forward in - .snp_fwdin_valid (1'b0), - .snp_fwdin_tag (0), - `UNUSED_PIN (snp_fwdin_ready), - // Miss status `UNUSED_PIN (miss_vec) ); diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index e0a5eb15c..f6826c43f 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -320,56 +320,70 @@ module Vortex ( // L3 Cache /////////////////////////////////////////////////////////// - wire [`L3NUM_REQUESTS-1:0] cluster_dram_req_valid; - wire [`L3NUM_REQUESTS-1:0] cluster_dram_req_rw; - wire [`L3NUM_REQUESTS-1:0][`L2DRAM_BYTEEN_WIDTH-1:0] cluster_dram_req_byteen; - wire [`L3NUM_REQUESTS-1:0][`L2DRAM_ADDR_WIDTH-1:0] cluster_dram_req_addr; - wire [`L3NUM_REQUESTS-1:0][`L2DRAM_LINE_WIDTH-1:0] cluster_dram_req_data; - wire [`L3NUM_REQUESTS-1:0][`L2DRAM_TAG_WIDTH-1:0] cluster_dram_req_tag; - wire [`L3NUM_REQUESTS-1:0] cluster_dram_rsp_valid; wire [`L3NUM_REQUESTS-1:0][`L2DRAM_LINE_WIDTH-1:0] cluster_dram_rsp_data; wire [`L3NUM_REQUESTS-1:0][`L2DRAM_TAG_WIDTH-1:0] cluster_dram_rsp_tag; wire cluster_dram_rsp_ready; - wire [`NUM_CLUSTERS-1:0] cluster_snp_fwdout_valid; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_ADDR_WIDTH-1:0] cluster_snp_fwdout_addr; - wire [`NUM_CLUSTERS-1:0] cluster_snp_fwdout_invalidate; - wire [`NUM_CLUSTERS-1:0][`L2SNP_TAG_WIDTH-1:0] cluster_snp_fwdout_tag; - wire [`NUM_CLUSTERS-1:0] cluster_snp_fwdout_ready; + wire snp_fwd_rsp_valid; + wire [`L3DRAM_ADDR_WIDTH-1:0] snp_fwd_rsp_addr; + wire snp_fwd_rsp_invalidate; + wire [`L3SNP_TAG_WIDTH-1:0] snp_fwd_rsp_tag; + wire snp_fwd_rsp_ready; - wire [`NUM_CLUSTERS-1:0] cluster_snp_fwdin_valid; - wire [`NUM_CLUSTERS-1:0][`L2SNP_TAG_WIDTH-1:0] cluster_snp_fwdin_tag; - wire [`NUM_CLUSTERS-1:0] cluster_snp_fwdin_ready; + reg [`L3NUM_REQUESTS-1:0] cluster_dram_rsp_ready_other; - for (genvar i = 0; i < `L3NUM_REQUESTS; i++) begin - // Core Request - assign cluster_dram_req_valid [i] = per_cluster_dram_req_valid [i]; - assign cluster_dram_req_rw [i] = per_cluster_dram_req_rw [i]; - assign cluster_dram_req_byteen [i] = per_cluster_dram_req_byteen[i]; - assign cluster_dram_req_addr [i] = per_cluster_dram_req_addr [i]; - assign cluster_dram_req_tag [i] = per_cluster_dram_req_tag [i]; - assign cluster_dram_req_data [i] = per_cluster_dram_req_data [i]; - - // Core Response - assign per_cluster_dram_rsp_valid [i] = cluster_dram_rsp_valid [i] && cluster_dram_rsp_ready; - assign per_cluster_dram_rsp_data [i] = cluster_dram_rsp_data [i]; - assign per_cluster_dram_rsp_tag [i] = cluster_dram_rsp_tag [i]; - - // Snoop Forwarding out - assign per_cluster_snp_req_valid [i] = cluster_snp_fwdout_valid[i]; - assign per_cluster_snp_req_addr [i] = cluster_snp_fwdout_addr[i]; - assign per_cluster_snp_req_invalidate [i] = cluster_snp_fwdout_invalidate[i]; - assign per_cluster_snp_req_tag [i] = cluster_snp_fwdout_tag[i]; - assign cluster_snp_fwdout_ready [i] = per_cluster_snp_req_ready[i]; - - // Snoop Forwarding in - assign cluster_snp_fwdin_valid [i] = per_cluster_snp_rsp_valid [i]; - assign cluster_snp_fwdin_tag [i] = per_cluster_snp_rsp_tag [i]; - assign per_cluster_snp_rsp_ready [i] = cluster_snp_fwdin_ready [i]; + always @(*) begin + cluster_dram_rsp_ready_other = {`L3NUM_REQUESTS{1'b1}}; + for (integer i = 0; i < `L3NUM_REQUESTS; i++) begin + for (integer j = 0; j < `L3NUM_REQUESTS; j++) begin + if (i != j) + cluster_dram_rsp_ready_other[i] &= (per_cluster_dram_rsp_ready [j] | !cluster_dram_rsp_valid [j]); + end + end end - assign cluster_dram_rsp_ready = (& per_cluster_dram_rsp_ready); + for (genvar i = 0; i < `L3NUM_REQUESTS; i++) begin + // Core Response + assign per_cluster_dram_rsp_valid [i] = cluster_dram_rsp_valid [i] & cluster_dram_rsp_ready_other [i]; + assign per_cluster_dram_rsp_data [i] = cluster_dram_rsp_data [i]; + assign per_cluster_dram_rsp_tag [i] = cluster_dram_rsp_tag [i]; + end + assign cluster_dram_rsp_ready = & (per_cluster_dram_rsp_ready | ~cluster_dram_rsp_valid); + + VX_snp_forwarder #( + .CACHE_ID (`L3CACHE_ID), + .NUM_REQUESTS (`NUM_CLUSTERS), + .SRC_ADDR_WIDTH (`L3DRAM_ADDR_WIDTH), + .DST_ADDR_WIDTH (`L2DRAM_ADDR_WIDTH), + .SNP_TAG_WIDTH (`L3SNP_TAG_WIDTH), + .SNRQ_SIZE (`L3SNRQ_SIZE) + ) snp_forwarder ( + .clk (clk), + .reset (reset), + + .snp_req_valid (snp_req_valid), + .snp_req_addr (snp_req_addr), + .snp_req_invalidate (snp_req_invalidate), + .snp_req_tag (snp_req_tag), + .snp_req_ready (snp_req_ready), + + .snp_rsp_valid (snp_fwd_rsp_valid), + .snp_rsp_addr (snp_fwd_rsp_addr), + .snp_rsp_invalidate (snp_fwd_rsp_invalidate), + .snp_rsp_tag (snp_fwd_rsp_tag), + .snp_rsp_ready (snp_fwd_rsp_ready), + + .snp_fwdout_valid (per_cluster_snp_req_valid), + .snp_fwdout_addr (per_cluster_snp_req_addr), + .snp_fwdout_invalidate(per_cluster_snp_req_invalidate), + .snp_fwdout_tag (per_cluster_snp_req_tag), + .snp_fwdout_ready (per_cluster_snp_req_ready), + + .snp_fwdin_valid (per_cluster_snp_rsp_valid), + .snp_fwdin_tag (per_cluster_snp_rsp_tag), + .snp_fwdin_ready (per_cluster_snp_rsp_ready) + ); VX_cache #( .CACHE_ID (`L3CACHE_ID), @@ -388,13 +402,10 @@ module Vortex ( .DRAM_ENABLE (1), .FLUSH_ENABLE (1), .WRITE_ENABLE (1), - .SNOOP_FORWARDING (1), .CORE_TAG_WIDTH (`L2DRAM_TAG_WIDTH), .CORE_TAG_ID_BITS (0), .DRAM_TAG_WIDTH (`L3DRAM_TAG_WIDTH), - .NUM_SNP_REQUESTS (`NUM_CLUSTERS), - .SNP_REQ_TAG_WIDTH (`L3SNP_TAG_WIDTH), - .SNP_FWD_TAG_WIDTH (`L2SNP_TAG_WIDTH) + .SNP_TAG_WIDTH (`L3SNP_TAG_WIDTH) ) l3cache ( `SCOPE_BIND_Vortex_l3cache @@ -402,12 +413,12 @@ module Vortex ( .reset (reset), // Core request - .core_req_valid (cluster_dram_req_valid), - .core_req_rw (cluster_dram_req_rw), - .core_req_byteen (cluster_dram_req_byteen), - .core_req_addr (cluster_dram_req_addr), - .core_req_data (cluster_dram_req_data), - .core_req_tag (cluster_dram_req_tag), + .core_req_valid (per_cluster_dram_req_valid), + .core_req_rw (per_cluster_dram_req_rw), + .core_req_byteen (per_cluster_dram_req_byteen), + .core_req_addr (per_cluster_dram_req_addr), + .core_req_data (per_cluster_dram_req_data), + .core_req_tag (per_cluster_dram_req_tag), .core_req_ready (cluster_dram_req_ready), // Core response @@ -432,29 +443,17 @@ module Vortex ( .dram_rsp_ready (dram_rsp_ready), // Snoop request - .snp_req_valid (snp_req_valid), - .snp_req_addr (snp_req_addr), - .snp_req_invalidate (snp_req_invalidate), - .snp_req_tag (snp_req_tag), - .snp_req_ready (snp_req_ready), + .snp_req_valid (snp_fwd_rsp_valid), + .snp_req_addr (snp_fwd_rsp_addr), + .snp_req_invalidate (snp_fwd_rsp_invalidate), + .snp_req_tag (snp_fwd_rsp_tag), + .snp_req_ready (snp_fwd_rsp_ready), // Snoop response .snp_rsp_valid (snp_rsp_valid), .snp_rsp_tag (snp_rsp_tag), .snp_rsp_ready (snp_rsp_ready), - // Snoop forwarding out - .snp_fwdout_valid (cluster_snp_fwdout_valid), - .snp_fwdout_addr (cluster_snp_fwdout_addr), - .snp_fwdout_invalidate(cluster_snp_fwdout_invalidate), - .snp_fwdout_tag (cluster_snp_fwdout_tag), - .snp_fwdout_ready (cluster_snp_fwdout_ready), - - // Snoop forwarding in - .snp_fwdin_valid (cluster_snp_fwdin_valid), - .snp_fwdin_tag (cluster_snp_fwdin_tag), - .snp_fwdin_ready (cluster_snp_fwdin_ready), - // Miss status `UNUSED_PIN (miss_vec) ); @@ -497,4 +496,11 @@ module Vortex ( end `endif + +`ifndef NDEBUG + always @(posedge clk) begin + $fflush(); // flush stdout buffer + end +`endif + endmodule \ No newline at end of file diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index f1d7ede9d..12b404a4b 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -47,7 +47,7 @@ module VX_bank #( parameter CORE_TAG_ID_BITS = 0, // Snooping request tag width - parameter SNP_REQ_TAG_WIDTH = 1 + parameter SNP_TAG_WIDTH = 1 ) ( `SCOPE_IO_VX_bank @@ -88,12 +88,12 @@ module VX_bank #( input wire snp_req_valid, input wire [`LINE_ADDR_WIDTH-1:0] snp_req_addr, input wire snp_req_invalidate, - input wire [SNP_REQ_TAG_WIDTH-1:0] snp_req_tag, + input wire [SNP_TAG_WIDTH-1:0] snp_req_tag, output wire snp_req_ready, // Snoop Response output wire snp_rsp_valid, - output wire [SNP_REQ_TAG_WIDTH-1:0] snp_rsp_tag, + output wire [SNP_TAG_WIDTH-1:0] snp_rsp_tag, input wire snp_rsp_ready, // Misses @@ -142,13 +142,13 @@ module VX_bank #( wire [`LINE_ADDR_WIDTH-1:0] snrq_addr_st0; wire snrq_invalidate_st0; - wire [SNP_REQ_TAG_WIDTH-1:0] snrq_tag_st0; + wire [SNP_TAG_WIDTH-1:0] snrq_tag_st0; wire snp_req_fire = snp_req_valid && snp_req_ready; assign snp_req_ready = !snrq_full; VX_generic_queue #( - .DATAW(`LINE_ADDR_WIDTH + 1 + SNP_REQ_TAG_WIDTH), + .DATAW(`LINE_ADDR_WIDTH + 1 + SNP_TAG_WIDTH), .SIZE(SNRQ_SIZE) ) snp_req_queue ( .clk (clk), @@ -352,7 +352,7 @@ module VX_bank #( || ((miss_st3 || force_miss_st3) && (addr_st3 == addr_st0)); `ifdef DBG_CACHE_REQ_INFO - if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin + if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin assign {debug_pc_st0, debug_rd_st0, debug_wid_st0, debug_tagid_st0, debug_rw_st0, debug_byteen_st0, debug_tid_st0} = inst_meta_st0; end else begin assign {debug_pc_st0, debug_rd_st0, debug_wid_st0, debug_tagid_st0, debug_rw_st0, debug_byteen_st0, debug_tid_st0} = 0; @@ -371,7 +371,7 @@ module VX_bank #( ); `ifdef DBG_CACHE_REQ_INFO - if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin + if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin assign {debug_pc_st1, debug_rd_st1, debug_wid_st1, debug_tagid_st1, debug_rw_st1, debug_byteen_st1, debug_tid_st1} = inst_meta_st1; end else begin assign {debug_pc_st1, debug_rd_st1, debug_wid_st1, debug_tagid_st1, debug_rw_st1, debug_byteen_st1, debug_tid_st1} = 0; @@ -474,7 +474,7 @@ module VX_bank #( ); `ifdef DBG_CACHE_REQ_INFO - if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin + if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin assign {debug_pc_st2, debug_rd_st2, debug_wid_st2, debug_tagid_st2, debug_rw_st2, debug_byteen_st2, debug_tid_st2} = inst_meta_st2; end else begin assign {debug_pc_st2, debug_rd_st2, debug_wid_st2, debug_tagid_st2, debug_rw_st2, debug_byteen_st2, debug_tid_st2} = 0; @@ -574,7 +574,7 @@ module VX_bank #( ); `ifdef DBG_CACHE_REQ_INFO - if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin + if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin assign {debug_pc_st3, debug_rd_st3, debug_wid_st3, debug_tagid_st3, debug_rw_st3, debug_byteen_st3, debug_tid_st3} = inst_meta_st3; end else begin assign {debug_pc_st3, debug_rd_st3, debug_wid_st3, debug_tagid_st3, debug_rw_st3, debug_byteen_st3, debug_tid_st3} = 0; @@ -621,7 +621,7 @@ module VX_bank #( .NUM_REQUESTS (NUM_REQUESTS), .MRVQ_SIZE (MRVQ_SIZE), .CORE_TAG_WIDTH (CORE_TAG_WIDTH), - .SNP_REQ_TAG_WIDTH (SNP_REQ_TAG_WIDTH) + .SNP_TAG_WIDTH (SNP_TAG_WIDTH) ) cache_miss_resrv ( .clk (clk), .reset (reset), @@ -803,12 +803,12 @@ module VX_bank #( wire snpq_pop = snp_rsp_valid && snp_rsp_ready; - wire [SNP_REQ_TAG_WIDTH-1:0] snpq_tag_st3 = SNP_REQ_TAG_WIDTH'(req_tag_st3); + wire [SNP_TAG_WIDTH-1:0] snpq_tag_st3 = SNP_TAG_WIDTH'(req_tag_st3); if (FLUSH_ENABLE) begin VX_generic_queue #( - .DATAW(SNP_REQ_TAG_WIDTH), - .SIZE(SNPQ_SIZE) + .DATAW (SNP_TAG_WIDTH), + .SIZE (SNPQ_SIZE) ) snp_rsp_queue ( .clk (clk), .reset (reset), diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index b4efedae8..87b44f606 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -39,9 +39,6 @@ module VX_cache #( // Enable cache flush parameter FLUSH_ENABLE = 1, - // Enable snoop forwarding - parameter SNOOP_FORWARDING = 1, - // core request tag size parameter CORE_TAG_WIDTH = 4, @@ -51,14 +48,8 @@ module VX_cache #( // dram request tag size parameter DRAM_TAG_WIDTH = 28, - // Number of snoop forwarding requests - parameter NUM_SNP_REQUESTS = (SNOOP_FORWARDING ? 4 : 1), - // Snooping request tag width - parameter SNP_REQ_TAG_WIDTH = (SNOOP_FORWARDING ? 4 : 1), - - // Snooping forward tag width - parameter SNP_FWD_TAG_WIDTH = (SNOOP_FORWARDING ? 4 : 1) + parameter SNP_TAG_WIDTH = 1 ) ( `SCOPE_IO_VX_cache @@ -99,28 +90,14 @@ module VX_cache #( input wire snp_req_valid, input wire [`DRAM_ADDR_WIDTH-1:0] snp_req_addr, input wire snp_req_invalidate, - input wire [SNP_REQ_TAG_WIDTH-1:0] snp_req_tag, + input wire [SNP_TAG_WIDTH-1:0] snp_req_tag, output wire snp_req_ready, // Snoop response output wire snp_rsp_valid, - output wire [SNP_REQ_TAG_WIDTH-1:0] snp_rsp_tag, + output wire [SNP_TAG_WIDTH-1:0] snp_rsp_tag, input wire snp_rsp_ready, - // Snoop Forwarding out - output wire [NUM_SNP_REQUESTS-1:0] snp_fwdout_valid, - output wire [NUM_SNP_REQUESTS-1:0][`DRAM_ADDR_WIDTH-1:0] snp_fwdout_addr, - output wire [NUM_SNP_REQUESTS-1:0] snp_fwdout_invalidate, - output wire [NUM_SNP_REQUESTS-1:0][SNP_FWD_TAG_WIDTH-1:0] snp_fwdout_tag, -`IGNORE_WARNINGS_BEGIN - input wire [NUM_SNP_REQUESTS-1:0] snp_fwdout_ready, - - // Snoop forwarding in - input wire [NUM_SNP_REQUESTS-1:0] snp_fwdin_valid, - input wire [NUM_SNP_REQUESTS-1:0][SNP_FWD_TAG_WIDTH-1:0] snp_fwdin_tag, -`IGNORE_WARNINGS_END - output wire [NUM_SNP_REQUESTS-1:0] snp_fwdin_ready, - output wire [NUM_BANKS-1:0] miss_vec ); @@ -146,72 +123,16 @@ module VX_cache #( wire [NUM_BANKS-1:0] per_bank_snp_req_ready; wire [NUM_BANKS-1:0] per_bank_snp_rsp_valid; - wire [NUM_BANKS-1:0][SNP_REQ_TAG_WIDTH-1:0] per_bank_snp_rsp_tag; + wire [NUM_BANKS-1:0][SNP_TAG_WIDTH-1:0] per_bank_snp_rsp_tag; wire [NUM_BANKS-1:0] per_bank_snp_rsp_ready; wire [NUM_BANKS-1:0] per_bank_miss; - assign miss_vec = per_bank_miss; - - - - wire snp_req_valid_qual; - wire [`DRAM_ADDR_WIDTH-1:0] snp_req_addr_qual; - wire snp_req_invalidate_qual; - wire [SNP_REQ_TAG_WIDTH-1:0] snp_req_tag_qual; - wire snp_req_ready_qual; - - if (SNOOP_FORWARDING) begin - VX_snp_forwarder #( - .CACHE_ID (CACHE_ID), - .BANK_LINE_SIZE (BANK_LINE_SIZE), - .NUM_REQUESTS (NUM_SNP_REQUESTS), - .SNRQ_SIZE (SNRQ_SIZE), - .SNP_REQ_TAG_WIDTH (SNP_REQ_TAG_WIDTH) - ) snp_forwarder ( - .clk (clk), - .reset (reset), - - .snp_req_valid (snp_req_valid), - .snp_req_addr (snp_req_addr), - .snp_req_invalidate (snp_req_invalidate), - .snp_req_tag (snp_req_tag), - .snp_req_ready (snp_req_ready), - - .snp_rsp_valid (snp_req_valid_qual), - .snp_rsp_addr (snp_req_addr_qual), - .snp_rsp_invalidate (snp_req_invalidate_qual), - .snp_rsp_tag (snp_req_tag_qual), - .snp_rsp_ready (snp_req_ready_qual), - - .snp_fwdout_valid (snp_fwdout_valid), - .snp_fwdout_addr (snp_fwdout_addr), - .snp_fwdout_invalidate(snp_fwdout_invalidate), - .snp_fwdout_tag (snp_fwdout_tag), - .snp_fwdout_ready (snp_fwdout_ready), - - .snp_fwdin_valid (snp_fwdin_valid), - .snp_fwdin_tag (snp_fwdin_tag), - .snp_fwdin_ready (snp_fwdin_ready) - ); - end else begin - assign snp_fwdout_valid = 0; - assign snp_fwdout_addr = 0; - assign snp_fwdout_invalidate = 0; - assign snp_fwdout_tag = 0; - - assign snp_fwdin_ready = 0; - - assign snp_req_valid_qual = snp_req_valid; - assign snp_req_addr_qual = snp_req_addr; - assign snp_req_invalidate_qual = snp_req_invalidate; - assign snp_req_tag_qual = snp_req_tag; - assign snp_req_ready = snp_req_ready_qual; - end - + assign miss_vec = per_bank_miss; + if (NUM_BANKS == 1) begin - assign snp_req_ready_qual = per_bank_snp_req_ready; + assign snp_req_ready = per_bank_snp_req_ready; end else begin - assign snp_req_ready_qual = per_bank_snp_req_ready[`DRAM_ADDR_BANK(snp_req_addr_qual)]; + assign snp_req_ready = per_bank_snp_req_ready[`DRAM_ADDR_BANK(snp_req_addr)]; end VX_cache_core_req_bank_sel #( @@ -221,14 +142,18 @@ module VX_cache #( .NUM_REQUESTS (NUM_REQUESTS) ) cache_core_req_bank_sel ( .core_req_valid (core_req_valid), - .per_bank_ready (per_bank_core_req_ready), .core_req_addr (core_req_addr), + .core_req_ready (core_req_ready), .per_bank_valid (per_bank_valid), - .core_req_ready (core_req_ready) + .per_bank_ready (per_bank_core_req_ready) ); assign dram_req_tag = dram_req_addr; - assign dram_rsp_ready = (& per_bank_dram_rsp_ready); + if (NUM_BANKS == 1) begin + assign dram_rsp_ready = per_bank_dram_rsp_ready; + end else begin + assign dram_rsp_ready = per_bank_dram_rsp_ready[`DRAM_ADDR_BANK(dram_rsp_tag)]; + end for (genvar i = 0; i < NUM_BANKS; i++) begin wire [NUM_REQUESTS-1:0] curr_bank_core_req_valid; @@ -260,11 +185,11 @@ module VX_cache #( wire curr_bank_snp_req_valid; wire [`LINE_ADDR_WIDTH-1:0] curr_bank_snp_req_addr; wire curr_bank_snp_req_invalidate; - wire [SNP_REQ_TAG_WIDTH-1:0] curr_bank_snp_req_tag; + wire [SNP_TAG_WIDTH-1:0] curr_bank_snp_req_tag; wire curr_bank_snp_req_ready; wire curr_bank_snp_rsp_valid; - wire [SNP_REQ_TAG_WIDTH-1:0] curr_bank_snp_rsp_tag; + wire [SNP_TAG_WIDTH-1:0] curr_bank_snp_rsp_tag; wire curr_bank_snp_rsp_ready; wire curr_bank_miss; @@ -310,14 +235,14 @@ module VX_cache #( // Snoop request if (NUM_BANKS == 1) begin - assign curr_bank_snp_req_valid = snp_req_valid_qual; - assign curr_bank_snp_req_addr = snp_req_addr_qual; + assign curr_bank_snp_req_valid = snp_req_valid; + assign curr_bank_snp_req_addr = snp_req_addr; end else begin - assign curr_bank_snp_req_valid = snp_req_valid_qual && (`DRAM_ADDR_BANK(snp_req_addr_qual) == i); - assign curr_bank_snp_req_addr = `DRAM_TO_LINE_ADDR(snp_req_addr_qual); + assign curr_bank_snp_req_valid = snp_req_valid && (`DRAM_ADDR_BANK(snp_req_addr) == i); + assign curr_bank_snp_req_addr = `DRAM_TO_LINE_ADDR(snp_req_addr); end - assign curr_bank_snp_req_invalidate = snp_req_invalidate_qual; - assign curr_bank_snp_req_tag = snp_req_tag_qual; + assign curr_bank_snp_req_invalidate = snp_req_invalidate; + assign curr_bank_snp_req_tag = snp_req_tag; assign per_bank_snp_req_ready[i] = curr_bank_snp_req_ready; // Snoop response @@ -348,7 +273,7 @@ module VX_cache #( .WRITE_ENABLE (WRITE_ENABLE), .CORE_TAG_WIDTH (CORE_TAG_WIDTH), .CORE_TAG_ID_BITS (CORE_TAG_ID_BITS), - .SNP_REQ_TAG_WIDTH (SNP_REQ_TAG_WIDTH) + .SNP_TAG_WIDTH (SNP_TAG_WIDTH) ) bank ( `SCOPE_BIND_VX_cache_bank(i) @@ -459,9 +384,9 @@ module VX_cache #( if (FLUSH_ENABLE) begin VX_snp_rsp_arb #( - .NUM_BANKS (NUM_BANKS), - .BANK_LINE_SIZE (BANK_LINE_SIZE), - .SNP_REQ_TAG_WIDTH (SNP_REQ_TAG_WIDTH) + .NUM_BANKS (NUM_BANKS), + .BANK_LINE_SIZE (BANK_LINE_SIZE), + .SNP_TAG_WIDTH (SNP_TAG_WIDTH) ) snp_rsp_arb ( .clk (clk), .reset (reset), diff --git a/hw/rtl/cache/VX_cache_config.vh b/hw/rtl/cache/VX_cache_config.vh index 050c0b323..85c57d3f0 100644 --- a/hw/rtl/cache/VX_cache_config.vh +++ b/hw/rtl/cache/VX_cache_config.vh @@ -7,7 +7,7 @@ `include "VX_define.vh" `endif -`define REQ_TAG_WIDTH `MAX(CORE_TAG_WIDTH, SNP_REQ_TAG_WIDTH) +`define REQ_TAG_WIDTH `MAX(CORE_TAG_WIDTH, SNP_TAG_WIDTH) `define REQS_BITS `LOG2UP(NUM_REQUESTS) diff --git a/hw/rtl/cache/VX_cache_core_req_bank_sel.v b/hw/rtl/cache/VX_cache_core_req_bank_sel.v index e0124c15b..65a601d16 100644 --- a/hw/rtl/cache/VX_cache_core_req_bank_sel.v +++ b/hw/rtl/cache/VX_cache_core_req_bank_sel.v @@ -11,27 +11,43 @@ module VX_cache_core_req_bank_sel #( parameter NUM_REQUESTS = 1 ) ( input wire [NUM_REQUESTS-1:0] core_req_valid, -`IGNORE_WARNINGS_BEGIN - input wire [NUM_REQUESTS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr, -`IGNORE_WARNINGS_END - input wire [NUM_BANKS-1:0] per_bank_ready, + input wire [NUM_REQUESTS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr, + output wire core_req_ready, + output wire [NUM_BANKS-1:0][NUM_REQUESTS-1:0] per_bank_valid, - output wire core_req_ready + input wire [NUM_BANKS-1:0] per_bank_ready ); if (NUM_BANKS > 1) begin - reg [NUM_BANKS-1:0][NUM_REQUESTS-1:0] per_bank_valid_r; - reg [NUM_BANKS-1:0] per_bank_ready_sel; + reg [NUM_BANKS-1:0][NUM_REQUESTS-1:0] per_bank_valid_r; + reg [NUM_BANKS-1:0] per_bank_ready_ignore; + reg [NUM_BANKS-1:0] per_bank_ready_other; + always @(*) begin - per_bank_valid_r = 0; - per_bank_ready_sel = {NUM_BANKS{1'b1}}; + per_bank_valid_r = 0; + per_bank_ready_other = {NUM_BANKS{1'b1}}; + per_bank_ready_ignore = {NUM_BANKS{1'b1}}; + + for (integer i = 0; i < NUM_BANKS; i++) begin + for (integer j = 0; j < NUM_BANKS; j++) begin + if (i != j) + per_bank_ready_other[i] &= (per_bank_ready[j] | per_bank_ready_ignore[j]); + end + end + for (integer i = 0; i < NUM_REQUESTS; i++) begin per_bank_valid_r[core_req_addr[i][`BANK_SELECT_ADDR_RNG]][i] = core_req_valid[i]; - per_bank_ready_sel[core_req_addr[i][`BANK_SELECT_ADDR_RNG]] = 0; + per_bank_ready_ignore[core_req_addr[i][`BANK_SELECT_ADDR_RNG]] = 1'b0; end end - assign per_bank_valid = per_bank_valid_r; - assign core_req_ready = & (per_bank_ready | per_bank_ready_sel); + + for (genvar i = 0; i < NUM_BANKS; i++) begin + for (genvar j = 0; j < NUM_REQUESTS; j++) begin + assign per_bank_valid[i][j] = per_bank_valid_r[i][j] & per_bank_ready_other[i]; + end + end + assign core_req_ready = & (per_bank_ready | per_bank_ready_ignore); end else begin + `UNUSED_VAR (core_req_addr) assign per_bank_valid = core_req_valid; assign core_req_ready = per_bank_ready; end diff --git a/hw/rtl/cache/VX_cache_miss_resrv.v b/hw/rtl/cache/VX_cache_miss_resrv.v index 87796ff60..023fcad68 100644 --- a/hw/rtl/cache/VX_cache_miss_resrv.v +++ b/hw/rtl/cache/VX_cache_miss_resrv.v @@ -17,7 +17,7 @@ module VX_cache_miss_resrv #( // core request tag size parameter CORE_TAG_WIDTH = 1, // Snooping request tag width - parameter SNP_REQ_TAG_WIDTH = 1, + parameter SNP_TAG_WIDTH = 1, // size of tag id in core request tag parameter CORE_TAG_ID_BITS = 0 ) ( diff --git a/hw/rtl/cache/VX_snp_forwarder.v b/hw/rtl/cache/VX_snp_forwarder.v index ed9845d8a..877480ea6 100644 --- a/hw/rtl/cache/VX_snp_forwarder.v +++ b/hw/rtl/cache/VX_snp_forwarder.v @@ -1,33 +1,33 @@ `include "VX_cache_config.vh" module VX_snp_forwarder #( - parameter CACHE_ID = 0, - parameter BANK_LINE_SIZE = 1, - parameter NUM_REQUESTS = 1, - parameter SNRQ_SIZE = 1, - parameter SNP_REQ_TAG_WIDTH = 1, - parameter SNP_FWD_TAG_WIDTH = 1 + parameter CACHE_ID = 0, + parameter SRC_ADDR_WIDTH = 1, + parameter DST_ADDR_WIDTH = 1, + parameter NUM_REQUESTS = 1, + parameter SNP_TAG_WIDTH = 1, + parameter SNRQ_SIZE = 1 ) ( input wire clk, input wire reset, // Snoop request input wire snp_req_valid, - input wire [`DRAM_ADDR_WIDTH-1:0] snp_req_addr, + input wire [SRC_ADDR_WIDTH-1:0] snp_req_addr, input wire snp_req_invalidate, - input wire [SNP_REQ_TAG_WIDTH-1:0] snp_req_tag, + input wire [SNP_TAG_WIDTH-1:0] snp_req_tag, output wire snp_req_ready, // Snoop response output wire snp_rsp_valid, - output wire [`DRAM_ADDR_WIDTH-1:0] snp_rsp_addr, + output wire [SRC_ADDR_WIDTH-1:0] snp_rsp_addr, output wire snp_rsp_invalidate, - output wire [SNP_REQ_TAG_WIDTH-1:0] snp_rsp_tag, + output wire [SNP_TAG_WIDTH-1:0] snp_rsp_tag, input wire snp_rsp_ready, // Snoop Forwarding out output wire [NUM_REQUESTS-1:0] snp_fwdout_valid, - output wire [NUM_REQUESTS-1:0][`DRAM_ADDR_WIDTH-1:0] snp_fwdout_addr, + output wire [NUM_REQUESTS-1:0][DST_ADDR_WIDTH-1:0] snp_fwdout_addr, output wire [NUM_REQUESTS-1:0] snp_fwdout_invalidate, output wire [NUM_REQUESTS-1:0][`LOG2UP(SNRQ_SIZE)-1:0] snp_fwdout_tag, input wire [NUM_REQUESTS-1:0] snp_fwdout_ready, @@ -37,30 +37,37 @@ module VX_snp_forwarder #( input wire [NUM_REQUESTS-1:0][`LOG2UP(SNRQ_SIZE)-1:0] snp_fwdin_tag, output wire [NUM_REQUESTS-1:0] snp_fwdin_ready ); + localparam ADDR_DIFF = DST_ADDR_WIDTH - SRC_ADDR_WIDTH; + localparam NUM_REQUESTS_QUAL = NUM_REQUESTS * (1 << ADDR_DIFF); + localparam REQ_QUAL_BITS = `LOG2UP(NUM_REQUESTS_QUAL); + `STATIC_ASSERT(NUM_REQUESTS > 1, ("invalid value")) - reg [`REQS_BITS:0] pending_cntrs [SNRQ_SIZE-1:0]; + reg [REQ_QUAL_BITS:0] pending_cntrs [SNRQ_SIZE-1:0]; wire [`LOG2UP(SNRQ_SIZE)-1:0] sfq_write_addr, sfq_read_addr; wire sfq_acquire, sfq_release, sfq_full; + + wire [`LOG2UP(SNRQ_SIZE)-1:0] fwdout_tag; + reg [NUM_REQUESTS-1:0] snp_fwdout_ready_other; + wire fwdout_ready; - wire fwdin_valid; wire [`LOG2UP(SNRQ_SIZE)-1:0] fwdin_tag; + wire fwdin_valid; wire fwdin_ready = snp_rsp_ready || (1 != pending_cntrs[sfq_read_addr]); wire fwdin_fire = fwdin_valid && fwdin_ready; - wire fwdout_ready = (& snp_fwdout_ready); - - assign snp_rsp_valid = fwdin_valid && (1 == pending_cntrs[sfq_read_addr]); // send response + assign snp_rsp_valid = fwdin_valid && (1 == pending_cntrs[sfq_read_addr]); assign sfq_read_addr = fwdin_tag; - assign sfq_acquire = snp_req_valid && !sfq_full && fwdout_ready; assign sfq_release = snp_rsp_valid && snp_rsp_ready; + wire snp_req_ready_unqual = !sfq_full && fwdout_ready; + VX_cam_buffer #( - .DATAW (`DRAM_ADDR_WIDTH + 1 + SNP_REQ_TAG_WIDTH), + .DATAW (SRC_ADDR_WIDTH + 1 + SNP_TAG_WIDTH), .SIZE (SNRQ_SIZE) ) snp_fwd_cam ( .clk (clk), @@ -75,9 +82,54 @@ module VX_snp_forwarder #( .full (sfq_full) ); + wire [DST_ADDR_WIDTH-1:0] snp_req_addr_qual; + wire dispatch_ready; + + if (ADDR_DIFF != 0) begin + reg [`LOG2UP(SNRQ_SIZE)-1:0] fwdout_tag_r; + reg [DST_ADDR_WIDTH-1:0] snp_req_addr_r; + reg dispatch_ready_r; + reg use_cter_r; + + always @(posedge clk) begin + if (reset) begin + dispatch_ready_r <= 0; + use_cter_r <= 0; + end else begin + if (snp_req_valid && snp_req_ready_unqual) begin + if (snp_req_addr_r[ADDR_DIFF-1:0] == ((1 << ADDR_DIFF)-2)) begin + dispatch_ready_r <= 1; + end + if (snp_req_addr_r[ADDR_DIFF-1:0] == ((1 << ADDR_DIFF)-1)) begin + dispatch_ready_r <= 0; + use_cter_r <= 0; + end else begin + use_cter_r <= 1; + end + end + end + + if (snp_req_valid && snp_req_ready_unqual) begin + snp_req_addr_r <= snp_req_addr_qual + DST_ADDR_WIDTH'(1'b1); + end + if (!use_cter_r) begin + fwdout_tag_r <= sfq_write_addr; + end + end + assign sfq_acquire = snp_req_valid && snp_req_ready_unqual && !use_cter_r; + assign fwdout_tag = use_cter_r ? fwdout_tag_r : sfq_write_addr; + assign snp_req_addr_qual = use_cter_r ? snp_req_addr_r : {snp_req_addr, ADDR_DIFF'(0)}; + assign dispatch_ready = dispatch_ready_r; + end else begin + assign sfq_acquire = snp_req_valid && snp_req_ready; + assign fwdout_tag = sfq_write_addr; + assign snp_req_addr_qual = snp_req_addr; + assign dispatch_ready = 1'b1; + end + always @(posedge clk) begin if (sfq_acquire) begin - pending_cntrs[sfq_write_addr] <= NUM_REQUESTS; + pending_cntrs[sfq_write_addr] <= NUM_REQUESTS_QUAL; end if (fwdin_fire) begin pending_cntrs[sfq_read_addr] <= pending_cntrs[sfq_read_addr] - 1; @@ -85,13 +137,25 @@ module VX_snp_forwarder #( end for (genvar i = 0; i < NUM_REQUESTS; i++) begin - assign snp_fwdout_valid[i] = snp_req_valid && snp_req_ready; - assign snp_fwdout_addr[i] = snp_req_addr; + assign snp_fwdout_valid[i] = snp_req_valid && snp_fwdout_ready_other[i] && !sfq_full; + assign snp_fwdout_addr[i] = snp_req_addr_qual; assign snp_fwdout_invalidate[i] = snp_req_invalidate; - assign snp_fwdout_tag[i] = sfq_write_addr; + assign snp_fwdout_tag[i] = fwdout_tag; end - assign snp_req_ready = !sfq_full && fwdout_ready; + always @(*) begin + snp_fwdout_ready_other = {NUM_REQUESTS{1'b1}}; + for (integer i = 0; i < NUM_REQUESTS; i++) begin + for (integer j = 0; j < NUM_REQUESTS; j++) begin + if (i != j) + snp_fwdout_ready_other[i] &= snp_fwdout_ready[j]; + end + end + end + + assign fwdout_ready = (& snp_fwdout_ready); + + assign snp_req_ready = snp_req_ready_unqual && dispatch_ready; if (NUM_REQUESTS > 1) begin wire sel_valid; diff --git a/hw/rtl/cache/VX_snp_rsp_arb.v b/hw/rtl/cache/VX_snp_rsp_arb.v index 81a7a5821..695c142ee 100644 --- a/hw/rtl/cache/VX_snp_rsp_arb.v +++ b/hw/rtl/cache/VX_snp_rsp_arb.v @@ -3,17 +3,17 @@ module VX_snp_rsp_arb #( parameter NUM_BANKS = 1, parameter BANK_LINE_SIZE = 1, - parameter SNP_REQ_TAG_WIDTH = 1 + parameter SNP_TAG_WIDTH = 1 ) ( input wire clk, input wire reset, input wire [NUM_BANKS-1:0] per_bank_snp_rsp_valid, - input wire [NUM_BANKS-1:0][SNP_REQ_TAG_WIDTH-1:0] per_bank_snp_rsp_tag, + input wire [NUM_BANKS-1:0][SNP_TAG_WIDTH-1:0] per_bank_snp_rsp_tag, output wire [NUM_BANKS-1:0] per_bank_snp_rsp_ready, output wire snp_rsp_valid, - output wire [SNP_REQ_TAG_WIDTH-1:0] snp_rsp_tag, + output wire [SNP_TAG_WIDTH-1:0] snp_rsp_tag, input wire snp_rsp_ready ); if (NUM_BANKS > 1) begin @@ -35,7 +35,7 @@ module VX_snp_rsp_arb #( wire stall = ~snp_rsp_ready && snp_rsp_valid; VX_generic_register #( - .N(1 + SNP_REQ_TAG_WIDTH), + .N(1 + SNP_TAG_WIDTH), .PASSTHRU(NUM_BANKS <= 2) ) pipe_reg ( .clk (clk), diff --git a/hw/rtl/interfaces/VX_cache_core_rsp_if.v b/hw/rtl/interfaces/VX_cache_core_rsp_if.v index 5b70f5ee4..abf821492 100644 --- a/hw/rtl/interfaces/VX_cache_core_rsp_if.v +++ b/hw/rtl/interfaces/VX_cache_core_rsp_if.v @@ -11,8 +11,10 @@ interface VX_cache_core_rsp_if #( ) (); wire [NUM_REQUESTS-1:0] valid; + wire [NUM_REQUESTS-1:0][`WORD_WIDTH-1:0] data; wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] tag; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_cache_dram_req_if.v b/hw/rtl/interfaces/VX_cache_dram_req_if.v index d92b99124..d02c1f571 100644 --- a/hw/rtl/interfaces/VX_cache_dram_req_if.v +++ b/hw/rtl/interfaces/VX_cache_dram_req_if.v @@ -10,11 +10,13 @@ interface VX_cache_dram_req_if #( ) (); wire valid; + wire rw; wire [(DRAM_LINE_WIDTH/8)-1:0] byteen; wire [DRAM_ADDR_WIDTH-1:0] addr; wire [DRAM_LINE_WIDTH-1:0] data; - wire [DRAM_TAG_WIDTH-1:0] tag; + wire [DRAM_TAG_WIDTH-1:0] tag; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_cache_dram_rsp_if.v b/hw/rtl/interfaces/VX_cache_dram_rsp_if.v index 9e994d3a8..c7b36ecaf 100644 --- a/hw/rtl/interfaces/VX_cache_dram_rsp_if.v +++ b/hw/rtl/interfaces/VX_cache_dram_rsp_if.v @@ -9,8 +9,10 @@ interface VX_cache_dram_rsp_if #( ) (); wire valid; + wire [DRAM_LINE_WIDTH-1:0] data; wire [DRAM_TAG_WIDTH-1:0] tag; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_cache_snp_req_if.v b/hw/rtl/interfaces/VX_cache_snp_req_if.v index 6134b02d2..99fa0cf14 100644 --- a/hw/rtl/interfaces/VX_cache_snp_req_if.v +++ b/hw/rtl/interfaces/VX_cache_snp_req_if.v @@ -9,9 +9,11 @@ interface VX_cache_snp_req_if #( ) (); wire valid; + wire [DRAM_ADDR_WIDTH-1:0] addr; wire invalidate; - wire [SNP_TAG_WIDTH-1:0] tag; + wire [SNP_TAG_WIDTH-1:0] tag; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_cache_snp_rsp_if.v b/hw/rtl/interfaces/VX_cache_snp_rsp_if.v index a553c48d3..d1b619ce7 100644 --- a/hw/rtl/interfaces/VX_cache_snp_rsp_if.v +++ b/hw/rtl/interfaces/VX_cache_snp_rsp_if.v @@ -8,7 +8,9 @@ interface VX_cache_snp_rsp_if #( ) (); wire valid; + wire [SNP_TAG_WIDTH-1:0] tag; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_cmt_to_csr_if.v b/hw/rtl/interfaces/VX_cmt_to_csr_if.v index 563e1b286..a67a3d85d 100644 --- a/hw/rtl/interfaces/VX_cmt_to_csr_if.v +++ b/hw/rtl/interfaces/VX_cmt_to_csr_if.v @@ -5,14 +5,12 @@ interface VX_cmt_to_csr_if (); - wire valid; + wire valid; wire [`NW_BITS-1:0] wid; - wire [$clog2(`NUM_THREADS+1)-1:0] commit_size; - - wire has_fflags; - fflags_t fflags; + wire has_fflags; + fflags_t fflags; endinterface diff --git a/hw/rtl/interfaces/VX_csr_io_req_if.v b/hw/rtl/interfaces/VX_csr_io_req_if.v index a5307a08e..3b4d48060 100644 --- a/hw/rtl/interfaces/VX_csr_io_req_if.v +++ b/hw/rtl/interfaces/VX_csr_io_req_if.v @@ -6,9 +6,11 @@ interface VX_csr_io_req_if (); wire valid; + wire [`CSR_ADDR_BITS-1:0] addr; wire rw; wire [31:0] data; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_csr_io_rsp_if.v b/hw/rtl/interfaces/VX_csr_io_rsp_if.v index 7c4c8f6d7..2183edd75 100644 --- a/hw/rtl/interfaces/VX_csr_io_rsp_if.v +++ b/hw/rtl/interfaces/VX_csr_io_rsp_if.v @@ -6,7 +6,9 @@ interface VX_csr_io_rsp_if (); wire valid; + wire [31:0] data; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_decode_if.v b/hw/rtl/interfaces/VX_decode_if.v index b253ecbe5..692e2073d 100644 --- a/hw/rtl/interfaces/VX_decode_if.v +++ b/hw/rtl/interfaces/VX_decode_if.v @@ -10,18 +10,15 @@ interface VX_decode_if (); wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; - wire [`EX_BITS-1:0] ex_type; wire [`OP_BITS-1:0] op_type; wire [`MOD_BITS-1:0] op_mod; wire wb; - wire [`NR_BITS-1:0] rd; wire [`NR_BITS-1:0] rs1; wire [`NR_BITS-1:0] rs2; wire [`NR_BITS-1:0] rs3; - wire [31:0] imm; - + wire [31:0] imm; wire rs1_is_PC; wire rs2_is_imm; wire use_rs3; diff --git a/hw/rtl/interfaces/VX_exu_to_cmt_if.v b/hw/rtl/interfaces/VX_exu_to_cmt_if.v index 86763e2ac..abb8b4875 100644 --- a/hw/rtl/interfaces/VX_exu_to_cmt_if.v +++ b/hw/rtl/interfaces/VX_exu_to_cmt_if.v @@ -5,13 +5,15 @@ interface VX_exu_to_cmt_if (); - wire valid; + wire valid; + wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; wire [`NUM_THREADS-1:0][31:0] data; wire [`NR_BITS-1:0] rd; wire wb; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_fpu_to_cmt_if.v b/hw/rtl/interfaces/VX_fpu_to_cmt_if.v index 066949d1e..34e269d51 100644 --- a/hw/rtl/interfaces/VX_fpu_to_cmt_if.v +++ b/hw/rtl/interfaces/VX_fpu_to_cmt_if.v @@ -5,7 +5,8 @@ interface VX_fpu_to_cmt_if (); - wire valid; + wire valid; + wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -14,6 +15,7 @@ interface VX_fpu_to_cmt_if (); wire wb; wire has_fflags; fflags_t [`NUM_THREADS-1:0] fflags; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_fpu_to_csr_if.v b/hw/rtl/interfaces/VX_fpu_to_csr_if.v index 1dfae0ac4..d3bff987c 100644 --- a/hw/rtl/interfaces/VX_fpu_to_csr_if.v +++ b/hw/rtl/interfaces/VX_fpu_to_csr_if.v @@ -9,15 +9,13 @@ interface VX_fpu_to_csr_if (); - wire valid; - + wire valid; wire [`NW_BITS-1:0] wid; - - wire fflags_NV; - wire fflags_DZ; - wire fflags_OF; - wire fflags_UF; - wire fflags_NX; + wire fflags_NV; + wire fflags_DZ; + wire fflags_OF; + wire fflags_UF; + wire fflags_NX; endinterface diff --git a/hw/rtl/interfaces/VX_gpr_rsp_if.v b/hw/rtl/interfaces/VX_gpr_rsp_if.v index 4b953e067..0c86bef8a 100644 --- a/hw/rtl/interfaces/VX_gpr_rsp_if.v +++ b/hw/rtl/interfaces/VX_gpr_rsp_if.v @@ -9,7 +9,6 @@ interface VX_gpr_rsp_if (); wire [`NW_BITS-1:0] wid; wire [31:0] PC; `IGNORE_WARNINGS_END - wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; wire [`NUM_THREADS-1:0][31:0] rs3_data; diff --git a/hw/rtl/interfaces/VX_ifetch_req_if.v b/hw/rtl/interfaces/VX_ifetch_req_if.v index c4f34bcbe..923209a25 100644 --- a/hw/rtl/interfaces/VX_ifetch_req_if.v +++ b/hw/rtl/interfaces/VX_ifetch_req_if.v @@ -6,9 +6,11 @@ interface VX_ifetch_req_if (); wire valid; + wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_ifetch_rsp_if.v b/hw/rtl/interfaces/VX_ifetch_rsp_if.v index 7a8c14fd0..8f6c38aec 100644 --- a/hw/rtl/interfaces/VX_ifetch_rsp_if.v +++ b/hw/rtl/interfaces/VX_ifetch_rsp_if.v @@ -5,11 +5,13 @@ interface VX_ifetch_rsp_if (); - wire valid; + wire valid; + wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; wire [31:0] instr; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_lsu_req_if.v b/hw/rtl/interfaces/VX_lsu_req_if.v index bff05720b..9527cc042 100644 --- a/hw/rtl/interfaces/VX_lsu_req_if.v +++ b/hw/rtl/interfaces/VX_lsu_req_if.v @@ -10,14 +10,11 @@ interface VX_lsu_req_if (); wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; - wire rw; wire [`BYTEEN_BITS-1:0] byteen; - wire [`NUM_THREADS-1:0][31:0] store_data; wire [`NUM_THREADS-1:0][31:0] base_addr; - wire [31:0] offset; - + wire [31:0] offset; wire [`NR_BITS-1:0] rd; wire wb; diff --git a/hw/rtl/interfaces/VX_warp_ctl_if.v b/hw/rtl/interfaces/VX_warp_ctl_if.v index c7eb8a4b5..2a53c7140 100644 --- a/hw/rtl/interfaces/VX_warp_ctl_if.v +++ b/hw/rtl/interfaces/VX_warp_ctl_if.v @@ -5,13 +5,12 @@ interface VX_warp_ctl_if (); - wire valid; + wire valid; wire [`NW_BITS-1:0] wid; - - gpu_tmc_t tmc; - gpu_wspawn_t wspawn; - gpu_barrier_t barrier; - gpu_split_t split; + gpu_tmc_t tmc; + gpu_wspawn_t wspawn; + gpu_barrier_t barrier; + gpu_split_t split; endinterface diff --git a/hw/rtl/interfaces/VX_writeback_if.v b/hw/rtl/interfaces/VX_writeback_if.v index 5cb412bed..325b6bdcb 100644 --- a/hw/rtl/interfaces/VX_writeback_if.v +++ b/hw/rtl/interfaces/VX_writeback_if.v @@ -6,13 +6,12 @@ interface VX_writeback_if (); wire valid; + wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; - `IGNORE_WARNINGS_BEGIN wire [31:0] PC; `IGNORE_WARNINGS_END - wire [`NR_BITS-1:0] rd; wire [`NUM_THREADS-1:0][31:0] data; diff --git a/hw/rtl/interfaces/VX_wstall_if.v b/hw/rtl/interfaces/VX_wstall_if.v index 2e9840851..36260870a 100644 --- a/hw/rtl/interfaces/VX_wstall_if.v +++ b/hw/rtl/interfaces/VX_wstall_if.v @@ -5,7 +5,7 @@ interface VX_wstall_if(); - wire valid; + wire valid; wire [`NW_BITS-1:0] wid; endinterface diff --git a/hw/rtl/libs/VX_cam_buffer.v b/hw/rtl/libs/VX_cam_buffer.v index f3ca5cfa5..e93612d80 100644 --- a/hw/rtl/libs/VX_cam_buffer.v +++ b/hw/rtl/libs/VX_cam_buffer.v @@ -54,7 +54,7 @@ module VX_cam_buffer #( end else begin for (integer i = 0; i < CPORTS; i++) begin if (release_slot[i]) begin - assert(0 == free_slots[release_addr[i]]) else $display("%t: freed slot at port %d", $time, release_addr[i]); + assert(0 == free_slots[release_addr[i]]) else $error("%t: releasing invalid slot at port %d", $time, release_addr[i]); end end free_slots <= free_slots_n; @@ -63,7 +63,7 @@ module VX_cam_buffer #( end if (acquire_slot) begin - assert(1 == free_slots[write_addr]) else $display("%t: inused slot at port %d", $time, write_addr); + assert(1 == free_slots[write_addr]) else $error("%t: acquiring used slot at port %d", $time, write_addr); entries[write_addr] <= write_data; end end diff --git a/hw/scripts/gen_config.py b/hw/scripts/gen_config.py index 05ecb8bb6..9f33c01ff 100755 --- a/hw/scripts/gen_config.py +++ b/hw/scripts/gen_config.py @@ -57,34 +57,24 @@ if args.outc != 'none': print('\n#endif', file=f) translation_rules = [ - (re.compile(r'^$'), r''), - (re.compile(r'^(\s*)`ifndef\s+([^ ]+)'), r'\1#ifndef \2'), - (re.compile(r'^(\s*)`define\s+([^ ]+)'), r'\1#define \2'), - (re.compile(r'^(\s*)`include "VX_user_config\.vh"'), r''), - (re.compile(r'^(\s*)`define\s+([^ ]+) (.+)'), r'\1#define \2 \3'), - (re.compile(r'^(\s*)`endif\s+'), r'\1#endif'), - (re.compile(r'^(\s*)//(.*)'), r'\1// \2'), -] + # preprocessor directives + (re.compile(r'^\s*`include .*$'), r''), + (re.compile(r'`ifdef'), r'#ifdef'), + (re.compile(r'`ifndef'), r'#ifndef'), + (re.compile(r'`elif'), r'#elif'), + (re.compile(r'`else'), r'#else'), + (re.compile(r'`define'), r'#define'), + (re.compile(r'`endif'), r'#endif'), -post_rules = [ - (re.compile(r"\d+'d(\d+)"), r'\1'), - - # non-standard C but supported by GCC and Clang - (re.compile(r"\d+'b([01]+)"), r'0b\1'), - (re.compile(r"\d+'h([\da-fA-F]+)"), r'0x\1'), - - # fix macro references (does not support escaped identifiers ยง5.6.1) + # macro expansion (re.compile(r"`([A-Za-z_][$_0-9A-Za-z]*)"), r'\1'), + + # literals + (re.compile(r"\d+'d(\d+)"), r'\1'), + (re.compile(r"\d+'b([01]+)"), r'0b\1'), + (re.compile(r"\d+'h([\da-fA-F]+)"), r'0x\1') ] -def post_process_line(line): - for pat, repl in post_rules: - line = pat.sub(repl, line) - return line - - -in_expansion = False - if args.outc != 'none': with open(args.outc, 'a') as f: print(''' @@ -96,36 +86,14 @@ if args.outc != 'none': with open(path.join(script_dir, '../rtl/VX_config.vh'), 'r') as r: lineno = 0 for line in r: - if in_expansion: - f.write(post_process_line(line)) - if not line.strip().endswith('\\'): - in_expansion = False - else: - for pat, repl in translation_rules: - if pat.match(line): - if line.strip().endswith('\\'): - in_expansion = True - f.write(post_process_line(pat.sub(repl, line))) - break - else: - raise ValueError('failed to find rule for: "' + line + '" (' + str(lineno) + ')') + for pat, repl in translation_rules: + match = pat.search(line) + if match: + line = re.sub(pat, repl, line) + #print("*** match @" + str(lineno) + ": " + match.group() + " => " + line) + f.write(line) lineno = lineno + 1 - print(''' -// Misc - -#define THREADS_PER_WARP NUM_THREADS -#define WARPS_PER_CORE NUM_WARPS -#define NUM_WI (NUM_WARPS * NUM_THREADS * NUM_CORES_PER_CLUSTER * NUM_CLUSTERS) - -// legacy -#define TOTAL_THREADS NUM_WI -#define TOTAL_WARPS (NUM_WARPS * NUM_CORES_PER_CLUSTER * NUM_CLUSTERS) - -// COLORS -#define GREEN "\\033[32m" -#define RED "\\033[31m" -#define DEFAULT "\\033[39m" '''[1:], file=f) diff --git a/hw/simulate/Makefile b/hw/simulate/Makefile index 841a56a26..d88bef751 100644 --- a/hw/simulate/Makefile +++ b/hw/simulate/Makefile @@ -1,19 +1,19 @@ -SINGLECORE += -DNUM_CLUSTERS=1 -DNUM_CORES=1 +SINGLECORE += -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 #MULTICORE ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 #MULTICORE ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 MULTICORE ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 # control RTL debug print states +DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK -DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_SNP DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSRQ DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA -DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM -DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE +DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_SNP +DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_PRINT_FLAGS += -DDBG_PRINT_AVS DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index 895ea1da2..206e0eff3 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -57,11 +57,7 @@ void Simulator::attach_ram(RAM* ram) { dram_rsp_vec_.clear(); } -void Simulator::reset() { -#ifndef NDEBUG - std::cout << timestamp << ": [sim] reset()" << std::endl; -#endif - +void Simulator::reset() { print_bufs_.clear(); dram_rsp_vec_.clear(); @@ -96,15 +92,25 @@ void Simulator::reset() { } void Simulator::step() { + vortex_->clk = 0; this->eval(); + + dram_rsp_ready_ = vortex_->dram_rsp_ready; + snp_req_ready_ = vortex_->snp_req_ready; + csr_io_req_ready_ = vortex_->csr_io_req_ready; + vortex_->clk = 1; this->eval(); - + this->eval_dram_bus(); this->eval_io_bus(); this->eval_csr_bus(); this->eval_snp_bus(); + +#ifndef NDEBUG + fflush(stdout); +#endif } void Simulator::eval() { @@ -134,8 +140,7 @@ void Simulator::eval_dram_bus() { // send DRAM response if (dram_rsp_active_ - && vortex_->dram_rsp_valid - && vortex_->dram_rsp_ready) { + && vortex_->dram_rsp_valid && dram_rsp_ready_) { dram_rsp_active_ = false; } if (!dram_rsp_active_) { @@ -183,7 +188,7 @@ void Simulator::eval_dram_bus() { } } - vortex_->dram_req_ready = ~dram_stalled; + vortex_->dram_req_ready = !dram_stalled; } void Simulator::eval_io_bus() { @@ -207,31 +212,32 @@ void Simulator::eval_io_bus() { } void Simulator::eval_snp_bus() { - if (snp_req_active_) { - if (vortex_->snp_rsp_valid) { - assert(pending_snp_reqs_ > 0); - --pending_snp_reqs_; + if (snp_req_active_) { + if (vortex_->snp_req_valid && snp_req_ready_) { + assert(snp_req_size_); #ifdef DBG_PRINT_CACHE_SNP - std::cout << timestamp << ": [sim] snp rsp: tag=" << vortex_->snp_rsp_tag << " pending=" << pending_snp_reqs_ << std::endl; + std::cout << std::dec << timestamp << ": [sim] SNP Req: addr=" << std::hex << vortex_->snp_req_addr << " tag=" << vortex_->snp_req_tag << " remain=" << (snp_req_size_-1) << std::endl; #endif - } - if (vortex_->snp_req_valid && vortex_->snp_req_ready) { - if (snp_req_size_ != 0) { - vortex_->snp_req_addr += 1; - vortex_->snp_req_tag += 1; - --snp_req_size_; - ++pending_snp_reqs_; - #ifdef DBG_PRINT_CACHE_SNP - std::cout << timestamp << ": [sim] snp req: addr=" << std::hex << vortex_->snp_req_addr << " tag=" << vortex_->snp_req_tag << " remain=" << snp_req_size_ << std::endl; - #endif - } else { - vortex_->snp_req_valid = 0; + ++vortex_->snp_req_addr; + ++vortex_->snp_req_tag; + ++pending_snp_reqs_; + --snp_req_size_; + if (0 == snp_req_size_) { + vortex_->snp_req_valid = false; } } - if (!vortex_->snp_req_valid - && 0 == pending_snp_reqs_) { - snp_req_active_ = false; - } + + if (vortex_->snp_rsp_valid && vortex_->snp_rsp_ready) { + assert(pending_snp_reqs_ > 0); + --pending_snp_reqs_; + if (!vortex_->snp_req_valid && 0 == pending_snp_reqs_) { + vortex_->snp_rsp_ready = false; + snp_req_active_ = false; + } + #ifdef DBG_PRINT_CACHE_SNP + std::cout << std::dec << timestamp << ": [sim] SNP Rsp: tag=" << std::hex << vortex_->snp_rsp_tag << " pending=" << pending_snp_reqs_ << std::endl; + #endif + } } else { vortex_->snp_req_valid = 0; vortex_->snp_rsp_ready = 0; @@ -240,18 +246,24 @@ void Simulator::eval_snp_bus() { void Simulator::eval_csr_bus() { if (csr_req_active_) { - if (vortex_->csr_io_req_rw) { - if (vortex_->csr_io_req_ready) { - vortex_->snp_req_valid = 0; - csr_req_active_ = false; - } - } else { - if (vortex_->csr_io_rsp_valid) { - *csr_rsp_value_ = vortex_->csr_io_rsp_data; - vortex_->snp_req_valid = 0; - vortex_->csr_io_rsp_ready = 0; - csr_req_active_ = false; - } + if (vortex_->csr_io_req_valid && csr_io_req_ready_) { + #ifndef NDEBUG + if (vortex_->csr_io_req_rw) + std::cout << std::dec << timestamp << ": [sim] CSR Wr Req: core=" << (int)vortex_->csr_io_req_coreid << ", addr=" << std::hex << vortex_->csr_io_req_addr << ", value=" << vortex_->csr_io_req_data << std::endl; + else + std::cout << std::dec << timestamp << ": [sim] CSR Rd Req: core=" << (int)vortex_->csr_io_req_coreid << ", addr=" << std::hex << vortex_->csr_io_req_addr << std::endl; + #endif + vortex_->csr_io_req_valid = 0; + if (vortex_->csr_io_req_rw) + csr_req_active_ = false; + } + if (vortex_->csr_io_rsp_valid && vortex_->csr_io_rsp_ready) { + *csr_rsp_value_ = vortex_->csr_io_rsp_data; + vortex_->csr_io_rsp_ready = 0; + csr_req_active_ = false; + #ifndef NDEBUG + std::cout << std::dec << timestamp << ": [sim] CSR Rsp: value=" << vortex_->csr_io_rsp_data << std::endl; + #endif } } else { vortex_->csr_io_req_valid = 0; @@ -278,33 +290,23 @@ bool Simulator::csr_req_active() const { } void Simulator::flush_caches(uint32_t mem_addr, uint32_t size) { -#ifndef NDEBUG - std::cout << timestamp << ": [sim] flush_caches()" << std::endl; -#endif if (0 == size) return; + assert(!vortex_->snp_rsp_valid); + vortex_->snp_req_addr = mem_addr / GLOBAL_BLOCK_SIZE; vortex_->snp_req_tag = 0; vortex_->snp_req_valid = 1; vortex_->snp_rsp_ready = 1; - snp_req_size_ = (size + GLOBAL_BLOCK_SIZE - 1) / GLOBAL_BLOCK_SIZE; - --snp_req_size_; - pending_snp_reqs_ = 1; + snp_req_size_ = (size + GLOBAL_BLOCK_SIZE - 1) / GLOBAL_BLOCK_SIZE; + pending_snp_reqs_ = 0; snp_req_active_ = true; - - #ifdef DBG_PRINT_CACHE_SNP - std::cout << timestamp << ": [sim] snp req: addr=" << std::hex << vortex_->snp_req_addr << " tag=" << vortex_->snp_req_tag << " remain=" << snp_req_size_ << std::endl; - #endif } void Simulator::set_csr(int core_id, int addr, unsigned value) { -#ifndef NDEBUG - std::cout << timestamp << ": [sim] set_csr()" << std::endl; -#endif - vortex_->csr_io_req_valid = 1; vortex_->csr_io_req_coreid = core_id; vortex_->csr_io_req_addr = addr; @@ -316,10 +318,6 @@ void Simulator::set_csr(int core_id, int addr, unsigned value) { } void Simulator::get_csr(int core_id, int addr, unsigned *value) { -#ifndef NDEBUG - std::cout << timestamp << ": [sim] get_csr()" << std::endl; -#endif - vortex_->csr_io_req_valid = 1; vortex_->csr_io_req_coreid = core_id; vortex_->csr_io_req_addr = addr; @@ -327,12 +325,13 @@ void Simulator::get_csr(int core_id, int addr, unsigned *value) { vortex_->csr_io_rsp_ready = 1; csr_rsp_value_ = value; + csr_req_active_ = true; } void Simulator::run() { #ifndef NDEBUG - std::cout << timestamp << ": [sim] run()" << std::endl; + std::cout << std::dec << timestamp << ": [sim] run()" << std::endl; #endif // execute program diff --git a/hw/simulate/simulator.h b/hw/simulate/simulator.h index ba9fee7ce..d688ef885 100644 --- a/hw/simulate/simulator.h +++ b/hw/simulate/simulator.h @@ -65,7 +65,11 @@ private: std::list dram_rsp_vec_; bool dram_rsp_active_; - + + bool dram_rsp_ready_; + bool snp_req_ready_; + bool csr_io_req_ready_; + bool snp_req_active_; bool csr_req_active_; diff --git a/hw/simulate/testbench.cpp b/hw/simulate/testbench.cpp index ec990759d..45f787984 100644 --- a/hw/simulate/testbench.cpp +++ b/hw/simulate/testbench.cpp @@ -3,6 +3,10 @@ #include #include +#define GREEN "\\033[32m" +#define RED "\\033[31m" +#define DEFAULT "\\033[39m" + #define ALL_TESTS int main(int argc, char **argv) {