constant integration updates, cache bank incoming_fill optimization

This commit is contained in:
Blaise Tine 2020-11-15 23:01:24 -08:00
parent 2904f6441d
commit 1bc4b8e7a8
3 changed files with 135 additions and 51 deletions

View file

@ -1,7 +1,7 @@
language: cpp
dist: bionic
os: linux
compiler: clang
compiler: gcc
addons:
apt:
sources:
@ -20,12 +20,19 @@ install:
- export PATH=$VERILATOR_ROOT/bin:$PATH
script:
- make -j > /dev/null 2>&1
- make -j
- ci/test_runtime.sh
- ci/test_driver.sh
- ci/test_riscv_isa.sh
- ci/test_opencl.sh
- ci/blackbox.sh
- ci/blackbox.sh -run_1c
- ci/blackbox.sh -run_2c
- ci/blackbox.sh -run_4c
- ci/blackbox.sh -run_4c_l2
- ci/blackbox.sh -run_8c_2l2
- ci/blackbox.sh -run_16c_4l2_l3
- ci/blackbox.sh -run_debug
- ci/blackbox.sh -run_scope
after_success:
# Gather code coverage

View file

@ -1,41 +1,112 @@
#!/bin/sh
# test single core
make -C driver/opae/vlsim clean
CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae/vlsim > /dev/null 2>&1
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
run_1c()
{
# test single core
make -C driver/opae/vlsim clean
CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae/vlsim
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
}
# test 2 cores
make -C driver/opae/vlsim clean
CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0" make -C driver/opae/vlsim > /dev/null 2>&1
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
run_2c()
{
# test 2 cores
make -C driver/opae/vlsim clean
CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0" make -C driver/opae/vlsim
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
}
# test 4 cores with L2
make -C driver/opae/vlsim clean
CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1" make -C driver/opae/vlsim > /dev/null 2>&1
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
run_4c()
{
# test 4 cores
make -C driver/opae/vlsim clean
CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=0" make -C driver/opae/vlsim
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
}
# test 8 cores with L2
make -C driver/opae/vlsim clean
CONFIGS="-DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1" make -C driver/opae/vlsim > /dev/null 2>&1
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
run_4c_l2()
{
# test 4 cores with L2
make -C driver/opae/vlsim clean
CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1" make -C driver/opae/vlsim
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
}
# test 16 cores with L2 and L3
make -C driver/opae/vlsim clean
CONFIGS="-DNUM_CLUSTERS=4 -DNUM_CORES=4 -DL2_ENABLE=1 -DL3_ENABLE=1" make -C driver/opae/vlsim > /dev/null 2>&1
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
run_8c_2l2()
{
# test 8 cores with 2xL2
make -C driver/opae/vlsim clean
CONFIGS="-DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1" make -C driver/opae/vlsim
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
}
# test debug build
make -C driver/opae/vlsim clean
DEBUG=1 CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae/vlsim > /dev/null 2>&1
make -C driver/tests/demo run-vlsim
run_16c_4l2_l3()
{
# test 16 cores with L2 and L3
make -C driver/opae/vlsim clean
CONFIGS="-DNUM_CLUSTERS=4 -DNUM_CORES=4 -DL2_ENABLE=1 -DL3_ENABLE=1" make -C driver/opae/vlsim
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
}
# test build with scope analyzer
make -C driver/opae clean
SCOPE=1 CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae > /dev/null 2>&1
make -C driver/tests/demo run-vlsim
run_debug()
{
# test debug build
make -C driver/opae/vlsim clean
DEBUG=1 CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae/vlsim
make -C driver/tests/demo run-vlsim
}
run_scope()
{
# test build with scope analyzer
make -C driver/opae clean
SCOPE=1 CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae
make -C driver/tests/demo run-vlsim
}
usage()
{
echo "usage: blackbox [[-run_1c] [-run_2c] [-run_4c] [-run_4c_l2] [-run_8c_2l2] [-run_16c_4l2_l3] [-run_debug] [-run_scope] [-all] [-h|--help]]"
}
while [ "$1" != "" ]; do
case $1 in
-run_1c ) run_1c
;;
-run_2c ) run_2c
;;
-run_4c ) run_4c
;;
-run_4c_l2 ) run_4c_l2
;;
-run_8c_2l2 ) run_8c_2l2
;;
-run_16c_4l2_l3 ) run_16c_4l2_l3
;;
-run_debug ) run_debug
;;
-run_scope ) run_scope
;;
-all ) run_1c
run_2c
run_4c
run_4c_l2
run_8c_2l2
run_16c_4l2_l3
run_debug
run_scope
;;
-h | --help ) usage
exit
;;
* ) usage
exit 1
esac
shift
done

View file

@ -168,11 +168,10 @@ module VX_bank #(
wire [`LINE_ADDR_WIDTH-1:0] dfpq_addr_st0;
wire [`BANK_LINE_WIDTH-1:0] dfpq_filldata_st0;
wire dram_rsp_fire = dram_rsp_valid && dram_rsp_ready;
assign dram_rsp_ready = !dfpq_full;
if (DRAM_ENABLE) begin
wire dram_rsp_fire = dram_rsp_valid && dram_rsp_ready;
VX_generic_queue #(
.DATAW(`LINE_ADDR_WIDTH + $bits(dram_rsp_data)),
.SIZE(DRFQ_SIZE)
@ -535,9 +534,10 @@ module VX_bank #(
wire snp_invalidate_st3;
wire is_msrq_st3;
wire send_core_rsp_st3;
wire send_fill_req_st3;
wire send_dwb_req_st3;
wire do_writeback_st3;
wire send_snp_rsp_st3;
wire incoming_fill_st3;
wire send_core_rsp_st2 = valid_st2 && !is_fill_st2 && !is_snp_st2 && !miss_st2 && !force_miss_st2;
@ -549,17 +549,27 @@ module VX_bank #(
&& (is_fill_st2
|| (!force_miss_st2 && is_snp_st2));
wire send_dwb_req_st2 = send_fill_req_st2 || do_writeback_st2;
wire send_snp_rsp_st2 = valid_st2 && is_snp_st2 && !force_miss_st2;
// check if a matching fill request is comming
wire incoming_fill_dfp_st2 = dram_rsp_fire && (addr_st2 == dram_rsp_addr);
wire incoming_fill_st0_st2 = !dfpq_empty && (addr_st2 == dfpq_addr_st0);
wire incoming_fill_st1_st2 = is_fill_st1 && (addr_st2 == addr_st1);
wire incoming_fill_st2 = incoming_fill_dfp_st2
|| incoming_fill_st0_st2
|| incoming_fill_st1_st2;
VX_generic_register #(
.N(1+ 1+ 1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `UP(`WORD_SELECT_WIDTH) + `WORD_WIDTH + `WORD_WIDTH + `BANK_LINE_WIDTH + `TAG_SELECT_BITS + 1 + 1 + BANK_LINE_SIZE + `REQ_INST_META_WIDTH)
.N(1+ 1+ 1 + 1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `UP(`WORD_SELECT_WIDTH) + `WORD_WIDTH + `WORD_WIDTH + `BANK_LINE_WIDTH + `TAG_SELECT_BITS + 1 + 1 + BANK_LINE_SIZE + `REQ_INST_META_WIDTH)
) pipe_reg2 (
.clk (clk),
.reset (reset),
.stall (pipeline_stall),
.flush (1'b0),
.in ({is_msrq_st2, send_core_rsp_st2, send_fill_req_st2, do_writeback_st2, send_snp_rsp_st2, force_miss_st2, is_snp_st2, snp_invalidate_st2, valid_st2, addr_st2, wsel_st2, writeword_st2, readword_st2, readdata_st2, readtag_st2, miss_st2, dirtyb_st2, inst_meta_st2}),
.out ({is_msrq_st3, send_core_rsp_st3, send_fill_req_st3, do_writeback_st3, send_snp_rsp_st3, force_miss_st3, is_snp_st3, snp_invalidate_st3, valid_st3, addr_st3, wsel_st3, writeword_st3, readword_st3, readdata_st3, readtag_st3, miss_st3, dirtyb_st3, inst_meta_st3})
.in ({is_msrq_st2, incoming_fill_st2, send_core_rsp_st2, send_dwb_req_st2, do_writeback_st2, send_snp_rsp_st2, force_miss_st2, is_snp_st2, snp_invalidate_st2, valid_st2, addr_st2, wsel_st2, writeword_st2, readword_st2, readdata_st2, readtag_st2, miss_st2, dirtyb_st2, inst_meta_st2}),
.out ({is_msrq_st3, incoming_fill_st3, send_core_rsp_st3, send_dwb_req_st3, do_writeback_st3, send_snp_rsp_st3, force_miss_st3, is_snp_st3, snp_invalidate_st3, valid_st3, addr_st3, wsel_st3, writeword_st3, readword_st3, readdata_st3, readtag_st3, miss_st3, dirtyb_st3, inst_meta_st3})
);
`ifdef DBG_CACHE_REQ_INFO
@ -590,13 +600,9 @@ module VX_bank #(
assign {req_tag_st3, req_rw_st3, req_byteen_st3, req_tid_st3} = inst_meta_st3;
// check if a matching fill request is comming
wire incoming_st0_fill_st3 = is_fill_st0 && (addr_st3 == dfpq_addr_st0);
wire incoming_st1_fill_st3 = is_fill_st1 && (addr_st3 == addr_st1);
wire incoming_st2_fill_st3 = is_fill_st2 && (addr_st3 == addr_st2);
wire incoming_fill = incoming_st2_fill_st3
|| incoming_st1_fill_st3
|| incoming_st0_fill_st3;
wire incoming_fill_dfp_st3 = dram_rsp_fire && (addr_st3 == dram_rsp_addr);
wire incoming_fill = incoming_fill_dfp_st3
|| incoming_fill_st3;
if (DRAM_ENABLE) begin
wire msrq_dequeue_st3 = valid_st3 && is_msrq_st3 && !msrq_push_unqual && !pipeline_stall;
@ -732,12 +738,12 @@ module VX_bank #(
wire dwbq_empty, dwbq_full;
wire dwbq_push_unqual = send_fill_req_st3 || do_writeback_st3;
wire dwbq_push_unqual = send_dwb_req_st3;
assign dwbq_push_stall = dwbq_push_unqual && dwbq_full;
wire dwbq_push = dwbq_push_unqual
&& !(send_fill_req_st3 && incoming_fill) // not in 'dwbq_push_stall' to reduce clock delay
&& (do_writeback_st3 || !incoming_fill) // not in 'dwbq_push_stall' to reduce clock delay
&& !dwbq_full
&& !msrq_push_stall
&& !cwbq_push_stall