Merge branch 'develop'

This commit is contained in:
Blaise Tine 2024-08-04 14:17:08 -07:00
commit e663db9b5a
63 changed files with 1016 additions and 939 deletions

View file

@ -117,7 +117,7 @@ jobs:
strategy:
fail-fast: false
matrix:
name: [regression, opencl, config1, config2, debug, stress]
name: [regression, opencl, cache, config1, config2, debug, stress]
xlen: [32, 64]
steps:

View file

@ -122,32 +122,54 @@ opencl()
echo "opencl tests done!"
}
test_csv_trace()
cache()
{
# test CSV trace generation
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
diff trace_rtlsim.csv trace_simx.csv
# clean build
make -C sim/simx clean
make -C sim/rtlsim clean
}
echo "begin cache tests..."
debug()
{
echo "begin debugging tests..."
# disable local memory
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo --perf=1
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=demo --perf=1
test_csv_trace
# disable L1 cache
CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1"
# reduce l1 line size
CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=simx --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx
echo "debugging tests done!"
# test cache ways
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache banking
CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test writeback
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
# cache clustering
CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=4 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=4 --warps=1 --threads=2
# L2/L3
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1"
echo "begin cache tests..."
}
config1()
@ -163,10 +185,12 @@ config1()
./ci/blackbox.sh --driver=simx --warps=8 --threads=16 --app=diverge
# cores clustering
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=4 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1"
# issue width
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
@ -186,22 +210,19 @@ config1()
CONFIGS="-DISSUE_WIDTH=2 -DNUM_FPU_BLOCK=1 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_FPU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx
# FPU's PE scaling
CONFIGS="-DFMA_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfmadd"
CONFIGS="-DFCVT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tftoi"
CONFIGS="-DFDIV_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfdiv"
CONFIGS="-DFSQRT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfsqrt"
CONFIGS="-DFNCP_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfclamp"
# LSU scaling
CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx
# L2/L3
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1"
# multiple L1 caches per socket
CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=8 --warps=1 --threads=2
echo "configuration-1 tests done!"
}
@ -232,37 +253,9 @@ config2()
# disabling ZICOND extension
CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo
# disable local memory
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo --perf=1
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=demo --perf=1
# test AXI bus
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=demo
# disable L1 cache
CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
# reduce l1 line size
CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=simx --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache ways
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache banking
CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test 128-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=demo
@ -275,12 +268,40 @@ config2()
echo "configuration-2 tests done!"
}
test_csv_trace()
{
# test CSV trace generation
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
diff trace_rtlsim.csv trace_simx.csv
# clean build
make -C sim/simx clean
make -C sim/rtlsim clean
}
debug()
{
echo "begin debugging tests..."
test_csv_trace
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1"
echo "debugging tests done!"
}
stress()
{
echo "begin stress tests..."
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1 -DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --args="-n128" --l2cache
echo "stress tests done!"
@ -299,11 +320,9 @@ synthesis()
show_usage()
{
echo "Vortex Regression Test"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]"
}
start=$SECONDS
declare -a tests=()
clean=0
@ -327,6 +346,9 @@ while [ "$1" != "" ]; do
--opencl )
tests+=("opencl")
;;
--cache )
tests+=("cache")
;;
--config1 )
tests+=("config1")
;;
@ -349,6 +371,7 @@ while [ "$1" != "" ]; do
tests+=("kernel")
tests+=("regression")
tests+=("opencl")
tests+=("cache")
tests+=("config1")
tests+=("config2")
tests+=("debug")
@ -372,6 +395,8 @@ then
make -s
fi
start=$SECONDS
for test in "${tests[@]}"; do
$test
done

View file

@ -19,6 +19,8 @@ import csv
import re
import inspect
configs = None
def parse_args():
parser = argparse.ArgumentParser(description='CPU trace log to CSV format converter.')
parser.add_argument('-t', '--type', default='simx', help='log type (rtlsim or simx)')
@ -26,6 +28,24 @@ def parse_args():
parser.add_argument('log', help='Input log file')
return parser.parse_args()
def load_config(filename):
config_pattern = r"CONFIGS: num_threads=(\d+), num_warps=(\d+), num_cores=(\d+), num_clusters=(\d+), socket_size=(\d+), local_mem_base=0x([0-9a-fA-F]+), num_barriers=(\d+)"
with open(filename, 'r') as file:
for line in file:
config_match = re.search(config_pattern, line)
if config_match:
config = {
'num_threads': int(config_match.group(1)),
'num_warps': int(config_match.group(2)),
'num_cores': int(config_match.group(3)),
'num_clusters': int(config_match.group(4)),
'socket_size': int(config_match.group(5)),
'local_mem_base': int(config_match.group(6), 16),
'num_barriers': int(config_match.group(7)),
}
return config
return None
def parse_simx(log_lines):
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
instr_pattern = r"Instr (0x[0-9a-fA-F]+):"
@ -46,10 +66,10 @@ def parse_simx(log_lines):
instr_data = {}
instr_data["lineno"] = lineno
instr_data["PC"] = re.search(pc_pattern, line).group(1)
instr_data["core_id"] = re.search(core_id_pattern, line).group(1)
instr_data["warp_id"] = re.search(warp_id_pattern, line).group(1)
instr_data["core_id"] = int(re.search(core_id_pattern, line).group(1))
instr_data["warp_id"] = int(re.search(warp_id_pattern, line).group(1))
instr_data["tmask"] = re.search(tmask_pattern, line).group(1)
instr_data["uuid"] = re.search(uuid_pattern, line).group(1)
instr_data["uuid"] = int(re.search(uuid_pattern, line).group(1))
elif line.startswith("DEBUG Instr"):
instr_data["instr"] = re.search(instr_pattern, line).group(1)
instr_data["opcode"] = re.search(opcode_pattern, line).group(1)
@ -96,7 +116,7 @@ def append_value(text, reg, value, tmask_arr, sep):
return text, sep
def parse_rtlsim(log_lines):
config_pattern = r"CONFIGS: num_threads=(\d+), num_warps=(\d+), num_cores=(\d+), num_clusters=(\d+), socket_size=(\d+), local_mem_base=(\d+), num_barriers=(\d+)"
global configs
line_pattern = r"\d+: cluster(\d+)-socket(\d+)-core(\d+)-(decode|issue|commit)"
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
instr_pattern = r"instr=(0x[0-9a-fA-F]+)"
@ -118,36 +138,20 @@ def parse_rtlsim(log_lines):
uuid_pattern = r"#(\d+)"
entries = []
instr_data = {}
num_threads = 0
num_warps = 0
num_cores = 0
num_clusters = 0
socket_size = 0
local_mem_base = 0
num_barriers = 0
num_sockets = 0
num_cores = configs['num_cores']
socket_size = configs['socket_size']
num_sockets = (num_cores + socket_size - 1) // socket_size
for lineno, line in enumerate(log_lines, start=1):
try:
config_match = re.search(config_pattern, line)
if config_match:
num_threads = int(config_match.group(1))
num_warps = int(config_match.group(2))
num_cores = int(config_match.group(3))
num_clusters = int(config_match.group(4))
socket_size = int(config_match.group(5))
local_mem_base = int(config_match.group(6))
num_barriers = int(config_match.group(7))
num_sockets = (num_cores + socket_size - 1) // socket_size
continue
line_match = re.search(line_pattern, line)
if line_match:
PC = re.search(pc_pattern, line).group(1)
warp_id = re.search(warp_id_pattern, line).group(1)
warp_id = int(re.search(warp_id_pattern, line).group(1))
tmask = re.search(tmask_pattern, line).group(1)
uuid = re.search(uuid_pattern, line).group(1)
cluster_id = line_match.group(1)
socket_id = line_match.group(2)
core_id = line_match.group(3)
uuid = int(re.search(uuid_pattern, line).group(1))
cluster_id = int(line_match.group(1))
socket_id = int(line_match.group(2))
core_id = int(line_match.group(3))
stage = line_match.group(4)
if stage == "decode":
trace = {}
@ -274,7 +278,9 @@ def split_log_file(log_filename):
return sublogs
def main():
global configs
args = parse_args()
configs = load_config(args.log)
sublogs = split_log_file(args.log)
write_csv(sublogs, args.csv, args.type)

View file

@ -238,11 +238,11 @@
`define RESET_RELAY(dst, src) \
`RESET_RELAY_EX (dst, src, 1, 0)
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2
`define TO_OUT_BUF_SIZE(out_reg) `MIN(out_reg, 2)
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2, 5 -> 2
`define TO_OUT_BUF_SIZE(s) `MIN(s, 2)
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2
`define TO_OUT_BUF_REG(out_reg) ((out_reg & 1) + ((out_reg >> 2) << 1))
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2, 5 > 3
`define TO_OUT_BUF_REG(s) ((s < 2) ? s : (s - 2))
`define REPEAT(n,f,s) `_REPEAT_``n(f,s)
`define _REPEAT_0(f,s)

View file

@ -14,6 +14,7 @@
`include "VX_cache_define.vh"
module VX_bank_flush #(
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
@ -27,34 +28,36 @@ module VX_bank_flush #(
) (
input wire clk,
input wire reset,
input wire flush_in_valid,
output wire flush_in_ready,
output wire flush_out_init,
output wire flush_out_valid,
output wire [`CS_LINE_SEL_BITS-1:0] flush_out_line,
output wire [NUM_WAYS-1:0] flush_out_way,
input wire flush_out_ready,
input wire mshr_empty
input wire flush_begin,
output wire flush_end,
output wire flush_init,
output wire flush_valid,
output wire [`CS_LINE_SEL_BITS-1:0] flush_line,
output wire [NUM_WAYS-1:0] flush_way,
input wire flush_ready,
input wire mshr_empty,
input wire bank_empty
);
// ways interation is only needed when eviction is enabled
localparam CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0);
localparam STATE_IDLE = 2'd0;
localparam STATE_INIT = 2'd1;
localparam STATE_FLUSH = 2'd2;
localparam STATE_IDLE = 0;
localparam STATE_INIT = 1;
localparam STATE_WAIT1 = 2;
localparam STATE_FLUSH = 3;
localparam STATE_WAIT2 = 4;
localparam STATE_DONE = 5;
reg [2:0] state_r, state_n;
reg [CTR_WIDTH-1:0] counter_r;
reg [1:0] state_r, state_n;
reg flush_in_ready_r, flush_in_ready_n;
always @(*) begin
state_n = state_r;
flush_in_ready_n = 0;
case (state_r)
// STATE_IDLE
default: begin
if (flush_in_valid && mshr_empty) begin
state_n = STATE_FLUSH;
STATE_IDLE: begin
if (flush_begin) begin
state_n = STATE_WAIT1;
end
end
STATE_INIT: begin
@ -62,25 +65,41 @@ module VX_bank_flush #(
state_n = STATE_IDLE;
end
end
STATE_FLUSH: begin
if (counter_r == ((2 ** CTR_WIDTH)-1)) begin
state_n = STATE_IDLE;
flush_in_ready_n = 1;
STATE_WAIT1: begin
// wait for pending requests to complete
if (mshr_empty) begin
state_n = STATE_FLUSH;
end
end
STATE_FLUSH: begin
if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_ready) begin
state_n = (BANK_ID == 0) ? STATE_DONE : STATE_WAIT2;
end
end
STATE_WAIT2: begin
// ensure the bank is empty before notifying the cache flush unit,
// because the flush request to lower caches only goes through bank0
// and it is important that request gets send out last.
if (bank_empty) begin
state_n = STATE_DONE;
end
end
STATE_DONE: begin
// generate a completion pulse
state_n = STATE_IDLE;
end
endcase
end
always @(posedge clk) begin
if (reset) begin
state_r <= STATE_INIT;
state_r <= STATE_INIT;
counter_r <= '0;
flush_in_ready_r <= '0;
end else begin
state_r <= state_n;
flush_in_ready_r <= flush_in_ready_n;
if (state_r != STATE_IDLE) begin
if ((state_r == STATE_INIT) || flush_out_ready) begin
if ((state_r == STATE_INIT)
|| ((state_r == STATE_FLUSH) && flush_ready)) begin
counter_r <= counter_r + CTR_WIDTH'(1);
end
end else begin
@ -89,20 +108,20 @@ module VX_bank_flush #(
end
end
assign flush_in_ready = flush_in_ready_r;
assign flush_out_init = (state_r == STATE_INIT);
assign flush_out_valid = (state_r == STATE_FLUSH);
assign flush_out_line = counter_r[`CS_LINE_SEL_BITS-1:0];
assign flush_end = (state_r == STATE_DONE);
assign flush_init = (state_r == STATE_INIT);
assign flush_valid = (state_r == STATE_FLUSH);
assign flush_line = counter_r[`CS_LINE_SEL_BITS-1:0];
if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin
reg [NUM_WAYS-1:0] flush_out_way_r;
reg [NUM_WAYS-1:0] flush_way_r;
always @(*) begin
flush_out_way_r = '0;
flush_out_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1;
flush_way_r = '0;
flush_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1;
end
assign flush_out_way = flush_out_way_r;
assign flush_way = flush_way_r;
end else begin
assign flush_out_way = {NUM_WAYS{1'b1}};
assign flush_way = {NUM_WAYS{1'b1}};
end
endmodule

View file

@ -109,26 +109,23 @@ module VX_cache import VX_gpu_pkg::*; #(
.TAG_WIDTH (TAG_WIDTH)
) core_bus2_if[NUM_REQS]();
wire [NUM_BANKS-1:0] per_bank_flush_valid;
wire [NUM_BANKS-1:0] per_bank_flush_ready;
wire [NUM_BANKS-1:0] per_bank_flush_begin;
wire [NUM_BANKS-1:0] per_bank_flush_end;
wire [NUM_BANKS-1:0] per_bank_core_req_fire;
// this reset relay is required to sync with bank initialization
`RESET_RELAY (flush_reset, reset);
VX_cache_flush #(
.NUM_REQS (NUM_REQS),
.NUM_BANKS (NUM_BANKS),
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency
) flush_unit (
.clk (clk),
.reset (flush_reset),
.reset (reset),
.core_bus_in_if (core_bus_if),
.core_bus_out_if (core_bus2_if),
.bank_req_fire (per_bank_core_req_fire),
.flush_valid (per_bank_flush_valid),
.flush_ready (per_bank_flush_ready)
.flush_begin (per_bank_flush_begin),
.flush_end (per_bank_flush_end)
);
///////////////////////////////////////////////////////////////////////////
@ -324,6 +321,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (CORE_REQ_DATAW),
.PERF_CTR_BITS (`PERF_CTR_BITS),
.ARBITER ("F"),
.OUT_BUF (REQ_XBAR_BUF)
) req_xbar (
.clk (clk),
@ -432,8 +430,8 @@ module VX_cache import VX_gpu_pkg::*; #(
.mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)),
.mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]),
.flush_valid (per_bank_flush_valid[bank_id]),
.flush_ready (per_bank_flush_ready[bank_id])
.flush_begin (per_bank_flush_begin[bank_id]),
.flush_end (per_bank_flush_end[bank_id])
);
if (NUM_BANKS == 1) begin
@ -457,7 +455,8 @@ module VX_cache import VX_gpu_pkg::*; #(
VX_stream_xbar #(
.NUM_INPUTS (NUM_BANKS),
.NUM_OUTPUTS (NUM_REQS),
.DATAW (CORE_RSP_DATAW)
.DATAW (CORE_RSP_DATAW),
.ARBITER ("F")
) rsp_xbar (
.clk (clk),
.reset (rsp_xbar_reset),

View file

@ -108,8 +108,8 @@ module VX_cache_bank #(
output wire mem_rsp_ready,
// flush
input wire flush_valid,
output wire flush_ready
input wire flush_begin,
output wire flush_end
);
localparam PIPELINE_STAGES = 2;
@ -120,6 +120,7 @@ module VX_cache_bank #(
wire crsp_queue_stall;
wire mshr_alm_full;
wire mreq_queue_empty;
wire mreq_queue_alm_full;
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr;
@ -162,30 +163,38 @@ module VX_cache_bank #(
wire mshr_pending_st0, mshr_pending_st1;
wire mshr_empty;
wire line_flush_valid;
wire line_flush_init;
wire [`CS_LINE_SEL_BITS-1:0] line_flush_sel;
wire [NUM_WAYS-1:0] line_flush_way;
wire line_flush_ready;
wire flush_valid;
wire init_valid;
wire [`CS_LINE_SEL_BITS-1:0] flush_sel;
wire [NUM_WAYS-1:0] flush_way;
wire flush_ready;
// ensure we have no pending memory request in the bank
wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty;
// this reset relay should match pipeline during tags initialization
`RESET_RELAY (flush_reset, reset);
// flush unit
VX_bank_flush #(
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WRITEBACK (WRITEBACK)
) flush_unit (
.clk (clk),
.reset (reset),
.flush_in_valid (flush_valid),
.flush_in_ready (flush_ready),
.flush_out_init (line_flush_init),
.flush_out_valid (line_flush_valid),
.flush_out_line (line_flush_sel),
.flush_out_way (line_flush_way),
.flush_out_ready (line_flush_ready),
.mshr_empty (mshr_empty)
.clk (clk),
.reset (flush_reset),
.flush_begin (flush_begin),
.flush_end (flush_end),
.flush_init (init_valid),
.flush_valid (flush_valid),
.flush_line (flush_sel),
.flush_way (flush_way),
.flush_ready (flush_ready),
.mshr_empty (mshr_empty),
.bank_empty (no_pending_req)
);
wire rdw_hazard1_sel;
@ -198,16 +207,16 @@ module VX_cache_bank #(
// mshr replay has highest priority to maximize utilization since there is no miss.
// handle memory responses next to prevent deadlock with potential memory request from a miss.
// flush has precedence over core requests to ensure that the cache is in a consistent state.
wire replay_grant = ~line_flush_init;
wire replay_grant = ~init_valid;
wire replay_enable = replay_grant && replay_valid;
wire fill_grant = ~line_flush_init && ~replay_enable;
wire fill_grant = ~init_valid && ~replay_enable;
wire fill_enable = fill_grant && mem_rsp_valid;
wire flush_grant = ~line_flush_init && ~replay_enable && ~fill_enable;
wire flush_enable = flush_grant && line_flush_valid;
wire flush_grant = ~init_valid && ~replay_enable && ~fill_enable;
wire flush_enable = flush_grant && flush_valid;
wire creq_grant = ~line_flush_init && ~replay_enable && ~fill_enable && ~flush_enable;
wire creq_grant = ~init_valid && ~replay_enable && ~fill_enable && ~flush_enable;
wire creq_enable = creq_grant && core_req_valid;
assign replay_ready = replay_grant
@ -219,23 +228,23 @@ module VX_cache_bank #(
&& ~rdw_hazard2_sel
&& ~pipe_stall;
assign line_flush_ready = flush_grant
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~pipe_stall;
assign flush_ready = flush_grant
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~pipe_stall;
assign core_req_ready = creq_grant
&& ~mreq_queue_alm_full
&& ~mshr_alm_full
&& ~pipe_stall;
wire init_fire = line_flush_init;
wire init_fire = init_valid;
wire replay_fire = replay_valid && replay_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
wire line_flush_fire = line_flush_valid && line_flush_ready;
wire flush_fire = flush_valid && flush_ready;
wire core_req_fire = core_req_valid && core_req_ready;
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || line_flush_fire || core_req_fire;
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire;
assign rw_sel = replay_valid ? replay_rw : core_req_rw;
assign byteen_sel = replay_valid ? replay_byteen : core_req_byteen;
assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel;
@ -243,7 +252,7 @@ module VX_cache_bank #(
assign tag_sel = replay_valid ? replay_tag : core_req_tag;
assign creq_flush_sel = core_req_valid && core_req_flush;
assign addr_sel = (line_flush_init | line_flush_valid) ? `CS_LINE_ADDR_WIDTH'(line_flush_sel) :
assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) :
(replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr));
if (WRITE_ENABLE) begin
@ -263,15 +272,17 @@ module VX_cache_bank #(
assign req_uuid_sel = 0;
end
`RESET_RELAY (pipe0_reset, reset);
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + NUM_WAYS + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.reset (pipe0_reset),
.enable (~pipe_stall),
.data_in ({valid_sel, line_flush_init, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, line_flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
.data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
);
if (UUID_WIDTH != 0) begin
@ -298,7 +309,7 @@ module VX_cache_bank #(
wire [NUM_WAYS-1:0] evict_way_st0;
wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0;
`RESET_RELAY (tag_reset, reset);
`RESET_RELAY (tags_reset, reset);
VX_cache_tags #(
.INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)),
@ -312,7 +323,7 @@ module VX_cache_bank #(
.UUID_WIDTH (UUID_WIDTH)
) cache_tags (
.clk (clk),
.reset (tag_reset),
.reset (tags_reset),
.req_uuid (req_uuid_st0),
@ -344,12 +355,14 @@ module VX_cache_bank #(
assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0;
`RESET_RELAY (pipe1_reset, reset);
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.reset (pipe1_reset),
.enable (~pipe_stall),
.data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1})
@ -585,7 +598,7 @@ module VX_cache_bank #(
// schedule memory request
wire mreq_queue_push, mreq_queue_pop, mreq_queue_empty;
wire mreq_queue_push, mreq_queue_pop;
wire [`CS_LINE_WIDTH-1:0] mreq_queue_data;
wire [LINE_SIZE-1:0] mreq_queue_byteen;
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr;
@ -663,8 +676,8 @@ module VX_cache_bank #(
`ifdef DBG_TRACE_CACHE
wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready;
wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || line_flush_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire || line_flush_fire);
wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || flush_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire || flush_fire);
always @(posedge clk) begin
if (input_stall || pipe_stall) begin
`TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw1=%b, rdw2=%b, rdw3=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard1_sel, rdw_hazard2_sel, rdw_hazard3_st1));

View file

@ -217,13 +217,15 @@ module VX_cache_bypass #(
assign mem_bus_in_if.req_ready = mem_req_out_ready;
`RESET_RELAY (mem_req_reset, reset);
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `ADDR_TYPE_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.reset (mem_req_reset),
.valid_in (mem_req_out_valid),
.ready_in (mem_req_out_ready),
.data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_atype, mem_req_out_data, mem_req_out_tag}),
@ -309,13 +311,16 @@ module VX_cache_bypass #(
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
`RESET_RELAY (core_rsp_reset, reset);
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (reset),
.reset (core_rsp_reset),
.valid_in (core_rsp_in_valid[i]),
.ready_in (core_rsp_in_ready[i]),
.data_in ({core_rsp_in_data[i], core_rsp_in_tag[i]}),

View file

@ -117,7 +117,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
`ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]);
end
`RESET_RELAY (arb_reset, reset);
`RESET_RELAY (cache_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (NUM_INPUTS),
@ -130,7 +130,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0)
) cache_arb (
.clk (clk),
.reset (arb_reset),
.reset (cache_arb_reset),
.bus_in_if (core_bus_tmp_if),
.bus_out_if (arb_core_bus_tmp_if)
);
@ -182,6 +182,8 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
) mem_bus_tmp_if[1]();
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (NUM_CACHES),
.DATA_SIZE (LINE_SIZE),
@ -192,7 +194,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
) mem_arb (
.clk (clk),
.reset (reset),
.reset (mem_arb_reset),
.bus_in_if (cache_mem_bus_if),
.bus_out_if (mem_bus_tmp_if)
);

View file

@ -62,7 +62,6 @@ module VX_cache_data #(
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (reset)
`UNUSED_VAR (stall)
`UNUSED_VAR (line_addr)
`UNUSED_VAR (init)
@ -91,9 +90,10 @@ module VX_cache_data #(
.SIZE (`CS_LINES_PER_BANK)
) byteen_store (
.clk (clk),
.reset (reset),
.read (write || fill || flush),
.write (init || write || fill || flush),
`UNUSED_PIN (wren),
.wren (1'b1),
.addr (line_sel),
.wdata (bs_wdata),
.rdata (bs_rdata)
@ -117,7 +117,7 @@ module VX_cache_data #(
end
// order the data layout to perform ways multiplexing last.
// this allows converting way index to binary in parallel with BRAM read.
// this allows converting way index to binary in parallel with BRAM readaccess and way selection.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_wdata;
wire [BYTEENW-1:0] line_wren;
@ -161,6 +161,7 @@ module VX_cache_data #(
.RW_ASSERT (1)
) data_store (
.clk (clk),
.reset (reset),
.read (line_read),
.write (line_write),
.wren (line_wren),

View file

@ -26,13 +26,16 @@ module VX_cache_flush #(
VX_mem_bus_if.slave core_bus_in_if [NUM_REQS],
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
input wire [NUM_BANKS-1:0] bank_req_fire,
output wire [NUM_BANKS-1:0] flush_valid,
input wire [NUM_BANKS-1:0] flush_ready
output wire [NUM_BANKS-1:0] flush_begin,
input wire [NUM_BANKS-1:0] flush_end
);
localparam STATE_IDLE = 0;
localparam STATE_WAIT = 1;
localparam STATE_WAIT1 = 1;
localparam STATE_FLUSH = 2;
localparam STATE_DONE = 3;
localparam STATE_WAIT2 = 3;
localparam STATE_DONE = 4;
reg [2:0] state, state_n;
// track in-flight core requests
@ -76,7 +79,6 @@ module VX_cache_flush #(
`UNUSED_VAR (bank_req_fire)
end
reg [1:0] state, state_n;
reg [NUM_BANKS-1:0] flush_done, flush_done_n;
wire [NUM_REQS-1:0] flush_req_mask;
@ -112,23 +114,32 @@ module VX_cache_flush #(
case (state)
STATE_IDLE: begin
if (flush_req_enable) begin
state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT : STATE_FLUSH;
state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT1 : STATE_FLUSH;
end
end
STATE_WAIT: begin
STATE_WAIT1: begin
if (no_inflight_reqs) begin
state_n = STATE_FLUSH;
flush_done_n = '0;
end
end
STATE_FLUSH: begin
flush_done_n = flush_done | flush_ready;
// generate a flush request pulse
state_n = STATE_WAIT2;
end
STATE_WAIT2: begin
// wait for all banks to finish flushing
flush_done_n = flush_done | flush_end;
if (flush_done_n == {NUM_BANKS{1'b1}}) begin
state_n = STATE_DONE;
flush_done_n = '0;
// only release current flush requests
// and keep normal requests locked
lock_released_n = flush_req_mask;
end
end
STATE_DONE: begin
// wait until released flush requests are issued
// when returning to IDLE state other requests will unlock
lock_released_n = lock_released & ~core_bus_out_ready;
if (lock_released_n == 0) begin
state_n = STATE_IDLE;
@ -149,6 +160,6 @@ module VX_cache_flush #(
end
end
assign flush_valid = {NUM_BANKS{state == STATE_FLUSH}};
assign flush_begin = {NUM_BANKS{state == STATE_FLUSH}};
endmodule

View file

@ -232,9 +232,10 @@ module VX_cache_mshr #(
.LUTRAM (1)
) entries (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (allocate_valid),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (allocate_id_r),
.wdata (allocate_data),
.raddr (dequeue_id_r),

View file

@ -57,7 +57,6 @@ module VX_cache_tags #(
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_VAR (reset)
`UNUSED_VAR (lookup)
// valid, dirty, tag
@ -130,9 +129,10 @@ module VX_cache_tags #(
.RW_ASSERT (1)
) tag_store (
.clk (clk),
.reset (reset),
.read (line_read),
.write (line_write),
`UNUSED_PIN (wren),
.wren (1'b1),
.addr (line_sel),
.wdata (line_wdata),
.rdata (line_rdata)

View file

@ -83,7 +83,7 @@ module VX_alu_muldiv #(
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
.clk(clk),
.clk (clk),
.reset (reset),
.enable (mul_ready_in),
.data_in ({mul_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, mul_result_tmp}),
@ -324,6 +324,7 @@ module VX_alu_muldiv #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (TAG_WIDTH + (NUM_LANES * `XLEN)),
.ARBITER ("F"),
.OUT_BUF (1)
) rsp_buf (
.clk (clk),

View file

@ -126,7 +126,8 @@ module VX_alu_unit #(
VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.OUT_BUF (PARTIAL_BW ? 1 : 3)
.OUT_BUF (PARTIAL_BW ? 1 : 3),
.ARBITER ("F")
) rsp_arb (
.clk (clk),
.reset (arb_reset),

View file

@ -56,9 +56,10 @@ module VX_fetch import VX_gpu_pkg::*; #(
.LUTRAM (1)
) tag_store (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (icache_req_fire),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (req_tag),
.wdata ({schedule_if.data.PC, schedule_if.data.tmask}),
.raddr (rsp_tag),

View file

@ -72,9 +72,10 @@ module VX_ipdom_stack #(
.LUTRAM (OUT_REG ? 0 : 1)
) store (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (push),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (wr_ptr),
.wdata ({q1, q0}),
.raddr (rd_ptr),

View file

@ -490,6 +490,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW),
.ARBITER ("P"), // prioritize commit_rsp_if
.OUT_BUF (3)
) rsp_arb (
.clk (clk),

View file

@ -43,8 +43,9 @@ module VX_operands import VX_gpu_pkg::*; #(
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
localparam METADATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS;
localparam DATAW = `UUID_WIDTH + METADATAW + 3 * `NUM_THREADS * `XLEN;
localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH;
localparam REGS_DATAW = NUM_SRC_REGS * `NUM_THREADS * `XLEN;
localparam DATAW = META_DATAW + REGS_DATAW;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
localparam XLEN_SIZE = `XLEN / 8;
@ -53,30 +54,28 @@ module VX_operands import VX_gpu_pkg::*; #(
`UNUSED_VAR (writeback_if.data.sop)
wire [NUM_SRC_REGS-1:0] src_valid;
wire [NUM_SRC_REGS-1:0] req_in_valid;
wire [NUM_SRC_REGS-1:0] req_in_ready;
wire [NUM_SRC_REGS-1:0] req_in_valid, req_in_ready;
wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_in_data;
wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
wire [NUM_BANKS-1:0] gpr_rd_valid_n, gpr_rd_ready;
reg [NUM_BANKS-1:0] gpr_rd_valid;
wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr_n;
reg [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr;
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx_n;
reg [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx;
wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready;
wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2;
wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr, gpr_rd_addr_st1;
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st1, gpr_rd_data_st2;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx, gpr_rd_req_idx_st1, gpr_rd_req_idx_st2;
wire pipe_in_ready;
reg pipe_out_valid;
wire pipe_out_ready;
reg [`UUID_WIDTH-1:0] pipe_out_uuid;
reg [METADATAW-1:0] pipe_out_data;
wire pipe_valid_st1, pipe_ready_st1;
wire pipe_valid_st2, pipe_ready_st2;
wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2;
reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data, src_data_n;
reg [NUM_SRC_REGS-1:0] data_fetched;
reg has_collision, has_collision_n;
reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n;
wire [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2;
wire stg_in_valid, stg_in_ready;
reg [NUM_SRC_REGS-1:0] data_fetched_n;
wire [NUM_SRC_REGS-1:0] data_fetched_st1;
reg has_collision_n;
wire has_collision_st1;
wire [NUM_SRC_REGS-1:0][`NR_BITS-1:0] src_regs = {scoreboard_if.data.rs3,
scoreboard_if.data.rs2,
@ -96,11 +95,13 @@ module VX_operands import VX_gpu_pkg::*; #(
end
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched[i];
assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched_st1[i];
end
assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid;
`RESET_RELAY (req_xbar_reset, reset);
VX_stream_xbar #(
.NUM_INPUTS (NUM_SRC_REGS),
.NUM_OUTPUTS (NUM_BANKS),
@ -110,19 +111,26 @@ module VX_operands import VX_gpu_pkg::*; #(
.OUT_BUF (0) // no output buffering
) req_xbar (
.clk (clk),
.reset (reset),
.reset (req_xbar_reset),
`UNUSED_PIN(collisions),
.valid_in (req_in_valid),
.data_in (req_in_data),
.sel_in (req_bank_idx),
.ready_in (req_in_ready),
.valid_out (gpr_rd_valid_n),
.data_out (gpr_rd_addr_n),
.sel_out (gpr_rd_req_idx_n),
.valid_out (gpr_rd_valid),
.data_out (gpr_rd_addr),
.sel_out (gpr_rd_req_idx),
.ready_out (gpr_rd_ready)
);
assign gpr_rd_ready = {NUM_BANKS{stg_in_ready}};
wire pipe_in_ready = pipe_ready_st1 || ~pipe_valid_st1;
assign gpr_rd_ready = {NUM_BANKS{pipe_in_ready}};
assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n;
wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1;
wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2;
always @(*) begin
has_collision_n = 0;
@ -136,83 +144,86 @@ module VX_operands import VX_gpu_pkg::*; #(
end
always @(*) begin
src_data_n = src_data;
for (integer b = 0; b < NUM_BANKS; ++b) begin
if (gpr_rd_valid[b]) begin
src_data_n[gpr_rd_req_idx[b]] = gpr_rd_data[b];
end
end
end
wire pipe_stall = pipe_out_valid && ~pipe_out_ready;
assign pipe_in_ready = ~pipe_stall;
assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n;
wire stg_in_fire = stg_in_valid && stg_in_ready;
always @(posedge clk) begin
if (reset) begin
pipe_out_valid <= 0;
gpr_rd_valid <= '0;
data_fetched <= '0;
src_data <= '0;
data_fetched_n = data_fetched_st1;
if (scoreboard_if.ready) begin
data_fetched_n = '0;
end else begin
if (~pipe_stall) begin
pipe_out_valid <= scoreboard_if.valid;
gpr_rd_valid <= gpr_rd_valid_n;
if (scoreboard_if.ready) begin
data_fetched <= '0;
end else begin
data_fetched <= data_fetched | req_in_ready;
end
if (stg_in_fire) begin
src_data <= '0;
end else begin
src_data <= src_data_n;
end
end
end
if (~pipe_stall) begin
pipe_out_uuid <= scoreboard_if.data.uuid;
pipe_out_data <= {
scoreboard_if.data.wis,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.wb,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.rd
};
has_collision <= has_collision_n;
gpr_rd_addr <= gpr_rd_addr_n;
gpr_rd_req_idx <= gpr_rd_req_idx_n;
data_fetched_n = data_fetched_st1 | req_in_ready;
end
end
assign pipe_out_ready = stg_in_ready;
assign stg_in_valid = pipe_out_valid && ~has_collision;
assign pipe_data = {
scoreboard_if.data.wis,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.wb,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.rd,
scoreboard_if.data.uuid
};
`RESET_RELAY (pipe1_reset, reset);
VX_pipe_register #(
.DATAW (1 + NUM_SRC_REGS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)),
.RESETW (1 + NUM_SRC_REGS)
) pipe_reg1 (
.clk (clk),
.reset (pipe1_reset),
.enable (pipe_in_ready),
.data_in ({scoreboard_if.valid, data_fetched_n, gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}),
.data_out ({pipe_valid_st1, data_fetched_st1, gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1})
);
assign pipe_ready_st1 = pipe_ready_st2 || ~pipe_valid_st2;
assign src_data_st1 = pipe_fire_st2 ? '0 : src_data_n;
wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1;
`RESET_RELAY (pipe2_reset, reset);
VX_pipe_register #(
.DATAW (1 + REGS_DATAW + NUM_BANKS + (NUM_BANKS * `XLEN * `NUM_THREADS) + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH),
.RESETW (1 + REGS_DATAW)
) pipe_reg2 (
.clk (clk),
.reset (pipe2_reset),
.enable (pipe_ready_st1),
.data_in ({pipe_valid2_st1, src_data_st1, gpr_rd_valid_st1, gpr_rd_data_st1, pipe_data_st1, gpr_rd_req_idx_st1}),
.data_out ({pipe_valid_st2, src_data_st2, gpr_rd_valid_st2, gpr_rd_data_st2, pipe_data_st2, gpr_rd_req_idx_st2})
);
always @(*) begin
src_data_n = src_data_st2;
for (integer b = 0; b < NUM_BANKS; ++b) begin
if (gpr_rd_valid_st2[b]) begin
src_data_n[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b];
end
end
end
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (1)
) out_buffer (
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (stg_in_valid),
.ready_in (stg_in_ready),
.reset (out_buf_reset),
.valid_in (pipe_valid_st2),
.ready_in (pipe_ready_st2),
.data_in ({
pipe_out_uuid,
pipe_out_data,
pipe_data_st2,
src_data_n[0],
src_data_n[1],
src_data_n[2]
}),
.data_out ({
operands_if.data.uuid,
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
@ -221,6 +232,7 @@ module VX_operands import VX_gpu_pkg::*; #(
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.rd,
operands_if.data.uuid,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data
@ -269,27 +281,26 @@ module VX_operands import VX_gpu_pkg::*; #(
assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}};
end
`ifdef GPR_RESET
VX_dp_ram_rst #(
`else
`RESET_RELAY (bram_reset, reset);
VX_dp_ram #(
`endif
.DATAW (`XLEN * `NUM_THREADS),
.SIZE (PER_BANK_REGS * PER_ISSUE_WARPS),
.WRENW (BYTEENW),
`ifdef GPR_RESET
.RESET_RAM (1),
`endif
.NO_RWCHECK (1)
) gpr_ram (
.clk (clk),
`ifdef GPR_RESET
.reset (reset),
`endif
.read (1'b1),
.reset (bram_reset),
.read (pipe_fire_st1),
.wren (wren),
.write (gpr_wr_enabled),
.waddr (gpr_wr_addr),
.wdata (writeback_if.data.data),
.raddr (gpr_rd_addr[b]),
.rdata (gpr_rd_data[b])
.raddr (gpr_rd_addr_st1[b]),
.rdata (gpr_rd_data_st1[b])
);
end

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,7 +21,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
parameter TAG_WIDTH = 1
) (
input wire clk,
input wire reset,
input wire reset,
output wire ready_in,
input wire valid_in,
@ -36,7 +36,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
input wire is_signed,
input wire [NUM_LANES-1:0][31:0] dataa,
output wire [NUM_LANES-1:0][31:0] result,
output wire [NUM_LANES-1:0][31:0] result,
output wire has_fflags,
output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -45,25 +45,26 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
input wire ready_out,
output wire valid_out
);
);
`UNUSED_VAR (frm)
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
fflags_t [NUM_LANES-1:0] fflags_out;
wire pe_enable;
wire pe_enable;
wire [NUM_PES-1:0][31:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FCVT),
.DATA_IN_WIDTH(32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0)
.PE_REG (0),
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -94,7 +95,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.enable (pe_enable),
.frm (frm),
.is_itof (is_itof),
.is_signed (is_signed),
.is_signed (is_signed),
.dataa (pe_data_in[i][0 +: 32]),
.result (pe_data_out[i][0 +: 32]),
.fflags (pe_data_out[i][32 +: `FP_FLAGS_BITS])

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,7 +21,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
parameter TAG_WIDTH = 1
) (
input wire clk,
input wire reset,
input wire reset,
input wire valid_in,
output wire ready_in,
@ -31,10 +31,10 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
input wire [TAG_WIDTH-1:0] tag_in,
input wire [`INST_FRM_BITS-1:0] frm,
input wire [NUM_LANES-1:0][31:0] dataa,
input wire [NUM_LANES-1:0][31:0] datab,
output wire [NUM_LANES-1:0][31:0] result,
output wire [NUM_LANES-1:0][31:0] result,
output wire has_fflags,
output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -47,27 +47,28 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
`UNUSED_VAR (frm)
wire [NUM_LANES-1:0][2*32-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out;
wire pe_enable;
wire pe_enable;
wire [NUM_PES-1:0][2*32-1:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign data_in[i][0 +: 32] = dataa[i];
assign data_in[i][32 +: 32] = datab[i];
end
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FDIV),
.DATA_IN_WIDTH(2*32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0)
.PE_REG (0),
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -92,7 +93,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
fflags_t [NUM_LANES-1:0] per_lane_fflags;
`ifdef QUARTUS
for (genvar i = 0; i < NUM_PES; ++i) begin
acl_fdiv fdiv (
.clk (clk),
@ -103,8 +104,8 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
.q (pe_data_out[i][0 +: 32])
);
assign pe_data_out[i][32 +: `FP_FLAGS_BITS] = 'x;
end
end
assign has_fflags = 0;
assign per_lane_fflags = 'x;
`UNUSED_VAR (fflags_out)
@ -131,21 +132,21 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
assign has_fflags = 1;
assign per_lane_fflags = fflags_out;
`else
`else
for (genvar i = 0; i < NUM_PES; ++i) begin
reg [63:0] r;
`UNUSED_VAR (r)
`UNUSED_VAR (r)
fflags_t f;
always @(*) begin
always @(*) begin
dpi_fdiv (
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i][0 +: 32]},
{32'hffffffff, pe_data_in[i][32 +: 32]},
frm,
r,
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i][0 +: 32]},
{32'hffffffff, pe_data_in[i][32 +: 32]},
frm,
r,
f
);
end

View file

@ -98,7 +98,8 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
.DATA_IN_WIDTH(3*32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG ((NUM_LANES != NUM_PES) ? 1 : 0)
.PE_REG ((NUM_LANES != NUM_PES) ? 1 : 0), // must be registered for DSPs
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
) pe_serializer (
.clk (clk),
.reset (reset),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -35,7 +35,7 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
input wire [NUM_LANES-1:0][31:0] dataa,
input wire [NUM_LANES-1:0][31:0] datab,
output wire [NUM_LANES-1:0][31:0] result,
output wire [NUM_LANES-1:0][31:0] result,
output wire has_fflags,
output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -44,15 +44,15 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
input wire ready_out,
output wire valid_out
);
);
`UNUSED_VAR (frm)
wire [NUM_LANES-1:0][2*32-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
fflags_t [NUM_LANES-1:0] fflags_out;
wire pe_enable;
wire pe_enable;
wire [NUM_PES-1:0][2*32-1:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
@ -60,15 +60,16 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
assign data_in[i][0 +: 32] = dataa[i];
assign data_in[i][32 +: 32] = datab[i];
end
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FNCP),
.DATA_IN_WIDTH(2*32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0)
.PE_REG (0),
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -97,8 +98,8 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (pe_enable),
.frm (frm),
.op_type (op_type),
.frm (frm),
.op_type (op_type),
.dataa (pe_data_in[i][0 +: 32]),
.datab (pe_data_in[i][32 +: 32]),
.result (pe_data_out[i][0 +: 32]),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -18,10 +18,10 @@
module VX_fpu_sqrt import VX_fpu_pkg::*; #(
parameter NUM_LANES = 1,
parameter NUM_PES = `UP(NUM_LANES /`FSQRT_PE_RATIO),
parameter TAG_WIDTH = 1
parameter TAG_WIDTH = 1
) (
input wire clk,
input wire reset,
input wire reset,
output wire ready_in,
input wire valid_in,
@ -29,11 +29,11 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
input wire [NUM_LANES-1:0] mask_in,
input wire [TAG_WIDTH-1:0] tag_in,
input wire [`INST_FRM_BITS-1:0] frm,
input wire [NUM_LANES-1:0][31:0] dataa,
output wire [NUM_LANES-1:0][31:0] result,
output wire [NUM_LANES-1:0][31:0] result,
output wire has_fflags,
output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -46,22 +46,23 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
`UNUSED_VAR (frm)
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out;
wire pe_enable;
wire pe_enable;
wire [NUM_PES-1:0][31:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FSQRT),
.DATA_IN_WIDTH(32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0)
.PE_REG (0),
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -83,10 +84,10 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS];
end
fflags_t [NUM_LANES-1:0] per_lane_fflags;
fflags_t [NUM_LANES-1:0] per_lane_fflags;
`ifdef QUARTUS
for (genvar i = 0; i < NUM_PES; ++i) begin
acl_fsqrt fsqrt (
.clk (clk),
@ -105,7 +106,7 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
`elsif VIVADO
for (genvar i = 0; i < NUM_PES; ++i) begin
wire tuser;
wire tuser;
xil_fsqrt fsqrt (
.aclk (clk),
@ -130,17 +131,17 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
`UNUSED_VAR (r)
fflags_t f;
always @(*) begin
always @(*) begin
dpi_fsqrt (
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i]},
frm,
r,
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i]},
frm,
r,
f
);
end
VX_shift_register #(
.DATAW (32 + $bits(fflags_t)),
.DEPTH (`LATENCY_FSQRT)

View file

@ -82,11 +82,14 @@ module VX_avs_adapter #(
end
for (genvar i = 0; i < NUM_BANKS; ++i) begin
`RESET_RELAY (rd_req_reset, reset);
VX_pending_size #(
.SIZE (RD_QUEUE_SIZE)
) pending_size (
.clk (clk),
.reset (reset),
.reset (rd_req_reset),
.incr (req_queue_push[i]),
.decr (req_queue_pop[i]),
`UNUSED_PIN (empty),
@ -102,7 +105,7 @@ module VX_avs_adapter #(
.DEPTH (RD_QUEUE_SIZE)
) rd_req_queue (
.clk (clk),
.reset (reset),
.reset (rd_req_reset),
.push (req_queue_push[i]),
.pop (req_queue_pop[i]),
.data_in (mem_req_tag),
@ -126,13 +129,15 @@ module VX_avs_adapter #(
wire valid_out_w = mem_req_valid && ~req_queue_going_full[i] && (req_bank_sel == i);
wire ready_out_w;
`RESET_RELAY (req_out_reset, reset);
VX_elastic_buffer #(
.DATAW (1 + DATA_SIZE + BANK_OFFSETW + DATA_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(REQ_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF))
) req_out_buf (
.clk (clk),
.reset (reset),
.reset (req_out_reset),
.valid_in (valid_out_w),
.ready_in (ready_out_w),
.data_in ({mem_req_rw, mem_req_byteen, req_bank_off, mem_req_data}),
@ -168,12 +173,15 @@ module VX_avs_adapter #(
wire [NUM_BANKS-1:0] rsp_queue_empty;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
`RESET_RELAY (rd_rsp_reset, reset);
VX_fifo_queue #(
.DATAW (DATA_WIDTH),
.DEPTH (RD_QUEUE_SIZE)
) rd_rsp_queue (
.clk (clk),
.reset (reset),
.reset (rd_rsp_reset),
.push (avs_readdatavalid[i]),
.pop (req_queue_pop[i]),
.data_in (avs_readdata[i]),
@ -192,14 +200,16 @@ module VX_avs_adapter #(
assign req_queue_pop[i] = rsp_arb_valid_in[i] && rsp_arb_ready_in[i];
end
`RESET_RELAY (rsp_arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (NUM_BANKS),
.DATAW (DATA_WIDTH + TAG_WIDTH),
.ARBITER ("R"),
.ARBITER ("F"),
.OUT_BUF (RSP_OUT_BUF)
) rsp_arb (
.clk (clk),
.reset (reset),
.reset (rsp_arb_reset),
.valid_in (rsp_arb_valid_in),
.data_in (rsp_arb_data_in),
.ready_in (rsp_arb_ready_in),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -15,10 +15,10 @@
`TRACING_OFF
module VX_axi_adapter #(
parameter DATA_WIDTH = 512,
parameter DATA_WIDTH = 512,
parameter ADDR_WIDTH = 32,
parameter TAG_WIDTH = 8,
parameter NUM_BANKS = 1,
parameter NUM_BANKS = 1,
parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)),
parameter RSP_OUT_BUF = 0
) (
@ -34,13 +34,13 @@ module VX_axi_adapter #(
input wire [TAG_WIDTH-1:0] mem_req_tag,
output wire mem_req_ready,
// Vortex response
output wire mem_rsp_valid,
// Vortex response
output wire mem_rsp_valid,
output wire [DATA_WIDTH-1:0] mem_rsp_data,
output wire [TAG_WIDTH-1:0] mem_rsp_tag,
input wire mem_rsp_ready,
// AXI write request address channel
// AXI write request address channel
output wire m_axi_awvalid [NUM_BANKS],
input wire m_axi_awready [NUM_BANKS],
output wire [ADDR_WIDTH-1:0] m_axi_awaddr [NUM_BANKS],
@ -54,7 +54,7 @@ module VX_axi_adapter #(
output wire [3:0] m_axi_awqos [NUM_BANKS],
output wire [3:0] m_axi_awregion [NUM_BANKS],
// AXI write request data channel
// AXI write request data channel
output wire m_axi_wvalid [NUM_BANKS],
input wire m_axi_wready [NUM_BANKS],
output wire [DATA_WIDTH-1:0] m_axi_wdata [NUM_BANKS],
@ -66,7 +66,7 @@ module VX_axi_adapter #(
output wire m_axi_bready [NUM_BANKS],
input wire [TAG_WIDTH-1:0] m_axi_bid [NUM_BANKS],
input wire [1:0] m_axi_bresp [NUM_BANKS],
// AXI read address channel
output wire m_axi_arvalid [NUM_BANKS],
input wire m_axi_arready [NUM_BANKS],
@ -74,13 +74,13 @@ module VX_axi_adapter #(
output wire [TAG_WIDTH-1:0] m_axi_arid [NUM_BANKS],
output wire [7:0] m_axi_arlen [NUM_BANKS],
output wire [2:0] m_axi_arsize [NUM_BANKS],
output wire [1:0] m_axi_arburst [NUM_BANKS],
output wire [1:0] m_axi_arburst [NUM_BANKS],
output wire [1:0] m_axi_arlock [NUM_BANKS],
output wire [3:0] m_axi_arcache [NUM_BANKS],
output wire [2:0] m_axi_arprot [NUM_BANKS],
output wire [3:0] m_axi_arqos [NUM_BANKS],
output wire [3:0] m_axi_arregion [NUM_BANKS],
// AXI read response channel
input wire m_axi_rvalid [NUM_BANKS],
output wire m_axi_rready [NUM_BANKS],
@ -88,15 +88,15 @@ module VX_axi_adapter #(
input wire m_axi_rlast [NUM_BANKS],
input wire [TAG_WIDTH-1:0] m_axi_rid [NUM_BANKS],
input wire [1:0] m_axi_rresp [NUM_BANKS]
);
);
localparam AXSIZE = `CLOG2(DATA_WIDTH/8);
localparam BANK_ADDRW = `LOG2UP(NUM_BANKS);
localparam BANK_ADDRW = `LOG2UP(NUM_BANKS);
localparam LOG2_NUM_BANKS = `CLOG2(NUM_BANKS);
wire [BANK_ADDRW-1:0] req_bank_sel;
if (NUM_BANKS > 1) begin
assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0];
assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0];
end else begin
assign req_bank_sel = '0;
end
@ -108,12 +108,12 @@ module VX_axi_adapter #(
for (genvar i = 0; i < NUM_BANKS; ++i) begin
wire m_axi_aw_fire = m_axi_awvalid[i] && m_axi_awready[i];
wire m_axi_w_fire = m_axi_wvalid[i] && m_axi_wready[i];
wire m_axi_w_fire = m_axi_wvalid[i] && m_axi_wready[i];
always @(posedge clk) begin
if (reset) begin
m_axi_aw_ack[i] <= 0;
m_axi_w_ack[i] <= 0;
end else begin
end else begin
if (mem_req_fire && (req_bank_sel == i)) begin
m_axi_aw_ack[i] <= 0;
m_axi_w_ack[i] <= 0;
@ -127,10 +127,10 @@ module VX_axi_adapter #(
end
end
wire axi_write_ready [NUM_BANKS];
wire axi_write_ready [NUM_BANKS];
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign axi_write_ready[i] = (m_axi_awready[i] || m_axi_aw_ack[i])
assign axi_write_ready[i] = (m_axi_awready[i] || m_axi_aw_ack[i])
&& (m_axi_wready[i] || m_axi_w_ack[i]);
end
@ -141,17 +141,17 @@ module VX_axi_adapter #(
assign mem_req_ready = mem_req_rw ? axi_write_ready[0] : m_axi_arready[0];
end
// AXI write request address channel
// AXI write request address channel
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign m_axi_awvalid[i] = mem_req_valid && mem_req_rw && (req_bank_sel == i) && ~m_axi_aw_ack[i];
assign m_axi_awaddr[i] = (ADDR_WIDTH'(mem_req_addr) >> LOG2_NUM_BANKS) << AXSIZE;
assign m_axi_awid[i] = mem_req_tag;
assign m_axi_awlen[i] = 8'b00000000;
assign m_axi_awlen[i] = 8'b00000000;
assign m_axi_awsize[i] = 3'(AXSIZE);
assign m_axi_awburst[i] = 2'b00;
assign m_axi_awlock[i] = 2'b00;
assign m_axi_awburst[i] = 2'b00;
assign m_axi_awlock[i] = 2'b00;
assign m_axi_awcache[i] = 4'b0000;
assign m_axi_awprot[i] = 3'b000;
assign m_axi_awprot[i] = 3'b000;
assign m_axi_awqos[i] = 4'b0000;
assign m_axi_awregion[i]= 4'b0000;
end
@ -170,31 +170,31 @@ module VX_axi_adapter #(
`UNUSED_VAR (m_axi_bid[i])
`UNUSED_VAR (m_axi_bresp[i])
assign m_axi_bready[i] = 1'b1;
`RUNTIME_ASSERT(~m_axi_bvalid[i] || m_axi_bresp[i] == 0, ("%t: *** AXI response error", $time));
`RUNTIME_ASSERT(~m_axi_bvalid[i] || m_axi_bresp[i] == 0, ("%t: *** AXI response error", $time));
end
// AXI read request channel
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign m_axi_arvalid[i] = mem_req_valid && ~mem_req_rw && (req_bank_sel == i);
assign m_axi_arvalid[i] = mem_req_valid && ~mem_req_rw && (req_bank_sel == i);
assign m_axi_araddr[i] = (ADDR_WIDTH'(mem_req_addr) >> LOG2_NUM_BANKS) << AXSIZE;
assign m_axi_arid[i] = mem_req_tag;
assign m_axi_arlen[i] = 8'b00000000;
assign m_axi_arsize[i] = 3'(AXSIZE);
assign m_axi_arburst[i] = 2'b00;
assign m_axi_arlock[i] = 2'b00;
assign m_axi_arburst[i] = 2'b00;
assign m_axi_arlock[i] = 2'b00;
assign m_axi_arcache[i] = 4'b0000;
assign m_axi_arprot[i] = 3'b000;
assign m_axi_arqos[i] = 4'b0000;
assign m_axi_arregion[i]= 4'b0000;
end
// AXI read response channel
// AXI read response channel
wire [NUM_BANKS-1:0] rsp_arb_valid_in;
wire [NUM_BANKS-1:0][DATA_WIDTH+TAG_WIDTH-1:0] rsp_arb_data_in;
wire [NUM_BANKS-1:0] rsp_arb_ready_in;
`UNUSED_VAR (m_axi_rlast)
`UNUSED_VAR (m_axi_rlast)
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign rsp_arb_valid_in[i] = m_axi_rvalid[i];
@ -204,14 +204,16 @@ module VX_axi_adapter #(
`RUNTIME_ASSERT(~m_axi_rvalid[i] || m_axi_rresp[i] == 0, ("%t: *** AXI response error", $time));
end
`RESET_RELAY (rsp_arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (NUM_BANKS),
.DATAW (DATA_WIDTH + TAG_WIDTH),
.ARBITER ("R"),
.ARBITER ("F"),
.OUT_BUF (RSP_OUT_BUF)
) rsp_arb (
.clk (clk),
.reset (reset),
.reset (rsp_arb_reset),
.valid_in (rsp_arb_valid_in),
.data_in (rsp_arb_data_in),
.ready_in (rsp_arb_ready_in),

View file

@ -23,12 +23,15 @@ module VX_dp_ram #(
parameter NO_RWCHECK = 0,
parameter LUTRAM = 0,
parameter RW_ASSERT = 0,
parameter RESET_RAM = 0,
parameter READ_ENABLE = 0,
parameter INIT_ENABLE = 0,
parameter INIT_FILE = "",
parameter [DATAW-1:0] INIT_VALUE = 0,
parameter ADDRW = `LOG2UP(SIZE)
) (
input wire clk,
input wire reset,
input wire read,
input wire write,
input wire [WRENW-1:0] wren,
@ -58,42 +61,37 @@ module VX_dp_ram #(
`RUNTIME_ASSERT(~write || (| wren), ("invalid write enable mask"));
end
wire [DATAW-1:0] rdata_w;
`ifdef SYNTHESIS
if (WRENW > 1) begin
`ifdef QUARTUS
if (LUTRAM != 0) begin
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
`USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
if (read) begin
rdata_r <= ram[raddr];
`USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
assign rdata = rdata_r;
end else begin
`USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
end
assign rdata = ram[raddr];
end
assign rdata_w = ram[raddr];
end else begin
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
end
assign rdata_w = ram[raddr];
end else begin
reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
@ -103,37 +101,8 @@ module VX_dp_ram #(
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
if (read) begin
rdata_r <= ram[raddr];
end
end
assign rdata = rdata_r;
end else begin
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
end
assign rdata = ram[raddr];
end else begin
reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
end
assign rdata = ram[raddr];
end
assign rdata_w = ram[raddr];
end
end
`else
@ -141,35 +110,18 @@ module VX_dp_ram #(
if (LUTRAM != 0) begin
`USE_FAST_BRAM reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
if (read) begin
rdata_r <= ram[raddr];
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
assign rdata = rdata_r;
end else begin
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
end
assign rdata = ram[raddr];
end
assign rdata_w = ram[raddr];
end else begin
if (OUT_REG != 0) begin
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
reg [DATAW-1:0] rdata_r;
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
@ -178,37 +130,20 @@ module VX_dp_ram #(
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
if (read) begin
rdata_r <= ram[raddr];
end
end
assign rdata = rdata_r;
assign rdata_w = ram[raddr];
end else begin
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
assign rdata = ram[raddr];
end else begin
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
end
assign rdata = ram[raddr];
end
assign rdata_w = ram[raddr];
end
end
`endif
@ -217,64 +152,36 @@ module VX_dp_ram #(
if (LUTRAM != 0) begin
`USE_FAST_BRAM reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
if (read) begin
rdata_r <= ram[raddr];
end
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
assign rdata = rdata_r;
end else begin
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
end
assign rdata = ram[raddr];
end
assign rdata_w = ram[raddr];
end else begin
if (OUT_REG != 0) begin
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
reg [DATAW-1:0] rdata_r;
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
if (read) begin
rdata_r <= ram[raddr];
end
end
assign rdata = rdata_r;
assign rdata_w = ram[raddr];
end else begin
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
assign rdata = ram[raddr];
end else begin
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
end
assign rdata = ram[raddr];
end
assign rdata_w = ram[raddr];
end
end
end
`else
// RAM emulation
// simulation
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
@ -283,42 +190,57 @@ module VX_dp_ram #(
assign ram_n[i * WSELW +: WSELW] = ((WRENW == 1) | wren[i]) ? wdata[i * WSELW +: WSELW] : ram[waddr][i * WSELW +: WSELW];
end
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
reg [DATAW-1:0] prev_data;
reg [ADDRW-1:0] prev_waddr;
reg prev_write;
always @(posedge clk) begin
if (RESET_RAM && reset) begin
for (integer i = 0; i < SIZE; ++i) begin
ram[i] <= DATAW'(INIT_VALUE);
end
end else begin
if (write) begin
ram[waddr] <= ram_n;
end
if (read) begin
rdata_r <= ram[raddr];
end
end
assign rdata = rdata_r;
end else begin
reg [DATAW-1:0] prev_data;
reg [ADDRW-1:0] prev_waddr;
reg prev_write;
always @(posedge clk) begin
if (write) begin
ram[waddr] <= ram_n;
end
if (reset) begin
prev_write <= 0;
prev_data <= '0;
prev_waddr <= '0;
end else begin
prev_write <= write;
prev_data <= ram[waddr];
prev_waddr <= waddr;
end
if (LUTRAM || !NO_RWCHECK) begin
`UNUSED_VAR (prev_write)
`UNUSED_VAR (prev_data)
`UNUSED_VAR (prev_waddr)
assign rdata = ram[raddr];
end else begin
assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr];
if (RW_ASSERT) begin
`RUNTIME_ASSERT(~read || (rdata == ram[raddr]), ("read after write hazard"));
end
end
if (LUTRAM || !NO_RWCHECK) begin
`UNUSED_VAR (prev_write)
`UNUSED_VAR (prev_data)
`UNUSED_VAR (prev_waddr)
assign rdata_w = ram[raddr];
end else begin
assign rdata_w = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr];
if (RW_ASSERT) begin
`RUNTIME_ASSERT(~read || (rdata_w == ram[raddr]), ("read after write hazard"));
end
end
`endif
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (READ_ENABLE && reset) begin
rdata_r <= '0;
end else if (!READ_ENABLE || read) begin
rdata_r <= rdata_w;
end
end
assign rdata = rdata_r;
end else begin
assign rdata = rdata_w;
end
endmodule
`TRACING_ON

View file

@ -1,115 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
`TRACING_OFF
module VX_dp_ram_rst #(
parameter DATAW = 1,
parameter SIZE = 1,
parameter ADDR_MIN = 0,
parameter WRENW = 1,
parameter OUT_REG = 0,
parameter NO_RWCHECK = 0,
parameter LUTRAM = 0,
parameter INIT_ENABLE = 0,
parameter INIT_FILE = "",
parameter [DATAW-1:0] INIT_VALUE = 0,
parameter ADDRW = `LOG2UP(SIZE)
) (
input wire clk,
input wire reset,
input wire read,
input wire write,
input wire [WRENW-1:0] wren,
input wire [ADDRW-1:0] waddr,
input wire [DATAW-1:0] wdata,
input wire [ADDRW-1:0] raddr,
output wire [DATAW-1:0] rdata
);
localparam WSELW = DATAW / WRENW;
`STATIC_ASSERT((WRENW * WSELW == DATAW), ("invalid parameter"))
`define RAM_INITIALIZATION \
if (INIT_ENABLE != 0) begin \
if (INIT_FILE != "") begin \
initial $readmemh(INIT_FILE, ram); \
end else begin \
initial \
for (integer i = 0; i < SIZE; ++i) \
ram[i] = INIT_VALUE; \
end \
end
`UNUSED_VAR (read)
// RAM emulation
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
wire [DATAW-1:0] ram_n;
for (genvar i = 0; i < WRENW; ++i) begin
assign ram_n[i * WSELW +: WSELW] = ((WRENW == 1) | wren[i]) ? wdata[i * WSELW +: WSELW] : ram[waddr][i * WSELW +: WSELW];
end
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (reset) begin
for (integer i = 0; i < SIZE; ++i) begin
ram[i] <= DATAW'(INIT_VALUE);
end
rdata_r <= '0;
end else begin
if (write) begin
ram[waddr] <= ram_n;
end
if (read) begin
rdata_r <= ram[raddr];
end
end
end
assign rdata = rdata_r;
end else begin
reg [DATAW-1:0] prev_data;
reg [ADDRW-1:0] prev_waddr;
reg prev_write;
always @(posedge clk) begin
if (reset) begin
for (integer i = 0; i < SIZE; ++i) begin
ram[i] <= DATAW'(INIT_VALUE);
end
prev_write <= 0;
prev_data <= '0;
prev_waddr <= '0;
end else begin
if (write) begin
ram[waddr] <= ram_n;
end
prev_write <= (| wren);
prev_data <= ram[waddr];
prev_waddr <= waddr;
end
end
if (LUTRAM || !NO_RWCHECK) begin
`UNUSED_VAR (prev_write)
`UNUSED_VAR (prev_data)
`UNUSED_VAR (prev_waddr)
assign rdata = ram[raddr];
end else begin
assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr];
end
end
endmodule
`TRACING_ON

View file

@ -103,9 +103,9 @@ module VX_elastic_buffer #(
assign ready_in = ~full;
VX_elastic_buffer #(
VX_pipe_buffer #(
.DATAW (DATAW),
.SIZE ((OUT_REG == 2) ? 1 : 0)
.DEPTH ((OUT_REG > 0) ? (OUT_REG-1) : 0)
) out_buf (
.clk (clk),
.reset (reset),

View file

@ -177,10 +177,11 @@ module VX_fifo_queue #(
.SIZE (DEPTH),
.LUTRAM (LUTRAM)
) dp_ram (
.clk(clk),
.clk (clk),
.reset (reset),
.read (1'b1),
.write (push),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (wr_ptr_r),
.wdata (data_in),
.raddr (rd_ptr_r),
@ -226,9 +227,10 @@ module VX_fifo_queue #(
.LUTRAM (LUTRAM)
) dp_ram (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (push),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (wr_ptr_r),
.wdata (data_in),
.raddr (rd_ptr_n_r),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -24,17 +24,17 @@ module VX_index_buffer #(
input wire reset,
output wire [ADDRW-1:0] write_addr,
input wire [DATAW-1:0] write_data,
input wire [DATAW-1:0] write_data,
input wire acquire_en,
input wire [ADDRW-1:0] read_addr,
output wire [DATAW-1:0] read_data,
input wire release_en,
output wire empty,
output wire full
output wire full
);
VX_allocator #(
.SIZE (SIZE)
) allocator (
@ -43,9 +43,9 @@ module VX_index_buffer #(
.acquire_en (acquire_en),
.acquire_addr (write_addr),
.release_en (release_en),
.release_addr (read_addr),
.release_addr (read_addr),
.empty (empty),
.full (full)
.full (full)
);
VX_dp_ram #(
@ -54,14 +54,15 @@ module VX_index_buffer #(
.LUTRAM (LUTRAM)
) data_table (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (acquire_en),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (write_addr),
.wdata (write_data),
.raddr (read_addr),
.rdata (read_data)
);
endmodule
`TRACING_ON

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -15,10 +15,10 @@
`TRACING_OFF
module VX_mem_adapter #(
parameter SRC_DATA_WIDTH = 1,
parameter SRC_ADDR_WIDTH = 1,
parameter DST_DATA_WIDTH = 1,
parameter DST_ADDR_WIDTH = 1,
parameter SRC_DATA_WIDTH = 1,
parameter SRC_ADDR_WIDTH = 1,
parameter DST_DATA_WIDTH = 1,
parameter DST_ADDR_WIDTH = 1,
parameter SRC_TAG_WIDTH = 1,
parameter DST_TAG_WIDTH = 1,
parameter REQ_OUT_BUF = 0,
@ -35,9 +35,9 @@ module VX_mem_adapter #(
input wire [SRC_TAG_WIDTH-1:0] mem_req_tag_in,
output wire mem_req_ready_in,
output wire mem_rsp_valid_in,
output wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_in,
output wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_in,
output wire mem_rsp_valid_in,
output wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_in,
output wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_in,
input wire mem_rsp_ready_in,
output wire mem_req_valid_out,
@ -48,12 +48,12 @@ module VX_mem_adapter #(
output wire [DST_TAG_WIDTH-1:0] mem_req_tag_out,
input wire mem_req_ready_out,
input wire mem_rsp_valid_out,
input wire [DST_DATA_WIDTH-1:0] mem_rsp_data_out,
input wire mem_rsp_valid_out,
input wire [DST_DATA_WIDTH-1:0] mem_rsp_data_out,
input wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_out,
output wire mem_rsp_ready_out
);
`STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!"))
);
`STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!"))
localparam DST_DATA_SIZE = (DST_DATA_WIDTH / 8);
localparam DST_LDATAW = `CLOG2(DST_DATA_WIDTH);
@ -69,7 +69,7 @@ module VX_mem_adapter #(
wire [DST_TAG_WIDTH-1:0] mem_req_tag_out_w;
wire mem_req_ready_out_w;
wire mem_rsp_valid_in_w;
wire mem_rsp_valid_in_w;
wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_in_w;
wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_in_w;
wire mem_rsp_ready_in_w;
@ -80,7 +80,7 @@ module VX_mem_adapter #(
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
wire [D-1:0] req_idx = mem_req_addr_in[D-1:0];
wire [D-1:0] rsp_idx = mem_rsp_tag_out[D-1:0];
@ -99,31 +99,31 @@ module VX_mem_adapter #(
assign mem_req_valid_out_w = mem_req_valid_in;
assign mem_req_rw_out_w = mem_req_rw_in;
assign mem_req_byteen_out_w = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3));
assign mem_req_byteen_out_w = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3));
assign mem_req_data_out_w = DST_DATA_WIDTH'(mem_req_data_in) << ((DST_LDATAW'(req_idx)) << SRC_LDATAW);
assign mem_req_tag_out_w = DST_TAG_WIDTH'({mem_req_tag_in, req_idx});
assign mem_req_ready_in = mem_req_ready_out_w;
assign mem_rsp_valid_in_w = mem_rsp_valid_out;
assign mem_rsp_data_in_w = mem_rsp_data_out_w[rsp_idx];
assign mem_rsp_data_in_w = mem_rsp_data_out_w[rsp_idx];
assign mem_rsp_tag_in_w = SRC_TAG_WIDTH'(mem_rsp_tag_out[SRC_TAG_WIDTH+D-1:D]);
assign mem_rsp_ready_out = mem_rsp_ready_in_w;
end else if (DST_LDATAW < SRC_LDATAW) begin
reg [D-1:0] req_ctr, rsp_ctr;
reg [P-1:0][DST_DATA_WIDTH-1:0] mem_rsp_data_out_r, mem_rsp_data_out_n;
wire mem_req_out_fire = mem_req_valid_out && mem_req_ready_out;
wire mem_rsp_in_fire = mem_rsp_valid_out && mem_rsp_ready_out;
wire mem_rsp_in_fire = mem_rsp_valid_out && mem_rsp_ready_out;
wire [P-1:0][DST_DATA_WIDTH-1:0] mem_req_data_in_w = mem_req_data_in;
wire [P-1:0][DST_DATA_SIZE-1:0] mem_req_byteen_in_w = mem_req_byteen_in;
always @(*) begin
mem_rsp_data_out_n = mem_rsp_data_out_r;
if (mem_rsp_in_fire) begin
if (mem_rsp_in_fire) begin
mem_rsp_data_out_n[rsp_ctr] = mem_rsp_data_out;
end
end
@ -139,24 +139,24 @@ module VX_mem_adapter #(
if (mem_rsp_in_fire) begin
rsp_ctr <= rsp_ctr + 1;
end
end
end
mem_rsp_data_out_r <= mem_rsp_data_out_n;
end
reg [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_r;
wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_x;
always @(posedge clk) begin
if (mem_rsp_in_fire) begin
mem_rsp_tag_in_r <= mem_rsp_tag_out;
end
end
end
assign mem_rsp_tag_in_x = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_out;
`RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_x == mem_rsp_tag_out),
`RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_x == mem_rsp_tag_out),
("%t: *** out-of-order memory reponse! cur=%d, expected=%d", $time, mem_rsp_tag_in_x, mem_rsp_tag_out))
wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr};
if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH + D)) begin
`UNUSED_VAR (mem_req_addr_in_qual)
assign mem_req_addr_out_w = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0];
@ -181,8 +181,8 @@ module VX_mem_adapter #(
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (reset)
if (DST_ADDR_WIDTH < SRC_ADDR_WIDTH) begin
`UNUSED_VAR (mem_req_addr_in)
assign mem_req_addr_out_w = mem_req_addr_in[DST_ADDR_WIDTH-1:0];
@ -206,13 +206,15 @@ module VX_mem_adapter #(
end
`RESET_RELAY (req_out_reset, reset);
VX_elastic_buffer #(
.DATAW (1 + DST_DATA_SIZE + DST_ADDR_WIDTH + DST_DATA_WIDTH + DST_TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(REQ_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF))
) req_out_buf (
.clk (clk),
.reset (reset),
.reset (req_out_reset),
.valid_in (mem_req_valid_out_w),
.ready_in (mem_req_ready_out_w),
.data_in ({mem_req_rw_out_w, mem_req_byteen_out_w, mem_req_addr_out_w, mem_req_data_out_w, mem_req_tag_out_w}),
@ -221,13 +223,15 @@ module VX_mem_adapter #(
.ready_out (mem_req_ready_out)
);
`RESET_RELAY (rsp_in_reset, reset);
VX_elastic_buffer #(
.DATAW (SRC_DATA_WIDTH + SRC_TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(RSP_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(RSP_OUT_BUF))
) rsp_in_buf (
.clk (clk),
.reset (reset),
.reset (rsp_in_reset),
.valid_in (mem_rsp_valid_in_w),
.ready_in (mem_rsp_ready_in_w),
.data_in ({mem_rsp_data_in_w, mem_rsp_tag_in_w}),

View file

@ -87,16 +87,16 @@ module VX_mem_coalescer #(
localparam STATE_SETUP = 0;
localparam STATE_SEND = 1;
reg state_r, state_n;
logic state_r, state_n;
reg out_req_valid_r, out_req_valid_n;
reg out_req_rw_r, out_req_rw_n;
reg [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n;
reg [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n;
reg [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype_r, out_req_atype_n;
reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] out_req_byteen_r, out_req_byteen_n;
reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] out_req_data_r, out_req_data_n;
reg [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n;
logic out_req_valid_r, out_req_valid_n;
logic out_req_rw_r, out_req_rw_n;
logic [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n;
logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n;
logic [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype_r, out_req_atype_n;
logic [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] out_req_byteen_r, out_req_byteen_n;
logic [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] out_req_data_r, out_req_data_n;
logic [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n;
reg in_req_ready_n;
@ -149,29 +149,6 @@ module VX_mem_coalescer #(
end
end
always @(posedge clk) begin
if (reset) begin
state_r <= STATE_SETUP;
processed_mask_r <= '0;
out_req_valid_r <= 0;
end else begin
state_r <= state_n;
batch_valid_r <= batch_valid_n;
seed_addr_r <= seed_addr_n;
seed_atype_r <= seed_atype_n;
addr_matches_r <= addr_matches_n;
out_req_valid_r <= out_req_valid_n;
out_req_mask_r <= out_req_mask_n;
out_req_rw_r <= out_req_rw_n;
out_req_addr_r <= out_req_addr_n;
out_req_atype_r <= out_req_atype_n;
out_req_byteen_r <= out_req_byteen_n;
out_req_data_r <= out_req_data_n;
out_req_tag_r <= out_req_tag_n;
processed_mask_r <= processed_mask_n;
end
end
wire [NUM_REQS-1:0] current_pmask = in_req_mask & addr_matches_r;
reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] req_byteen_merged;
@ -248,6 +225,19 @@ module VX_mem_coalescer #(
endcase
end
`RESET_RELAY (pipe_reset, reset);
VX_pipe_register #(
.DATAW (1 + NUM_REQS + 1 + 1 + NUM_REQS + OUT_REQS * (1 + 1 + OUT_ADDR_WIDTH + ATYPE_WIDTH + OUT_ADDR_WIDTH + ATYPE_WIDTH + DATA_OUT_SIZE + DATA_OUT_WIDTH) + OUT_TAG_WIDTH),
.RESETW (1 + NUM_REQS + 1)
) pipe_reg (
.clk (clk),
.reset (pipe_reset),
.enable (1'b1),
.data_in ({state_n, processed_mask_n, out_req_valid_n, out_req_rw_n, addr_matches_n, batch_valid_n, out_req_mask_n, seed_addr_n, seed_atype_n, out_req_addr_n, out_req_atype_n, out_req_byteen_n, out_req_data_n, out_req_tag_n}),
.data_out ({state_r, processed_mask_r, out_req_valid_r, out_req_rw_r, addr_matches_r, batch_valid_r, out_req_mask_r, seed_addr_r, seed_atype_r, out_req_addr_r, out_req_atype_r, out_req_byteen_r, out_req_data_r, out_req_tag_r})
);
wire out_rsp_fire = out_rsp_valid && out_rsp_ready;
wire out_rsp_eop;

View file

@ -167,13 +167,15 @@ module VX_mem_scheduler #(
assign reqq_tag_u = ibuf_waddr;
end
`RESET_RELAY (reqq_reset, reset);
VX_elastic_buffer #(
.DATAW (1 + CORE_REQS * (1 + WORD_SIZE + ADDR_WIDTH + ATYPE_WIDTH + WORD_WIDTH) + REQQ_TAG_WIDTH),
.SIZE (CORE_QUEUE_SIZE),
.OUT_REG (1)
) req_queue (
.clk (clk),
.reset (reset),
.reset (reqq_reset),
.valid_in (reqq_valid_in),
.ready_in (reqq_ready_in),
.data_in ({core_req_rw, core_req_mask, core_req_byteen, core_req_addr, core_req_atype, core_req_data, reqq_tag_u}),
@ -389,13 +391,15 @@ module VX_mem_scheduler #(
assign reqq_ready_s = req_sent_all;
`RESET_RELAY (mem_req_reset, reset);
VX_elastic_buffer #(
.DATAW (MEM_CHANNELS + 1 + MEM_CHANNELS * (LINE_SIZE + MEM_ADDR_WIDTH + ATYPE_WIDTH + LINE_WIDTH) + MEM_TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.reset (mem_req_reset),
.valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s),
.data_in ({mem_req_mask_s, mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_atype_s, mem_req_data_s, mem_req_tag_s}),
@ -509,13 +513,15 @@ module VX_mem_scheduler #(
// Send response to caller
`RESET_RELAY (crsp_reset, reset);
VX_elastic_buffer #(
.DATAW (CORE_REQS + 1 + 1 + (CORE_REQS * WORD_WIDTH) + TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(CORE_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) rsp_buf (
.clk (clk),
.reset (reset),
.reset (crsp_reset),
.valid_in (crsp_valid),
.ready_in (crsp_ready),
.data_in ({crsp_mask, crsp_sop, crsp_eop, crsp_data, crsp_tag}),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -23,13 +23,13 @@ module VX_onehot_encoder #(
parameter MODEL = 1,
parameter LN = `LOG2UP(N)
) (
input wire [N-1:0] data_in,
input wire [N-1:0] data_in,
output wire [LN-1:0] data_out,
output wire valid_out
);
);
if (N == 1) begin
assign data_out = data_in;
assign data_out = 0;
assign valid_out = data_in;
end else if (N == 2) begin
@ -37,43 +37,43 @@ module VX_onehot_encoder #(
assign data_out = data_in[!REVERSE];
assign valid_out = (| data_in);
end else if (MODEL == 1) begin
localparam M = 1 << LN;
`IGNORE_UNOPTFLAT_BEGIN
end else if (MODEL == 1) begin
localparam M = 1 << LN;
`IGNORE_UNOPTFLAT_BEGIN
wire [LN-1:0][M-1:0] addr;
wire [LN:0][M-1:0] v;
`IGNORE_UNOPTFLAT_END
// base case, also handle padding for non-power of two inputs
assign v[0] = REVERSE ? (M'(data_in) << (M - N)) : M'(data_in);
for (genvar lvl = 1; lvl < (LN+1); ++lvl) begin
localparam SN = 1 << (LN - lvl);
localparam SI = M / SN;
localparam SW = lvl;
for (genvar s = 0; s < SN; ++s) begin
`IGNORE_UNOPTFLAT_BEGIN
wire [1:0] vs = {v[lvl-1][s*SI+(SI>>1)], v[lvl-1][s*SI]};
`IGNORE_UNOPTFLAT_END
assign v[lvl][s*SI] = (| vs);
if (lvl == 1) begin
assign addr[lvl-1][s*SI +: SW] = vs[!REVERSE];
assign addr[lvl-1][s*SI +: SW] = vs[!REVERSE];
end else begin
assign addr[lvl-1][s*SI +: SW] = {
assign addr[lvl-1][s*SI +: SW] = {
vs[!REVERSE],
addr[lvl-2][s*SI +: SW-1] | addr[lvl-2][s*SI+(SI>>1) +: SW-1]
};
end
end
end
end
end
end
assign data_out = addr[LN-1][LN-1:0];
assign valid_out = v[LN][0];
end else if (MODEL == 2 && REVERSE == 0) begin
end else if (MODEL == 2 && REVERSE == 0) begin
for (genvar j = 0; j < LN; ++j) begin
wire [N-1:0] mask;
@ -90,19 +90,19 @@ module VX_onehot_encoder #(
reg [LN-1:0] index_r;
if (REVERSE != 0) begin
always @(*) begin
index_r = 'x;
always @(*) begin
index_r = 'x;
for (integer i = N-1; i >= 0; --i) begin
if (data_in[i]) begin
if (data_in[i]) begin
index_r = LN'(N-1-i);
end
end
end
end else begin
always @(*) begin
index_r = 'x;
always @(*) begin
index_r = 'x;
for (integer i = 0; i < N; ++i) begin
if (data_in[i]) begin
if (data_in[i]) begin
index_r = LN'(i);
end
end

View file

@ -17,7 +17,8 @@
module VX_onehot_mux #(
parameter DATAW = 1,
parameter N = 1,
parameter MODEL = 1
parameter MODEL = 1,
parameter LUT_OPT = 1
) (
input wire [N-1:0][DATAW-1:0] data_in,
input wire [N-1:0] sel_in,
@ -26,6 +27,90 @@ module VX_onehot_mux #(
if (N == 1) begin
`UNUSED_VAR (sel_in)
assign data_out = data_in;
end else if (LUT_OPT && N == 2) begin
`UNUSED_VAR (sel_in)
assign data_out = sel_in[0] ? data_in[0] : data_in[1];
end else if (LUT_OPT && N == 3) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
3'b001: data_out_r = data_in[0];
3'b010: data_out_r = data_in[1];
3'b100: data_out_r = data_in[2];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (LUT_OPT && N == 4) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
4'b0001: data_out_r = data_in[0];
4'b0010: data_out_r = data_in[1];
4'b0100: data_out_r = data_in[2];
4'b1000: data_out_r = data_in[3];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (LUT_OPT && N == 5) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
5'b00001: data_out_r = data_in[0];
5'b00010: data_out_r = data_in[1];
5'b00100: data_out_r = data_in[2];
5'b01000: data_out_r = data_in[3];
5'b10000: data_out_r = data_in[4];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (LUT_OPT && N == 6) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
6'b000001: data_out_r = data_in[0];
6'b000010: data_out_r = data_in[1];
6'b000100: data_out_r = data_in[2];
6'b001000: data_out_r = data_in[3];
6'b010000: data_out_r = data_in[4];
6'b100000: data_out_r = data_in[5];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (LUT_OPT && N == 7) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
7'b0000001: data_out_r = data_in[0];
7'b0000010: data_out_r = data_in[1];
7'b0000100: data_out_r = data_in[2];
7'b0001000: data_out_r = data_in[3];
7'b0010000: data_out_r = data_in[4];
7'b0100000: data_out_r = data_in[5];
7'b1000000: data_out_r = data_in[6];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (LUT_OPT && N == 8) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
8'b00000001: data_out_r = data_in[0];
8'b00000010: data_out_r = data_in[1];
8'b00000100: data_out_r = data_in[2];
8'b00001000: data_out_r = data_in[3];
8'b00010000: data_out_r = data_in[4];
8'b00100000: data_out_r = data_in[5];
8'b01000000: data_out_r = data_in[6];
8'b10000000: data_out_r = data_in[7];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (MODEL == 1) begin
wire [N-1:0][DATAW-1:0] mask;
for (genvar i = 0; i < N; ++i) begin

View file

@ -21,7 +21,8 @@ module VX_pe_serializer #(
parameter DATA_IN_WIDTH = 1,
parameter DATA_OUT_WIDTH = 1,
parameter TAG_WIDTH = 0,
parameter PE_REG = 0
parameter PE_REG = 0,
parameter OUT_BUF = 0
) (
input wire clk,
input wire reset,
@ -43,6 +44,11 @@ module VX_pe_serializer #(
output wire [TAG_WIDTH-1:0] tag_out,
input wire ready_out
);
wire valid_out_u;
wire [NUM_LANES-1:0][DATA_OUT_WIDTH-1:0] data_out_u;
wire [TAG_WIDTH-1:0] tag_out_u;
wire ready_out_u;
wire [NUM_PES-1:0][DATA_IN_WIDTH-1:0] pe_data_in_s;
wire valid_out_s;
wire [TAG_WIDTH-1:0] tag_out_s;
@ -105,7 +111,7 @@ module VX_pe_serializer #(
reg [TAG_WIDTH-1:0] tag_out_r;
wire valid_out_b = valid_out_s && batch_out_done;
wire ready_out_b = ready_out || ~valid_out;
wire ready_out_b = ready_out_u || ~valid_out_u;
always @(posedge clk) begin
if (reset) begin
@ -119,29 +125,44 @@ module VX_pe_serializer #(
end
end
assign enable = ready_out_b || ~valid_out_b;
assign ready_in = enable && batch_in_done;
assign enable = ready_out_b || ~valid_out_b;
assign ready_in = enable && batch_in_done;
assign pe_enable = enable;
assign pe_enable = enable;
assign valid_out = valid_out_r;
assign data_out = data_out_r;
assign tag_out = tag_out_r;
assign valid_out_u = valid_out_r;
assign data_out_u = data_out_r;
assign tag_out_u = tag_out_r;
end else begin
assign pe_data_in_s = data_in;
assign enable = ready_out || ~valid_out;
assign ready_in = enable;
assign enable = ready_out_u || ~valid_out_u;
assign ready_in = enable;
assign pe_enable = enable;
assign pe_enable = enable;
assign valid_out = valid_out_s;
assign data_out = pe_data_out;
assign tag_out = tag_out_s;
assign valid_out_u = valid_out_s;
assign data_out_u = pe_data_out;
assign tag_out_u = tag_out_s;
end
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (NUM_LANES * DATA_OUT_WIDTH + TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_out_u),
.ready_in (ready_out_u),
.data_in ({data_out_u, tag_out_u}),
.data_out ({data_out, tag_out}),
.valid_out (valid_out),
.ready_out (ready_out)
);
endmodule
`TRACING_ON

View file

@ -1,11 +1,11 @@
// Copyright 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -24,39 +24,53 @@
`TRACING_OFF
module VX_pipe_buffer #(
parameter DATAW = 1,
parameter PASSTHRU = 0
) (
parameter DATAW = 1,
parameter DEPTH = 1
) (
input wire clk,
input wire reset,
input wire valid_in,
output wire ready_in,
output wire ready_in,
input wire [DATAW-1:0] data_in,
output wire [DATAW-1:0] data_out,
input wire ready_out,
output wire valid_out
);
if (PASSTHRU != 0) begin
);
if (DEPTH == 0) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign ready_in = ready_out;
assign valid_out = valid_in;
assign valid_out = valid_in;
assign data_out = data_in;
end else begin
wire stall = valid_out && ~ready_out;
wire [DEPTH:0] valid;
`IGNORE_UNOPTFLAT_BEGIN
wire [DEPTH:0] ready;
`IGNORE_UNOPTFLAT_END
wire [DEPTH:0][DATAW-1:0] data;
VX_pipe_register #(
.DATAW (1 + DATAW),
.RESETW (1)
) pipe_register (
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in, data_in}),
.data_out ({valid_out, data_out})
);
assign valid[0] = valid_in;
assign data[0] = data_in;
assign ready_in = ready[0];
for (genvar i = 0; i < DEPTH; ++i) begin
assign ready[i] = (ready[i+1] || ~valid[i+1]);
VX_pipe_register #(
.DATAW (1 + DATAW),
.RESETW (1)
) pipe_register (
.clk (clk),
.reset (reset),
.enable (ready[i]),
.data_in ({valid[i], data[i]}),
.data_out ({valid[i+1], data[i+1]})
);
end
assign valid_out = valid[DEPTH];
assign data_out = data[DEPTH];
assign ready[DEPTH] = ready_out;
assign ready_in = ~stall;
end
endmodule

View file

@ -23,12 +23,14 @@ module VX_sp_ram #(
parameter NO_RWCHECK = 0,
parameter RW_ASSERT = 0,
parameter LUTRAM = 0,
parameter RESET_RAM = 0,
parameter INIT_ENABLE = 0,
parameter INIT_FILE = "",
parameter [DATAW-1:0] INIT_VALUE = 0,
parameter ADDRW = `LOG2UP(SIZE)
) (
input wire clk,
input wire reset,
input wire read,
input wire write,
input wire [WRENW-1:0] wren,
@ -45,12 +47,14 @@ module VX_sp_ram #(
.NO_RWCHECK (NO_RWCHECK),
.RW_ASSERT (RW_ASSERT),
.LUTRAM (LUTRAM),
.RESET_RAM (RESET_RAM),
.INIT_ENABLE (INIT_ENABLE),
.INIT_FILE (INIT_FILE),
.INIT_VALUE (INIT_VALUE),
.ADDRW (ADDRW)
) dp_ram (
.clk (clk),
.reset (reset),
.read (read),
.write (write),
.wren (wren),

View file

@ -18,7 +18,7 @@ module VX_stream_arb #(
parameter NUM_INPUTS = 1,
parameter NUM_OUTPUTS = 1,
parameter DATAW = 1,
parameter `STRING ARBITER = "P",
parameter `STRING ARBITER = "R",
parameter MAX_FANOUT = `MAX_FANOUT,
parameter OUT_BUF = 0,
parameter LUTRAM = 0,
@ -46,14 +46,14 @@ module VX_stream_arb #(
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
localparam BATCH_BEGIN = i * NUM_REQS;
localparam BATCH_END = `MIN(BATCH_BEGIN + NUM_REQS, NUM_INPUTS);
localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN;
localparam SLICE_BEGIN = i * NUM_REQS;
localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_INPUTS);
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
`RESET_RELAY (slice_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (BATCH_SIZE),
.NUM_INPUTS (SLICE_SIZE),
.NUM_OUTPUTS (1),
.DATAW (DATAW),
.ARBITER (ARBITER),
@ -63,9 +63,9 @@ module VX_stream_arb #(
) arb_slice (
.clk (clk),
.reset (slice_reset),
.valid_in (valid_in[BATCH_END-1: BATCH_BEGIN]),
.ready_in (ready_in[BATCH_END-1: BATCH_BEGIN]),
.data_in (data_in[BATCH_END-1: BATCH_BEGIN]),
.valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]),
.ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]),
.data_in (data_in[SLICE_END-1: SLICE_BEGIN]),
.data_out (data_out[i]),
.sel_out (sel_out[i]),
.valid_out (valid_out[i]),
@ -77,28 +77,28 @@ module VX_stream_arb #(
// (#inputs > max_fanout) and (#outputs == 1)
localparam NUM_BATCHES = `CDIV(NUM_INPUTS, MAX_FANOUT);
localparam NUM_SLICES = `CDIV(NUM_INPUTS, MAX_FANOUT);
localparam LOG_NUM_REQS2 = `CLOG2(MAX_FANOUT);
localparam LOG_NUM_REQS3 = `CLOG2(NUM_BATCHES);
localparam LOG_NUM_REQS3 = `CLOG2(NUM_SLICES);
wire [NUM_BATCHES-1:0] valid_tmp;
wire [NUM_BATCHES-1:0][DATAW+LOG_NUM_REQS2-1:0] data_tmp;
wire [NUM_BATCHES-1:0] ready_tmp;
wire [NUM_SLICES-1:0] valid_tmp;
wire [NUM_SLICES-1:0][DATAW+LOG_NUM_REQS2-1:0] data_tmp;
wire [NUM_SLICES-1:0] ready_tmp;
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
for (genvar i = 0; i < NUM_SLICES; ++i) begin
localparam BATCH_BEGIN = i * MAX_FANOUT;
localparam BATCH_END = `MIN(BATCH_BEGIN + MAX_FANOUT, NUM_INPUTS);
localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN;
localparam SLICE_BEGIN = i * MAX_FANOUT;
localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_INPUTS);
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
wire [DATAW-1:0] data_tmp_u;
wire [`LOG2UP(BATCH_SIZE)-1:0] sel_tmp_u;
wire [`LOG2UP(SLICE_SIZE)-1:0] sel_tmp_u;
`RESET_RELAY (slice_reset, reset);
if (MAX_FANOUT != 1) begin
VX_stream_arb #(
.NUM_INPUTS (BATCH_SIZE),
.NUM_INPUTS (SLICE_SIZE),
.NUM_OUTPUTS (1),
.DATAW (DATAW),
.ARBITER (ARBITER),
@ -108,9 +108,9 @@ module VX_stream_arb #(
) fanout_slice_arb (
.clk (clk),
.reset (slice_reset),
.valid_in (valid_in[BATCH_END-1: BATCH_BEGIN]),
.data_in (data_in[BATCH_END-1: BATCH_BEGIN]),
.ready_in (ready_in[BATCH_END-1: BATCH_BEGIN]),
.valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]),
.data_in (data_in[SLICE_END-1: SLICE_BEGIN]),
.ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]),
.valid_out (valid_tmp[i]),
.data_out (data_tmp_u),
.sel_out (sel_tmp_u),
@ -125,7 +125,7 @@ module VX_stream_arb #(
wire [LOG_NUM_REQS3-1:0] sel_out_u;
VX_stream_arb #(
.NUM_INPUTS (NUM_BATCHES),
.NUM_INPUTS (NUM_SLICES),
.NUM_OUTPUTS (1),
.DATAW (DATAW + LOG_NUM_REQS2),
.ARBITER (ARBITER),
@ -214,15 +214,15 @@ module VX_stream_arb #(
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
localparam BATCH_BEGIN = i * NUM_REQS;
localparam BATCH_END = `MIN(BATCH_BEGIN + NUM_REQS, NUM_OUTPUTS);
localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN;
localparam SLICE_BEGIN = i * NUM_REQS;
localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_OUTPUTS);
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
`RESET_RELAY (slice_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (1),
.NUM_OUTPUTS (BATCH_SIZE),
.NUM_OUTPUTS (SLICE_SIZE),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
@ -234,13 +234,13 @@ module VX_stream_arb #(
.valid_in (valid_in[i]),
.ready_in (ready_in[i]),
.data_in (data_in[i]),
.data_out (data_out[BATCH_END-1: BATCH_BEGIN]),
.valid_out (valid_out[BATCH_END-1: BATCH_BEGIN]),
.ready_out (ready_out[BATCH_END-1: BATCH_BEGIN]),
.data_out (data_out[SLICE_END-1: SLICE_BEGIN]),
.valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]),
.ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]),
`UNUSED_PIN (sel_out)
);
for (genvar j = BATCH_BEGIN; j < BATCH_END; ++j) begin
for (genvar j = SLICE_BEGIN; j < SLICE_END; ++j) begin
assign sel_out[j] = i;
end
end
@ -249,15 +249,15 @@ module VX_stream_arb #(
// (#inputs == 1) and (#outputs > max_fanout)
localparam NUM_BATCHES = `CDIV(NUM_OUTPUTS, MAX_FANOUT);
localparam NUM_SLICES = `CDIV(NUM_OUTPUTS, MAX_FANOUT);
wire [NUM_BATCHES-1:0] valid_tmp;
wire [NUM_BATCHES-1:0][DATAW-1:0] data_tmp;
wire [NUM_BATCHES-1:0] ready_tmp;
wire [NUM_SLICES-1:0] valid_tmp;
wire [NUM_SLICES-1:0][DATAW-1:0] data_tmp;
wire [NUM_SLICES-1:0] ready_tmp;
VX_stream_arb #(
.NUM_INPUTS (1),
.NUM_OUTPUTS (NUM_BATCHES),
.NUM_OUTPUTS (NUM_SLICES),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
@ -275,17 +275,17 @@ module VX_stream_arb #(
`UNUSED_PIN (sel_out)
);
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
for (genvar i = 0; i < NUM_SLICES; ++i) begin
localparam BATCH_BEGIN = i * MAX_FANOUT;
localparam BATCH_END = `MIN(BATCH_BEGIN + MAX_FANOUT, NUM_OUTPUTS);
localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN;
localparam SLICE_BEGIN = i * MAX_FANOUT;
localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_OUTPUTS);
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
`RESET_RELAY (slice_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (1),
.NUM_OUTPUTS (BATCH_SIZE),
.NUM_OUTPUTS (SLICE_SIZE),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
@ -297,9 +297,9 @@ module VX_stream_arb #(
.valid_in (valid_tmp[i]),
.ready_in (ready_tmp[i]),
.data_in (data_tmp[i]),
.data_out (data_out[BATCH_END-1: BATCH_BEGIN]),
.valid_out (valid_out[BATCH_END-1: BATCH_BEGIN]),
.ready_out (ready_out[BATCH_END-1: BATCH_BEGIN]),
.data_out (data_out[SLICE_END-1: SLICE_BEGIN]),
.valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]),
.ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]),
`UNUSED_PIN (sel_out)
);
end

View file

@ -20,7 +20,7 @@ module VX_stream_xbar #(
parameter DATAW = 4,
parameter IN_WIDTH = `LOG2UP(NUM_INPUTS),
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS),
parameter ARBITER = "P",
parameter ARBITER = "R",
parameter OUT_BUF = 0,
parameter LUTRAM = 0,
parameter MAX_FANOUT = `MAX_FANOUT,

View file

@ -94,7 +94,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_req_idx;
wire [NUM_BANKS-1:0] per_bank_req_ready;
wire [NUM_BANKS-1:0][REQ_DATAW-1:0] per_bank_req_data_all;
wire [NUM_BANKS-1:0][REQ_DATAW-1:0] per_bank_req_data_aos;
wire [NUM_REQS-1:0] req_valid_in;
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in;
@ -111,7 +111,8 @@ module VX_local_mem import VX_gpu_pkg::*; #(
req_bank_addr[i],
mem_bus_if[i].req_data.byteen,
mem_bus_if[i].req_data.data,
mem_bus_if[i].req_data.tag};
mem_bus_if[i].req_data.tag
};
assign mem_bus_if[i].req_ready = req_ready_in[i];
end
@ -120,6 +121,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (REQ_DATAW),
.PERF_CTR_BITS (`PERF_CTR_BITS),
.ARBITER ("F"),
.OUT_BUF (3) // output should be registered for the data_store addressing
) req_xbar (
.clk (clk),
@ -134,7 +136,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
.sel_in (req_bank_idx),
.ready_in (req_ready_in),
.valid_out (per_bank_req_valid),
.data_out (per_bank_req_data_all),
.data_out (per_bank_req_data_aos),
.sel_out (per_bank_req_idx),
.ready_out (per_bank_req_ready)
);
@ -145,7 +147,8 @@ module VX_local_mem import VX_gpu_pkg::*; #(
per_bank_req_addr[i],
per_bank_req_byteen[i],
per_bank_req_data[i],
per_bank_req_tag[i]} = per_bank_req_data_all[i];
per_bank_req_tag[i]
} = per_bank_req_data_aos[i];
end
// banks access
@ -156,38 +159,55 @@ module VX_local_mem import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_rsp_tag;
wire [NUM_BANKS-1:0] per_bank_rsp_ready;
`RESET_RELAY (bank_reset, reset);
for (genvar i = 0; i < NUM_BANKS; ++i) begin
wire bank_rsp_valid, bank_rsp_ready;
wire [WORD_WIDTH-1:0] bank_rsp_data;
`RESET_RELAY (bram_reset, reset);
VX_sp_ram #(
.DATAW (WORD_WIDTH),
.SIZE (WORDS_PER_BANK),
.WRENW (WORD_SIZE)
.WRENW (WORD_SIZE),
.NO_RWCHECK (1)
) data_store (
.clk (clk),
.read (1'b1),
.reset (bram_reset),
.read (per_bank_req_valid[i] && per_bank_req_ready[i] && ~per_bank_req_rw[i]),
.write (per_bank_req_valid[i] && per_bank_req_ready[i] && per_bank_req_rw[i]),
.wren (per_bank_req_byteen[i]),
.addr (per_bank_req_addr[i]),
.wdata (per_bank_req_data[i]),
.rdata (per_bank_rsp_data[i])
.rdata (bank_rsp_data)
);
// drop write response
wire per_bank_req_valid_w, per_bank_req_ready_w;
assign per_bank_req_valid_w = per_bank_req_valid[i] && ~per_bank_req_rw[i];
assign per_bank_req_ready[i] = per_bank_req_ready_w || per_bank_req_rw[i];
// read-during-write hazard detection
reg [BANK_ADDR_WIDTH-1:0] last_wr_addr;
reg last_wr_valid;
always @(posedge clk) begin
if (bram_reset) begin
last_wr_valid <= 0;
end else begin
last_wr_valid <= per_bank_req_valid[i] && per_bank_req_ready[i] && per_bank_req_rw[i];
end
last_wr_addr <= per_bank_req_addr[i];
end
wire is_rdw_hazard = last_wr_valid && ~per_bank_req_rw[i] && (per_bank_req_addr[i] == last_wr_addr);
VX_elastic_buffer #(
.DATAW (REQ_SEL_WIDTH + TAG_WIDTH),
.SIZE (0)
) bank_buf (
// drop write response and stall on read-during-write hazard
assign bank_rsp_valid = per_bank_req_valid[i] && ~per_bank_req_rw[i] && ~is_rdw_hazard;
assign per_bank_req_ready[i] = (bank_rsp_ready || per_bank_req_rw[i]) && ~is_rdw_hazard;
// register BRAM output
VX_pipe_buffer #(
.DATAW (REQ_SEL_WIDTH + WORD_WIDTH + TAG_WIDTH)
) bram_buf (
.clk (clk),
.reset (bank_reset),
.valid_in (per_bank_req_valid_w),
.ready_in (per_bank_req_ready_w),
.data_in ({per_bank_req_idx[i], per_bank_req_tag[i]}),
.data_out ({per_bank_rsp_idx[i], per_bank_rsp_tag[i]}),
.reset (bram_reset),
.valid_in (bank_rsp_valid),
.ready_in (bank_rsp_ready),
.data_in ({per_bank_req_idx[i], bank_rsp_data, per_bank_req_tag[i]}),
.data_out ({per_bank_rsp_idx[i], per_bank_rsp_data[i], per_bank_rsp_tag[i]}),
.valid_out (per_bank_rsp_valid[i]),
.ready_out (per_bank_rsp_ready[i])
);
@ -195,10 +215,10 @@ module VX_local_mem import VX_gpu_pkg::*; #(
// bank responses gather
wire [NUM_BANKS-1:0][RSP_DATAW-1:0] per_bank_rsp_data_all;
wire [NUM_BANKS-1:0][RSP_DATAW-1:0] per_bank_rsp_data_aos;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign per_bank_rsp_data_all[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]};
assign per_bank_rsp_data_aos[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]};
end
wire [NUM_REQS-1:0] rsp_valid_out;
@ -209,6 +229,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
.NUM_INPUTS (NUM_BANKS),
.NUM_OUTPUTS (NUM_REQS),
.DATAW (RSP_DATAW),
.ARBITER ("P"), // this priority arbiter has negligeable impact om performance
.OUT_BUF (OUT_BUF)
) rsp_xbar (
.clk (clk),
@ -216,7 +237,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
`UNUSED_PIN (collisions),
.sel_in (per_bank_rsp_idx),
.valid_in (per_bank_rsp_valid),
.data_in (per_bank_rsp_data_all),
.data_in (per_bank_rsp_data_aos),
.ready_in (per_bank_rsp_ready),
.valid_out (rsp_valid_out),
.data_out (rsp_data_out),

View file

@ -73,10 +73,10 @@ ifneq ($(TARGET), fpga)
CFLAGS += -DSIMULATION
endif
# Debugigng
# Debugging
ifdef DEBUG
ifneq ($(TARGET), fpga)
CFLAGS += $(DBG_TRACE_FLAGS)
CFLAGS += -DDEBUG_LEVEL=$(DEBUG) $(DBG_TRACE_FLAGS)
else
CFLAGS += -DNDEBUG
endif

View file

@ -45,6 +45,7 @@ FPGA_BIN_DIR=<bin_dir> XRT_DEVICE_INDEX=1 TARGET=hw ./ci/blackbox.sh --driver=xr
# build report logs
<build_dir>/bin/vortex_afu.xclbin.info
<build_dir>/_x/logs/link/vivado.log # search for keyword "Very high fanout"
<build_dir>/_x/reports/link/link/imp/impl_1_full_util_routed.rpt
<build_dir>/_x/reports/link/imp/impl_1_hw_bb_locked_timing_summary_routed.rpt # search for keyword "VIOLATED"
<build_dir>/_x/logs/link/syn/ulp_vortex_afu_1_0_synth_1_runme.log

View file

@ -111,12 +111,12 @@ ifeq ($(TARGET), hw_emu)
CFLAGS += -DSIMULATION
endif
# Debugigng
# Debugging
ifdef DEBUG
VPP_FLAGS += -g --debug.protocol all
ifneq ($(TARGET), hw)
VPP_FLAGS += --vivado.prop fileset.sim_1.xsim.elaborate.debug_level=all
CFLAGS += $(DBG_TRACE_FLAGS)
CFLAGS += -DDEBUG_LEVEL=$(DEBUG) $(DBG_TRACE_FLAGS)
else
CFLAGS += -DNDEBUG
endif

View file

@ -49,7 +49,7 @@ endif
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache
RTL_INCLUDE += $(FPU_INCLUDE)
# Debugigng
# Debugging
ifdef DEBUG
CFLAGS += $(DBG_TRACE_FLAGS)
else

View file

@ -29,7 +29,7 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count()
VL_FLAGS += -j $(THREADS)
#VL_FLAGS += --threads $(THREADS)
# Debugigng
# Debugging
ifdef DEBUG
VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 $(DBG_FLAGS)

View file

@ -51,8 +51,10 @@ _start:
# la t0, trap_entry
# csrw mtvec, t0
#ifdef HAVE_INITFINI_ARRAY
# run global initialization functions
call __libc_init_array
#endif
# call main program routine
call main

View file

@ -122,8 +122,10 @@ void __libc_fini_array (void) {
// This function will be called by LIBC at program exit.
// Since this platform only support statically linked programs,
// it is not required to support LIBC's exit functions registration via atexit().
void __funcs_on_exit() {
void __funcs_on_exit (void) {
#ifdef HAVE_INITFINI_ARRAY
__libc_fini_array();
#endif
}
#ifdef __cplusplus

View file

@ -30,7 +30,7 @@ else
CXXFLAGS += -I$(SYN_DIR)
endif
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0
else

View file

@ -19,7 +19,7 @@ LDFLAGS += -L$(DESTDIR) -lrtlsim
SRCS := $(SRC_DIR)/vortex.cpp
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0
else

View file

@ -15,7 +15,7 @@ LDFLAGS += -L$(DESTDIR) -lsimx
SRCS := $(SRC_DIR)/vortex.cpp
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0
else

View file

@ -12,7 +12,7 @@ LDFLAGS += -shared -pthread -ldl
SRCS := $(SRC_DIR)/vortex.cpp $(SRC_DIR)/utils.cpp
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0
else

View file

@ -314,7 +314,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
if (num_cores > 1) {
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_csrs_per_core + scrb_wctl_per_core;
int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, scrs=%d%%, wctl=%d%%)\n"
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n"
, core_id
, scrb_stalls_per_core
, scrb_percent_per_core
@ -559,7 +559,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, scrs=%d%%, wctl=%d%%)\n"
fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n"
, scrb_stalls
, scrb_percent
, calcAvgPercent(scrb_alu, scrb_total)

View file

@ -26,7 +26,7 @@ endif
PROJECT := libvortex-xrt.so
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0
else

View file

@ -83,7 +83,7 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count()
VL_FLAGS += -j $(THREADS)
#VL_FLAGS += --threads $(THREADS)
# Debugigng
# Debugging
ifdef DEBUG
VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 $(DBG_FLAGS)

View file

@ -65,7 +65,7 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count()
VL_FLAGS += -j $(THREADS)
#VL_FLAGS += --threads $(THREADS)
# Debugigng
# Debugging
ifdef DEBUG
VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 $(DBG_FLAGS)

View file

@ -20,7 +20,7 @@ LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulato
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
#CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer

View file

@ -82,7 +82,7 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count()
VL_FLAGS += -j $(THREADS)
#VL_FLAGS += --threads $(THREADS)
# Debugigng
# Debugging
ifdef DEBUG
VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 $(DBG_FLAGS)

View file

@ -46,13 +46,15 @@ int test_global_memory() {
///////////////////////////////////////////////////////////////////////////////
int* lmem_addr = (int*)LMEM_BASE_ADDR;
volatile int* lmem_addr = (int*)LMEM_BASE_ADDR;
int lmem_buffer[8];
void __attribute__((noinline)) do_lmem_wr() {
unsigned tid = vx_thread_id();
lmem_addr[tid] = 65 + tid;
int x = lmem_addr[tid];
lmem_addr[tid] = x;
}
void __attribute__((noinline)) do_lmem_rd() {

View file

@ -44,7 +44,7 @@ CXXFLAGS += -I$(POCL_PATH)/include
POCL_CC_FLAGS += LLVM_PREFIX=$(LLVM_VORTEX) POCL_VORTEX_BINTOOL="$(VX_BINTOOL)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)"
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0
POCL_CC_FLAGS += POCL_DEBUG=all

View file

@ -52,7 +52,7 @@ CXXFLAGS += -I$(VORTEX_RT_PATH)/include -I$(ROOT_DIR)/hw
LDFLAGS += -L$(ROOT_DIR)/runtime -lvortex
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0
else

View file

@ -2,7 +2,7 @@
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_RT_PATH)/common
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0
else