mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
Merge branch 'develop'
This commit is contained in:
commit
267521a1cb
1284 changed files with 5519 additions and 165618 deletions
|
@ -25,12 +25,14 @@ cache:
|
|||
- $HOME/build64
|
||||
|
||||
before_install:
|
||||
- if [ ! -d "$TOOLDIR" ] || [ -z "$(ls -A $TOOLDIR)" ] || [ "$(cat "$TOOLDIR/version.txt")" != "v0.3" ]; then
|
||||
- if [ ! -d "$TOOLDIR" ] || [ -z "$(ls -A $TOOLDIR)" ] || [ "$(cat "$TOOLDIR/version.txt")" != "v0.4" ]; then
|
||||
rm -rf $TOOLDIR;
|
||||
mkdir -p $TRAVIS_BUILD_DIR/build && cd $TRAVIS_BUILD_DIR/build;
|
||||
../configure --tooldir=$TOOLDIR;
|
||||
ci/toolchain_install.sh --all;
|
||||
echo "v0.3" > "$TOOLDIR/version.txt";
|
||||
else
|
||||
echo "using existing tooldir build";
|
||||
fi
|
||||
- if [ ! -d "$HOME/third_party" ] || [ -z "$(ls -A $HOME/third_party)" ] || [ "$(cat "$HOME/third_party/version.txt")" != "v0.2" ]; then
|
||||
cd $TRAVIS_BUILD_DIR;
|
||||
|
@ -38,6 +40,7 @@ before_install:
|
|||
echo "v0.2" > "third_party/version.txt";
|
||||
cp -rf third_party $HOME;
|
||||
else
|
||||
echo "using existing third_party build";
|
||||
cp -rf $HOME/third_party $TRAVIS_BUILD_DIR;
|
||||
fi
|
||||
|
||||
|
@ -50,6 +53,7 @@ install:
|
|||
echo "$TRAVIS_COMMIT" > version.txt;
|
||||
cp -rf $TRAVIS_BUILD_DIR/build$XLEN $HOME;
|
||||
else
|
||||
echo "using existing build for commit $TRAVIS_COMMIT";
|
||||
cp -rf $HOME/build$XLEN $TRAVIS_BUILD_DIR;
|
||||
fi
|
||||
|
||||
|
|
27
Makefile.in
27
Makefile.in
|
@ -22,41 +22,40 @@ clean:
|
|||
$(MAKE) -C runtime clean
|
||||
$(MAKE) -C tests clean
|
||||
|
||||
clean-all:
|
||||
$(MAKE) -C hw clean
|
||||
$(MAKE) -C sim clean
|
||||
$(MAKE) -C kernel clean
|
||||
$(MAKE) -C runtime clean
|
||||
$(MAKE) -C tests clean-all
|
||||
|
||||
# Install setup
|
||||
KERNEL_LIB_DST = $(PREFIX)/kernel/lib$(XLEN)
|
||||
RUNTIME_LIB_DST = $(PREFIX)/runtime/lib
|
||||
KERNEL_INC_DST = $(PREFIX)/kernel/include
|
||||
KERNEL_LIB_DST = $(PREFIX)/kernel/lib$(XLEN)
|
||||
RUNTIME_INC_DST = $(PREFIX)/runtime/include
|
||||
RUNTIME_LIB_DST = $(PREFIX)/runtime/lib
|
||||
|
||||
KERNEL_HEADERS = $(wildcard $(VORTEX_HOME)/kernel/include/*.h)
|
||||
KERNEL_LIBS = $(wildcard kernel/*.a)
|
||||
RUNTIME_HEADERS = $(wildcard $(VORTEX_HOME)/runtime/include/*.h)
|
||||
RUNTIME_LIBS = $(wildcard runtime/*.so)
|
||||
|
||||
INSTALL_DIRS = $(KERNEL_LIB_DST) $(RUNTIME_LIB_DST) $(KERNEL_INC_DST) $(RUNTIME_INC_DST)
|
||||
|
||||
$(INSTALL_DIRS):
|
||||
mkdir -p $@
|
||||
|
||||
$(KERNEL_INC_DST)/VX_types.h: hw/VX_types.h | $(KERNEL_INC_DST)
|
||||
cp $< $@
|
||||
|
||||
$(KERNEL_INC_DST)/%.h: $(VORTEX_HOME)/kernel/include/%.h | $(KERNEL_INC_DST)
|
||||
cp $< $@
|
||||
|
||||
$(RUNTIME_INC_DST)/%.h: $(VORTEX_HOME)/runtime/include/%.h | $(RUNTIME_INC_DST)
|
||||
cp $< $@
|
||||
|
||||
$(KERNEL_LIB_DST)/libvortex.a: kernel/libvortexrt.a | $(KERNEL_LIB_DST)
|
||||
$(KERNEL_LIB_DST)/%.a: kernel/%.a | $(KERNEL_LIB_DST)
|
||||
cp $< $@
|
||||
|
||||
$(RUNTIME_LIB_DST)/libvortex.so: runtime/stub/libvortex.so | $(RUNTIME_LIB_DST)
|
||||
$(RUNTIME_LIB_DST)/%.so: runtime/%.so | $(RUNTIME_LIB_DST)
|
||||
cp $< $@
|
||||
|
||||
install: $(INSTALL_DIRS) \
|
||||
$(KERNEL_HEADERS:$(VORTEX_HOME)/kernel/include/%=$(KERNEL_INC_DST)/%) \
|
||||
$(KERNEL_INC_DST)/VX_types.h \
|
||||
$(KERNEL_HEADERS:$(VORTEX_HOME)/kernel/include/%=$(KERNEL_INC_DST)/%) \
|
||||
$(RUNTIME_HEADERS:$(VORTEX_HOME)/runtime/include/%=$(RUNTIME_INC_DST)/%) \
|
||||
$(KERNEL_LIB_DST)/libvortex.a \
|
||||
$(RUNTIME_LIB_DST)/libvortex.so
|
||||
$(KERNEL_LIBS:kernel/%=$(KERNEL_LIB_DST)/%) \
|
||||
$(RUNTIME_LIBS:runtime/%=$(RUNTIME_LIB_DST)/%)
|
||||
|
|
|
@ -193,7 +193,7 @@ then
|
|||
|
||||
if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ];
|
||||
then
|
||||
make -C $DRIVER_PATH clean > /dev/null
|
||||
make -C $DRIVER_PATH clean-driver > /dev/null
|
||||
echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE
|
||||
fi
|
||||
fi
|
||||
|
@ -232,11 +232,11 @@ then
|
|||
if [ $HAS_ARGS -eq 1 ]
|
||||
then
|
||||
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
else
|
||||
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
fi
|
||||
|
||||
|
@ -257,11 +257,11 @@ then
|
|||
if [ $HAS_ARGS -eq 1 ]
|
||||
then
|
||||
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
DEBUG=1 OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
else
|
||||
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
DEBUG=1 make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
fi
|
||||
fi
|
||||
|
|
|
@ -19,6 +19,37 @@ set -e
|
|||
# clear blackbox cache
|
||||
rm -f blackbox.*.cache
|
||||
|
||||
split_file() {
|
||||
if [[ $# -ne 2 ]]; then
|
||||
echo "Usage: $0 <filename> <start_with>"
|
||||
return 1
|
||||
fi
|
||||
input_file="$1"
|
||||
start_with="$2"
|
||||
if [[ ! -r "$input_file" ]]; then
|
||||
echo "Error: File '$input_file' is not readable or does not exist."
|
||||
return 1
|
||||
fi
|
||||
count=0
|
||||
output_file=""
|
||||
while IFS= read -r line; do
|
||||
if [[ $line == $start_with* ]]; then
|
||||
count=$((count + 1))
|
||||
output_file="$input_file.part$count"
|
||||
> "$output_file" # ensure empty
|
||||
fi
|
||||
if [[ -n "$output_file" ]]; then
|
||||
echo "$line" >> "$output_file"
|
||||
fi
|
||||
done < "$input_file"
|
||||
|
||||
if [[ $count -eq 0 ]]; then
|
||||
echo "No lines starting with '$start_with' were found in '$input_file'."
|
||||
fi
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
|
||||
unittest()
|
||||
{
|
||||
make -C tests/unittest run
|
||||
|
@ -102,6 +133,9 @@ opencl()
|
|||
make -C tests/opencl run-simx
|
||||
make -C tests/opencl run-rtlsim
|
||||
|
||||
./ci/blackbox.sh --driver=simx --app=lbm --warps=8
|
||||
./ci/blackbox.sh --driver=rtlsim --app=lbm --warps=8
|
||||
|
||||
echo "opencl tests done!"
|
||||
}
|
||||
|
||||
|
@ -125,25 +159,39 @@ cluster()
|
|||
echo "clustering tests done!"
|
||||
}
|
||||
|
||||
debug()
|
||||
test_csv_trace()
|
||||
{
|
||||
echo "begin debugging tests..."
|
||||
|
||||
# test CSV trace generation
|
||||
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
|
||||
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-simx-32im > run_simx.log
|
||||
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
|
||||
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
|
||||
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
|
||||
diff trace_rtlsim.csv trace_simx.csv
|
||||
split_file run_simx.log "Running "
|
||||
split_file run_rtlsim.log "Running "
|
||||
for file in ./run_simx.log.part*; do
|
||||
if [[ -f "$file" ]]; then
|
||||
file2="${file//simx/rtlsim}"
|
||||
if [[ -f "$file2" ]]; then
|
||||
./ci/trace_csv.py -tsimx $file -otrace_simx.csv
|
||||
./ci/trace_csv.py -trtlsim $file2 -otrace_rtlsim.csv
|
||||
diff trace_rtlsim.csv trace_simx.csv
|
||||
else
|
||||
echo "File $file2 not found."
|
||||
fi
|
||||
fi
|
||||
done
|
||||
# restore default prebuilt configuration
|
||||
make -C sim/simx clean && make -C sim/simx > /dev/null
|
||||
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
|
||||
}
|
||||
|
||||
debug()
|
||||
{
|
||||
echo "begin debugging tests..."
|
||||
test_csv_trace
|
||||
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=basic --args="-t0 -n1"
|
||||
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1"
|
||||
|
||||
echo "debugging tests done!"
|
||||
}
|
||||
|
@ -189,12 +237,10 @@ config()
|
|||
CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx
|
||||
|
||||
# custom program startup address
|
||||
make -C tests/regression/dogfood clean-all
|
||||
make -C tests/regression/dogfood clean-kernel
|
||||
STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
|
||||
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=simx --app=dogfood
|
||||
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
make -C tests/regression/dogfood clean-all
|
||||
make -C tests/regression/dogfood
|
||||
./ci/blackbox.sh --driver=simx --app=dogfood
|
||||
./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
|
||||
# disabling M & F extensions
|
||||
make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null
|
||||
|
|
199
ci/trace_csv.py
199
ci/trace_csv.py
|
@ -17,6 +17,7 @@ import sys
|
|||
import argparse
|
||||
import csv
|
||||
import re
|
||||
import inspect
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='CPU trace log to CSV format converter.')
|
||||
|
@ -39,24 +40,27 @@ def parse_simx(log_filename):
|
|||
with open(log_filename, 'r') as log_file:
|
||||
instr_data = None
|
||||
for lineno, line in enumerate(log_file, start=1):
|
||||
if line.startswith("DEBUG Fetch:"):
|
||||
if instr_data:
|
||||
entries.append(instr_data)
|
||||
instr_data = {}
|
||||
instr_data["lineno"] = lineno
|
||||
instr_data["PC"] = re.search(pc_pattern, line).group(1)
|
||||
instr_data["core_id"] = re.search(core_id_pattern, line).group(1)
|
||||
instr_data["warp_id"] = re.search(warp_id_pattern, line).group(1)
|
||||
instr_data["tmask"] = re.search(tmask_pattern, line).group(1)
|
||||
instr_data["uuid"] = re.search(uuid_pattern, line).group(1)
|
||||
elif line.startswith("DEBUG Instr"):
|
||||
instr_data["instr"] = re.search(instr_pattern, line).group(1)
|
||||
instr_data["opcode"] = re.search(opcode_pattern, line).group(1)
|
||||
elif line.startswith("DEBUG Src"):
|
||||
src_reg = re.search(operands_pattern, line).group(1)
|
||||
instr_data["operands"] = (instr_data["operands"] + ', ' + src_reg) if 'operands' in instr_data else src_reg
|
||||
elif line.startswith("DEBUG Dest"):
|
||||
instr_data["destination"] = re.search(destination_pattern, line).group(1)
|
||||
try:
|
||||
if line.startswith("DEBUG Fetch:"):
|
||||
if instr_data:
|
||||
entries.append(instr_data)
|
||||
instr_data = {}
|
||||
instr_data["lineno"] = lineno
|
||||
instr_data["PC"] = re.search(pc_pattern, line).group(1)
|
||||
instr_data["core_id"] = re.search(core_id_pattern, line).group(1)
|
||||
instr_data["warp_id"] = re.search(warp_id_pattern, line).group(1)
|
||||
instr_data["tmask"] = re.search(tmask_pattern, line).group(1)
|
||||
instr_data["uuid"] = re.search(uuid_pattern, line).group(1)
|
||||
elif line.startswith("DEBUG Instr"):
|
||||
instr_data["instr"] = re.search(instr_pattern, line).group(1)
|
||||
instr_data["opcode"] = re.search(opcode_pattern, line).group(1)
|
||||
elif line.startswith("DEBUG Src"):
|
||||
src_reg = re.search(operands_pattern, line).group(1)
|
||||
instr_data["operands"] = (instr_data["operands"] + ', ' + src_reg) if 'operands' in instr_data else src_reg
|
||||
elif line.startswith("DEBUG Dest"):
|
||||
instr_data["destination"] = re.search(destination_pattern, line).group(1)
|
||||
except Exception as e:
|
||||
print("Error at line {}: {}".format(lineno, e))
|
||||
if instr_data:
|
||||
entries.append(instr_data)
|
||||
return entries
|
||||
|
@ -115,88 +119,91 @@ def parse_rtlsim(log_filename):
|
|||
with open(log_filename, 'r') as log_file:
|
||||
instr_data = {}
|
||||
for lineno, line in enumerate(log_file, start=1):
|
||||
line_match = re.search(line_pattern, line)
|
||||
if line_match:
|
||||
PC = re.search(pc_pattern, line).group(1)
|
||||
warp_id = re.search(warp_id_pattern, line).group(1)
|
||||
tmask = re.search(tmask_pattern, line).group(1)
|
||||
uuid = re.search(uuid_pattern, line).group(1)
|
||||
core_id = line_match.group(1)
|
||||
stage = line_match.group(2)
|
||||
if stage == "decode":
|
||||
trace = {}
|
||||
trace["uuid"] = uuid
|
||||
trace["PC"] = PC
|
||||
trace["core_id"] = core_id
|
||||
trace["warp_id"] = warp_id
|
||||
trace["tmask"] = reverse_binary(tmask)
|
||||
trace["instr"] = re.search(instr_pattern, line).group(1)
|
||||
trace["opcode"] = re.search(op_pattern, line).group(1)
|
||||
trace["opds"] = bin_to_array(re.search(opds_pattern, line).group(1))
|
||||
trace["rd"] = re.search(rd_pattern, line).group(1)
|
||||
trace["rs1"] = re.search(rs1_pattern, line).group(1)
|
||||
trace["rs2"] = re.search(rs2_pattern, line).group(1)
|
||||
trace["rs3"] = re.search(rs3_pattern, line).group(1)
|
||||
instr_data[uuid] = trace
|
||||
elif stage == "issue":
|
||||
if uuid in instr_data:
|
||||
trace = instr_data[uuid]
|
||||
trace["lineno"] = lineno
|
||||
opds = trace["opds"]
|
||||
if opds[1]:
|
||||
trace["rs1_data"] = re.search(rs1_data_pattern, line).group(1).split(', ')[::-1]
|
||||
if opds[2]:
|
||||
trace["rs2_data"] = re.search(rs2_data_pattern, line).group(1).split(', ')[::-1]
|
||||
if opds[3]:
|
||||
trace["rs3_data"] = re.search(rs3_data_pattern, line).group(1).split(', ')[::-1]
|
||||
trace["issued"] = True
|
||||
try:
|
||||
line_match = re.search(line_pattern, line)
|
||||
if line_match:
|
||||
PC = re.search(pc_pattern, line).group(1)
|
||||
warp_id = re.search(warp_id_pattern, line).group(1)
|
||||
tmask = re.search(tmask_pattern, line).group(1)
|
||||
uuid = re.search(uuid_pattern, line).group(1)
|
||||
core_id = line_match.group(1)
|
||||
stage = line_match.group(2)
|
||||
if stage == "decode":
|
||||
trace = {}
|
||||
trace["uuid"] = uuid
|
||||
trace["PC"] = PC
|
||||
trace["core_id"] = core_id
|
||||
trace["warp_id"] = warp_id
|
||||
trace["tmask"] = reverse_binary(tmask)
|
||||
trace["instr"] = re.search(instr_pattern, line).group(1)
|
||||
trace["opcode"] = re.search(op_pattern, line).group(1)
|
||||
trace["opds"] = bin_to_array(re.search(opds_pattern, line).group(1))
|
||||
trace["rd"] = re.search(rd_pattern, line).group(1)
|
||||
trace["rs1"] = re.search(rs1_pattern, line).group(1)
|
||||
trace["rs2"] = re.search(rs2_pattern, line).group(1)
|
||||
trace["rs3"] = re.search(rs3_pattern, line).group(1)
|
||||
instr_data[uuid] = trace
|
||||
elif stage == "commit":
|
||||
if uuid in instr_data:
|
||||
trace = instr_data[uuid]
|
||||
if "issued" in trace:
|
||||
elif stage == "issue":
|
||||
if uuid in instr_data:
|
||||
trace = instr_data[uuid]
|
||||
trace["lineno"] = lineno
|
||||
opds = trace["opds"]
|
||||
dst_tmask_arr = bin_to_array(tmask)[::-1]
|
||||
wb = re.search(wb_pattern, line).group(1) == "1"
|
||||
if wb:
|
||||
rd_data = re.search(rd_data_pattern, line).group(1).split(', ')[::-1]
|
||||
if 'rd_data' in trace:
|
||||
merged_rd_data = trace['rd_data']
|
||||
for i in range(len(dst_tmask_arr)):
|
||||
if dst_tmask_arr[i] == 1:
|
||||
merged_rd_data[i] = rd_data[i]
|
||||
trace['rd_data'] = merged_rd_data
|
||||
else:
|
||||
trace['rd_data'] = rd_data
|
||||
if opds[1]:
|
||||
trace["rs1_data"] = re.search(rs1_data_pattern, line).group(1).split(', ')[::-1]
|
||||
if opds[2]:
|
||||
trace["rs2_data"] = re.search(rs2_data_pattern, line).group(1).split(', ')[::-1]
|
||||
if opds[3]:
|
||||
trace["rs3_data"] = re.search(rs3_data_pattern, line).group(1).split(', ')[::-1]
|
||||
trace["issued"] = True
|
||||
instr_data[uuid] = trace
|
||||
eop = re.search(eop_pattern, line).group(1) == "1"
|
||||
if eop:
|
||||
tmask_arr = bin_to_array(trace["tmask"])
|
||||
destination = ''
|
||||
elif stage == "commit":
|
||||
if uuid in instr_data:
|
||||
trace = instr_data[uuid]
|
||||
if "issued" in trace:
|
||||
opds = trace["opds"]
|
||||
dst_tmask_arr = bin_to_array(tmask)[::-1]
|
||||
wb = re.search(wb_pattern, line).group(1) == "1"
|
||||
if wb:
|
||||
destination, sep = append_value(destination, trace["rd"], trace['rd_data'], tmask_arr, False)
|
||||
del trace['rd_data']
|
||||
trace["destination"] = destination
|
||||
operands = ''
|
||||
sep = False
|
||||
if opds[1]:
|
||||
operands, sep = append_value(operands, trace["rs1"], trace["rs1_data"], tmask_arr, sep)
|
||||
del trace["rs1_data"]
|
||||
if opds[2]:
|
||||
operands, sep = append_value(operands, trace["rs2"], trace["rs2_data"], tmask_arr, sep)
|
||||
del trace["rs2_data"]
|
||||
if opds[3]:
|
||||
operands, sep = append_value(operands, trace["rs3"], trace["rs3_data"], tmask_arr, sep)
|
||||
del trace["rs3_data"]
|
||||
trace["operands"] = operands
|
||||
del trace["opds"]
|
||||
del trace["rd"]
|
||||
del trace["rs1"]
|
||||
del trace["rs2"]
|
||||
del trace["rs3"]
|
||||
del trace["issued"]
|
||||
del instr_data[uuid]
|
||||
entries.append(trace)
|
||||
rd_data = re.search(rd_data_pattern, line).group(1).split(', ')[::-1]
|
||||
if 'rd_data' in trace:
|
||||
merged_rd_data = trace['rd_data']
|
||||
for i in range(len(dst_tmask_arr)):
|
||||
if dst_tmask_arr[i] == 1:
|
||||
merged_rd_data[i] = rd_data[i]
|
||||
trace['rd_data'] = merged_rd_data
|
||||
else:
|
||||
trace['rd_data'] = rd_data
|
||||
instr_data[uuid] = trace
|
||||
eop = re.search(eop_pattern, line).group(1) == "1"
|
||||
if eop:
|
||||
tmask_arr = bin_to_array(trace["tmask"])
|
||||
destination = ''
|
||||
if wb:
|
||||
destination, sep = append_value(destination, trace["rd"], trace['rd_data'], tmask_arr, False)
|
||||
del trace['rd_data']
|
||||
trace["destination"] = destination
|
||||
operands = ''
|
||||
sep = False
|
||||
if opds[1]:
|
||||
operands, sep = append_value(operands, trace["rs1"], trace["rs1_data"], tmask_arr, sep)
|
||||
del trace["rs1_data"]
|
||||
if opds[2]:
|
||||
operands, sep = append_value(operands, trace["rs2"], trace["rs2_data"], tmask_arr, sep)
|
||||
del trace["rs2_data"]
|
||||
if opds[3]:
|
||||
operands, sep = append_value(operands, trace["rs3"], trace["rs3_data"], tmask_arr, sep)
|
||||
del trace["rs3_data"]
|
||||
trace["operands"] = operands
|
||||
del trace["opds"]
|
||||
del trace["rd"]
|
||||
del trace["rs1"]
|
||||
del trace["rs2"]
|
||||
del trace["rs3"]
|
||||
del trace["issued"]
|
||||
del instr_data[uuid]
|
||||
entries.append(trace)
|
||||
except Exception as e:
|
||||
print("Error at line {}: {}".format(lineno, e))
|
||||
return entries
|
||||
|
||||
def write_csv(log_filename, csv_filename, log_type):
|
||||
|
|
|
@ -64,6 +64,7 @@ def main(argv):
|
|||
|
||||
# execute command
|
||||
exitcode = execute(argv)
|
||||
print(" + exitcode="+str(exitcode))
|
||||
|
||||
# terminate monitoring thread
|
||||
stop_event.set()
|
||||
|
|
|
@ -3,14 +3,14 @@
|
|||
## Testing changes to the RTL or simulator GPU driver.
|
||||
|
||||
The Blackbox utility script will not pick up your changes if the h/w configuration is the same as during teh last run.
|
||||
To force the utility to build the driver, you need pass the --rebuild=1 option when running tests.
|
||||
To force the utility to build the driver, you need pass the --rebuild=1 option when running tests.
|
||||
Using --rebuild=0 will prevent the rebuild even if the h/w configuration is different from last run.
|
||||
|
||||
$ ./ci/blackbox.sh --driver=simx --app=demo --rebuild=1
|
||||
|
||||
## SimX Debugging
|
||||
|
||||
SimX cycle-approximate simulator allows faster debugging of Vortex kernels' execution.
|
||||
SimX cycle-approximate simulator allows faster debugging of Vortex kernels' execution.
|
||||
The recommended method to enable debugging is to pass the `--debug=<level>` flag to `blackbox` tool when running a program.
|
||||
|
||||
// Running demo program on SimX in debug mode
|
||||
|
@ -61,5 +61,8 @@ We provide a trace sanitizer tool under ./hw/scripts/trace_csv.py that you can u
|
|||
$ ./ci/blackbox.sh --driver=simx --app=demo --debug=3 --log=run_simx.log
|
||||
$ ./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
|
||||
|
||||
The first column in the CSV trace is UUID (universal unique identifier) of the instruction and the content is sorted by the UUID. You can use the UUID to trace the same instruction running on either the RTL hw or SimX simulator.
|
||||
$ diff trace_rtlsim.csv trace_simx.csv
|
||||
|
||||
The first column in the CSV trace is UUID (universal unique identifier) of the instruction and the content is sorted by the UUID.
|
||||
You can use the UUID to trace the same instruction running on either the RTL hw or SimX simulator.
|
||||
This can be very effective if you want to use SimX to debugging your RTL hardware by comparing CSV traces.
|
|
@ -15,7 +15,7 @@ You can execute the same application of a GPU architecture with 2 cores:
|
|||
|
||||
$ ./ci/blackbox.sh --core=2 --driver=simx --app=sgemm --args="-n10"
|
||||
|
||||
When excuting, Blackbox needs to recompile the driver if the desired architecture changes.
|
||||
When excuting, Blackbox needs to recompile the driver if the desired architecture changes.
|
||||
It tracks the latest configuration in a file under the current directory blackbox.<driver>.cache.
|
||||
To avoid having to rebuild the driver all the time, Blackbox checks if the latest cached configuration matches the current.
|
||||
|
||||
|
@ -24,24 +24,29 @@ To avoid having to rebuild the driver all the time, Blackbox checks if the lates
|
|||
The Vortex test suite is located under the /test/ folder
|
||||
You can execute the default regression suite by running the following commands at the root folder.
|
||||
|
||||
$ make -C tests/regression run-simx
|
||||
$ make -C tests/regression run-simx
|
||||
$ make -C tests/regression run-rtlsim
|
||||
|
||||
You can execute the default opncl suite by running the following commands at the root folder.
|
||||
|
||||
$ make -C tests/opencl run-simx
|
||||
$ make -C tests/opencl run-simx
|
||||
$ make -C tests/opencl run-rtlsim
|
||||
|
||||
## Creating Your Own Regression Tests
|
||||
- Inside `test/` you will find a series of folders which are named based on what they test
|
||||
- You can view the tests to see which ones have tests similar to what you are trying to create new tests for
|
||||
- once you have found a similar baseline, you can copy the folder and rename it to what you are planning to test
|
||||
- `testcases.h` contains each of the test case templates
|
||||
- `main.cpp` contains the implementation of each of the test cases and builds a test suite of all the tests cases you want
|
||||
## Creating Your Own Regression Test
|
||||
|
||||
Compile the test case: `make -C tests/regression/<testcase-name>/ clean-all && make -C tests/regression/<testcase-name>/`
|
||||
Inside `tests/regression` you will find a series of folders which are named based on what they test.
|
||||
You can view the tests to see which ones have tests similar to what you are trying to create new tests for.
|
||||
Once you have found a similar baseline, you can copy the folder and rename it to what you are planning to test.
|
||||
A regression test typically implements the following files:
|
||||
- ***kernel.cpp*** contains the GPU kernel code.
|
||||
- ***main.cpp*** contains the host CPU code.
|
||||
- ***Makefile*** defines the compiler build commands for the CPU and GPU binaries.
|
||||
|
||||
Run the test case: `./ci/blackbox.sh --driver=simx --cores=4 --app=<testcase-name> --debug`
|
||||
Sync your build folder: `$ ../configure`
|
||||
|
||||
Compile your test: `$ make -C tests/regression/<test-name>`
|
||||
|
||||
Run your test: `$ ./ci/blackbox.sh --driver=simx --app=<test-name> --debug`
|
||||
|
||||
## Adding Your Tests to the CI Pipeline
|
||||
see `continuous_integration.md`
|
||||
See `continuous_integration.md`
|
|
@ -21,8 +21,6 @@
|
|||
#include "svdpi.h"
|
||||
#include "verilated_vpi.h"
|
||||
|
||||
#include "uuid_gen.h"
|
||||
|
||||
#ifdef XLEN_64
|
||||
#define iword_t int64_t
|
||||
#define uword_t uint64_t
|
||||
|
@ -50,7 +48,7 @@ extern "C" {
|
|||
void dpi_trace_start();
|
||||
void dpi_trace_stop();
|
||||
|
||||
uint64_t dpi_uuid_gen(bool reset, int wid, uint64_t PC);
|
||||
uint64_t dpi_uuid_gen(bool reset, int wid);
|
||||
}
|
||||
|
||||
bool sim_trace_enabled();
|
||||
|
@ -209,22 +207,14 @@ void dpi_trace_stop() {
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::unordered_map<uint32_t, std::shared_ptr<vortex::UUIDGenerator>> g_uuid_gens;
|
||||
std::unordered_map<uint32_t, uint32_t> g_uuid_gens;
|
||||
|
||||
uint64_t dpi_uuid_gen(bool reset, int wid, uint64_t PC) {
|
||||
uint64_t dpi_uuid_gen(bool reset, int wid) {
|
||||
if (reset) {
|
||||
g_uuid_gens.clear();
|
||||
return 0;
|
||||
}
|
||||
std::shared_ptr<vortex::UUIDGenerator> uuid_gen;
|
||||
auto it = g_uuid_gens.find(wid);
|
||||
if (it == g_uuid_gens.end()) {
|
||||
uuid_gen = std::make_shared<vortex::UUIDGenerator>();
|
||||
g_uuid_gens.emplace(wid, uuid_gen);
|
||||
} else {
|
||||
uuid_gen = it->second;
|
||||
}
|
||||
uint32_t instr_uuid = uuid_gen->get_uuid(PC);
|
||||
uint32_t instr_uuid = g_uuid_gens[wid]++;
|
||||
uint64_t uuid = (uint64_t(wid) << 32) | instr_uuid;
|
||||
return uuid;
|
||||
}
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -32,6 +32,6 @@ import "DPI-C" function void dpi_trace(input int level, input string format /*ve
|
|||
import "DPI-C" function void dpi_trace_start();
|
||||
import "DPI-C" function void dpi_trace_stop();
|
||||
|
||||
import "DPI-C" function longint dpi_uuid_gen(input logic reset, input int wid, input longint PC);
|
||||
import "DPI-C" function longint dpi_uuid_gen(input logic reset, input int wid);
|
||||
|
||||
`endif
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -15,7 +15,7 @@
|
|||
|
||||
module VX_cluster import VX_gpu_pkg::*; #(
|
||||
parameter CLUSTER_ID = 0
|
||||
) (
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
// Clock
|
||||
|
@ -32,10 +32,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
// Memory
|
||||
VX_mem_bus_if.master mem_bus_if,
|
||||
|
||||
// simulation helper signals
|
||||
output wire sim_ebreak,
|
||||
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
|
@ -43,16 +39,16 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
`ifdef SCOPE
|
||||
localparam scope_socket = 0;
|
||||
`SCOPE_IO_SWITCH (`NUM_SOCKETS);
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
assign mem_perf_tmp_if.icache = 'x;
|
||||
assign mem_perf_tmp_if.dcache = 'x;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.lmem = 'x;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
|
||||
|
@ -102,7 +98,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
.MREQ_SIZE (`L2_MREQ_SIZE),
|
||||
.TAG_WIDTH (L2_TAG_WIDTH),
|
||||
.WRITE_ENABLE (1),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.CORE_OUT_BUF (2),
|
||||
.MEM_OUT_BUF (2),
|
||||
.NC_ENABLE (1),
|
||||
|
@ -119,13 +115,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak;
|
||||
wire [`NUM_SOCKETS-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_socket_sim_wb_value;
|
||||
assign sim_ebreak = per_socket_sim_ebreak[0];
|
||||
assign sim_wb_value = per_socket_sim_wb_value[0];
|
||||
`UNUSED_VAR (per_socket_sim_ebreak)
|
||||
`UNUSED_VAR (per_socket_sim_wb_value)
|
||||
|
||||
VX_dcr_bus_if socket_dcr_bus_tmp_if();
|
||||
assign socket_dcr_bus_tmp_if.write_valid = dcr_bus_if.write_valid && (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
|
||||
assign socket_dcr_bus_tmp_if.write_addr = dcr_bus_if.write_addr;
|
||||
|
@ -151,7 +140,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
`endif
|
||||
|
||||
|
||||
.dcr_bus_if (socket_dcr_bus_if),
|
||||
|
||||
.mem_bus_if (per_socket_mem_bus_if[i]),
|
||||
|
@ -160,8 +149,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
.gbar_bus_if (per_socket_gbar_bus_if[i]),
|
||||
`endif
|
||||
|
||||
.sim_ebreak (per_socket_sim_ebreak[i]),
|
||||
.sim_wb_value (per_socket_sim_wb_value[i]),
|
||||
.busy (per_socket_busy[i])
|
||||
);
|
||||
end
|
||||
|
|
|
@ -146,57 +146,71 @@
|
|||
|
||||
`ifdef XLEN_64
|
||||
|
||||
`ifndef STARTUP_ADDR
|
||||
`define STARTUP_ADDR 64'h180000000
|
||||
`ifndef STACK_BASE_ADDR
|
||||
`define STACK_BASE_ADDR 64'h1FFFF0000
|
||||
`endif
|
||||
|
||||
`ifndef STACK_BASE_ADDR
|
||||
`define STACK_BASE_ADDR 64'h1FF000000
|
||||
`ifndef STARTUP_ADDR
|
||||
`define STARTUP_ADDR 64'h080000000
|
||||
`endif
|
||||
|
||||
`ifndef USER_BASE_ADDR
|
||||
`define USER_BASE_ADDR 64'h000010000
|
||||
`endif
|
||||
|
||||
`ifndef IO_BASE_ADDR
|
||||
`define IO_BASE_ADDR 64'h000000040
|
||||
`endif
|
||||
|
||||
`else
|
||||
|
||||
`ifndef STARTUP_ADDR
|
||||
`define STARTUP_ADDR 32'h80000000
|
||||
`endif
|
||||
|
||||
`ifndef STACK_BASE_ADDR
|
||||
`define STACK_BASE_ADDR 32'hFF000000
|
||||
`define STACK_BASE_ADDR 32'hFFFF0000
|
||||
`endif
|
||||
|
||||
`ifndef STARTUP_ADDR
|
||||
`define STARTUP_ADDR 32'h80000000
|
||||
`endif
|
||||
|
||||
`ifndef USER_BASE_ADDR
|
||||
`define USER_BASE_ADDR 32'h00010000
|
||||
`endif
|
||||
|
||||
`ifndef IO_BASE_ADDR
|
||||
`define IO_BASE_ADDR 32'h00000040
|
||||
`endif
|
||||
|
||||
`endif
|
||||
|
||||
`ifndef LMEM_BASE_ADDR
|
||||
`define LMEM_BASE_ADDR `STACK_BASE_ADDR
|
||||
`endif
|
||||
`define IO_END_ADDR `USER_BASE_ADDR
|
||||
|
||||
`ifndef LMEM_LOG_SIZE
|
||||
`define LMEM_LOG_SIZE 14
|
||||
`endif
|
||||
|
||||
`ifndef IO_BASE_ADDR
|
||||
`define IO_BASE_ADDR (`LMEM_BASE_ADDR + (1 << `LMEM_LOG_SIZE))
|
||||
`ifndef LMEM_BASE_ADDR
|
||||
`define LMEM_BASE_ADDR `STACK_BASE_ADDR
|
||||
`endif
|
||||
|
||||
`ifndef IO_COUT_ADDR
|
||||
`define IO_COUT_ADDR `IO_BASE_ADDR
|
||||
`define IO_COUT_ADDR `IO_BASE_ADDR
|
||||
`endif
|
||||
`define IO_COUT_SIZE `MEM_BLOCK_SIZE
|
||||
`define IO_COUT_SIZE `MEM_BLOCK_SIZE
|
||||
|
||||
`ifndef IO_MPM_ADDR
|
||||
`define IO_MPM_ADDR (`IO_COUT_ADDR + `IO_COUT_SIZE)
|
||||
`define IO_MPM_ADDR (`IO_COUT_ADDR + `IO_COUT_SIZE)
|
||||
`endif
|
||||
`define IO_CSR_SIZE (4 * 64 * `NUM_CORES * `NUM_CLUSTERS)
|
||||
`define IO_MPM_SIZE (8 * 32 * `NUM_CORES * `NUM_CLUSTERS)
|
||||
|
||||
`ifndef STACK_LOG2_SIZE
|
||||
`define STACK_LOG2_SIZE 13
|
||||
`endif
|
||||
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
|
||||
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
|
||||
|
||||
`define RESET_DELAY 8
|
||||
|
||||
`ifndef STALL_TIMEOUT
|
||||
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
|
||||
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
|
||||
`endif
|
||||
|
||||
`ifndef SV_DPI
|
||||
|
|
|
@ -126,27 +126,27 @@
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_OP_BITS 4
|
||||
`define INST_MOD_BITS $bits(op_mod_t)
|
||||
`define INST_ARGS_BITS $bits(op_args_t)
|
||||
`define INST_FMT_BITS 2
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_ALU_ADD 4'b0000
|
||||
//`define INST_ALU_UNUSED 4'b0001
|
||||
`define INST_ALU_LUI 4'b0010
|
||||
`define INST_ALU_AUIPC 4'b0011
|
||||
`define INST_ALU_SLTU 4'b0100
|
||||
`define INST_ALU_SLT 4'b0101
|
||||
//`define INST_ALU_UNUSED 4'b0110
|
||||
`define INST_ALU_SUB 4'b0111
|
||||
`define INST_ALU_SRL 4'b1000
|
||||
`define INST_ALU_SRA 4'b1001
|
||||
`define INST_ALU_CZEQ 4'b1010
|
||||
`define INST_ALU_CZNE 4'b1011
|
||||
`define INST_ALU_AND 4'b1100
|
||||
`define INST_ALU_OR 4'b1101
|
||||
`define INST_ALU_XOR 4'b1110
|
||||
`define INST_ALU_SLL 4'b1111
|
||||
`define INST_ALU_CZEQ 4'b1010
|
||||
`define INST_ALU_CZNE 4'b1011
|
||||
//`define INST_ALU_UNUSED 4'b0001
|
||||
//`define INST_ALU_UNUSED 4'b0110
|
||||
|
||||
|
||||
`define ALU_TYPE_BITS 2
|
||||
|
@ -300,9 +300,10 @@
|
|||
`define L1_ENABLE
|
||||
`endif
|
||||
|
||||
`define ADDR_TYPE_IO 0
|
||||
`define ADDR_TYPE_LOCAL 1
|
||||
`define ADDR_TYPE_WIDTH (`LMEM_ENABLED + 1)
|
||||
`define ADDR_TYPE_FLUSH 0
|
||||
`define ADDR_TYPE_IO 1
|
||||
`define ADDR_TYPE_LOCAL 2 // shoud be last since optional
|
||||
`define ADDR_TYPE_WIDTH (`ADDR_TYPE_LOCAL + `LMEM_ENABLED)
|
||||
|
||||
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
|
||||
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
|
||||
|
@ -431,7 +432,7 @@
|
|||
data.tmask, \
|
||||
data.PC, \
|
||||
data.op_type, \
|
||||
data.op_mod, \
|
||||
data.op_args, \
|
||||
data.wb, \
|
||||
data.rd, \
|
||||
tid, \
|
||||
|
|
|
@ -92,7 +92,8 @@ package VX_gpu_pkg;
|
|||
} fpu_mod_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [($bits(alu_mod_t)-1-`OFFSET_BITS)-1:0] __padding;
|
||||
logic [($bits(alu_mod_t)-1-1-`OFFSET_BITS)-1:0] __padding;
|
||||
logic is_store;
|
||||
logic is_float;
|
||||
logic [`OFFSET_BITS-1:0] offset;
|
||||
} lsu_mod_t;
|
||||
|
@ -115,7 +116,7 @@ package VX_gpu_pkg;
|
|||
lsu_mod_t lsu;
|
||||
csr_mod_t csr;
|
||||
wctl_mod_t wctl;
|
||||
} op_mod_t;
|
||||
} op_args_t;
|
||||
|
||||
/* verilator lint_off UNUSED */
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -13,11 +13,11 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_socket import VX_gpu_pkg::*; #(
|
||||
module VX_socket import VX_gpu_pkg::*; #(
|
||||
parameter SOCKET_ID = 0
|
||||
) (
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
|
||||
// Clock
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -36,11 +36,6 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
// Barrier
|
||||
VX_gbar_bus_if.master gbar_bus_if,
|
||||
`endif
|
||||
|
||||
// simulation helper signals
|
||||
output wire sim_ebreak,
|
||||
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
|
@ -86,7 +81,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
`RESET_RELAY (icache_reset, reset);
|
||||
|
||||
VX_cache_cluster #(
|
||||
.INSTANCE_ID ($sformatf("socket%0d-icache", SOCKET_ID)),
|
||||
.INSTANCE_ID ($sformatf("socket%0d-icache", SOCKET_ID)),
|
||||
.NUM_UNITS (`NUM_ICACHES),
|
||||
.NUM_INPUTS (`SOCKET_SIZE),
|
||||
.TAG_SEL_IDX (0),
|
||||
|
@ -102,7 +97,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.MREQ_SIZE (`ICACHE_MREQ_SIZE),
|
||||
.TAG_WIDTH (ICACHE_TAG_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.WRITE_ENABLE (0),
|
||||
.WRITE_ENABLE (0),
|
||||
.NC_ENABLE (0),
|
||||
.CORE_OUT_BUF (2),
|
||||
.MEM_OUT_BUF (2)
|
||||
|
@ -122,7 +117,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
|
||||
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
||||
|
@ -131,7 +126,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
`RESET_RELAY (dcache_reset, reset);
|
||||
|
||||
VX_cache_cluster #(
|
||||
.INSTANCE_ID ($sformatf("socket%0d-dcache", SOCKET_ID)),
|
||||
.INSTANCE_ID ($sformatf("socket%0d-dcache", SOCKET_ID)),
|
||||
.NUM_UNITS (`NUM_DCACHES),
|
||||
.NUM_INPUTS (`SOCKET_SIZE),
|
||||
.TAG_SEL_IDX (0),
|
||||
|
@ -147,21 +142,21 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.MREQ_SIZE (`DCACHE_MREQ_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.WRITE_ENABLE (1),
|
||||
.WRITE_ENABLE (1),
|
||||
.NC_ENABLE (1),
|
||||
.CORE_OUT_BUF (`LMEM_ENABLED ? 2 : 1),
|
||||
.MEM_OUT_BUF (2)
|
||||
) dcache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (mem_perf_tmp_if.dcache),
|
||||
`endif
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (dcache_reset),
|
||||
.reset (dcache_reset),
|
||||
.core_bus_if (per_core_dcache_bus_if),
|
||||
.mem_bus_if (dcache_mem_bus_if)
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
|
@ -197,13 +192,6 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`SOCKET_SIZE-1:0] per_core_sim_ebreak;
|
||||
wire [`SOCKET_SIZE-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_core_sim_wb_value;
|
||||
assign sim_ebreak = per_core_sim_ebreak[0];
|
||||
assign sim_wb_value = per_core_sim_wb_value[0];
|
||||
`UNUSED_VAR (per_core_sim_ebreak)
|
||||
`UNUSED_VAR (per_core_sim_wb_value)
|
||||
|
||||
wire [`SOCKET_SIZE-1:0] per_core_busy;
|
||||
|
||||
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, (`SOCKET_SIZE > 1));
|
||||
|
@ -226,7 +214,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
`endif
|
||||
|
||||
|
||||
.dcr_bus_if (core_dcr_bus_if),
|
||||
|
||||
.dcache_bus_if (per_core_dcache_bus_if[i * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]),
|
||||
|
@ -237,12 +225,10 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.gbar_bus_if (per_core_gbar_bus_if[i]),
|
||||
`endif
|
||||
|
||||
.sim_ebreak (per_core_sim_ebreak[i]),
|
||||
.sim_wb_value (per_core_sim_wb_value[i]),
|
||||
.busy (per_core_busy[i])
|
||||
);
|
||||
end
|
||||
|
||||
`BUFFER_EX(busy, (| per_core_busy), 1'b1, (`SOCKET_SIZE > 1));
|
||||
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -53,9 +53,10 @@
|
|||
`define VX_CSR_MIDELEG 12'h303
|
||||
`define VX_CSR_MIE 12'h304
|
||||
`define VX_CSR_MTVEC 12'h305
|
||||
`define VX_CSR_MSCRATCH 12'h340
|
||||
|
||||
`define VX_CSR_MSCRATCH 12'h340
|
||||
`define VX_CSR_MEPC 12'h341
|
||||
`define VX_CSR_MCAUSE 12'h342
|
||||
|
||||
`define VX_CSR_MNSTATUS 12'h744
|
||||
|
||||
|
@ -193,6 +194,6 @@
|
|||
`define VX_CSR_NUM_THREADS 12'hFC0
|
||||
`define VX_CSR_NUM_WARPS 12'hFC1
|
||||
`define VX_CSR_NUM_CORES 12'hFC2
|
||||
`define VX_CSR_NUM_BARRIERS 12'hFC3
|
||||
`define VX_CSR_LOCAL_MEM_BASE 12'hFC3
|
||||
|
||||
`endif // VX_TYPES_VH
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -29,8 +29,8 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready,
|
||||
|
@ -45,7 +45,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
assign mem_perf_if.icache = 'x;
|
||||
assign mem_perf_if.dcache = 'x;
|
||||
assign mem_perf_if.l2cache = 'x;
|
||||
|
@ -78,7 +78,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
.MREQ_SIZE (`L3_MREQ_SIZE),
|
||||
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
|
||||
.WRITE_ENABLE (1),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.CORE_OUT_BUF (2),
|
||||
.MEM_OUT_BUF (2),
|
||||
.NC_ENABLE (1),
|
||||
|
@ -114,15 +114,6 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
`UNUSED_VAR (mem_req_fire)
|
||||
`UNUSED_VAR (mem_rsp_fire)
|
||||
|
||||
wire sim_ebreak /* verilator public */;
|
||||
wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value /* verilator public */;
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_sim_ebreak;
|
||||
wire [`NUM_CLUSTERS-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_cluster_sim_wb_value;
|
||||
assign sim_ebreak = per_cluster_sim_ebreak[0];
|
||||
assign sim_wb_value = per_cluster_sim_wb_value[0];
|
||||
`UNUSED_VAR (per_cluster_sim_ebreak)
|
||||
`UNUSED_VAR (per_cluster_sim_wb_value)
|
||||
|
||||
VX_dcr_bus_if dcr_bus_if();
|
||||
assign dcr_bus_if.write_valid = dcr_wr_valid;
|
||||
assign dcr_bus_if.write_addr = dcr_wr_addr;
|
||||
|
@ -150,14 +141,11 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
`endif
|
||||
|
||||
|
||||
.dcr_bus_if (cluster_dcr_bus_if),
|
||||
|
||||
.mem_bus_if (per_cluster_mem_bus_if[i]),
|
||||
|
||||
.sim_ebreak (per_cluster_sim_ebreak[i]),
|
||||
.sim_wb_value (per_cluster_sim_wb_value[i]),
|
||||
|
||||
.busy (per_cluster_busy[i])
|
||||
);
|
||||
end
|
||||
|
@ -166,14 +154,14 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
mem_perf_t mem_perf;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_pending_reads <= '0;
|
||||
end else begin
|
||||
perf_mem_pending_reads <= $signed(perf_mem_pending_reads) +
|
||||
perf_mem_pending_reads <= $signed(perf_mem_pending_reads) +
|
||||
`PERF_CTR_BITS'($signed(2'(mem_req_fire && ~mem_bus_if.req_data.rw) - 2'(mem_rsp_fire)));
|
||||
end
|
||||
end
|
||||
|
@ -182,7 +170,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
if (reset) begin
|
||||
mem_perf <= '0;
|
||||
end else begin
|
||||
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
|
||||
|
@ -191,7 +179,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
end
|
||||
end
|
||||
assign mem_perf_if.mem = mem_perf;
|
||||
|
||||
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_MEM
|
||||
|
|
|
@ -96,9 +96,8 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
|
||||
wire [127:0] afu_id = `AFU_ACCEL_UUID;
|
||||
|
||||
wire [63:0] dev_caps = {8'b0,
|
||||
wire [63:0] dev_caps = {16'b0,
|
||||
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
|
||||
8'(`NUM_BARRIERS),
|
||||
16'(`NUM_CORES * `NUM_CLUSTERS),
|
||||
8'(`NUM_WARPS),
|
||||
8'(`NUM_THREADS),
|
||||
|
|
|
@ -142,9 +142,8 @@ module VX_afu_ctrl #(
|
|||
RSTATE_DATA = 2'd1;
|
||||
|
||||
// device caps
|
||||
wire [63:0] dev_caps = {8'b0,
|
||||
wire [63:0] dev_caps = {16'b0,
|
||||
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
|
||||
8'(`NUM_BARRIERS),
|
||||
16'(`NUM_CORES * `NUM_CLUSTERS),
|
||||
8'(`NUM_WARPS),
|
||||
8'(`NUM_THREADS),
|
||||
|
|
104
hw/rtl/cache/VX_cache_bank.sv
vendored
104
hw/rtl/cache/VX_cache_bank.sv
vendored
|
@ -105,13 +105,15 @@ module VX_cache_bank #(
|
|||
input wire [`CS_LINE_SEL_BITS-1:0] init_line_sel
|
||||
);
|
||||
|
||||
localparam PIPELINE_STAGES = 2;
|
||||
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
wire [`UP(UUID_WIDTH)-1:0] req_uuid_sel, req_uuid_st0, req_uuid_st1;
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
wire crsq_stall;
|
||||
wire crsp_queue_stall;
|
||||
wire mshr_alm_full;
|
||||
wire mreq_alm_full;
|
||||
wire mreq_queue_alm_full;
|
||||
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr;
|
||||
|
||||
|
@ -147,7 +149,7 @@ module VX_cache_bank #(
|
|||
wire rdw_hazard_st0;
|
||||
reg rdw_hazard_st1;
|
||||
|
||||
wire pipe_stall = crsq_stall || rdw_hazard_st1;
|
||||
wire pipe_stall = crsp_queue_stall || rdw_hazard_st1;
|
||||
|
||||
// inputs arbitration:
|
||||
// mshr replay has highest priority to maximize utilization since there is no miss.
|
||||
|
@ -169,12 +171,12 @@ module VX_cache_bank #(
|
|||
&& ~pipe_stall;
|
||||
|
||||
assign core_req_ready = creq_grant
|
||||
&& ~mreq_alm_full
|
||||
&& ~mreq_queue_alm_full
|
||||
&& ~mshr_alm_full
|
||||
&& ~pipe_stall;
|
||||
|
||||
wire init_fire = init_enable;
|
||||
wire replay_fire = replay_valid && replay_ready;
|
||||
wire replay_fire = replay_valid && replay_ready;
|
||||
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
|
||||
wire core_req_fire = core_req_valid && core_req_ready;
|
||||
|
||||
|
@ -429,17 +431,17 @@ module VX_cache_bank #(
|
|||
|
||||
// schedule core response
|
||||
|
||||
wire crsq_valid, crsq_ready;
|
||||
wire [`CS_WORD_WIDTH-1:0] crsq_data;
|
||||
wire [REQ_SEL_WIDTH-1:0] crsq_idx;
|
||||
wire [TAG_WIDTH-1:0] crsq_tag;
|
||||
wire crsp_queue_valid, crsp_queue_ready;
|
||||
wire [`CS_WORD_WIDTH-1:0] crsp_queue_data;
|
||||
wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx;
|
||||
wire [TAG_WIDTH-1:0] crsp_queue_tag;
|
||||
|
||||
assign crsq_valid = do_read_hit_st1 || do_replay_rd_st1;
|
||||
assign crsq_idx = req_idx_st1;
|
||||
assign crsq_data = read_data_st1;
|
||||
assign crsq_tag = tag_st1;
|
||||
assign crsp_queue_valid = do_read_hit_st1 || do_replay_rd_st1;
|
||||
assign crsp_queue_idx = req_idx_st1;
|
||||
assign crsp_queue_data = read_data_st1;
|
||||
assign crsp_queue_tag = tag_st1;
|
||||
|
||||
`RESET_RELAY (crsp_reset, reset);
|
||||
`RESET_RELAY (crsp_queue_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH),
|
||||
|
@ -447,61 +449,61 @@ module VX_cache_bank #(
|
|||
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
|
||||
) core_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (crsp_reset),
|
||||
.valid_in (crsq_valid && ~rdw_hazard_st1),
|
||||
.ready_in (crsq_ready),
|
||||
.data_in ({crsq_tag, crsq_data, crsq_idx}),
|
||||
.reset (crsp_queue_reset),
|
||||
.valid_in (crsp_queue_valid && ~rdw_hazard_st1),
|
||||
.ready_in (crsp_queue_ready),
|
||||
.data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}),
|
||||
.data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}),
|
||||
.valid_out (core_rsp_valid),
|
||||
.ready_out (core_rsp_ready)
|
||||
);
|
||||
|
||||
assign crsq_stall = crsq_valid && ~crsq_ready;
|
||||
assign crsp_queue_stall = crsp_queue_valid && ~crsp_queue_ready;
|
||||
|
||||
// schedule memory request
|
||||
|
||||
wire mreq_push, mreq_pop, mreq_empty;
|
||||
wire [`CS_WORD_WIDTH-1:0] mreq_data;
|
||||
wire [WORD_SIZE-1:0] mreq_byteen;
|
||||
wire [WORD_SEL_WIDTH-1:0] mreq_wsel;
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_addr;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mreq_id;
|
||||
wire mreq_rw;
|
||||
wire mreq_queue_push, mreq_queue_pop, mreq_queue_empty;
|
||||
wire [`CS_WORD_WIDTH-1:0] mreq_queue_data;
|
||||
wire [WORD_SIZE-1:0] mreq_queue_byteen;
|
||||
wire [WORD_SEL_WIDTH-1:0] mreq_queue_wsel;
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mreq_queue_id;
|
||||
wire mreq_queue_rw;
|
||||
|
||||
assign mreq_push = (do_read_miss_st1 && ~mshr_pending_st1)
|
||||
|| do_creq_wr_st1;
|
||||
assign mreq_queue_push = (do_read_miss_st1 && ~mshr_pending_st1)
|
||||
|| do_creq_wr_st1;
|
||||
|
||||
assign mreq_pop = mem_req_valid && mem_req_ready;
|
||||
assign mreq_queue_pop = mem_req_valid && mem_req_ready;
|
||||
|
||||
assign mreq_rw = WRITE_ENABLE && rw_st1;
|
||||
assign mreq_addr = addr_st1;
|
||||
assign mreq_id = mshr_id_st1;
|
||||
assign mreq_wsel = wsel_st1;
|
||||
assign mreq_byteen = byteen_st1;
|
||||
assign mreq_data = write_data_st1;
|
||||
assign mreq_queue_rw = WRITE_ENABLE && rw_st1;
|
||||
assign mreq_queue_addr = addr_st1;
|
||||
assign mreq_queue_id = mshr_id_st1;
|
||||
assign mreq_queue_wsel = wsel_st1;
|
||||
assign mreq_queue_byteen = byteen_st1;
|
||||
assign mreq_queue_data = write_data_st1;
|
||||
|
||||
`RESET_RELAY (mreq_reset, reset);
|
||||
`RESET_RELAY (mreq_queue_reset, reset);
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + `CS_WORD_WIDTH),
|
||||
.DEPTH (MREQ_SIZE),
|
||||
.ALM_FULL (MREQ_SIZE-2),
|
||||
.ALM_FULL (MREQ_SIZE-PIPELINE_STAGES),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) mem_req_queue (
|
||||
.clk (clk),
|
||||
.reset (mreq_reset),
|
||||
.push (mreq_push),
|
||||
.pop (mreq_pop),
|
||||
.data_in ({mreq_rw, mreq_addr, mreq_id, mreq_byteen, mreq_wsel, mreq_data}),
|
||||
.reset (mreq_queue_reset),
|
||||
.push (mreq_queue_push),
|
||||
.pop (mreq_queue_pop),
|
||||
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_wsel, mreq_queue_data}),
|
||||
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_wsel, mem_req_data}),
|
||||
.empty (mreq_empty),
|
||||
.alm_full (mreq_alm_full),
|
||||
.empty (mreq_queue_empty),
|
||||
.alm_full (mreq_queue_alm_full),
|
||||
`UNUSED_PIN (full),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
`UNUSED_PIN (size)
|
||||
);
|
||||
|
||||
assign mem_req_valid = ~mreq_empty;
|
||||
assign mem_req_valid = ~mreq_queue_empty;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -512,12 +514,12 @@ module VX_cache_bank #(
|
|||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_CACHE
|
||||
wire crsq_fire = crsq_valid && crsq_ready;
|
||||
wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready;
|
||||
wire pipeline_stall = (replay_valid || mem_rsp_valid || core_req_valid)
|
||||
&& ~(replay_fire || mem_rsp_fire || core_req_fire);
|
||||
always @(posedge clk) begin
|
||||
if (pipeline_stall) begin
|
||||
`TRACE(3, ("%d: *** %s-bank%0d stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID, BANK_ID, crsq_stall, mreq_alm_full, mshr_alm_full));
|
||||
`TRACE(3, ("%d: *** %s-bank%0d stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID, BANK_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full));
|
||||
end
|
||||
if (init_enable) begin
|
||||
`TRACE(2, ("%d: %s-bank%0d init: addr=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(init_line_sel, BANK_ID)));
|
||||
|
@ -534,14 +536,14 @@ module VX_cache_bank #(
|
|||
else
|
||||
`TRACE(2, ("%d: %s-bank%0d core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel));
|
||||
end
|
||||
if (crsq_fire) begin
|
||||
`TRACE(2, ("%d: %s-bank%0d core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_idx, crsq_data, req_uuid_st1));
|
||||
if (crsp_queue_fire) begin
|
||||
`TRACE(2, ("%d: %s-bank%0d core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1));
|
||||
end
|
||||
if (mreq_push) begin
|
||||
if (mreq_queue_push) begin
|
||||
if (do_creq_wr_st1)
|
||||
`TRACE(2, ("%d: %s-bank%0d writethrough: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_addr, BANK_ID), mreq_byteen, mreq_data, req_uuid_st1));
|
||||
`TRACE(2, ("%d: %s-bank%0d writethrough: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1));
|
||||
else
|
||||
`TRACE(2, ("%d: %s-bank%0d fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_addr, BANK_ID), mreq_id, req_uuid_st1));
|
||||
`TRACE(2, ("%d: %s-bank%0d fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -52,16 +52,14 @@ module VX_alu_int #(
|
|||
wire [NUM_LANES-1:0][`XLEN-1:0] alu_result_r;
|
||||
|
||||
`ifdef XLEN_64
|
||||
wire is_alu_w = execute_if.data.op_mod.alu.is_w;
|
||||
wire is_alu_w = execute_if.data.op_args.alu.is_w;
|
||||
`else
|
||||
wire is_alu_w = 0;
|
||||
`endif
|
||||
|
||||
`UNUSED_VAR (execute_if.data.op_mod)
|
||||
|
||||
wire [`INST_ALU_BITS-1:0] alu_op = `INST_ALU_BITS'(execute_if.data.op_type);
|
||||
wire [`INST_BR_BITS-1:0] br_op = `INST_BR_BITS'(execute_if.data.op_type);
|
||||
wire is_br_op = (execute_if.data.op_mod.alu.xtype == `ALU_TYPE_BRANCH);
|
||||
wire is_br_op = (execute_if.data.op_args.alu.xtype == `ALU_TYPE_BRANCH);
|
||||
wire is_sub_op = `INST_ALU_IS_SUB(alu_op);
|
||||
wire is_signed = `INST_ALU_SIGNED(alu_op);
|
||||
wire [1:0] op_class = is_br_op ? `INST_BR_CLASS(alu_op) : `INST_ALU_CLASS(alu_op);
|
||||
|
@ -69,9 +67,9 @@ module VX_alu_int #(
|
|||
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1 = execute_if.data.rs1_data;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2 = execute_if.data.rs2_data;
|
||||
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1_PC = execute_if.data.op_mod.alu.use_PC ? {NUM_LANES{execute_if.data.PC, 1'd0}} : alu_in1;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.op_mod.alu.use_imm ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_mod.alu.imm)}} : alu_in2;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_br = (execute_if.data.op_mod.alu.use_imm && ~is_br_op) ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_mod.alu.imm)}} : alu_in2;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1_PC = execute_if.data.op_args.alu.use_PC ? {NUM_LANES{execute_if.data.PC, 1'd0}} : alu_in1;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.op_args.alu.use_imm ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_br = (execute_if.data.op_args.alu.use_imm && ~is_br_op) ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign add_result[i] = alu_in1_PC[i] + alu_in2_imm[i];
|
||||
|
|
|
@ -38,7 +38,7 @@ module VX_alu_muldiv #(
|
|||
wire is_mulx_op = `INST_M_IS_MULX(muldiv_op);
|
||||
wire is_signed_op = `INST_M_SIGNED(muldiv_op);
|
||||
`ifdef XLEN_64
|
||||
wire is_alu_w = execute_if.data.op_mod.alu.is_w;
|
||||
wire is_alu_w = execute_if.data.op_args.alu.is_w;
|
||||
`else
|
||||
wire is_alu_w = 0;
|
||||
`endif
|
||||
|
|
|
@ -59,7 +59,7 @@ module VX_alu_unit #(
|
|||
|
||||
`RESET_RELAY (block_reset, reset);
|
||||
|
||||
wire is_muldiv_op = `EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_mod.alu.xtype == `ALU_TYPE_MULDIV);
|
||||
wire is_muldiv_op = `EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV);
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
|
|
|
@ -25,10 +25,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
// outputs
|
||||
VX_writeback_if.master writeback_if [`ISSUE_WIDTH],
|
||||
VX_commit_csr_if.master commit_csr_if,
|
||||
VX_commit_sched_if.master commit_sched_if,
|
||||
|
||||
// simulation helper signals
|
||||
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value
|
||||
VX_commit_sched_if.master commit_sched_if
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1;
|
||||
|
@ -169,15 +166,6 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
assign commit_arb_if[i].ready = 1'b1; // writeback has no backpressure
|
||||
end
|
||||
|
||||
// simulation helper signal to get RISC-V tests Pass/Fail status
|
||||
reg [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value_r;
|
||||
always @(posedge clk) begin
|
||||
if (writeback_if[0].valid) begin
|
||||
sim_wb_value_r[writeback_if[0].data.rd] <= writeback_if[0].data.data[0];
|
||||
end
|
||||
end
|
||||
assign sim_wb_value = sim_wb_value_r;
|
||||
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -17,11 +17,11 @@
|
|||
`include "VX_fpu_define.vh"
|
||||
`endif
|
||||
|
||||
module VX_core import VX_gpu_pkg::*; #(
|
||||
module VX_core import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
|
||||
// Clock
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -40,10 +40,6 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
VX_gbar_bus_if.master gbar_bus_if,
|
||||
`endif
|
||||
|
||||
// simulation helper signals
|
||||
output wire sim_ebreak,
|
||||
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
|
@ -55,10 +51,10 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
VX_commit_sched_if commit_sched_if();
|
||||
VX_commit_csr_if commit_csr_if();
|
||||
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
|
||||
VX_warp_ctl_if warp_ctl_if();
|
||||
|
||||
VX_warp_ctl_if warp_ctl_if();
|
||||
|
||||
VX_dispatch_if dispatch_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
|
||||
VX_commit_if commit_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
|
||||
VX_commit_if commit_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
|
||||
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
|
@ -69,7 +65,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
VX_pipeline_perf_if pipeline_perf_if();
|
||||
VX_pipeline_perf_if pipeline_perf_if();
|
||||
|
||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||
|
@ -105,13 +101,13 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_schedule_if (pipeline_perf_if.schedule),
|
||||
`endif
|
||||
`endif
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
.base_dcrs (base_dcrs),
|
||||
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
|
||||
|
||||
.decode_sched_if(decode_sched_if),
|
||||
.commit_sched_if(commit_sched_if),
|
||||
|
||||
|
@ -119,7 +115,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
`ifdef GBAR_ENABLE
|
||||
.gbar_bus_if (gbar_bus_if),
|
||||
`endif
|
||||
.sched_csr_if (sched_csr_if),
|
||||
.sched_csr_if (sched_csr_if),
|
||||
|
||||
.busy (busy)
|
||||
);
|
||||
|
@ -166,19 +162,19 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.CORE_ID (CORE_ID)
|
||||
) execute (
|
||||
`SCOPE_IO_BIND (2)
|
||||
|
||||
|
||||
.clk (clk),
|
||||
.reset (execute_reset),
|
||||
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
`endif
|
||||
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
||||
.lsu_mem_if (lsu_mem_if),
|
||||
|
||||
|
||||
.dispatch_if (dispatch_if),
|
||||
.commit_if (commit_if),
|
||||
|
||||
|
@ -186,10 +182,8 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.sched_csr_if (sched_csr_if),
|
||||
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
|
||||
.sim_ebreak (sim_ebreak)
|
||||
);
|
||||
.branch_ctl_if (branch_ctl_if)
|
||||
);
|
||||
|
||||
VX_commit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
|
@ -198,13 +192,11 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.reset (commit_reset),
|
||||
|
||||
.commit_if (commit_if),
|
||||
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.commit_sched_if(commit_sched_if),
|
||||
|
||||
.sim_wb_value (sim_wb_value)
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.commit_sched_if(commit_sched_if)
|
||||
);
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
|
@ -248,10 +240,10 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
`RESET_RELAY (coalescer_reset, reset);
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
|
||||
|
||||
|
||||
VX_mem_coalescer #(
|
||||
.INSTANCE_ID ($sformatf("core%0d-coalescer", CORE_ID)),
|
||||
.NUM_REQS (`NUM_LSU_LANES),
|
||||
.NUM_REQS (`NUM_LSU_LANES),
|
||||
.DATA_IN_SIZE (LSU_WORD_SIZE),
|
||||
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
|
||||
.ADDR_WIDTH (LSU_ADDR_WIDTH),
|
||||
|
@ -262,7 +254,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
) coalescer (
|
||||
.clk (clk),
|
||||
.reset (coalescer_reset),
|
||||
|
||||
|
||||
// Input request
|
||||
.in_req_valid (lsu_dcache_if[i].req_valid),
|
||||
.in_req_mask (lsu_dcache_if[i].req_data.mask),
|
||||
|
@ -320,8 +312,8 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_lsu_adapter #(
|
||||
.NUM_LANES (DCACHE_CHANNELS),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH),
|
||||
.REQ_OUT_BUF (0),
|
||||
.RSP_OUT_BUF (0)
|
||||
|
@ -341,7 +333,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
|
||||
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
|
||||
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
|
||||
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
|
||||
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
|
||||
|
||||
wire [1:0] perf_icache_pending_read_cycle;
|
||||
wire [`CLOG2(LSU_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
||||
|
@ -374,7 +366,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r);
|
||||
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r);
|
||||
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
|
||||
|
||||
|
||||
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
|
||||
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
|
||||
|
||||
|
@ -387,7 +379,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -17,9 +17,9 @@
|
|||
`include "VX_fpu_define.vh"
|
||||
`endif
|
||||
|
||||
module VX_core_top import VX_gpu_pkg::*; #(
|
||||
module VX_core_top import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
) (
|
||||
// Clock
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -58,34 +58,29 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
`ifdef GBAR_ENABLE
|
||||
output wire gbar_req_valid,
|
||||
output wire [`NB_WIDTH-1:0] gbar_req_id,
|
||||
output wire [`NC_WIDTH-1:0] gbar_req_size_m1,
|
||||
output wire [`NC_WIDTH-1:0] gbar_req_size_m1,
|
||||
output wire [`NC_WIDTH-1:0] gbar_req_core_id,
|
||||
input wire gbar_req_ready,
|
||||
input wire gbar_rsp_valid,
|
||||
input wire [`NB_WIDTH-1:0] gbar_rsp_id,
|
||||
`endif
|
||||
|
||||
// simulation helper signals
|
||||
output wire sim_ebreak,
|
||||
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
VX_gbar_bus_if gbar_bus_if();
|
||||
|
||||
assign gbar_req_valid = gbar_bus_if.req_valid;
|
||||
assign gbar_req_id = gbar_bus_if.req_id;
|
||||
assign gbar_req_size_m1 = gbar_bus_if.req_size_m1;
|
||||
assign gbar_req_size_m1 = gbar_bus_if.req_size_m1;
|
||||
assign gbar_req_core_id = gbar_bus_if.req_core_id;
|
||||
assign gbar_bus_if.req_ready = gbar_req_ready;
|
||||
assign gbar_bus_if.rsp_valid = gbar_rsp_valid;
|
||||
assign gbar_bus_if.rsp_id = gbar_rsp_id;
|
||||
`endif
|
||||
|
||||
VX_dcr_bus_if dcr_bus_if();
|
||||
VX_dcr_bus_if dcr_bus_if();
|
||||
|
||||
assign dcr_bus_if.write_valid = dcr_write_valid;
|
||||
assign dcr_bus_if.write_addr = dcr_write_addr;
|
||||
|
@ -132,7 +127,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
assign icache_rsp_ready = icache_bus_if.rsp_ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
assign mem_perf_if.icache = '0;
|
||||
assign mem_perf_if.dcache = '0;
|
||||
assign mem_perf_if.l2cache = '0;
|
||||
|
@ -142,8 +137,8 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
`ifdef SCOPE
|
||||
wire [0:0] scope_reset_w = 1'b0;
|
||||
wire [0:0] scope_bus_in_w = 1'b0;
|
||||
wire [0:0] scope_reset_w = 1'b0;
|
||||
wire [0:0] scope_bus_in_w = 1'b0;
|
||||
wire [0:0] scope_bus_out_w;
|
||||
`UNUSED_VAR (scope_bus_out_w)
|
||||
`endif
|
||||
|
@ -154,11 +149,11 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
`SCOPE_IO_BIND (0)
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
`endif
|
||||
|
||||
|
||||
.dcr_bus_if (dcr_bus_if),
|
||||
|
||||
.dcache_bus_if (dcache_bus_if),
|
||||
|
@ -169,8 +164,6 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
.gbar_bus_if (gbar_bus_if),
|
||||
`endif
|
||||
|
||||
.sim_ebreak (sim_ebreak),
|
||||
.sim_wb_value (sim_wb_value),
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
|
|
|
@ -182,7 +182,7 @@ import VX_fpu_pkg::*;
|
|||
`VX_CSR_NUM_THREADS: read_data_ro_r = `XLEN'(`NUM_THREADS);
|
||||
`VX_CSR_NUM_WARPS : read_data_ro_r = `XLEN'(`NUM_WARPS);
|
||||
`VX_CSR_NUM_CORES : read_data_ro_r = `XLEN'(`NUM_CORES * `NUM_CLUSTERS);
|
||||
`VX_CSR_NUM_BARRIERS: read_data_ro_r = `XLEN'(`NUM_BARRIERS);
|
||||
`VX_CSR_LOCAL_MEM_BASE: read_data_ro_r = `XLEN'(`LMEM_BASE_ADDR);
|
||||
|
||||
`CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_r, cycles);
|
||||
|
||||
|
|
|
@ -51,8 +51,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
wire csr_wr_enable;
|
||||
wire csr_req_ready;
|
||||
|
||||
wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = execute_if.data.op_mod.csr.addr;
|
||||
wire [`NRI_BITS-1:0] csr_imm = execute_if.data.op_mod.csr.imm;
|
||||
wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = execute_if.data.op_args.csr.addr;
|
||||
wire [`NRI_BITS-1:0] csr_imm = execute_if.data.op_args.csr.imm;
|
||||
|
||||
wire is_fpu_csr = (csr_addr <= `VX_CSR_FCSR);
|
||||
|
||||
|
@ -134,7 +134,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
// CSR write
|
||||
|
||||
assign csr_req_data = execute_if.data.op_mod.csr.use_imm ? `XLEN'(csr_imm) : rs1_data[0];
|
||||
assign csr_req_data = execute_if.data.op_args.csr.use_imm ? `XLEN'(csr_imm) : rs1_data[0];
|
||||
assign csr_wr_enable = (csr_write_enable || (| csr_req_data));
|
||||
|
||||
always @(*) begin
|
||||
|
|
|
@ -42,7 +42,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
VX_decode_sched_if.master decode_sched_if
|
||||
);
|
||||
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + (`NR_BITS * 4);
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + (`NR_BITS * 4);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
`UNUSED_VAR (clk)
|
||||
|
@ -50,7 +50,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
|
||||
reg [`EX_BITS-1:0] ex_type;
|
||||
reg [`INST_OP_BITS-1:0] op_type;
|
||||
op_mod_t op_mod;
|
||||
op_args_t op_args;
|
||||
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
|
||||
reg use_rd, use_rs1, use_rs2, use_rs3;
|
||||
reg is_wstall;
|
||||
|
@ -149,7 +149,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
|
||||
ex_type = '0;
|
||||
op_type = 'x;
|
||||
op_mod = 'x;
|
||||
op_args = 'x;
|
||||
rd_r = '0;
|
||||
rs1_r = '0;
|
||||
rs2_r = '0;
|
||||
|
@ -164,20 +164,20 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`INST_I: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(r_type);
|
||||
op_mod.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_mod.alu.is_w = 0;
|
||||
op_mod.alu.use_PC = 0;
|
||||
op_mod.alu.use_imm = 1;
|
||||
op_mod.alu.imm = `SEXT(`IMM_BITS, i_imm);
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 0;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = `SEXT(`IMM_BITS, i_imm);
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
`INST_R: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_mod.alu.is_w = 0;
|
||||
op_mod.alu.use_PC = 0;
|
||||
op_mod.alu.use_imm = 0;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 0;
|
||||
op_args.alu.use_imm = 0;
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
|
@ -187,19 +187,19 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`INST_R_F7_MUL: begin
|
||||
// MUL, MULH, MULHSU, MULHU
|
||||
op_type = `INST_OP_BITS'(m_type);
|
||||
op_mod.alu.xtype = `ALU_TYPE_MULDIV;
|
||||
op_args.alu.xtype = `ALU_TYPE_MULDIV;
|
||||
end
|
||||
`endif
|
||||
`ifdef EXT_ZICOND_ENABLE
|
||||
`INST_R_F7_ZICOND: begin
|
||||
// CZERO-EQZ, CZERO-NEZ
|
||||
op_type = func3[1] ? `INST_OP_BITS'(`INST_ALU_CZNE) : `INST_OP_BITS'(`INST_ALU_CZEQ);
|
||||
op_mod.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
end
|
||||
`endif
|
||||
default: begin
|
||||
op_type = `INST_OP_BITS'(r_type);
|
||||
op_mod.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
|
@ -208,20 +208,20 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
// ADDIW, SLLIW, SRLIW, SRAIW
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(r_type);
|
||||
op_mod.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_mod.alu.is_w = 1;
|
||||
op_mod.alu.use_PC = 0;
|
||||
op_mod.alu.use_imm = 1;
|
||||
op_mod.alu.imm = `SEXT(`IMM_BITS, iw_imm);
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_args.alu.is_w = 1;
|
||||
op_args.alu.use_PC = 0;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = `SEXT(`IMM_BITS, iw_imm);
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
`INST_R_W: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_mod.alu.is_w = 1;
|
||||
op_mod.alu.use_PC = 0;
|
||||
op_mod.alu.use_imm = 0;
|
||||
op_args.alu.is_w = 1;
|
||||
op_args.alu.use_PC = 0;
|
||||
op_args.alu.use_imm = 0;
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
|
@ -231,13 +231,13 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`INST_R_F7_MUL: begin
|
||||
// MULW, DIVW, DIVUW, REMW, REMUW
|
||||
op_type = `INST_OP_BITS'(m_type);
|
||||
op_mod.alu.xtype = `ALU_TYPE_MULDIV;
|
||||
op_args.alu.xtype = `ALU_TYPE_MULDIV;
|
||||
end
|
||||
`endif
|
||||
default: begin
|
||||
// ADDW, SUBW, SLLW, SRLW, SRAW
|
||||
op_type = `INST_OP_BITS'(r_type);
|
||||
op_mod.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
|
@ -245,33 +245,33 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`INST_LUI: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(`INST_ALU_LUI);
|
||||
op_mod.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_mod.alu.is_w = 0;
|
||||
op_mod.alu.use_PC = 0;
|
||||
op_mod.alu.use_imm = 1;
|
||||
op_mod.alu.imm = {{`IMM_BITS-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 0;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = {{`IMM_BITS-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
end
|
||||
`INST_AUIPC: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(`INST_ALU_AUIPC);
|
||||
op_mod.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_mod.alu.is_w = 0;
|
||||
op_mod.alu.use_PC = 1;
|
||||
op_mod.alu.use_imm = 1;
|
||||
op_mod.alu.imm = {{`IMM_BITS-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 1;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = {{`IMM_BITS-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
end
|
||||
`INST_JAL: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(`INST_BR_JAL);
|
||||
op_mod.alu.xtype = `ALU_TYPE_BRANCH;
|
||||
op_mod.alu.is_w = 0;
|
||||
op_mod.alu.use_PC = 1;
|
||||
op_mod.alu.use_imm = 1;
|
||||
op_mod.alu.imm = `SEXT(`IMM_BITS, jal_imm);
|
||||
op_args.alu.xtype = `ALU_TYPE_BRANCH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 1;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = `SEXT(`IMM_BITS, jal_imm);
|
||||
use_rd = 1;
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rd);
|
||||
|
@ -279,11 +279,11 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`INST_JALR: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(`INST_BR_JALR);
|
||||
op_mod.alu.xtype = `ALU_TYPE_BRANCH;
|
||||
op_mod.alu.is_w = 0;
|
||||
op_mod.alu.use_PC = 0;
|
||||
op_mod.alu.use_imm = 1;
|
||||
op_mod.alu.imm = `SEXT(`IMM_BITS, u_12);
|
||||
op_args.alu.xtype = `ALU_TYPE_BRANCH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 0;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = `SEXT(`IMM_BITS, u_12);
|
||||
use_rd = 1;
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rd);
|
||||
|
@ -292,11 +292,11 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`INST_B: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(b_type);
|
||||
op_mod.alu.xtype = `ALU_TYPE_BRANCH;
|
||||
op_mod.alu.is_w = 0;
|
||||
op_mod.alu.use_PC = 1;
|
||||
op_mod.alu.use_imm = 1;
|
||||
op_mod.alu.imm = `SEXT(`IMM_BITS, b_imm);
|
||||
op_args.alu.xtype = `ALU_TYPE_BRANCH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 1;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = `SEXT(`IMM_BITS, b_imm);
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
|
@ -304,29 +304,32 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`INST_FENCE: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_LSU_FENCE;
|
||||
op_args.lsu.is_store = 0;
|
||||
op_args.lsu.is_float = 0;
|
||||
op_args.lsu.offset = 0;
|
||||
end
|
||||
`INST_SYS : begin
|
||||
if (func3[1:0] != 0) begin
|
||||
ex_type = `EX_SFU;
|
||||
op_type = `INST_OP_BITS'(`INST_SFU_CSR(func3[1:0]));
|
||||
op_mod.csr.addr = u_12;
|
||||
op_mod.csr.use_imm = func3[2];
|
||||
op_args.csr.addr = u_12;
|
||||
op_args.csr.use_imm = func3[2];
|
||||
use_rd = 1;
|
||||
is_wstall = is_fpu_csr; // only stall for FPU CSRs
|
||||
`USED_IREG (rd);
|
||||
if (func3[2]) begin
|
||||
op_mod.csr.imm = rs1;
|
||||
op_args.csr.imm = rs1;
|
||||
end else begin
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
end else begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(s_type);
|
||||
op_mod.alu.xtype = `ALU_TYPE_BRANCH;
|
||||
op_mod.alu.is_w = 0;
|
||||
op_mod.alu.use_imm = 1;
|
||||
op_mod.alu.use_PC = 1;
|
||||
op_mod.alu.imm = `IMM_BITS'd4;
|
||||
op_args.alu.xtype = `ALU_TYPE_BRANCH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.use_PC = 1;
|
||||
op_args.alu.imm = `IMM_BITS'd4;
|
||||
use_rd = 1;
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rd);
|
||||
|
@ -338,8 +341,9 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`INST_L: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'({1'b0, func3});
|
||||
op_mod.lsu.is_float = opcode[2];
|
||||
op_mod.lsu.offset = u_12;
|
||||
op_args.lsu.is_store = 0;
|
||||
op_args.lsu.is_float = opcode[2];
|
||||
op_args.lsu.offset = u_12;
|
||||
use_rd = 1;
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (opcode[2]) begin
|
||||
|
@ -355,8 +359,9 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`INST_S: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'({1'b1, func3});
|
||||
op_mod.lsu.is_float = opcode[2];
|
||||
op_mod.lsu.offset = s_imm;
|
||||
op_args.lsu.is_store = 1;
|
||||
op_args.lsu.is_float = opcode[2];
|
||||
op_args.lsu.offset = s_imm;
|
||||
`USED_IREG (rs1);
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (opcode[2]) begin
|
||||
|
@ -372,8 +377,8 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`INST_FNMADD: begin
|
||||
ex_type = `EX_FPU;
|
||||
op_type = `INST_OP_BITS'({2'b11, opcode[3:2]});
|
||||
op_mod.fpu.frm = func3;
|
||||
op_mod.fpu.fmt[0] = func2[0]; // float / double
|
||||
op_args.fpu.frm = func3;
|
||||
op_args.fpu.fmt[0] = func2[0]; // float / double
|
||||
use_rd = 1;
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
|
@ -382,9 +387,9 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
end
|
||||
`INST_FCI: begin
|
||||
ex_type = `EX_FPU;
|
||||
op_mod.fpu.frm = func3;
|
||||
op_mod.fpu.fmt[0] = func2[0]; // float / double
|
||||
op_mod.fpu.fmt[1] = rs2[1]; // int32 / int64
|
||||
op_args.fpu.frm = func3;
|
||||
op_args.fpu.fmt[0] = func2[0]; // float / double
|
||||
op_args.fpu.fmt[1] = rs2[1]; // int32 / int64
|
||||
use_rd = 1;
|
||||
case (func5)
|
||||
5'b00000, // FADD
|
||||
|
@ -399,7 +404,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
5'b00100: begin
|
||||
// NCP: FSGNJ=0, FSGNJN=1, FSGNJX=2
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_mod.fpu.frm = `INST_FRM_BITS'(func3[1:0]);
|
||||
op_args.fpu.frm = `INST_FRM_BITS'(func3[1:0]);
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
`USED_FREG (rs2);
|
||||
|
@ -407,7 +412,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
5'b00101: begin
|
||||
// NCP: FMIN=6, FMAX=7
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_mod.fpu.frm = `INST_FRM_BITS'(func3[0] ? 7 : 6);
|
||||
op_args.fpu.frm = `INST_FRM_BITS'(func3[0] ? 7 : 6);
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
`USED_FREG (rs2);
|
||||
|
@ -449,11 +454,11 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
if (func3[0]) begin
|
||||
// NCP: FCLASS=3
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_mod.fpu.frm = `INST_FRM_BITS'(3);
|
||||
op_args.fpu.frm = `INST_FRM_BITS'(3);
|
||||
end else begin
|
||||
// NCP: FMV.X.W=4
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_mod.fpu.frm = `INST_FRM_BITS'(4);
|
||||
op_args.fpu.frm = `INST_FRM_BITS'(4);
|
||||
end
|
||||
`USED_IREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
|
@ -461,7 +466,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
5'b11110: begin
|
||||
// NCP: FMV.W.X=5
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_mod.fpu.frm = `INST_FRM_BITS'(5);
|
||||
op_args.fpu.frm = `INST_FRM_BITS'(5);
|
||||
`USED_FREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
|
@ -487,7 +492,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
3'h2: begin // SPLIT
|
||||
op_type = `INST_OP_BITS'(`INST_SFU_SPLIT);
|
||||
use_rd = 1;
|
||||
op_mod.wctl.is_neg = rs2[0];
|
||||
op_args.wctl.is_neg = rs2[0];
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rd);
|
||||
end
|
||||
|
@ -502,7 +507,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
end
|
||||
3'h5: begin // PRED
|
||||
op_type = `INST_OP_BITS'(`INST_SFU_PRED);
|
||||
op_mod.wctl.is_neg = rd[0];
|
||||
op_args.wctl.is_neg = rd[0];
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
|
@ -527,8 +532,8 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
.valid_in (fetch_if.valid),
|
||||
.ready_in (fetch_if.ready),
|
||||
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_mod, wb, rd_r, rs1_r, rs2_r, rs3_r}),
|
||||
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
|
||||
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_r, rs1_r, rs2_r, rs3_r}),
|
||||
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
|
||||
.valid_out (decode_if.valid),
|
||||
.ready_out (decode_if.ready)
|
||||
);
|
||||
|
@ -550,10 +555,10 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`TRACE(1, ("%d: core%0d-decode: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, CORE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr));
|
||||
trace_ex_type(1, decode_if.data.ex_type);
|
||||
`TRACE(1, (", op="));
|
||||
trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod);
|
||||
trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
|
||||
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, opds=%b%b%b%b",
|
||||
decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, use_rd, use_rs1, use_rs2, use_rs3));
|
||||
trace_op_mod(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod);
|
||||
trace_op_args(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
|
||||
`TRACE(1, (" (#%0d)\n", decode_if.data.uuid));
|
||||
end
|
||||
end
|
||||
|
|
|
@ -31,7 +31,7 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
|
||||
|
||||
wire [`NUM_THREADS-1:0][`NT_WIDTH-1:0] tids;
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
|
@ -124,14 +124,14 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), {operands_if[i].data.PC, 1'b0}));
|
||||
trace_ex_type(1, operands_if[i].data.ex_type);
|
||||
`TRACE(1, (", op="));
|
||||
trace_ex_op(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_mod);
|
||||
trace_ex_op(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_args);
|
||||
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if[i].data.tmask, operands_if[i].data.wb, operands_if[i].data.rd));
|
||||
`TRACE_ARRAY1D(1, "0x%0h", operands_if[i].data.rs1_data, `NUM_THREADS);
|
||||
`TRACE(1, (", rs2_data="));
|
||||
`TRACE_ARRAY1D(1, "0x%0h", operands_if[i].data.rs2_data, `NUM_THREADS);
|
||||
`TRACE(1, (", rs3_data="));
|
||||
`TRACE_ARRAY1D(1, "0x%0h", operands_if[i].data.rs3_data, `NUM_THREADS);
|
||||
trace_op_mod(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_mod);
|
||||
trace_op_args(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_args);
|
||||
`TRACE(1, (" (#%0d)\n", operands_if[i].data.uuid));
|
||||
end
|
||||
end
|
||||
|
|
|
@ -38,8 +38,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
localparam BATCH_COUNT = `ISSUE_WIDTH / BLOCK_SIZE;
|
||||
localparam BATCH_COUNT_W= `LOG2UP(BATCH_COUNT);
|
||||
localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH);
|
||||
localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
|
||||
localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_MOD_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
|
||||
localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
|
||||
localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
|
||||
localparam FANOUT_ENABLE= (`NUM_THREADS > MAX_FANOUT);
|
||||
|
||||
localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS);
|
||||
|
|
|
@ -43,10 +43,7 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
VX_warp_ctl_if.master warp_ctl_if,
|
||||
|
||||
// commit interface
|
||||
VX_commit_csr_if.slave commit_csr_if,
|
||||
|
||||
// simulation helper signals
|
||||
output wire sim_ebreak
|
||||
VX_commit_csr_if.slave commit_csr_if
|
||||
);
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
@ -114,11 +111,4 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
.warp_ctl_if (warp_ctl_if)
|
||||
);
|
||||
|
||||
// simulation helper signal to get RISC-V tests Pass/Fail status
|
||||
assign sim_ebreak = dispatch_if[0].valid && dispatch_if[0].ready
|
||||
&& dispatch_if[0].data.wis == 0
|
||||
&& (dispatch_if[0].data.op_mod.alu.xtype == `ALU_TYPE_BRANCH)
|
||||
&& (`INST_BR_BITS'(dispatch_if[0].data.op_type) == `INST_BR_EBREAK
|
||||
|| `INST_BR_BITS'(dispatch_if[0].data.op_type) == `INST_BR_ECALL);
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -78,8 +78,8 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;
|
||||
wire mdata_full;
|
||||
|
||||
wire [`INST_FMT_BITS-1:0] fpu_fmt = per_block_execute_if[block_idx].data.op_mod.fpu.fmt;
|
||||
wire [`INST_FRM_BITS-1:0] fpu_frm = per_block_execute_if[block_idx].data.op_mod.fpu.frm;
|
||||
wire [`INST_FMT_BITS-1:0] fpu_fmt = per_block_execute_if[block_idx].data.op_args.fpu.fmt;
|
||||
wire [`INST_FRM_BITS-1:0] fpu_frm = per_block_execute_if[block_idx].data.op_args.fpu.frm;
|
||||
|
||||
wire execute_fire = per_block_execute_if[block_idx].valid && per_block_execute_if[block_idx].ready;
|
||||
wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready;
|
||||
|
|
|
@ -25,7 +25,7 @@ module VX_gpr_slice import VX_gpu_pkg::*; #(
|
|||
VX_operands_if.master operands_if
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + `NR_BITS;
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS;
|
||||
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
|
||||
|
||||
localparam STATE_IDLE = 2'd0;
|
||||
|
@ -210,7 +210,7 @@ module VX_gpr_slice import VX_gpu_pkg::*; #(
|
|||
scoreboard_if.data.wb,
|
||||
scoreboard_if.data.ex_type,
|
||||
scoreboard_if.data.op_type,
|
||||
scoreboard_if.data.op_mod,
|
||||
scoreboard_if.data.op_args,
|
||||
scoreboard_if.data.rd
|
||||
}),
|
||||
.ready_in (stg_ready_in),
|
||||
|
@ -223,7 +223,7 @@ module VX_gpr_slice import VX_gpu_pkg::*; #(
|
|||
operands_if.data.wb,
|
||||
operands_if.data.ex_type,
|
||||
operands_if.data.op_type,
|
||||
operands_if.data.op_mod,
|
||||
operands_if.data.op_args,
|
||||
operands_if.data.rd
|
||||
}),
|
||||
.ready_out (operands_if.ready)
|
||||
|
|
|
@ -26,7 +26,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||
VX_ibuffer_if.master ibuffer_if [`NUM_WARPS]
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + (`NR_BITS * 4);
|
||||
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4);
|
||||
|
||||
wire [`NUM_WARPS-1:0] ibuf_ready_in;
|
||||
|
||||
|
@ -47,7 +47,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||
decode_if.data.PC,
|
||||
decode_if.data.ex_type,
|
||||
decode_if.data.op_type,
|
||||
decode_if.data.op_mod,
|
||||
decode_if.data.op_args,
|
||||
decode_if.data.wb,
|
||||
decode_if.data.rd,
|
||||
decode_if.data.rs1,
|
||||
|
|
|
@ -38,29 +38,30 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
|
||||
localparam MEM_ADDRW = `MEM_ADDR_WIDTH - MEM_ASHIFT;
|
||||
|
||||
// tag_id = wid + PC + rd + op_type + align + pid + pkt_addr
|
||||
localparam TAG_ID_WIDTH = `NW_WIDTH + `PC_BITS + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * REQ_ASHIFT) + PID_WIDTH + LSUQ_SIZEW;
|
||||
// tag_id = wid + PC + wb + rd + op_type + align + pid + pkt_addr + fence
|
||||
localparam TAG_ID_WIDTH = `NW_WIDTH + `PC_BITS + 1 + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * REQ_ASHIFT) + PID_WIDTH + LSUQ_SIZEW + 1;
|
||||
|
||||
// tag = uuid + tag_id
|
||||
localparam TAG_WIDTH = `UUID_WIDTH + TAG_ID_WIDTH;
|
||||
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) commit_st_if();
|
||||
) commit_rsp_if();
|
||||
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) commit_ld_if();
|
||||
) commit_no_rsp_if();
|
||||
|
||||
`UNUSED_VAR (execute_if.data.op_mod)
|
||||
`UNUSED_VAR (execute_if.data.rs3_data)
|
||||
`UNUSED_VAR (execute_if.data.tid)
|
||||
|
||||
// full address calculation
|
||||
|
||||
wire req_is_fence, rsp_is_fence;
|
||||
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign full_addr[i] = execute_if.data.rs1_data[i] + `SEXT(`XLEN, execute_if.data.op_mod.lsu.offset);
|
||||
assign full_addr[i] = execute_if.data.rs1_data[i] + `SEXT(`XLEN, execute_if.data.op_args.lsu.offset);
|
||||
end
|
||||
|
||||
// address type calculation
|
||||
|
@ -70,7 +71,9 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
wire [MEM_ADDRW-1:0] block_addr = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
|
||||
// is I/O address
|
||||
wire [MEM_ADDRW-1:0] io_addr_start = MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT);
|
||||
assign mem_req_atype[i][`ADDR_TYPE_IO] = (block_addr >= io_addr_start);
|
||||
wire [MEM_ADDRW-1:0] io_addr_end = MEM_ADDRW'(`XLEN'(`IO_END_ADDR) >> MEM_ASHIFT);
|
||||
assign mem_req_atype[i][`ADDR_TYPE_FLUSH] = req_is_fence;
|
||||
assign mem_req_atype[i][`ADDR_TYPE_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end);
|
||||
`ifdef LMEM_ENABLE
|
||||
// is local memory address
|
||||
wire [MEM_ADDRW-1:0] lmem_addr_start = MEM_ADDRW'(`XLEN'(`LMEM_BASE_ADDR) >> MEM_ASHIFT);
|
||||
|
@ -79,17 +82,6 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
end
|
||||
|
||||
wire mem_req_empty;
|
||||
wire st_rsp_ready;
|
||||
wire lsu_valid, lsu_ready;
|
||||
|
||||
// fence: stall the pipeline until all pending requests are sent
|
||||
wire is_fence = `INST_LSU_IS_FENCE(execute_if.data.op_type);
|
||||
wire fence_wait = is_fence && ~mem_req_empty;
|
||||
|
||||
assign lsu_valid = execute_if.valid && ~fence_wait;
|
||||
assign execute_if.ready = lsu_ready && ~fence_wait;
|
||||
|
||||
// schedule memory request
|
||||
|
||||
wire mem_req_valid;
|
||||
|
@ -109,18 +101,53 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
wire mem_rsp_eop;
|
||||
wire mem_rsp_ready;
|
||||
|
||||
assign mem_req_valid = lsu_valid;
|
||||
assign lsu_ready = mem_req_ready
|
||||
&& (~mem_req_rw || st_rsp_ready); // writes commit directly
|
||||
|
||||
assign mem_req_mask = execute_if.data.tmask;
|
||||
assign mem_req_rw = ~execute_if.data.wb;
|
||||
|
||||
wire mem_req_fire = mem_req_valid && mem_req_ready;
|
||||
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
|
||||
`UNUSED_VAR (mem_req_fire)
|
||||
`UNUSED_VAR (mem_rsp_fire)
|
||||
|
||||
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
|
||||
wire no_rsp_buf_valid, no_rsp_buf_ready;
|
||||
|
||||
// fence handling
|
||||
|
||||
reg fence_lock;
|
||||
|
||||
assign req_is_fence = `INST_LSU_IS_FENCE(execute_if.data.op_type);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
fence_lock <= 0;
|
||||
end else begin
|
||||
if (mem_req_fire && req_is_fence && execute_if.data.eop) begin
|
||||
fence_lock <= 1;
|
||||
end
|
||||
if (mem_rsp_fire && rsp_is_fence && mem_rsp_eop_pkt) begin
|
||||
fence_lock <= 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
wire req_skip = req_is_fence && ~execute_if.data.eop;
|
||||
wire no_rsp_buf_enable = (mem_req_rw && ~execute_if.data.wb) || req_skip;
|
||||
|
||||
assign mem_req_valid = execute_if.valid
|
||||
&& ~req_skip
|
||||
&& ~(no_rsp_buf_enable && ~no_rsp_buf_ready)
|
||||
&& ~fence_lock;
|
||||
|
||||
assign no_rsp_buf_valid = execute_if.valid
|
||||
&& no_rsp_buf_enable
|
||||
&& (req_skip || mem_req_ready)
|
||||
&& ~fence_lock;
|
||||
|
||||
assign execute_if.ready = (mem_req_ready || req_skip)
|
||||
&& ~(no_rsp_buf_enable && ~no_rsp_buf_ready)
|
||||
&& ~fence_lock;
|
||||
|
||||
assign mem_req_mask = execute_if.data.tmask;
|
||||
assign mem_req_rw = execute_if.data.op_args.lsu.is_store;
|
||||
|
||||
// address formatting
|
||||
|
||||
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
|
||||
|
@ -158,7 +185,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
// memory misalignment not supported!
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire lsu_req_fire = execute_if.valid && execute_if.ready;
|
||||
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if.data.op_type))) == 0),
|
||||
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || req_is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if.data.op_type))) == 0),
|
||||
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
|
||||
execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid));
|
||||
end
|
||||
|
@ -185,13 +212,12 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
// track SOP/EOP for out-of-order memory responses
|
||||
|
||||
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
|
||||
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
|
||||
|
||||
if (PID_BITS != 0) begin
|
||||
reg [`LSUQ_IN_SIZE-1:0][PID_BITS:0] pkt_ctr;
|
||||
reg [`LSUQ_IN_SIZE-1:0] pkt_sop, pkt_eop;
|
||||
|
||||
wire mem_req_rd_fire = mem_req_fire && execute_if.data.wb;
|
||||
wire mem_req_rd_fire = mem_req_fire && ~mem_req_rw;
|
||||
wire mem_req_rd_sop_fire = mem_req_rd_fire && execute_if.data.sop;
|
||||
wire mem_req_rd_eop_fire = mem_req_rd_fire && execute_if.data.eop;
|
||||
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
|
||||
|
@ -258,10 +284,13 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
execute_if.data.uuid,
|
||||
execute_if.data.wid,
|
||||
execute_if.data.PC,
|
||||
execute_if.data.wb,
|
||||
execute_if.data.rd,
|
||||
execute_if.data.op_type,
|
||||
req_align, execute_if.data.pid,
|
||||
pkt_waddr
|
||||
req_align,
|
||||
execute_if.data.pid,
|
||||
pkt_waddr,
|
||||
req_is_fence
|
||||
};
|
||||
|
||||
wire lsu_mem_req_valid;
|
||||
|
@ -311,7 +340,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
.core_req_data (mem_req_data),
|
||||
.core_req_tag (mem_req_tag),
|
||||
.core_req_ready (mem_req_ready),
|
||||
.core_req_empty (mem_req_empty),
|
||||
`UNUSED_PIN (core_req_empty),
|
||||
`UNUSED_PIN (core_req_sent),
|
||||
|
||||
// Output response
|
||||
|
@ -361,6 +390,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
wire [`UUID_WIDTH-1:0] rsp_uuid;
|
||||
wire [`NW_WIDTH-1:0] rsp_wid;
|
||||
wire [`PC_BITS-1:0] rsp_pc;
|
||||
wire rsp_wb;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
wire [`INST_LSU_BITS-1:0] rsp_op_type;
|
||||
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] rsp_align;
|
||||
|
@ -371,11 +401,14 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
assign {
|
||||
rsp_uuid,
|
||||
rsp_wid,
|
||||
rsp_pc, rsp_rd,
|
||||
rsp_pc,
|
||||
rsp_wb,
|
||||
rsp_rd,
|
||||
rsp_op_type,
|
||||
rsp_align,
|
||||
rsp_pid,
|
||||
pkt_raddr
|
||||
pkt_raddr,
|
||||
rsp_is_fence
|
||||
} = mem_rsp_tag;
|
||||
|
||||
// load response formatting
|
||||
|
@ -419,44 +452,38 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
// load commit
|
||||
// commit
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + 1 + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
|
||||
.SIZE (2)
|
||||
) ld_rsp_buf (
|
||||
) rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_rsp_valid),
|
||||
.ready_in (mem_rsp_ready),
|
||||
.data_in ({rsp_uuid, rsp_wid, mem_rsp_mask, rsp_pc, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
|
||||
.data_out ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}),
|
||||
.valid_out (commit_ld_if.valid),
|
||||
.ready_out (commit_ld_if.ready)
|
||||
.data_in ({rsp_uuid, rsp_wid, mem_rsp_mask, rsp_pc, rsp_wb, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
|
||||
.data_out ({commit_rsp_if.data.uuid, commit_rsp_if.data.wid, commit_rsp_if.data.tmask, commit_rsp_if.data.PC, commit_rsp_if.data.wb, commit_rsp_if.data.rd, commit_rsp_if.data.data, commit_rsp_if.data.pid, commit_rsp_if.data.sop, commit_rsp_if.data.eop}),
|
||||
.valid_out (commit_rsp_if.valid),
|
||||
.ready_out (commit_rsp_if.ready)
|
||||
);
|
||||
|
||||
assign commit_ld_if.data.wb = 1'b1;
|
||||
|
||||
// store commit
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + PID_WIDTH + 1 + 1),
|
||||
.SIZE (2)
|
||||
) st_rsp_buf (
|
||||
) no_rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_req_fire && mem_req_rw),
|
||||
.ready_in (st_rsp_ready),
|
||||
.valid_in (no_rsp_buf_valid),
|
||||
.ready_in (no_rsp_buf_ready),
|
||||
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
|
||||
.data_out ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}),
|
||||
.valid_out (commit_st_if.valid),
|
||||
.ready_out (commit_st_if.ready)
|
||||
.data_out ({commit_no_rsp_if.data.uuid, commit_no_rsp_if.data.wid, commit_no_rsp_if.data.tmask, commit_no_rsp_if.data.PC, commit_no_rsp_if.data.pid, commit_no_rsp_if.data.sop, commit_no_rsp_if.data.eop}),
|
||||
.valid_out (commit_no_rsp_if.valid),
|
||||
.ready_out (commit_no_rsp_if.ready)
|
||||
);
|
||||
assign commit_st_if.data.rd = '0;
|
||||
assign commit_st_if.data.wb = 1'b0;
|
||||
assign commit_st_if.data.data = commit_ld_if.data.data; // force arbiter passthru
|
||||
|
||||
// lsu commit
|
||||
assign commit_no_rsp_if.data.rd = '0;
|
||||
assign commit_no_rsp_if.data.wb = 1'b0;
|
||||
assign commit_no_rsp_if.data.data = commit_rsp_if.data.data; // arbiter MUX optimization
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
|
@ -465,9 +492,9 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in ({commit_st_if.valid, commit_ld_if.valid}),
|
||||
.ready_in ({commit_st_if.ready, commit_ld_if.ready}),
|
||||
.data_in ({commit_st_if.data, commit_ld_if.data}),
|
||||
.valid_in ({commit_no_rsp_if.valid, commit_rsp_if.valid}),
|
||||
.ready_in ({commit_no_rsp_if.ready, commit_rsp_if.ready}),
|
||||
.data_in ({commit_no_rsp_if.data, commit_rsp_if.data}),
|
||||
.data_out (commit_if.data),
|
||||
.valid_out (commit_if.valid),
|
||||
.ready_out (commit_if.ready),
|
||||
|
@ -476,7 +503,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
|
||||
`ifdef DBG_TRACE_MEM
|
||||
always @(posedge clk) begin
|
||||
if (execute_if.valid && fence_wait) begin
|
||||
if (execute_if.valid && fence_lock) begin
|
||||
`TRACE(1, ("%d: *** D$%0d fence wait\n", $time, CORE_ID));
|
||||
end
|
||||
if (mem_req_fire) begin
|
||||
|
|
|
@ -338,9 +338,9 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
`ifdef SV_DPI
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0));
|
||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 32'd0));
|
||||
end else if (schedule_fire) begin
|
||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc)));
|
||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid)));
|
||||
end
|
||||
end
|
||||
`else
|
||||
|
|
|
@ -30,7 +30,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
VX_scoreboard_if.master scoreboard_if [`ISSUE_WIDTH]
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + (`NR_BITS * 4) + 1;
|
||||
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`NUM_WARPS-1:0][`NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
|
||||
|
@ -319,7 +319,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
scoreboard_if[i].data.PC,
|
||||
scoreboard_if[i].data.ex_type,
|
||||
scoreboard_if[i].data.op_type,
|
||||
scoreboard_if[i].data.op_mod,
|
||||
scoreboard_if[i].data.op_args,
|
||||
scoreboard_if[i].data.wb,
|
||||
scoreboard_if[i].data.rd,
|
||||
scoreboard_if[i].data.rs1,
|
||||
|
|
|
@ -27,16 +27,16 @@
|
|||
endtask
|
||||
|
||||
task trace_ex_op(input int level,
|
||||
input [`EX_BITS-1:0] ex_type,
|
||||
input [`INST_OP_BITS-1:0] op_type,
|
||||
input VX_gpu_pkg::op_mod_t op_mod
|
||||
input [`EX_BITS-1:0] ex_type,
|
||||
input [`INST_OP_BITS-1:0] op_type,
|
||||
input VX_gpu_pkg::op_args_t op_args
|
||||
);
|
||||
case (ex_type)
|
||||
`EX_ALU: begin
|
||||
case (op_mod.alu.xtype)
|
||||
case (op_args.alu.xtype)
|
||||
`ALU_TYPE_ARITH: begin
|
||||
if (op_mod.alu.is_w) begin
|
||||
if (op_mod.alu.use_imm) begin
|
||||
if (op_args.alu.is_w) begin
|
||||
if (op_args.alu.use_imm) begin
|
||||
case (`INST_ALU_BITS'(op_type))
|
||||
`INST_ALU_ADD: `TRACE(level, ("ADDIW"));
|
||||
`INST_ALU_SLL: `TRACE(level, ("SLLIW"));
|
||||
|
@ -55,7 +55,7 @@
|
|||
endcase
|
||||
end
|
||||
end else begin
|
||||
if (op_mod.alu.use_imm) begin
|
||||
if (op_args.alu.use_imm) begin
|
||||
case (`INST_ALU_BITS'(op_type))
|
||||
`INST_ALU_ADD: `TRACE(level, ("ADDI"));
|
||||
`INST_ALU_SLL: `TRACE(level, ("SLLI"));
|
||||
|
@ -108,7 +108,7 @@
|
|||
endcase
|
||||
end
|
||||
`ALU_TYPE_MULDIV: begin
|
||||
if (op_mod.alu.is_w) begin
|
||||
if (op_args.alu.is_w) begin
|
||||
case (`INST_M_BITS'(op_type))
|
||||
`INST_M_MUL: `TRACE(level, ("MULW"));
|
||||
`INST_M_DIV: `TRACE(level, ("DIVW"));
|
||||
|
@ -131,10 +131,11 @@
|
|||
endcase
|
||||
end
|
||||
end
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
`EX_LSU: begin
|
||||
if (op_mod.lsu.is_float) begin
|
||||
if (op_args.lsu.is_float) begin
|
||||
case (`INST_LSU_BITS'(op_type))
|
||||
`INST_LSU_LW: `TRACE(level, ("FLW"));
|
||||
`INST_LSU_LD: `TRACE(level, ("FLD"));
|
||||
|
@ -163,69 +164,69 @@
|
|||
`EX_FPU: begin
|
||||
case (`INST_FPU_BITS'(op_type))
|
||||
`INST_FPU_ADD: begin
|
||||
if (op_mod.fpu.fmt[0])
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FADD.D"));
|
||||
else
|
||||
`TRACE(level, ("FADD.S"));
|
||||
end
|
||||
`INST_FPU_SUB: begin
|
||||
if (op_mod.fpu.fmt[0])
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FSUB.D"));
|
||||
else
|
||||
`TRACE(level, ("FSUB.S"));
|
||||
end
|
||||
`INST_FPU_MUL: begin
|
||||
if (op_mod.fpu.fmt[0])
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FMUL.D"));
|
||||
else
|
||||
`TRACE(level, ("FMUL.S"));
|
||||
end
|
||||
`INST_FPU_DIV: begin
|
||||
if (op_mod.fpu.fmt[0])
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FDIV.D"));
|
||||
else
|
||||
`TRACE(level, ("FDIV.S"));
|
||||
end
|
||||
`INST_FPU_SQRT: begin
|
||||
if (op_mod.fpu.fmt[0])
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FSQRT.D"));
|
||||
else
|
||||
`TRACE(level, ("FSQRT.S"));
|
||||
end
|
||||
`INST_FPU_MADD: begin
|
||||
if (op_mod.fpu.fmt[0])
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FMADD.D"));
|
||||
else
|
||||
`TRACE(level, ("FMADD.S"));
|
||||
end
|
||||
`INST_FPU_MSUB: begin
|
||||
if (op_mod.fpu.fmt[0])
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FMSUB.D"));
|
||||
else
|
||||
`TRACE(level, ("FMSUB.S"));
|
||||
end
|
||||
`INST_FPU_NMADD: begin
|
||||
if (op_mod.fpu.fmt[0])
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FNMADD.D"));
|
||||
else
|
||||
`TRACE(level, ("FNMADD.S"));
|
||||
end
|
||||
`INST_FPU_NMSUB: begin
|
||||
if (op_mod.fpu.fmt[0])
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FNMSUB.D"));
|
||||
else
|
||||
`TRACE(level, ("FNMSUB.S"));
|
||||
end
|
||||
`INST_FPU_CMP: begin
|
||||
if (op_mod.fpu.fmt[0]) begin
|
||||
case (op_mod.fpu.frm[1:0])
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
case (op_args.fpu.frm[1:0])
|
||||
0: `TRACE(level, ("FLE.D"));
|
||||
1: `TRACE(level, ("FLT.D"));
|
||||
2: `TRACE(level, ("FEQ.D"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end else begin
|
||||
case (op_mod.fpu.frm[1:0])
|
||||
case (op_args.fpu.frm[1:0])
|
||||
0: `TRACE(level, ("FLE.S"));
|
||||
1: `TRACE(level, ("FLT.S"));
|
||||
2: `TRACE(level, ("FEQ.S"));
|
||||
|
@ -234,21 +235,21 @@
|
|||
end
|
||||
end
|
||||
`INST_FPU_F2F: begin
|
||||
if (op_mod.fpu.fmt[0]) begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
`TRACE(level, ("FCVT.D.S"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.S.D"));
|
||||
end
|
||||
end
|
||||
`INST_FPU_F2I: begin
|
||||
if (op_mod.fpu.fmt[0]) begin
|
||||
if (op_mod.fpu.fmt[1]) begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.L.D"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.W.D"));
|
||||
end
|
||||
end else begin
|
||||
if (op_mod.fpu.fmt[1]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.L.S"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.W.S"));
|
||||
|
@ -256,14 +257,14 @@
|
|||
end
|
||||
end
|
||||
`INST_FPU_F2U: begin
|
||||
if (op_mod.fpu.fmt[0]) begin
|
||||
if (op_mod.fpu.fmt[1]) begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.LU.D"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.WU.D"));
|
||||
end
|
||||
end else begin
|
||||
if (op_mod.fpu.fmt[1]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.LU.S"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.WU.S"));
|
||||
|
@ -271,14 +272,14 @@
|
|||
end
|
||||
end
|
||||
`INST_FPU_I2F: begin
|
||||
if (op_mod.fpu.fmt[0]) begin
|
||||
if (op_mod.fpu.fmt[1]) begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.D.L"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.D.W"));
|
||||
end
|
||||
end else begin
|
||||
if (op_mod.fpu.fmt[1]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.S.L"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.S.W"));
|
||||
|
@ -286,14 +287,14 @@
|
|||
end
|
||||
end
|
||||
`INST_FPU_U2F: begin
|
||||
if (op_mod.fpu.fmt[0]) begin
|
||||
if (op_mod.fpu.fmt[1]) begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.D.LU"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.D.WU"));
|
||||
end
|
||||
end else begin
|
||||
if (op_mod.fpu.fmt[1]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.S.LU"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.S.WU"));
|
||||
|
@ -301,8 +302,8 @@
|
|||
end
|
||||
end
|
||||
`INST_FPU_MISC: begin
|
||||
if (op_mod.fpu.fmt[0]) begin
|
||||
case (op_mod)
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
case (op_args.fpu.frm)
|
||||
0: `TRACE(level, ("FSGNJ.D"));
|
||||
1: `TRACE(level, ("FSGNJN.D"));
|
||||
2: `TRACE(level, ("FSGNJX.D"));
|
||||
|
@ -313,7 +314,7 @@
|
|||
7: `TRACE(level, ("FMAX.D"));
|
||||
endcase
|
||||
end else begin
|
||||
case (op_mod)
|
||||
case (op_args.fpu.frm)
|
||||
0: `TRACE(level, ("FSGNJ.S"));
|
||||
1: `TRACE(level, ("FSGNJN.S"));
|
||||
2: `TRACE(level, ("FSGNJX.S"));
|
||||
|
@ -332,13 +333,13 @@
|
|||
case (`INST_SFU_BITS'(op_type))
|
||||
`INST_SFU_TMC: `TRACE(level, ("TMC"));
|
||||
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"));
|
||||
`INST_SFU_SPLIT: begin if (op_mod.wctl.is_neg) `TRACE(level, ("SPLIT.N")); else `TRACE(level, ("SPLIT")); end
|
||||
`INST_SFU_SPLIT: begin if (op_args.wctl.is_neg) `TRACE(level, ("SPLIT.N")); else `TRACE(level, ("SPLIT")); end
|
||||
`INST_SFU_JOIN: `TRACE(level, ("JOIN"));
|
||||
`INST_SFU_BAR: `TRACE(level, ("BAR"));
|
||||
`INST_SFU_PRED: begin if (op_mod.wctl.is_neg) `TRACE(level, ("PRED.N")); else `TRACE(level, ("PRED")); end
|
||||
`INST_SFU_CSRRW: begin if (op_mod.csr.use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
|
||||
`INST_SFU_CSRRS: begin if (op_mod.csr.use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
|
||||
`INST_SFU_CSRRC: begin if (op_mod.csr.use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end
|
||||
`INST_SFU_PRED: begin if (op_args.wctl.is_neg) `TRACE(level, ("PRED.N")); else `TRACE(level, ("PRED")); end
|
||||
`INST_SFU_CSRRW: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
|
||||
`INST_SFU_CSRRS: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
|
||||
`INST_SFU_CSRRC: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
|
@ -346,24 +347,24 @@
|
|||
endcase
|
||||
endtask
|
||||
|
||||
task trace_op_mod(input int level,
|
||||
input [`EX_BITS-1:0] ex_type,
|
||||
input [`INST_OP_BITS-1:0] op_type,
|
||||
input VX_gpu_pkg::op_mod_t op_mod
|
||||
task trace_op_args(input int level,
|
||||
input [`EX_BITS-1:0] ex_type,
|
||||
input [`INST_OP_BITS-1:0] op_type,
|
||||
input VX_gpu_pkg::op_args_t op_args
|
||||
);
|
||||
case (ex_type)
|
||||
`EX_ALU: begin
|
||||
`TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_mod.alu.use_PC, op_mod.alu.use_imm, op_mod.alu.imm));
|
||||
`TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_args.alu.use_PC, op_args.alu.use_imm, op_args.alu.imm));
|
||||
end
|
||||
`EX_LSU: begin
|
||||
`TRACE(level, (", offset=0x%0h", op_mod.lsu.offset));
|
||||
`TRACE(level, (", offset=0x%0h", op_args.lsu.offset));
|
||||
end
|
||||
`EX_FPU: begin
|
||||
`TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_mod.fpu.fmt, op_mod.fpu.frm));
|
||||
`TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_args.fpu.fmt, op_args.fpu.frm));
|
||||
end
|
||||
`EX_SFU: begin
|
||||
if (`INST_SFU_IS_CSR(op_type)) begin
|
||||
`TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_mod.csr.addr, op_mod.csr.use_imm, op_mod.csr.imm));
|
||||
`TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_args.csr.addr, op_args.csr.use_imm, op_args.csr.imm));
|
||||
end
|
||||
end
|
||||
default:;
|
||||
|
|
|
@ -60,7 +60,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||
wire [`XLEN-1:0] rs2_data = execute_if.data.rs2_data[tid];
|
||||
`UNUSED_VAR (rs1_data)
|
||||
|
||||
wire not_pred = execute_if.data.op_mod.wctl.is_neg;
|
||||
wire not_pred = execute_if.data.op_args.wctl.is_neg;
|
||||
|
||||
wire [NUM_LANES-1:0] taken;
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
|
@ -107,7 +107,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||
assign split.is_dvg = has_then && has_else;
|
||||
assign split.then_tmask = taken_tmask;
|
||||
assign split.else_tmask = ntaken_tmask;
|
||||
assign split.next_pc = execute_if.data.PC + 2;
|
||||
assign split.next_pc = execute_if.data.PC + `PC_BITS'(2);
|
||||
|
||||
assign warp_ctl_if.dvstack_wid = execute_if.data.wid;
|
||||
wire [`DV_STACK_SIZEW-1:0] dvstack_ptr;
|
||||
|
|
|
@ -22,7 +22,7 @@ interface VX_decode_if import VX_gpu_pkg::*; ();
|
|||
logic [`PC_BITS-1:0] PC;
|
||||
logic [`EX_BITS-1:0] ex_type;
|
||||
logic [`INST_OP_BITS-1:0] op_type;
|
||||
op_mod_t op_mod;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [`NR_BITS-1:0] rs1;
|
||||
|
|
|
@ -21,7 +21,7 @@ interface VX_dispatch_if import VX_gpu_pkg::*; ();
|
|||
logic [`NUM_THREADS-1:0] tmask;
|
||||
logic [`PC_BITS-1:0] PC;
|
||||
logic [`INST_ALU_BITS-1:0] op_type;
|
||||
op_mod_t op_mod;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [`NT_WIDTH-1:0] tid;
|
||||
|
|
|
@ -23,7 +23,7 @@ interface VX_execute_if import VX_gpu_pkg::*; #(
|
|||
logic [NUM_LANES-1:0] tmask;
|
||||
logic [`PC_BITS-1:0] PC;
|
||||
logic [`INST_ALU_BITS-1:0] op_type;
|
||||
op_mod_t op_mod;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [`NT_WIDTH-1:0] tid;
|
||||
|
|
|
@ -21,7 +21,7 @@ interface VX_ibuffer_if import VX_gpu_pkg::*; ();
|
|||
logic [`PC_BITS-1:0] PC;
|
||||
logic [`EX_BITS-1:0] ex_type;
|
||||
logic [`INST_OP_BITS-1:0] op_type;
|
||||
op_mod_t op_mod;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [`NR_BITS-1:0] rs1;
|
||||
|
|
|
@ -22,7 +22,7 @@ interface VX_operands_if import VX_gpu_pkg::*; ();
|
|||
logic [`PC_BITS-1:0] PC;
|
||||
logic [`EX_BITS-1:0] ex_type;
|
||||
logic [`INST_OP_BITS-1:0] op_type;
|
||||
op_mod_t op_mod;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data;
|
||||
|
|
|
@ -22,7 +22,7 @@ interface VX_scoreboard_if import VX_gpu_pkg::*; ();
|
|||
logic [`PC_BITS-1:0] PC;
|
||||
logic [`EX_BITS-1:0] ex_type;
|
||||
logic [`INST_OP_BITS-1:0] op_type;
|
||||
op_mod_t op_mod;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [`NR_BITS-1:0] rs1;
|
||||
|
|
|
@ -13,19 +13,17 @@ SRC_DIR := $(VORTEX_HOME)/kernel/src
|
|||
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
|
||||
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
|
||||
LLVM_CFLAGS += -Xclang -target-feature -Xclang +vortex -mllvm -vortex-branch-divergence=0
|
||||
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0/$(RISCV_PREFIX)
|
||||
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0/$(RISCV_PREFIX)
|
||||
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0
|
||||
#LLVM_CFLAGS += -Wl,-L$(RISCV_TOOLCHAIN_PATH)/lib/gcc/$(RISCV_PREFIX)/9.2.0
|
||||
#LLVM_CFLAGS += --rtlib=libgcc
|
||||
|
||||
#CC = $(LLVM_VORTEX)/bin/clang $(LLVM_CFLAGS)
|
||||
#CXX = $(LLVM_VORTEX)/bin/clang++ $(LLVM_CFLAGS)
|
||||
#AR = $(LLVM_VORTEX)/bin/llvm-ar
|
||||
#DP = $(LLVM_VORTEX)/bin/llvm-objdump
|
||||
#CP = $(LLVM_VORTEX)/bin/llvm-objcopy
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
|
||||
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++
|
||||
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
|
||||
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
|
||||
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
|
||||
|
@ -34,7 +32,7 @@ CFLAGS += -O3 -mcmodel=medany -fno-exceptions -fdata-sections -ffunction-section
|
|||
CFLAGS += -I$(INC_DIR) -I$(ROOT_DIR)/hw
|
||||
CFLAGS += -DXLEN_$(XLEN)
|
||||
|
||||
PROJECT := libvortexrt
|
||||
PROJECT := libvortex
|
||||
|
||||
SRCS = $(SRC_DIR)/vx_start.S $(SRC_DIR)/vx_syscalls.c $(SRC_DIR)/vx_print.S $(SRC_DIR)/tinyprintf.c $(SRC_DIR)/vx_print.c $(SRC_DIR)/vx_spawn.c $(SRC_DIR)/vx_serial.S $(SRC_DIR)/vx_perf.c
|
||||
|
||||
|
@ -48,9 +46,6 @@ $(PROJECT).dump: $(PROJECT).a
|
|||
%.S.o: $(SRC_DIR)/%.S
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
%.cpp.o: $(SRC_DIR)/%.cpp
|
||||
$(CXX) $(CFLAGS) -c $< -o $@
|
||||
|
||||
%.c.o: $(SRC_DIR)/%.c
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
|
|
|
@ -17,9 +17,17 @@
|
|||
#ifndef __VX_INTRINSICS_H__
|
||||
#define __VX_INTRINSICS_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <VX_types.h>
|
||||
|
||||
#if __riscv_xlen == 64
|
||||
typedef unsigned long size_t; // 64-bit RISC-V
|
||||
#elif __riscv_xlen == 32
|
||||
typedef unsigned int size_t; // 32-bit RISC-V
|
||||
#else
|
||||
#error "Unknown RISC-V architecture"
|
||||
#endif
|
||||
|
||||
#if defined(__clang__)
|
||||
#define __UNIFORM__ __attribute__((annotate("vortex.uniform")))
|
||||
#else
|
||||
|
@ -103,17 +111,17 @@ extern "C" {
|
|||
|
||||
// Set thread mask
|
||||
inline void vx_tmc(size_t thread_mask) {
|
||||
asm volatile (".insn r %0, 0, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(thread_mask));
|
||||
__asm__ volatile (".insn r %0, 0, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(thread_mask));
|
||||
}
|
||||
|
||||
// disable all threads in the current warp
|
||||
inline void vx_tmc_zero() {
|
||||
asm volatile (".insn r %0, 0, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM0));
|
||||
__asm__ volatile (".insn r %0, 0, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM0));
|
||||
}
|
||||
|
||||
// switch execution to single thread zero
|
||||
inline void vx_tmc_one() {
|
||||
asm volatile (
|
||||
__asm__ volatile (
|
||||
"li a0, 1\n\t" // Load immediate value 1 into a0 (x10) register
|
||||
".insn r %0, 0, 0, x0, a0, x0" :: "i"(RISCV_CUSTOM0) : "a0"
|
||||
);
|
||||
|
@ -121,116 +129,109 @@ inline void vx_tmc_one() {
|
|||
|
||||
// Set thread predicate
|
||||
inline void vx_pred(int condition, int thread_mask) {
|
||||
asm volatile (".insn r %0, 5, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(condition), "r"(thread_mask));
|
||||
__asm__ volatile (".insn r %0, 5, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(condition), "r"(thread_mask));
|
||||
}
|
||||
|
||||
// Set thread not predicate
|
||||
inline void vx_pred_n(int condition, int thread_mask) {
|
||||
asm volatile (".insn r %0, 5, 0, x1, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(condition), "r"(thread_mask));
|
||||
__asm__ volatile (".insn r %0, 5, 0, x1, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(condition), "r"(thread_mask));
|
||||
}
|
||||
|
||||
// Spawn warps
|
||||
typedef void (*vx_wspawn_pfn)();
|
||||
inline void vx_wspawn(size_t num_warps, vx_wspawn_pfn func_ptr) {
|
||||
asm volatile (".insn r %0, 1, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(num_warps), "r"(func_ptr));
|
||||
__asm__ volatile (".insn r %0, 1, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(num_warps), "r"(func_ptr));
|
||||
}
|
||||
|
||||
// Split on a predicate
|
||||
inline int vx_split(int predicate) {
|
||||
size_t ret;
|
||||
asm volatile (".insn r %1, 2, 0, %0, %2, x0" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate));
|
||||
__asm__ volatile (".insn r %1, 2, 0, %0, %2, x0" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Split on a not predicate
|
||||
inline int vx_split_n(int predicate) {
|
||||
size_t ret;
|
||||
asm volatile (".insn r %1, 2, 0, %0, %2, x1" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate));
|
||||
__asm__ volatile (".insn r %1, 2, 0, %0, %2, x1" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Join
|
||||
inline void vx_join(int stack_ptr) {
|
||||
asm volatile (".insn r %0, 3, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(stack_ptr));
|
||||
__asm__ volatile (".insn r %0, 3, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(stack_ptr));
|
||||
}
|
||||
|
||||
// Warp Barrier
|
||||
inline void vx_barrier(int barried_id, int num_warps) {
|
||||
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(num_warps));
|
||||
__asm__ volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(num_warps));
|
||||
}
|
||||
|
||||
// Return current thread identifier
|
||||
inline int vx_thread_id() {
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_ID));
|
||||
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_ID));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return current warp identifier
|
||||
inline int vx_warp_id() {
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_ID));
|
||||
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_ID));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return current core identifier
|
||||
inline int vx_core_id() {
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_CORE_ID));
|
||||
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_CORE_ID));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return active threads mask
|
||||
inline int vx_active_threads() {
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_ACTIVE_THREADS));
|
||||
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_ACTIVE_THREADS));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return active warps mask
|
||||
inline int vx_active_warps() {
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_ACTIVE_WARPS));
|
||||
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_ACTIVE_WARPS));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return the number of threads per warp
|
||||
inline int vx_num_threads() {
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_THREADS));
|
||||
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_THREADS));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return the number of warps per core
|
||||
inline int vx_num_warps() {
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_WARPS));
|
||||
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_WARPS));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return the number of cores per cluster
|
||||
inline int vx_num_cores() {
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_CORES));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return the number of barriers
|
||||
inline int vx_num_barriers() {
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_BARRIERS));
|
||||
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_CORES));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return the hart identifier (thread id accross the processor)
|
||||
inline int vx_hart_id() {
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_MHARTID));
|
||||
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_MHARTID));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline void vx_fence() {
|
||||
asm volatile ("fence iorw, iorw");
|
||||
__asm__ volatile ("fence iorw, iorw");
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
|
|
@ -14,24 +14,49 @@
|
|||
#ifndef __VX_SPAWN_H__
|
||||
#define __VX_SPAWN_H__
|
||||
|
||||
#include <vx_intrinsics.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef void (*vx_spawn_tasks_cb)(int task_id, void *arg);
|
||||
typedef union {
|
||||
struct {
|
||||
uint32_t x;
|
||||
uint32_t y;
|
||||
uint32_t z;
|
||||
};
|
||||
uint32_t m[3];
|
||||
} dim3_t;
|
||||
|
||||
typedef void (*vx_spawn_task_groups_cb)(int local_task_id, int group_id, int local_group_id, int warps_per_group, void *arg);
|
||||
extern __thread dim3_t blockIdx;
|
||||
extern __thread dim3_t threadIdx;
|
||||
extern dim3_t gridDim;
|
||||
extern dim3_t blockDim;
|
||||
|
||||
extern __thread uint32_t __local_group_id;
|
||||
extern uint32_t __warps_per_group;
|
||||
|
||||
typedef void (*vx_kernel_func_cb)(void *arg);
|
||||
|
||||
typedef void (*vx_serial_cb)(void *arg);
|
||||
|
||||
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
|
||||
#define __local_mem(size) \
|
||||
(void*)((int8_t*)csr_read(VX_CSR_LOCAL_MEM_BASE) + __local_group_id * size)
|
||||
|
||||
void vx_spawn_task_groups(int num_groups, int group_size, vx_spawn_task_groups_cb callback, void * arg);
|
||||
#define __syncthreads() \
|
||||
vx_barrier(__local_group_id, __warps_per_group)
|
||||
|
||||
void vx_serial(vx_serial_cb callback, void * arg);
|
||||
// launch a kernel function with a grid of blocks and block of threads
|
||||
int vx_spawn_threads(uint32_t dimension,
|
||||
const uint32_t* grid_dim,
|
||||
const uint32_t* block_dim,
|
||||
vx_kernel_func_cb kernel_func,
|
||||
const void* arg);
|
||||
|
||||
// function call serialization
|
||||
void vx_serial(vx_serial_cb callback, const void * arg);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -10,6 +10,34 @@ ENTRY(_start)
|
|||
SECTIONS
|
||||
{
|
||||
. = STARTUP_ADDR;
|
||||
/DISCARD/ : {
|
||||
*(.interp)
|
||||
*(.dynsym)
|
||||
*(.dynstr)
|
||||
*(.gnu.version)
|
||||
*(.gnu.version_d)
|
||||
*(.gnu.version_r)
|
||||
*(.rela.dyn)
|
||||
*(.rela.init)
|
||||
*(.rela.text .rela.text.* .rela.gnu.linkonce.t.*)
|
||||
*(.rela.fini)
|
||||
*(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*)
|
||||
*(.rela.data.rel.ro .rela.data.rel.ro.* .rela.gnu.linkonce.d.rel.ro.*)
|
||||
*(.rela.data .rela.data.* .rela.gnu.linkonce.d.*)
|
||||
*(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*)
|
||||
*(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*)
|
||||
*(.rela.ctors)
|
||||
*(.rela.dtors)
|
||||
*(.rela.got)
|
||||
*(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*)
|
||||
*(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*)
|
||||
*(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*)
|
||||
*(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*)
|
||||
*(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*)
|
||||
*(.rela.iplt)
|
||||
*(.rela.plt)
|
||||
*(.dynamic)
|
||||
}
|
||||
.interp : { *(.interp) }
|
||||
.note.gnu.build-id : { *(.note.gnu.build-id) }
|
||||
.hash : { *(.hash) }
|
||||
|
@ -19,32 +47,6 @@ SECTIONS
|
|||
.gnu.version : { *(.gnu.version) }
|
||||
.gnu.version_d : { *(.gnu.version_d) }
|
||||
.gnu.version_r : { *(.gnu.version_r) }
|
||||
.rela.init : { *(.rela.init) }
|
||||
.rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) }
|
||||
.rela.fini : { *(.rela.fini) }
|
||||
.rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) }
|
||||
.rela.data.rel.ro : { *(.rela.data.rel.ro .rela.data.rel.ro.* .rela.gnu.linkonce.d.rel.ro.*) }
|
||||
.rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) }
|
||||
.rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) }
|
||||
.rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) }
|
||||
.rela.ctors : { *(.rela.ctors) }
|
||||
.rela.dtors : { *(.rela.dtors) }
|
||||
.rela.got : { *(.rela.got) }
|
||||
.rela.sdata : { *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) }
|
||||
.rela.sbss : { *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) }
|
||||
.rela.sdata2 : { *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) }
|
||||
.rela.sbss2 : { *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) }
|
||||
.rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) }
|
||||
.rela.iplt :
|
||||
{
|
||||
PROVIDE_HIDDEN (__rela_iplt_start = .);
|
||||
*(.rela.iplt)
|
||||
PROVIDE_HIDDEN (__rela_iplt_end = .);
|
||||
}
|
||||
.rela.plt :
|
||||
{
|
||||
*(.rela.plt)
|
||||
}
|
||||
.init :
|
||||
{
|
||||
KEEP (*(SORT_NONE(.init)))
|
||||
|
@ -98,15 +100,14 @@ SECTIONS
|
|||
PROVIDE_HIDDEN (__tdata_end = .);
|
||||
}
|
||||
PROVIDE (__tdata_size = SIZEOF (.tdata));
|
||||
.tbss :
|
||||
.tbss :
|
||||
{
|
||||
PROVIDE_HIDDEN (__tbss_start = .);
|
||||
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
|
||||
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
|
||||
*(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
|
||||
PROVIDE_HIDDEN (__tbss_end = .);
|
||||
}
|
||||
PROVIDE (__tbss_size = SIZEOF (.tbss));
|
||||
PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
|
||||
.preinit_array :
|
||||
{
|
||||
PROVIDE_HIDDEN (__preinit_array_start = .);
|
||||
|
|
|
@ -10,6 +10,34 @@ ENTRY(_start)
|
|||
SECTIONS
|
||||
{
|
||||
. = STARTUP_ADDR;
|
||||
/DISCARD/ : {
|
||||
*(.interp)
|
||||
*(.dynsym)
|
||||
*(.dynstr)
|
||||
*(.gnu.version)
|
||||
*(.gnu.version_d)
|
||||
*(.gnu.version_r)
|
||||
*(.rela.dyn)
|
||||
*(.rela.init)
|
||||
*(.rela.text .rela.text.* .rela.gnu.linkonce.t.*)
|
||||
*(.rela.fini)
|
||||
*(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*)
|
||||
*(.rela.data.rel.ro .rela.data.rel.ro.* .rela.gnu.linkonce.d.rel.ro.*)
|
||||
*(.rela.data .rela.data.* .rela.gnu.linkonce.d.*)
|
||||
*(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*)
|
||||
*(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*)
|
||||
*(.rela.ctors)
|
||||
*(.rela.dtors)
|
||||
*(.rela.got)
|
||||
*(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*)
|
||||
*(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*)
|
||||
*(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*)
|
||||
*(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*)
|
||||
*(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*)
|
||||
*(.rela.iplt)
|
||||
*(.rela.plt)
|
||||
*(.dynamic)
|
||||
}
|
||||
.interp : { *(.interp) }
|
||||
.note.gnu.build-id : { *(.note.gnu.build-id) }
|
||||
.hash : { *(.hash) }
|
||||
|
@ -19,32 +47,6 @@ SECTIONS
|
|||
.gnu.version : { *(.gnu.version) }
|
||||
.gnu.version_d : { *(.gnu.version_d) }
|
||||
.gnu.version_r : { *(.gnu.version_r) }
|
||||
.rela.init : { *(.rela.init) }
|
||||
.rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) }
|
||||
.rela.fini : { *(.rela.fini) }
|
||||
.rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) }
|
||||
.rela.data.rel.ro : { *(.rela.data.rel.ro .rela.data.rel.ro.* .rela.gnu.linkonce.d.rel.ro.*) }
|
||||
.rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) }
|
||||
.rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) }
|
||||
.rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) }
|
||||
.rela.ctors : { *(.rela.ctors) }
|
||||
.rela.dtors : { *(.rela.dtors) }
|
||||
.rela.got : { *(.rela.got) }
|
||||
.rela.sdata : { *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) }
|
||||
.rela.sbss : { *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) }
|
||||
.rela.sdata2 : { *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) }
|
||||
.rela.sbss2 : { *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) }
|
||||
.rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) }
|
||||
.rela.iplt :
|
||||
{
|
||||
PROVIDE_HIDDEN (__rela_iplt_start = .);
|
||||
*(.rela.iplt)
|
||||
PROVIDE_HIDDEN (__rela_iplt_end = .);
|
||||
}
|
||||
.rela.plt :
|
||||
{
|
||||
*(.rela.plt)
|
||||
}
|
||||
.init :
|
||||
{
|
||||
KEEP (*(SORT_NONE(.init)))
|
||||
|
@ -98,15 +100,14 @@ SECTIONS
|
|||
PROVIDE_HIDDEN (__tdata_end = .);
|
||||
}
|
||||
PROVIDE (__tdata_size = SIZEOF (.tdata));
|
||||
.tbss :
|
||||
.tbss :
|
||||
{
|
||||
PROVIDE_HIDDEN (__tbss_start = .);
|
||||
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
|
||||
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
|
||||
*(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
|
||||
PROVIDE_HIDDEN (__tbss_end = .);
|
||||
}
|
||||
PROVIDE (__tbss_size = SIZEOF (.tbss));
|
||||
PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
|
||||
.preinit_array :
|
||||
{
|
||||
PROVIDE_HIDDEN (__preinit_array_start = .);
|
||||
|
|
|
@ -42,7 +42,7 @@ def get_vma_size(elf_file):
|
|||
min_vma = min(min_vma, vma)
|
||||
max_vma = max(max_vma, end_vma)
|
||||
vma_size = max_vma - min_vma
|
||||
#print("vma={0:x}, size={1}, min_vma={2:x}, max_vma={3:x}, vma_size={4}".format(vma, size, min_vma, max_vma, vma_size))
|
||||
#print("vma={0:x}, size={1}, min_vma=0x{2:x}, max_vma=0x{3:x}, vma_size={4}".format(vma, size, min_vma, max_vma, vma_size))
|
||||
|
||||
return min_vma, max_vma
|
||||
|
||||
|
@ -73,7 +73,7 @@ def create_vxbin_binary(input_elf, output_bin, objcopy_path):
|
|||
|
||||
# Remove the temporary binary file
|
||||
os.remove(temp_bin_path)
|
||||
print("Binary created successfully: {}, min_vma={:x}, max_vma={:x}".format(output_bin, min_vma, max_vma))
|
||||
# print("Binary created successfully: {}, min_vma={:x}, max_vma={:x}".format(output_bin, min_vma, max_vma))
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) != 3:
|
||||
|
|
27
kernel/src/common.h
Normal file
27
kernel/src/common.h
Normal file
|
@ -0,0 +1,27 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#define RISCV_CUSTOM0 0x0B
|
||||
|
||||
#define IO_MPM_EXITCODE (IO_MPM_ADDR + 8)
|
||||
|
||||
#ifdef XLEN_64
|
||||
#define LOAD_IMMEDIATE64(rd, imm) \
|
||||
li t0, (imm >> 32); \
|
||||
slli t0, t0, 32; \
|
||||
li rd, (imm & 0xffffffff); \
|
||||
or rd, rd, t0
|
||||
#else
|
||||
#define LOAD_IMMEDIATE64(rd, imm) \
|
||||
li rd, imm
|
||||
#endif
|
|
@ -34,7 +34,7 @@ void vx_perf_dump() {
|
|||
int core_id = vx_core_id();
|
||||
uint32_t * const csr_mem = (uint32_t*)(IO_MPM_ADDR + 64 * sizeof(uint32_t) * core_id);
|
||||
DUMP_CSRS(0);
|
||||
DUMP_CSRS(1);
|
||||
//DUMP_CSRS(1); reserved for exitcode
|
||||
DUMP_CSRS(2);
|
||||
DUMP_CSRS(3);
|
||||
DUMP_CSRS(4);
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -13,20 +13,14 @@
|
|||
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
#include "common.h"
|
||||
|
||||
.type vx_putchar, @function
|
||||
.global vx_putchar
|
||||
vx_putchar:
|
||||
csrr t0, VX_CSR_MHARTID
|
||||
andi t0, t0, %lo(IO_COUT_SIZE-1)
|
||||
#if (XLEN == 64)
|
||||
li t1, (IO_COUT_ADDR >> 32)
|
||||
slli t1, t1, 32
|
||||
li t2, (IO_COUT_ADDR & 0xffffffff)
|
||||
or t1, t1, t2
|
||||
#else
|
||||
li t1, IO_COUT_ADDR
|
||||
#endif
|
||||
add t0, t0, t1
|
||||
add t0, t0, t1
|
||||
sb a0, 0(t0)
|
||||
ret
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -41,14 +41,14 @@ typedef struct {
|
|||
int precision;
|
||||
} putfloat_arg_t;
|
||||
|
||||
static void __putint_cb(const putint_arg_t* arg) {
|
||||
static void __putint_cb(const putint_arg_t* arg) {
|
||||
char tmp[33];
|
||||
float value = arg->value;
|
||||
int base = arg->base;
|
||||
itoa(value, tmp, base);
|
||||
for (int i = 0; i < 33; ++i) {
|
||||
int c = tmp[i];
|
||||
if (!c)
|
||||
if (!c)
|
||||
break;
|
||||
vx_putchar(c);
|
||||
}
|
||||
|
@ -58,13 +58,13 @@ static void __putfloat_cb(const putfloat_arg_t* arg) {
|
|||
float value = arg->value;
|
||||
int precision = arg->precision;
|
||||
int ipart = (int)value;
|
||||
vx_putint(ipart, 10);
|
||||
if (precision != 0) {
|
||||
vx_putchar('.');
|
||||
vx_putint(ipart, 10);
|
||||
if (precision != 0) {
|
||||
vx_putchar('.');
|
||||
float frac = value - (float)ipart;
|
||||
float fscaled = frac * pow(10, precision);
|
||||
vx_putint((int)fscaled, 10);
|
||||
}
|
||||
float fscaled = frac * pow(10, precision);
|
||||
vx_putint((int)fscaled, 10);
|
||||
}
|
||||
}
|
||||
|
||||
static void __vprintf_cb(printf_arg_t* arg) {
|
||||
|
@ -90,7 +90,7 @@ int vx_vprintf(const char* format, va_list va) {
|
|||
arg.format = format;
|
||||
arg.va = &va;
|
||||
vx_serial((vx_serial_cb)__vprintf_cb, &arg);
|
||||
return arg.ret;
|
||||
return arg.ret;
|
||||
}
|
||||
|
||||
int vx_printf(const char * format, ...) {
|
||||
|
@ -98,8 +98,8 @@ int vx_printf(const char * format, ...) {
|
|||
va_list va;
|
||||
va_start(va, format);
|
||||
ret = vx_vprintf(format, va);
|
||||
va_end(va);
|
||||
return ret;
|
||||
va_end(va);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -18,7 +18,7 @@
|
|||
|
||||
.type vx_serial, @function
|
||||
.global vx_serial
|
||||
vx_serial:
|
||||
vx_serial:
|
||||
#if (XLEN == 64)
|
||||
addi sp, sp, -56
|
||||
sd ra, 48(sp)
|
||||
|
@ -41,7 +41,7 @@ vx_serial:
|
|||
mv s4, a0 # s4 <- callback
|
||||
mv s3, a1 # s3 <- arg
|
||||
csrr s2, VX_CSR_NUM_THREADS # s2 <- NT
|
||||
csrr s1, VX_CSR_THREAD_ID # s1 <- tid
|
||||
csrr s1, VX_CSR_THREAD_ID # s1 <- tid
|
||||
li s0, 0 # s0 <- index
|
||||
label_loop:
|
||||
sub t0, s0, s1
|
||||
|
@ -72,6 +72,5 @@ label_join:
|
|||
lw s1, 4(sp)
|
||||
lw s0, 0(sp)
|
||||
addi sp, sp, 28
|
||||
#endif
|
||||
#endif
|
||||
ret
|
||||
|
||||
|
|
|
@ -13,7 +13,6 @@
|
|||
|
||||
#include <vx_spawn.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <inttypes.h>
|
||||
#include <vx_print.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -24,267 +23,302 @@ extern "C" {
|
|||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
__thread dim3_t blockIdx;
|
||||
__thread dim3_t threadIdx;
|
||||
dim3_t gridDim;
|
||||
dim3_t blockDim;
|
||||
|
||||
__thread uint32_t __local_group_id;
|
||||
uint32_t __warps_per_group;
|
||||
|
||||
typedef struct {
|
||||
vx_spawn_tasks_cb callback;
|
||||
void* arg;
|
||||
int all_tasks_offset;
|
||||
int remain_tasks_offset;
|
||||
int warp_batches;
|
||||
int remaining_warps;
|
||||
} wspawn_tasks_args_t;
|
||||
vx_kernel_func_cb callback;
|
||||
const void* arg;
|
||||
uint32_t group_offset;
|
||||
uint32_t warp_batches;
|
||||
uint32_t remaining_warps;
|
||||
uint32_t warps_per_group;
|
||||
uint32_t groups_per_core;
|
||||
uint32_t remaining_mask;
|
||||
} wspawn_groups_args_t;
|
||||
|
||||
static void __attribute__ ((noinline)) process_all_tasks() {
|
||||
wspawn_tasks_args_t* targs = (wspawn_tasks_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
typedef struct {
|
||||
vx_kernel_func_cb callback;
|
||||
const void* arg;
|
||||
uint32_t all_tasks_offset;
|
||||
uint32_t remain_tasks_offset;
|
||||
uint32_t warp_batches;
|
||||
uint32_t remaining_warps;
|
||||
} wspawn_threads_args_t;
|
||||
|
||||
int threads_per_warp = vx_num_threads();
|
||||
int warp_id = vx_warp_id();
|
||||
int thread_id = vx_thread_id();
|
||||
static void __attribute__ ((noinline)) process_threads() {
|
||||
wspawn_threads_args_t* targs = (wspawn_threads_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
|
||||
int start_warp = (warp_id * targs->warp_batches) + MIN(warp_id, targs->remaining_warps);
|
||||
int iterations = targs->warp_batches + (warp_id < targs->remaining_warps);
|
||||
uint32_t threads_per_warp = vx_num_threads();
|
||||
uint32_t warp_id = vx_warp_id();
|
||||
uint32_t thread_id = vx_thread_id();
|
||||
|
||||
int start_task_id = targs->all_tasks_offset + (start_warp * threads_per_warp) + thread_id;
|
||||
int end_task_id = start_task_id + iterations * threads_per_warp;
|
||||
uint32_t start_warp = (warp_id * targs->warp_batches) + MIN(warp_id, targs->remaining_warps);
|
||||
uint32_t iterations = targs->warp_batches + (warp_id < targs->remaining_warps);
|
||||
|
||||
vx_spawn_tasks_cb callback = targs->callback;
|
||||
void* arg = targs->arg;
|
||||
for (int task_id = start_task_id; task_id < end_task_id; task_id += threads_per_warp) {
|
||||
callback(task_id, arg);
|
||||
uint32_t start_task_id = targs->all_tasks_offset + (start_warp * threads_per_warp) + thread_id;
|
||||
uint32_t end_task_id = start_task_id + iterations * threads_per_warp;
|
||||
|
||||
__local_group_id = 0;
|
||||
threadIdx.x = 0;
|
||||
threadIdx.y = 0;
|
||||
threadIdx.z = 0;
|
||||
|
||||
vx_kernel_func_cb callback = targs->callback;
|
||||
const void* arg = targs->arg;
|
||||
|
||||
for (uint32_t task_id = start_task_id; task_id < end_task_id; task_id += threads_per_warp) {
|
||||
blockIdx.x = task_id % gridDim.x;
|
||||
blockIdx.y = (task_id / gridDim.x) % gridDim.y;
|
||||
blockIdx.z = task_id / (gridDim.x * gridDim.y);
|
||||
callback((void*)arg);
|
||||
}
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) process_remaining_tasks() {
|
||||
wspawn_tasks_args_t* targs = (wspawn_tasks_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
static void __attribute__ ((noinline)) process_remaining_threads() {
|
||||
wspawn_threads_args_t* targs = (wspawn_threads_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
|
||||
int thread_id = vx_thread_id();
|
||||
int task_id = targs->remain_tasks_offset + thread_id;
|
||||
uint32_t thread_id = vx_thread_id();
|
||||
uint32_t task_id = targs->remain_tasks_offset + thread_id;
|
||||
|
||||
(targs->callback)(task_id, targs->arg);
|
||||
(targs->callback)((void*)targs->arg);
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) process_all_tasks_stub() {
|
||||
static void __attribute__ ((noinline)) process_threads_stub() {
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
// process all tasks
|
||||
process_all_tasks();
|
||||
process_threads();
|
||||
|
||||
// disable warp
|
||||
vx_tmc_zero();
|
||||
}
|
||||
|
||||
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
|
||||
// device specifications
|
||||
int num_cores = vx_num_cores();
|
||||
int warps_per_core = vx_num_warps();
|
||||
int threads_per_warp = vx_num_threads();
|
||||
int core_id = vx_core_id();
|
||||
static void __attribute__ ((noinline)) process_thread_groups() {
|
||||
wspawn_groups_args_t* targs = (wspawn_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
|
||||
// calculate necessary active cores
|
||||
int threads_per_core = warps_per_core * threads_per_warp;
|
||||
int needed_cores = (num_tasks + threads_per_core - 1) / threads_per_core;
|
||||
int active_cores = MIN(needed_cores, num_cores);
|
||||
uint32_t threads_per_warp = vx_num_threads();
|
||||
uint32_t warp_id = vx_warp_id();
|
||||
uint32_t thread_id = vx_thread_id();
|
||||
|
||||
// only active cores participate
|
||||
if (core_id >= active_cores)
|
||||
return;
|
||||
uint32_t warps_per_group = targs->warps_per_group;
|
||||
uint32_t groups_per_core = targs->groups_per_core;
|
||||
|
||||
// number of tasks per core
|
||||
int tasks_per_core = num_tasks / active_cores;
|
||||
int remaining_tasks_per_core = num_tasks - tasks_per_core * active_cores;
|
||||
if (core_id < remaining_tasks_per_core)
|
||||
tasks_per_core++;
|
||||
uint32_t iterations = targs->warp_batches + (warp_id < targs->remaining_warps);
|
||||
|
||||
// calculate number of warps to activate
|
||||
int total_warps_per_core = tasks_per_core / threads_per_warp;
|
||||
int remaining_tasks = tasks_per_core - total_warps_per_core * threads_per_warp;
|
||||
int active_warps = total_warps_per_core;
|
||||
int warp_batches = 1, remaining_warps = 0;
|
||||
if (active_warps > warps_per_core) {
|
||||
active_warps = warps_per_core;
|
||||
warp_batches = total_warps_per_core / active_warps;
|
||||
remaining_warps = total_warps_per_core - warp_batches * active_warps;
|
||||
}
|
||||
uint32_t local_group_id = warp_id / warps_per_group;
|
||||
uint32_t group_warp_id = warp_id - local_group_id * warps_per_group;
|
||||
uint32_t local_task_id = group_warp_id * threads_per_warp + thread_id;
|
||||
|
||||
// calculate offsets for task distribution
|
||||
int all_tasks_offset = core_id * tasks_per_core + MIN(core_id, remaining_tasks_per_core);
|
||||
int remain_tasks_offset = all_tasks_offset + (tasks_per_core - remaining_tasks);
|
||||
uint32_t start_group = targs->group_offset + local_group_id;
|
||||
uint32_t end_group = start_group + iterations * groups_per_core;
|
||||
|
||||
// prepare scheduler arguments
|
||||
wspawn_tasks_args_t wspawn_args = {
|
||||
callback,
|
||||
arg,
|
||||
all_tasks_offset,
|
||||
remain_tasks_offset,
|
||||
warp_batches,
|
||||
remaining_warps
|
||||
};
|
||||
csr_write(VX_CSR_MSCRATCH, &wspawn_args);
|
||||
__local_group_id = local_group_id;
|
||||
|
||||
if (active_warps >= 1) {
|
||||
// execute callback on other warps
|
||||
vx_wspawn(active_warps, process_all_tasks_stub);
|
||||
threadIdx.x = local_task_id % blockDim.x;
|
||||
threadIdx.y = (local_task_id / blockDim.x) % blockDim.y;
|
||||
threadIdx.z = local_task_id / (blockDim.x * blockDim.y);
|
||||
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
vx_kernel_func_cb callback = targs->callback;
|
||||
const void* arg = targs->arg;
|
||||
|
||||
// process all tasks
|
||||
process_all_tasks();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc_one();
|
||||
}
|
||||
|
||||
if (remaining_tasks != 0) {
|
||||
// activate remaining threads
|
||||
int tmask = (1 << remaining_tasks) - 1;
|
||||
vx_tmc(tmask);
|
||||
|
||||
// process remaining tasks
|
||||
process_remaining_tasks();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc_one();
|
||||
}
|
||||
|
||||
// wait for spawned tasks to complete
|
||||
vx_wspawn(1, 0);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef struct {
|
||||
vx_spawn_task_groups_cb callback;
|
||||
void* arg;
|
||||
int group_offset;
|
||||
int warp_batches;
|
||||
int remaining_warps;
|
||||
int warps_per_group;
|
||||
int groups_per_core;
|
||||
int remaining_mask;
|
||||
} wspawn_task_groups_args_t;
|
||||
|
||||
static void __attribute__ ((noinline)) process_all_task_groups() {
|
||||
wspawn_task_groups_args_t* targs = (wspawn_task_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
|
||||
int warps_per_group = targs->warps_per_group;
|
||||
int groups_per_core = targs->groups_per_core;
|
||||
|
||||
int threads_per_warp = vx_num_threads();
|
||||
int warp_id = vx_warp_id();
|
||||
int thread_id = vx_thread_id();
|
||||
|
||||
int iterations = targs->warp_batches + (warp_id < targs->remaining_warps);
|
||||
|
||||
int local_group_id = warp_id / warps_per_group;
|
||||
int group_warp_id = warp_id - local_group_id * warps_per_group;
|
||||
int local_task_id = group_warp_id * threads_per_warp + thread_id;
|
||||
|
||||
int start_group = targs->group_offset + local_group_id;
|
||||
int end_group = start_group + iterations * groups_per_core;
|
||||
|
||||
vx_spawn_task_groups_cb callback = targs->callback;
|
||||
void* arg = targs->arg;
|
||||
|
||||
for (int group_id = start_group; group_id < end_group; group_id += groups_per_core) {
|
||||
callback(local_task_id, group_id, start_group, warps_per_group, arg);
|
||||
for (uint32_t group_id = start_group; group_id < end_group; group_id += groups_per_core) {
|
||||
blockIdx.x = group_id % gridDim.x;
|
||||
blockIdx.y = (group_id / gridDim.x) % gridDim.y;
|
||||
blockIdx.z = group_id / (gridDim.x * gridDim.y);
|
||||
callback((void*)arg);
|
||||
}
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) process_all_task_groups_stub() {
|
||||
wspawn_task_groups_args_t* targs = (wspawn_task_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
int warps_per_group = targs->warps_per_group;
|
||||
int remaining_mask = targs->remaining_mask;
|
||||
int warp_id = vx_warp_id();
|
||||
int group_warp_id = warp_id % warps_per_group;
|
||||
int threads_mask = (group_warp_id == warps_per_group-1) ? remaining_mask : -1;
|
||||
static void __attribute__ ((noinline)) process_thread_groups_stub() {
|
||||
wspawn_groups_args_t* targs = (wspawn_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
uint32_t warps_per_group = targs->warps_per_group;
|
||||
uint32_t remaining_mask = targs->remaining_mask;
|
||||
uint32_t warp_id = vx_warp_id();
|
||||
uint32_t group_warp_id = warp_id % warps_per_group;
|
||||
uint32_t threads_mask = (group_warp_id == warps_per_group-1) ? remaining_mask : -1;
|
||||
|
||||
// activate threads
|
||||
vx_tmc(threads_mask);
|
||||
|
||||
// process all tasks
|
||||
process_all_task_groups();
|
||||
// process thread groups
|
||||
process_thread_groups();
|
||||
|
||||
// disable all warps except warp0
|
||||
vx_tmc(0 == vx_warp_id());
|
||||
}
|
||||
|
||||
void vx_syncthreads(int barrier_id) {
|
||||
wspawn_task_groups_args_t* targs = (wspawn_task_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
int warps_per_group = targs->warps_per_group;
|
||||
vx_barrier(barrier_id, warps_per_group);
|
||||
}
|
||||
int vx_spawn_threads(uint32_t dimension,
|
||||
const uint32_t* grid_dim,
|
||||
const uint32_t * block_dim,
|
||||
vx_kernel_func_cb kernel_func,
|
||||
const void* arg) {
|
||||
// calculate number of groups and group size
|
||||
uint32_t num_groups = 1;
|
||||
uint32_t group_size = 1;
|
||||
for (uint32_t i = 0; i < 3; ++i) {
|
||||
uint32_t gd = (grid_dim && (i < dimension)) ? grid_dim[i] : 1;
|
||||
uint32_t bd = (block_dim && (i < dimension)) ? block_dim[i] : 1;
|
||||
num_groups *= gd;
|
||||
group_size *= bd;
|
||||
gridDim.m[i] = gd;
|
||||
blockDim.m[i] = bd;
|
||||
}
|
||||
|
||||
void vx_spawn_task_groups(int num_groups, int group_size, vx_spawn_task_groups_cb callback, void * arg) {
|
||||
// device specifications
|
||||
int num_cores = vx_num_cores();
|
||||
int warps_per_core = vx_num_warps();
|
||||
int threads_per_warp = vx_num_threads();
|
||||
int core_id = vx_core_id();
|
||||
uint32_t num_cores = vx_num_cores();
|
||||
uint32_t warps_per_core = vx_num_warps();
|
||||
uint32_t threads_per_warp = vx_num_threads();
|
||||
uint32_t core_id = vx_core_id();
|
||||
|
||||
// check group size
|
||||
int threads_per_core = warps_per_core * threads_per_warp;
|
||||
uint32_t threads_per_core = warps_per_core * threads_per_warp;
|
||||
if (threads_per_core < group_size) {
|
||||
vx_printf("error: group_size > threads_per_core (%d)\n", threads_per_core);
|
||||
return;
|
||||
vx_printf("error: group_size > threads_per_core (%d,%d)\n", group_size, threads_per_core);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int warps_per_group = group_size / threads_per_warp;
|
||||
int remaining_threads = group_size - warps_per_group * threads_per_warp;
|
||||
int remaining_mask = -1;
|
||||
if (remaining_threads != 0) {
|
||||
remaining_mask = (1 << remaining_threads) - 1;
|
||||
warps_per_group++;
|
||||
if (group_size > 1) {
|
||||
// calculate number of warps per group
|
||||
uint32_t warps_per_group = group_size / threads_per_warp;
|
||||
uint32_t remaining_threads = group_size - warps_per_group * threads_per_warp;
|
||||
uint32_t remaining_mask = -1;
|
||||
if (remaining_threads != 0) {
|
||||
remaining_mask = (1 << remaining_threads) - 1;
|
||||
++warps_per_group;
|
||||
}
|
||||
|
||||
// calculate necessary active cores
|
||||
uint32_t needed_warps = num_groups * warps_per_group;
|
||||
uint32_t needed_cores = (needed_warps + warps_per_core-1) / warps_per_core;
|
||||
uint32_t active_cores = MIN(needed_cores, num_cores);
|
||||
|
||||
// only active cores participate
|
||||
if (core_id >= active_cores)
|
||||
return 0;
|
||||
|
||||
// total number of groups per core
|
||||
uint32_t total_groups_per_core = num_groups / active_cores;
|
||||
uint32_t remaining_groups_per_core = num_groups - active_cores * total_groups_per_core;
|
||||
if (core_id < remaining_groups_per_core)
|
||||
++total_groups_per_core;
|
||||
|
||||
// calculate number of warps to activate
|
||||
uint32_t groups_per_core = warps_per_core / warps_per_group;
|
||||
uint32_t total_warps_per_core = total_groups_per_core * warps_per_group;
|
||||
uint32_t active_warps = total_warps_per_core;
|
||||
uint32_t warp_batches = 1, remaining_warps = 0;
|
||||
if (active_warps > warps_per_core) {
|
||||
active_warps = groups_per_core * warps_per_group;
|
||||
warp_batches = total_warps_per_core / active_warps;
|
||||
remaining_warps = total_warps_per_core - warp_batches * active_warps;
|
||||
}
|
||||
|
||||
// calculate offsets for group distribution
|
||||
uint32_t group_offset = core_id * total_groups_per_core + MIN(core_id, remaining_groups_per_core);
|
||||
|
||||
// set scheduler arguments
|
||||
wspawn_groups_args_t wspawn_args = {
|
||||
kernel_func,
|
||||
arg,
|
||||
group_offset,
|
||||
warp_batches,
|
||||
remaining_warps,
|
||||
warps_per_group,
|
||||
groups_per_core,
|
||||
remaining_mask
|
||||
};
|
||||
csr_write(VX_CSR_MSCRATCH, &wspawn_args);
|
||||
|
||||
// set global variables
|
||||
__warps_per_group = warps_per_group;
|
||||
|
||||
// execute callback on other warps
|
||||
vx_wspawn(active_warps, process_thread_groups_stub);
|
||||
|
||||
// execute callback on warp0
|
||||
process_thread_groups_stub();
|
||||
} else {
|
||||
uint32_t num_tasks = num_groups;
|
||||
__warps_per_group = 0;
|
||||
|
||||
// calculate necessary active cores
|
||||
uint32_t needed_cores = (num_tasks + threads_per_core - 1) / threads_per_core;
|
||||
uint32_t active_cores = MIN(needed_cores, num_cores);
|
||||
|
||||
// only active cores participate
|
||||
if (core_id >= active_cores)
|
||||
return 0;
|
||||
|
||||
// number of tasks per core
|
||||
uint32_t tasks_per_core = num_tasks / active_cores;
|
||||
uint32_t remaining_tasks_per_core = num_tasks - tasks_per_core * active_cores;
|
||||
if (core_id < remaining_tasks_per_core)
|
||||
++tasks_per_core;
|
||||
|
||||
// calculate number of warps to activate
|
||||
uint32_t total_warps_per_core = tasks_per_core / threads_per_warp;
|
||||
uint32_t remaining_tasks = tasks_per_core - total_warps_per_core * threads_per_warp;
|
||||
uint32_t active_warps = total_warps_per_core;
|
||||
uint32_t warp_batches = 1, remaining_warps = 0;
|
||||
if (active_warps > warps_per_core) {
|
||||
active_warps = warps_per_core;
|
||||
warp_batches = total_warps_per_core / active_warps;
|
||||
remaining_warps = total_warps_per_core - warp_batches * active_warps;
|
||||
}
|
||||
|
||||
// calculate offsets for task distribution
|
||||
uint32_t all_tasks_offset = core_id * tasks_per_core + MIN(core_id, remaining_tasks_per_core);
|
||||
uint32_t remain_tasks_offset = all_tasks_offset + (tasks_per_core - remaining_tasks);
|
||||
|
||||
// prepare scheduler arguments
|
||||
wspawn_threads_args_t wspawn_args = {
|
||||
kernel_func,
|
||||
arg,
|
||||
all_tasks_offset,
|
||||
remain_tasks_offset,
|
||||
warp_batches,
|
||||
remaining_warps
|
||||
};
|
||||
csr_write(VX_CSR_MSCRATCH, &wspawn_args);
|
||||
|
||||
if (active_warps >= 1) {
|
||||
// execute callback on other warps
|
||||
vx_wspawn(active_warps, process_threads_stub);
|
||||
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
// process threads
|
||||
process_threads();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc_one();
|
||||
}
|
||||
|
||||
if (remaining_tasks != 0) {
|
||||
// activate remaining threads
|
||||
uint32_t tmask = (1 << remaining_tasks) - 1;
|
||||
vx_tmc(tmask);
|
||||
|
||||
// process remaining threads
|
||||
process_remaining_threads();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc_one();
|
||||
}
|
||||
}
|
||||
|
||||
int needed_warps = num_groups * warps_per_group;
|
||||
int needed_cores = (needed_warps + warps_per_core-1) / warps_per_core;
|
||||
int active_cores = MIN(needed_cores, num_cores);
|
||||
|
||||
// only active cores participate
|
||||
if (core_id >= active_cores)
|
||||
return;
|
||||
|
||||
int total_groups_per_core = num_groups / active_cores;
|
||||
int remaining_groups_per_core = num_groups - active_cores * total_groups_per_core;
|
||||
if (core_id < remaining_groups_per_core)
|
||||
total_groups_per_core++;
|
||||
|
||||
// calculate number of warps to activate
|
||||
int groups_per_core = warps_per_core / warps_per_group;
|
||||
int total_warps_per_core = total_groups_per_core * warps_per_group;
|
||||
int active_warps = total_warps_per_core;
|
||||
int warp_batches = 1, remaining_warps = 0;
|
||||
if (active_warps > warps_per_core) {
|
||||
active_warps = groups_per_core * warps_per_group;
|
||||
warp_batches = total_warps_per_core / active_warps;
|
||||
remaining_warps = total_warps_per_core - warp_batches * active_warps;
|
||||
}
|
||||
|
||||
// calculate offsets for group distribution
|
||||
int group_offset = core_id * total_groups_per_core + MIN(core_id, remaining_groups_per_core);
|
||||
|
||||
// prepare scheduler arguments
|
||||
wspawn_task_groups_args_t wspawn_args = {
|
||||
callback,
|
||||
arg,
|
||||
group_offset,
|
||||
warp_batches,
|
||||
remaining_warps,
|
||||
warps_per_group,
|
||||
groups_per_core,
|
||||
remaining_mask
|
||||
};
|
||||
csr_write(VX_CSR_MSCRATCH, &wspawn_args);
|
||||
|
||||
// execute callback on other warps
|
||||
vx_wspawn(active_warps, process_all_task_groups_stub);
|
||||
|
||||
// execute callback on warp0
|
||||
process_all_task_groups_stub();
|
||||
|
||||
// wait for spawned tasks to complete
|
||||
// wait for spawned warps to complete
|
||||
vx_wspawn(1, 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -13,8 +13,7 @@
|
|||
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
|
||||
#define RISCV_CUSTOM0 0x0B
|
||||
#include "common.h"
|
||||
|
||||
.section .init, "ax"
|
||||
.global _start
|
||||
|
@ -70,9 +69,10 @@ _start:
|
|||
.type _Exit, @function
|
||||
.global _Exit
|
||||
_Exit:
|
||||
mv s0, a0
|
||||
call vx_perf_dump
|
||||
mv gp, s0
|
||||
call vx_perf_dump
|
||||
li t0, IO_MPM_EXITCODE
|
||||
sw a0, 0(t0)
|
||||
fence
|
||||
.insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0 # tmc x0
|
||||
|
||||
.section .text
|
||||
|
@ -86,27 +86,17 @@ init_regs:
|
|||
.option pop
|
||||
|
||||
# set stack pointer register
|
||||
#if (XLEN == 64)
|
||||
li t0, (STACK_BASE_ADDR >> 32)
|
||||
slli t0, t0, 32
|
||||
li sp, (STACK_BASE_ADDR & 0xffffffff)
|
||||
or sp, sp, t0
|
||||
#else
|
||||
li sp, STACK_BASE_ADDR
|
||||
#endif
|
||||
LOAD_IMMEDIATE64(sp, STACK_BASE_ADDR)
|
||||
csrr t0, VX_CSR_MHARTID
|
||||
sll t1, t0, STACK_LOG2_SIZE
|
||||
sub sp, sp, t1
|
||||
|
||||
# set thread pointer register
|
||||
# use address space after BSS region
|
||||
# ensure cache line alignment
|
||||
la t1, __tcb_aligned_size
|
||||
la t1, __tbss_size
|
||||
mul t0, t0, t1
|
||||
la tp, _end
|
||||
addi tp, tp, 63
|
||||
add tp, tp, t0
|
||||
and tp, tp, -64
|
||||
ret
|
||||
|
||||
.section .text
|
||||
|
|
95
miscs/patches/riscv-test-env.patch
Normal file
95
miscs/patches/riscv-test-env.patch
Normal file
|
@ -0,0 +1,95 @@
|
|||
diff --git a/encoding.h b/encoding.h
|
||||
index 01889d1..362ebd8 100644
|
||||
--- a/encoding.h
|
||||
+++ b/encoding.h
|
||||
@@ -231,6 +231,13 @@
|
||||
#define MSECCFG_USEED 0x00000100
|
||||
#define MSECCFG_SSEED 0x00000200
|
||||
|
||||
+// Vortex defines
|
||||
+#define RISCV_CUSTOM0 0x0B
|
||||
+#define VX_IO_COUT_ADDR 0x40
|
||||
+#define VX_IO_COUT_SIZE 64
|
||||
+#define VX_IO_MPM_ADDR (VX_IO_COUT_ADDR + VX_IO_COUT_SIZE)
|
||||
+#define VX_IO_MPM_EXITCODE (VX_IO_MPM_ADDR + 8)
|
||||
+
|
||||
/* jvt fields */
|
||||
#define JVT_MODE 0x3F
|
||||
#define JVT_BASE (~0x3F)
|
||||
diff --git a/p/riscv_test.h b/p/riscv_test.h
|
||||
index 7bf35cf..6dc759f 100644
|
||||
--- a/p/riscv_test.h
|
||||
+++ b/p/riscv_test.h
|
||||
@@ -174,6 +174,13 @@
|
||||
_start: \
|
||||
/* reset vector */ \
|
||||
j reset_vector; \
|
||||
+ .globl exit; \
|
||||
+ .align 2; \
|
||||
+exit: \
|
||||
+ li t0, VX_IO_MPM_EXITCODE; \
|
||||
+ sw a0, 0(t0); \
|
||||
+ fence; \
|
||||
+ .insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0; \
|
||||
.align 2; \
|
||||
trap_vector: \
|
||||
/* test whether the test came from pass/fail */ \
|
||||
@@ -245,21 +252,14 @@ reset_vector: \
|
||||
//-----------------------------------------------------------------------
|
||||
|
||||
#define RVTEST_PASS \
|
||||
- fence; \
|
||||
- li TESTNUM, 1; \
|
||||
- li a7, 93; \
|
||||
li a0, 0; \
|
||||
- ecall
|
||||
+ call exit
|
||||
|
||||
#define TESTNUM gp
|
||||
#define RVTEST_FAIL \
|
||||
- fence; \
|
||||
-1: beqz TESTNUM, 1b; \
|
||||
- sll TESTNUM, TESTNUM, 1; \
|
||||
- or TESTNUM, TESTNUM, 1; \
|
||||
- li a7, 93; \
|
||||
+ 1: beqz TESTNUM, 1b; \
|
||||
addi a0, TESTNUM, 0; \
|
||||
- ecall
|
||||
+ call exit
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Data Section Macro
|
||||
diff --git a/v/entry.S b/v/entry.S
|
||||
index 13d46a3..39a6c34 100644
|
||||
--- a/v/entry.S
|
||||
+++ b/v/entry.S
|
||||
@@ -17,8 +17,14 @@
|
||||
.align 2
|
||||
_start:
|
||||
j handle_reset
|
||||
-
|
||||
- /* NMI vector */
|
||||
+ .globl exit
|
||||
+ .align 2
|
||||
+exit:
|
||||
+ li t0, VX_IO_MPM_EXITCODE
|
||||
+ sw a0, 0(t0)
|
||||
+ fence
|
||||
+ .insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0
|
||||
+/* NMI vector */
|
||||
.align 2
|
||||
nmi_vector:
|
||||
j wtf
|
||||
diff --git a/v/riscv_test.h b/v/riscv_test.h
|
||||
index f56c022..7341d38 100644
|
||||
--- a/v/riscv_test.h
|
||||
+++ b/v/riscv_test.h
|
||||
@@ -46,7 +46,7 @@ userstart: \
|
||||
#define RVTEST_PASS li a0, 1; scall
|
||||
|
||||
#undef RVTEST_FAIL
|
||||
-#define RVTEST_FAIL sll a0, TESTNUM, 1; 1:beqz a0, 1b; or a0, a0, 1; scall;
|
||||
+#define RVTEST_FAIL 1:beqz TESTNUM, 1b; addi a0, TESTNUM, 0; call exit
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Data Section Macro
|
240
miscs/patches/riscv-tests.patch
Normal file
240
miscs/patches/riscv-tests.patch
Normal file
|
@ -0,0 +1,240 @@
|
|||
diff --git a/benchmarks/Makefile b/benchmarks/Makefile
|
||||
index fde4f23..317fd77 100644
|
||||
--- a/benchmarks/Makefile
|
||||
+++ b/benchmarks/Makefile
|
||||
@@ -22,7 +22,6 @@ bmarks = \
|
||||
qsort \
|
||||
rsort \
|
||||
towers \
|
||||
- vvadd \
|
||||
memcpy \
|
||||
multiply \
|
||||
mm \
|
||||
@@ -31,11 +30,7 @@ bmarks = \
|
||||
mt-vvadd \
|
||||
mt-matmul \
|
||||
mt-memcpy \
|
||||
- pmp \
|
||||
- vec-memcpy \
|
||||
- vec-daxpy \
|
||||
- vec-sgemm \
|
||||
- vec-strcmp \
|
||||
+ pmp
|
||||
|
||||
#--------------------------------------------------------------------
|
||||
# Build rules
|
||||
@@ -43,9 +38,15 @@ bmarks = \
|
||||
|
||||
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
|
||||
RISCV_GCC ?= $(RISCV_PREFIX)gcc
|
||||
-RISCV_GCC_OPTS ?= -DPREALLOCATE=1 -mcmodel=medany -static -std=gnu99 -O2 -ffast-math -fno-common -fno-builtin-printf -fno-tree-loop-distribute-patterns -march=rv$(XLEN)gcv -mabi=lp64d
|
||||
+ifeq ($(XLEN),64)
|
||||
+RISCV_GCC_OPTS ?= -DPREALLOCATE=1 -mcmodel=medany -static -std=gnu99 -O2 -ffast-math -fno-common -fno-builtin-printf -fno-tree-loop-distribute-patterns -march=rv64imafd -mabi=lp64d
|
||||
+else
|
||||
+RISCV_GCC_OPTS ?= -DPREALLOCATE=1 -mcmodel=medany -static -std=gnu99 -O2 -ffast-math -fno-common -fno-builtin-printf -fno-tree-loop-distribute-patterns -march=rv32imaf -mabi=ilp32f
|
||||
+endif
|
||||
RISCV_LINK ?= $(RISCV_GCC) -T $(src_dir)/common/test.ld $(incs)
|
||||
RISCV_LINK_OPTS ?= -static -nostdlib -nostartfiles -lm -lgcc -T $(src_dir)/common/test.ld
|
||||
+RISCV_HEX ?= $(RISCV_PREFIX)objcopy -O ihex
|
||||
+RISCV_BIN ?= $(RISCV_PREFIX)objcopy -O binary
|
||||
RISCV_OBJDUMP ?= $(RISCV_PREFIX)objdump --disassemble-all --disassemble-zeroes --section=.text --section=.text.startup --section=.text.init --section=.data
|
||||
RISCV_SIM ?= spike --isa=rv$(XLEN)gcv
|
||||
|
||||
@@ -53,7 +54,7 @@ incs += -I$(src_dir)/../env -I$(src_dir)/common $(addprefix -I$(src_dir)/, $(bm
|
||||
objs :=
|
||||
|
||||
define compile_template
|
||||
-$(1).riscv: $(wildcard $(src_dir)/$(1)/*) $(wildcard $(src_dir)/common/*)
|
||||
+$(1).elf: $(wildcard $(src_dir)/$(1)/*) $(wildcard $(src_dir)/common/*)
|
||||
$$(RISCV_GCC) $$(incs) $$(RISCV_GCC_OPTS) -o $$@ $(wildcard $(src_dir)/$(1)/*.c) $(wildcard $(src_dir)/$(1)/*.S) $(wildcard $(src_dir)/common/*.c) $(wildcard $(src_dir)/common/*.S) $$(RISCV_LINK_OPTS)
|
||||
endef
|
||||
|
||||
@@ -62,20 +63,28 @@ $(foreach bmark,$(bmarks),$(eval $(call compile_template,$(bmark))))
|
||||
#------------------------------------------------------------
|
||||
# Build and run benchmarks on riscv simulator
|
||||
|
||||
-bmarks_riscv_bin = $(addsuffix .riscv, $(bmarks))
|
||||
-bmarks_riscv_dump = $(addsuffix .riscv.dump, $(bmarks))
|
||||
-bmarks_riscv_out = $(addsuffix .riscv.out, $(bmarks))
|
||||
+bmarks_riscv_elf = $(addsuffix .elf, $(bmarks))
|
||||
+bmarks_riscv_bin = $(addsuffix .bin, $(bmarks))
|
||||
+bmarks_riscv_hex = $(addsuffix .hex, $(bmarks))
|
||||
+bmarks_riscv_dump = $(addsuffix .dump, $(bmarks))
|
||||
+bmarks_riscv_out = $(addsuffix .out, $(bmarks))
|
||||
|
||||
-$(bmarks_riscv_dump): %.riscv.dump: %.riscv
|
||||
+$(bmarks_riscv_hex): %.hex: %.elf
|
||||
+ $(RISCV_HEX) $< $@
|
||||
+
|
||||
+$(bmarks_riscv_bin): %.bin: %.elf
|
||||
+ $(RISCV_BIN) $< $@
|
||||
+
|
||||
+$(bmarks_riscv_dump): %.dump: %.elf
|
||||
$(RISCV_OBJDUMP) $< > $@
|
||||
|
||||
-$(bmarks_riscv_out): %.riscv.out: %.riscv
|
||||
+$(bmarks_riscv_out): %.out: %.elf
|
||||
$(RISCV_SIM) $< > $@
|
||||
|
||||
-riscv: $(bmarks_riscv_dump)
|
||||
+riscv: $(bmarks_riscv_bin) $(bmarks_riscv_hex) $(bmarks_riscv_dump)
|
||||
run: $(bmarks_riscv_out)
|
||||
|
||||
-junk += $(bmarks_riscv_bin) $(bmarks_riscv_dump) $(bmarks_riscv_hex) $(bmarks_riscv_out)
|
||||
+junk += $(bmarks_riscv_elf) $(bmarks_riscv_dump) $(bmarks_riscv_hex) $(bmarks_riscv_bin) $(bmarks_riscv_out)
|
||||
|
||||
#------------------------------------------------------------
|
||||
# Default
|
||||
@@ -91,7 +100,7 @@ latest_install = $(shell ls -1 -d $(instbasedir)/$(instname)* | tail -n 1)
|
||||
|
||||
install:
|
||||
mkdir $(install_dir)
|
||||
- cp -r $(bmarks_riscv_bin) $(bmarks_riscv_dump) $(install_dir)
|
||||
+ cp -r $(bmarks_riscv_elf) $(bmarks_riscv_dump) $(bmarks_riscv_hex) $(bmarks_riscv_bin) $(install_dir)
|
||||
|
||||
install-link:
|
||||
rm -rf $(instbasedir)/$(instname)
|
||||
diff --git a/benchmarks/common/crt.S b/benchmarks/common/crt.S
|
||||
index 3f5bb2c..811412c 100644
|
||||
--- a/benchmarks/common/crt.S
|
||||
+++ b/benchmarks/common/crt.S
|
||||
@@ -134,7 +134,22 @@ _start:
|
||||
add tp, tp, a2
|
||||
|
||||
j _init
|
||||
-
|
||||
+ .align 2
|
||||
+ .globl exit
|
||||
+exit:
|
||||
+ li t0, VX_IO_MPM_EXITCODE
|
||||
+ sw a0, 0(t0)
|
||||
+ fence
|
||||
+ .insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0
|
||||
+ .align 2
|
||||
+ .globl putchar
|
||||
+putchar:
|
||||
+ li t1, VX_IO_COUT_ADDR
|
||||
+ csrr t0, CSR_MHARTID
|
||||
+ andi t0, t0, %lo(VX_IO_COUT_SIZE-1)
|
||||
+ add t0, t0, t1
|
||||
+ sb a0, 0(t0)
|
||||
+ ret
|
||||
.align 2
|
||||
trap_entry:
|
||||
addi sp, sp, -272
|
||||
diff --git a/benchmarks/common/syscalls.c b/benchmarks/common/syscalls.c
|
||||
index 7a7b7fd..1195fb4 100644
|
||||
--- a/benchmarks/common/syscalls.c
|
||||
+++ b/benchmarks/common/syscalls.c
|
||||
@@ -64,10 +64,10 @@ uintptr_t __attribute__((weak)) handle_trap(uintptr_t cause, uintptr_t epc, uint
|
||||
tohost_exit(1337);
|
||||
}
|
||||
|
||||
-void exit(int code)
|
||||
+/*void exit(int code)
|
||||
{
|
||||
tohost_exit(code);
|
||||
-}
|
||||
+}*/
|
||||
|
||||
void abort()
|
||||
{
|
||||
@@ -76,7 +76,7 @@ void abort()
|
||||
|
||||
void printstr(const char* s)
|
||||
{
|
||||
- syscall(SYS_write, 1, (uintptr_t)s, strlen(s));
|
||||
+ printf(s);
|
||||
}
|
||||
|
||||
void __attribute__((weak)) thread_entry(int cid, int nc)
|
||||
@@ -122,7 +122,7 @@ void _init(int cid, int nc)
|
||||
exit(ret);
|
||||
}
|
||||
|
||||
-#undef putchar
|
||||
+/*#undef putchar
|
||||
int putchar(int ch)
|
||||
{
|
||||
static __thread char buf[64] __attribute__((aligned(64)));
|
||||
@@ -137,7 +137,7 @@ int putchar(int ch)
|
||||
}
|
||||
|
||||
return 0;
|
||||
-}
|
||||
+}*/
|
||||
|
||||
void printhex(uint64_t x)
|
||||
{
|
||||
@@ -226,7 +226,7 @@ static void vprintfmt(void (*putch)(int, void**), void **putdat, const char *fmt
|
||||
case '-':
|
||||
padc = '-';
|
||||
goto reswitch;
|
||||
-
|
||||
+
|
||||
// flag to pad with 0's instead of spaces
|
||||
case '0':
|
||||
padc = '0';
|
||||
@@ -335,7 +335,7 @@ static void vprintfmt(void (*putch)(int, void**), void **putdat, const char *fmt
|
||||
case '%':
|
||||
putch(ch, putdat);
|
||||
break;
|
||||
-
|
||||
+
|
||||
// unrecognized escape sequence - just print it literally
|
||||
default:
|
||||
putch('%', putdat);
|
||||
diff --git a/env b/env
|
||||
index 4fabfb4..1c577dc 160000
|
||||
--- a/env
|
||||
+++ b/env
|
||||
@@ -1 +1 @@
|
||||
-Subproject commit 4fabfb4e0d3eacc1dc791da70e342e4b68ea7e46
|
||||
+Subproject commit 1c577dc7c7d6aee27b8d5cb0e2e87c8473e3ad12-dirty
|
||||
diff --git a/isa/Makefile b/isa/Makefile
|
||||
index bf85e1f..3ba32bd 100644
|
||||
--- a/isa/Makefile
|
||||
+++ b/isa/Makefile
|
||||
@@ -47,6 +47,7 @@ RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
|
||||
RISCV_GCC ?= $(RISCV_PREFIX)gcc
|
||||
RISCV_GCC_OPTS ?= -static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles
|
||||
RISCV_OBJDUMP ?= $(RISCV_PREFIX)objdump --disassemble-all --disassemble-zeroes --section=.text --section=.text.startup --section=.text.init --section=.data
|
||||
+RISCV_BIN ?= $(RISCV_PREFIX)objcopy -O binary
|
||||
RISCV_SIM ?= spike
|
||||
|
||||
vpath %.S $(src_dir)
|
||||
@@ -57,6 +58,12 @@ vpath %.S $(src_dir)
|
||||
%.dump: %
|
||||
$(RISCV_OBJDUMP) $< > $@
|
||||
|
||||
+%.hex: %
|
||||
+ $(RISCV_HEX) $< $@
|
||||
+
|
||||
+%.bin: %
|
||||
+ $(RISCV_BIN) $< $@
|
||||
+
|
||||
%.out: %
|
||||
$(RISCV_SIM) --isa=rv64gc_zfh_zicboz_svnapot_zicntr_zba_zbb_zbc_zbs --misaligned $< 2> $@
|
||||
|
||||
@@ -119,18 +126,19 @@ $(eval $(call compile_template,rv64mi,-march=rv64g -mabi=lp64))
|
||||
endif
|
||||
|
||||
tests_dump = $(addsuffix .dump, $(tests))
|
||||
+tests_bin = $(addsuffix .bin, $(tests))
|
||||
tests_hex = $(addsuffix .hex, $(tests))
|
||||
tests_out = $(addsuffix .out, $(filter rv64%,$(tests)))
|
||||
tests32_out = $(addsuffix .out32, $(filter rv32%,$(tests)))
|
||||
|
||||
run: $(tests_out) $(tests32_out)
|
||||
|
||||
-junk += $(tests) $(tests_dump) $(tests_hex) $(tests_out) $(tests32_out)
|
||||
+junk += $(tests) $(tests_dump) $(tests_hex) $(tests_bin) $(tests_out) $(tests32_out)
|
||||
|
||||
#------------------------------------------------------------
|
||||
# Default
|
||||
|
||||
-all: $(tests_dump)
|
||||
+all: $(tests_dump) $(tests_bin)
|
||||
|
||||
#------------------------------------------------------------
|
||||
# Clean up
|
|
@ -27,7 +27,7 @@ set(CMAKE_CXX_FLAGS "-v --gcc-toolchain=${TOOLDIR}/riscv-gnu-toolchain -march=rv
|
|||
set(CMAKE_SYSROOT "${TOOLDIR}/riscv32-gnu-toolchain/riscv32-unknown-elf")
|
||||
|
||||
# Linker flags
|
||||
set(CMAKE_EXE_LINKER_FLAGS "-fuse-ld=lld -nostartfiles -Wl,-Bstatic,--gc-sections,-T,${VORTEX_HOME}/kernel/scripts/link32.ld ${VORTEX_BUILD}/kernel/libvortexrt.a")
|
||||
set(CMAKE_EXE_LINKER_FLAGS "-fuse-ld=lld -nostartfiles -Wl,-Bstatic,--gc-sections,-T,${VORTEX_HOME}/kernel/scripts/link32.ld ${VORTEX_BUILD}/kernel/libvortex.a")
|
||||
|
||||
# Don't run the linker on compiler check
|
||||
set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
|
||||
|
|
|
@ -19,10 +19,10 @@ xrt:
|
|||
$(MAKE) -C xrt
|
||||
|
||||
clean:
|
||||
$(MAKE) clean -C stub
|
||||
$(MAKE) clean -C simx
|
||||
$(MAKE) clean -C rtlsim
|
||||
$(MAKE) clean -C opae
|
||||
$(MAKE) clean -C xrt
|
||||
$(MAKE) -C stub clean
|
||||
$(MAKE) -C simx clean
|
||||
$(MAKE) -C rtlsim clean
|
||||
$(MAKE) -C opae clean
|
||||
$(MAKE) -C xrt clean
|
||||
|
||||
.PHONY: all stub simx rtlsim opae xrt clean
|
80
runtime/common/callbacks.h
Normal file
80
runtime/common/callbacks.h
Normal file
|
@ -0,0 +1,80 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef CALLBACKS_H
|
||||
#define CALLBACKS_H
|
||||
|
||||
#include <vortex.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
// open the device and connect to it
|
||||
int (*dev_open) (vx_device_h* hdevice);
|
||||
|
||||
// Close the device when all the operations are done
|
||||
int (*dev_close) (vx_device_h hdevice);
|
||||
|
||||
// return device configurations
|
||||
int (*dev_caps) (vx_device_h hdevice, uint32_t caps_id, uint64_t *value);
|
||||
|
||||
// allocate device memory and return address
|
||||
int (*mem_alloc) (vx_device_h hdevice, uint64_t size, int flags, vx_buffer_h* hbuffer);
|
||||
|
||||
// reserve memory address range
|
||||
int (*mem_reserve) (vx_device_h hdevice, uint64_t address, uint64_t size, int flags, vx_buffer_h* hbuffer);
|
||||
|
||||
// release device memory
|
||||
int (*mem_free) (vx_buffer_h hbuffer);
|
||||
|
||||
// set device memory access rights
|
||||
int (*mem_access) (vx_buffer_h hbuffer, uint64_t offset, uint64_t size, int flags);
|
||||
|
||||
// return device memory address
|
||||
int (*mem_address) (vx_buffer_h hbuffer, uint64_t* address);
|
||||
|
||||
// get device memory info
|
||||
int (*mem_info) (vx_device_h hdevice, uint64_t* mem_free, uint64_t* mem_used);
|
||||
|
||||
// Copy bytes from host to device memory
|
||||
int (*copy_to_dev) (vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offset, uint64_t size);
|
||||
|
||||
// Copy bytes from device memory to host
|
||||
int (*copy_from_dev) (void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size);
|
||||
|
||||
// Start device execution
|
||||
int (*start) (vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments);
|
||||
|
||||
// Wait for device ready with milliseconds timeout
|
||||
int (*ready_wait) (vx_device_h hdevice, uint64_t timeout);
|
||||
|
||||
// read device configuration registers
|
||||
int (*dcr_read) (vx_device_h hdevice, uint32_t addr, uint32_t* value);
|
||||
|
||||
// write device configuration registers
|
||||
int (*dcr_write) (vx_device_h hdevice, uint32_t addr, uint32_t value);
|
||||
|
||||
// query device performance counter
|
||||
int (*mpm_query) (vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value);
|
||||
|
||||
} callbacks_t;
|
||||
|
||||
int vx_dev_init(callbacks_t* callbacks);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
225
runtime/common/callbacks.inc
Normal file
225
runtime/common/callbacks.inc
Normal file
|
@ -0,0 +1,225 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
struct vx_buffer {
|
||||
vx_device* device;
|
||||
uint64_t addr;
|
||||
uint64_t size;
|
||||
};
|
||||
|
||||
extern int vx_dev_init(callbacks_t* callbacks) {
|
||||
if (nullptr == callbacks)
|
||||
return -1;
|
||||
|
||||
callbacks->dev_open = [](vx_device_h* hdevice)->int {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
auto device = new vx_device();
|
||||
if (device == nullptr)
|
||||
return -1;
|
||||
CHECK_ERR(device->init(), {
|
||||
delete device;
|
||||
return err;
|
||||
});
|
||||
DBGPRINT("DEV_OPEN: hdevice=%p\n", (void*)device);
|
||||
*hdevice = device;
|
||||
return 0;
|
||||
};
|
||||
|
||||
callbacks->dev_close = [](vx_device_h hdevice)->int {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
DBGPRINT("DEV_CLOSE: hdevice=%p\n", hdevice);
|
||||
auto device = ((vx_device*)hdevice);
|
||||
delete device;
|
||||
return 0;
|
||||
};
|
||||
|
||||
callbacks->dev_caps = [](vx_device_h hdevice, uint32_t caps_id, uint64_t *value)->int {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
uint64_t _value;
|
||||
CHECK_ERR(device->get_caps(caps_id, &_value), {
|
||||
return err;
|
||||
});
|
||||
DBGPRINT("DEV_CAPS: hdevice=%p, caps_id=%d, value=%ld\n", hdevice, caps_id, _value);
|
||||
*value = _value;
|
||||
return 0;
|
||||
};
|
||||
|
||||
callbacks->mem_alloc = [](vx_device_h hdevice, uint64_t size, int flags, vx_buffer_h* hbuffer)->int {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == hbuffer
|
||||
|| 0 == size)
|
||||
return -1;
|
||||
auto device = ((vx_device*)hdevice);
|
||||
uint64_t dev_addr;
|
||||
CHECK_ERR(device->mem_alloc(size, flags, &dev_addr), {
|
||||
return err;
|
||||
});
|
||||
auto buffer = new vx_buffer{device, dev_addr, size};
|
||||
if (nullptr == buffer) {
|
||||
device->mem_free(dev_addr);
|
||||
return -1;
|
||||
}
|
||||
DBGPRINT("MEM_ALLOC: hdevice=%p, size=%ld, flags=0x%d, hbuffer=%p\n", hdevice, size, flags, (void*)buffer);
|
||||
*hbuffer = buffer;
|
||||
return 0;
|
||||
};
|
||||
|
||||
callbacks->mem_reserve = [](vx_device_h hdevice, uint64_t address, uint64_t size, int flags, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == hbuffer
|
||||
|| 0 == size)
|
||||
return -1;
|
||||
auto device = ((vx_device*)hdevice);
|
||||
CHECK_ERR(device->mem_reserve(address, size, flags), {
|
||||
return err;
|
||||
});
|
||||
auto buffer = new vx_buffer{device, address, size};
|
||||
if (nullptr == buffer) {
|
||||
device->mem_free(address);
|
||||
return -1;
|
||||
}
|
||||
DBGPRINT("MEM_RESERVE: hdevice=%p, address=0x%lx, size=%ld, flags=0x%d, hbuffer=%p\n", hdevice, address, size, flags, (void*)buffer);
|
||||
*hbuffer = buffer;
|
||||
return 0;
|
||||
};
|
||||
|
||||
callbacks->mem_free = [](vx_buffer_h hbuffer) {
|
||||
if (nullptr == hbuffer)
|
||||
return 0;
|
||||
DBGPRINT("MEM_FREE: hbuffer=%p\n", hbuffer);
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
auto device = ((vx_device*)buffer->device);
|
||||
device->mem_access(buffer->addr, buffer->size, 0);
|
||||
int err = device->mem_free(buffer->addr);
|
||||
delete buffer;
|
||||
return err;
|
||||
};
|
||||
|
||||
callbacks->mem_access = [](vx_buffer_h hbuffer, uint64_t offset, uint64_t size, int flags) {
|
||||
if (nullptr == hbuffer)
|
||||
return -1;
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
auto device = ((vx_device*)buffer->device);
|
||||
if ((offset + size) > buffer->size)
|
||||
return -1;
|
||||
DBGPRINT("MEM_ACCESS: hbuffer=%p, offset=%ld, size=%ld, flags=%d\n", hbuffer, offset, size, flags);
|
||||
return device->mem_access(buffer->addr + offset, size, flags);
|
||||
};
|
||||
|
||||
callbacks->mem_address = [](vx_buffer_h hbuffer, uint64_t* address) {
|
||||
if (nullptr == hbuffer)
|
||||
return -1;
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
DBGPRINT("MEM_ADDRESS: hbuffer=%p, address=0x%lx\n", hbuffer, buffer->addr);
|
||||
*address = buffer->addr;
|
||||
return 0;
|
||||
};
|
||||
|
||||
callbacks->mem_info = [](vx_device_h hdevice, uint64_t* mem_free, uint64_t* mem_used) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
auto device = ((vx_device*)hdevice);
|
||||
uint64_t _mem_free, _mem_used;
|
||||
CHECK_ERR(device->mem_info(&_mem_free, &_mem_used), {
|
||||
return err;
|
||||
});
|
||||
DBGPRINT("MEM_INFO: hdevice=%p, mem_free=%ld, mem_used=%ld\n", hdevice, _mem_free, _mem_used);
|
||||
if (mem_free) {
|
||||
*mem_free = _mem_free;
|
||||
}
|
||||
if (mem_used) {
|
||||
*mem_used = _mem_used;
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
|
||||
callbacks->copy_to_dev = [](vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offset, uint64_t size) {
|
||||
if (nullptr == hbuffer || nullptr == host_ptr)
|
||||
return -1;
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
auto device = ((vx_device*)buffer->device);
|
||||
if ((dst_offset + size) > buffer->size)
|
||||
return -1;
|
||||
DBGPRINT("COPY_TO_DEV: hbuffer=%p, host_addr=%p, dst_offset=%ld, size=%ld\n", hbuffer, host_ptr, dst_offset, size);
|
||||
return device->upload(buffer->addr + dst_offset, host_ptr, size);
|
||||
};
|
||||
|
||||
callbacks->copy_from_dev = [](void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size) {
|
||||
if (nullptr == hbuffer || nullptr == host_ptr)
|
||||
return -1;
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
auto device = ((vx_device*)buffer->device);
|
||||
if ((src_offset + size) > buffer->size)
|
||||
return -1;
|
||||
DBGPRINT("COPY_FROM_DEV: hbuffer=%p, host_addr=%p, src_offset=%ld, size=%ld\n", hbuffer, host_ptr, src_offset, size);
|
||||
return device->download(host_ptr, buffer->addr + src_offset, size);
|
||||
};
|
||||
|
||||
callbacks->start = [](vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments) {
|
||||
if (nullptr == hdevice || nullptr == hkernel || nullptr == harguments)
|
||||
return -1;
|
||||
DBGPRINT("START: hdevice=%p, hkernel=%p, harguments=%p\n", hdevice, hkernel, harguments);
|
||||
auto device = ((vx_device*)hdevice);
|
||||
auto kernel = ((vx_buffer*)hkernel);
|
||||
auto arguments = ((vx_buffer*)harguments);
|
||||
return device->start(kernel->addr, arguments->addr);
|
||||
};
|
||||
|
||||
callbacks->ready_wait = [](vx_device_h hdevice, uint64_t timeout) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
DBGPRINT("READY_WAIT: hdevice=%p, timeout=%ld\n", hdevice, timeout);
|
||||
auto device = ((vx_device*)hdevice);
|
||||
return device->ready_wait(timeout);
|
||||
};
|
||||
|
||||
callbacks->dcr_read = [](vx_device_h hdevice, uint32_t addr, uint32_t* value) {
|
||||
if (nullptr == hdevice || NULL == value)
|
||||
return -1;
|
||||
auto device = ((vx_device*)hdevice);
|
||||
uint32_t _value;
|
||||
CHECK_ERR(device->dcr_read(addr, &_value), {
|
||||
return err;
|
||||
});
|
||||
DBGPRINT("DCR_READ: hdevice=%p, addr=0x%x, value=0x%x\n", hdevice, addr, _value);
|
||||
*value = _value;
|
||||
return 0;
|
||||
};
|
||||
|
||||
callbacks->dcr_write = [](vx_device_h hdevice, uint32_t addr, uint32_t value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
DBGPRINT("DCR_WRITE: hdevice=%p, addr=0x%x, value=0x%x\n", hdevice, addr, value);
|
||||
auto device = ((vx_device*)hdevice);
|
||||
return device->dcr_write(addr, value);
|
||||
};
|
||||
|
||||
callbacks->mpm_query = [](vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
auto device = ((vx_device*)hdevice);
|
||||
uint64_t _value;
|
||||
CHECK_ERR(device->mpm_query(addr, core_id, &_value), {
|
||||
return err;
|
||||
});
|
||||
DBGPRINT("MPM_QUERY: hdevice=%p, addr=0x%x, core_id=%d, value=0x%lx\n", hdevice, addr, core_id, _value);
|
||||
*value = _value;
|
||||
return 0;
|
||||
};
|
||||
|
||||
return 0;
|
||||
}
|
76
runtime/common/common.h
Normal file
76
runtime/common/common.h
Normal file
|
@ -0,0 +1,76 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vortex.h>
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
#include <callbacks.h>
|
||||
#include <malloc.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <unordered_map>
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
#define ALLOC_BASE_ADDR USER_BASE_ADDR
|
||||
|
||||
#if (XLEN == 64)
|
||||
#define GLOBAL_MEM_SIZE 0x200000000 // 8 GB
|
||||
#else
|
||||
#define GLOBAL_MEM_SIZE 0x100000000 // 4 GB
|
||||
#endif
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define DBGPRINT(format, ...) ((void)0)
|
||||
#endif
|
||||
|
||||
#define CHECK_ERR(_expr, _cleanup) \
|
||||
do { \
|
||||
auto err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
class DeviceConfig {
|
||||
public:
|
||||
void write(uint32_t addr, uint32_t value) {
|
||||
store_[addr] = value;
|
||||
}
|
||||
int read(uint32_t addr, uint32_t* value) const {
|
||||
auto it = store_.find(addr);
|
||||
if (it == store_.end())
|
||||
return -1;
|
||||
*value = it->second;
|
||||
return 0;
|
||||
}
|
||||
private:
|
||||
std::unordered_map<uint32_t, uint32_t> store_;
|
||||
};
|
||||
|
||||
inline uint64_t aligned_size(uint64_t size, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return (size + alignment - 1) & ~(alignment - 1);
|
||||
}
|
||||
|
||||
inline bool is_aligned(uint64_t addr, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return 0 == (addr & (alignment - 1));
|
||||
}
|
|
@ -21,523 +21,522 @@ namespace vortex {
|
|||
|
||||
class MemoryAllocator {
|
||||
public:
|
||||
MemoryAllocator(
|
||||
uint64_t baseAddress,
|
||||
uint64_t capacity,
|
||||
uint32_t pageAlign,
|
||||
uint32_t blockAlign)
|
||||
: baseAddress_(baseAddress)
|
||||
, capacity_(capacity)
|
||||
, pageAlign_(pageAlign)
|
||||
, blockAlign_(blockAlign)
|
||||
, pages_(nullptr)
|
||||
, allocated_(0)
|
||||
{}
|
||||
MemoryAllocator(
|
||||
uint64_t baseAddress,
|
||||
uint64_t capacity,
|
||||
uint32_t pageAlign,
|
||||
uint32_t blockAlign)
|
||||
: baseAddress_(baseAddress)
|
||||
, capacity_(capacity)
|
||||
, pageAlign_(pageAlign)
|
||||
, blockAlign_(blockAlign)
|
||||
, pages_(nullptr)
|
||||
, allocated_(0)
|
||||
{}
|
||||
|
||||
~MemoryAllocator() {
|
||||
// Free allocated pages
|
||||
page_t* currPage = pages_;
|
||||
while (currPage) {
|
||||
auto nextPage = currPage->next;
|
||||
delete currPage;
|
||||
currPage = nextPage;
|
||||
}
|
||||
~MemoryAllocator() {
|
||||
// Free allocated pages
|
||||
page_t* currPage = pages_;
|
||||
while (currPage) {
|
||||
auto nextPage = currPage->next;
|
||||
delete currPage;
|
||||
currPage = nextPage;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t baseAddress() const {
|
||||
return baseAddress_;
|
||||
}
|
||||
|
||||
uint32_t capacity() const {
|
||||
return capacity_;
|
||||
}
|
||||
|
||||
uint64_t free() const {
|
||||
return (capacity_ - allocated_);
|
||||
}
|
||||
|
||||
uint64_t allocated() const {
|
||||
return allocated_;
|
||||
}
|
||||
|
||||
int reserve(uint64_t addr, uint64_t size) {
|
||||
if (size == 0) {
|
||||
printf("error: invalid arguments\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint32_t baseAddress() const {
|
||||
return baseAddress_;
|
||||
// Align allocation size
|
||||
size = alignSize(size, pageAlign_);
|
||||
|
||||
// Check if the reservation is within memory capacity bounds
|
||||
if (addr + size > capacity_) {
|
||||
printf("error: address range out of bounds\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint32_t capacity() const {
|
||||
return capacity_;
|
||||
// Ensure the reservation does not overlap with existing pages
|
||||
if (hasPageOverlap(addr, size)) {
|
||||
printf("error: address range overlaps with existing allocation\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint64_t free() const {
|
||||
return (capacity_ - allocated_);
|
||||
// allocate a new page for segment
|
||||
auto newPage = this->createPage(addr, size);
|
||||
|
||||
// allocate space on free block
|
||||
auto freeBlock = newPage->findFreeBlock(size);
|
||||
newPage->allocate(size, freeBlock);
|
||||
|
||||
// Update allocated size
|
||||
allocated_ += size;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int allocate(uint64_t size, uint64_t* addr) {
|
||||
if (size == 0 || addr == nullptr) {
|
||||
printf("error: invalid arguments\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint64_t allocated() const {
|
||||
return allocated_;
|
||||
// Align allocation size
|
||||
size = alignSize(size, blockAlign_);
|
||||
|
||||
// Walk thru all pages to find a free block
|
||||
block_t* freeBlock = nullptr;
|
||||
auto currPage = pages_;
|
||||
while (currPage) {
|
||||
freeBlock = currPage->findFreeBlock(size);
|
||||
if (freeBlock != nullptr)
|
||||
break;
|
||||
currPage = currPage->next;
|
||||
}
|
||||
|
||||
int reserve(uint64_t addr, uint64_t size) {
|
||||
if (size == 0) {
|
||||
printf("error: invalid arguments\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Align allocation size
|
||||
size = alignSize(size, pageAlign_);
|
||||
|
||||
// Check if the reservation is within memory capacity bounds
|
||||
if (addr < baseAddress_ || addr + size > baseAddress_ + capacity_) {
|
||||
printf("error: address range out of bounds\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Ensure the reservation does not overlap with existing pages
|
||||
if (hasPageOverlap(addr, size)) {
|
||||
printf("error: address range overlaps with existing allocation\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// allocate a new page for segment
|
||||
auto newPage = this->createPage(addr, size);
|
||||
|
||||
// allocate space on free block
|
||||
auto freeBlock = newPage->findFreeBlock(size);
|
||||
newPage->allocate(size, freeBlock);
|
||||
|
||||
// Update allocated size
|
||||
allocated_ += size;
|
||||
|
||||
return 0;
|
||||
// Allocate a new page if no free block is found
|
||||
if (freeBlock == nullptr) {
|
||||
auto pageSize = alignSize(size, pageAlign_);
|
||||
uint64_t pageAddr;
|
||||
if (!this->findNextAddress(pageSize, &pageAddr)) {
|
||||
printf("error: out of memory\n");
|
||||
return -1;
|
||||
}
|
||||
currPage = this->createPage(pageAddr, pageSize);
|
||||
if (nullptr == currPage) {
|
||||
printf("error: out of memory\n");
|
||||
return -1;
|
||||
}
|
||||
freeBlock = currPage->findFreeBlock(size);
|
||||
}
|
||||
|
||||
int allocate(uint64_t size, uint64_t* addr) {
|
||||
if (size == 0 || addr == nullptr) {
|
||||
printf("error: invalid arguments\n");
|
||||
return -1;
|
||||
}
|
||||
// allocate space on free block
|
||||
currPage->allocate(size, freeBlock);
|
||||
|
||||
// Align allocation size
|
||||
size = alignSize(size, blockAlign_);
|
||||
// Return the free block address
|
||||
*addr = freeBlock->addr;
|
||||
|
||||
// Walk thru all pages to find a free block
|
||||
block_t* freeBlock = nullptr;
|
||||
auto currPage = pages_;
|
||||
while (currPage) {
|
||||
freeBlock = currPage->findFreeBlock(size);
|
||||
if (freeBlock != nullptr)
|
||||
break;
|
||||
currPage = currPage->next;
|
||||
}
|
||||
// Update allocated size
|
||||
allocated_ += size;
|
||||
|
||||
// Allocate a new page if no free block is found
|
||||
if (freeBlock == nullptr) {
|
||||
auto pageSize = alignSize(size, pageAlign_);
|
||||
uint64_t pageAddr;
|
||||
if (!this->findNextAddress(pageSize, &pageAddr)) {
|
||||
printf("error: out of memory\n");
|
||||
return -1;
|
||||
}
|
||||
currPage = this->createPage(pageAddr, pageSize);
|
||||
if (nullptr == currPage) {
|
||||
printf("error: out of memory\n");
|
||||
return -1;
|
||||
}
|
||||
freeBlock = currPage->findFreeBlock(size);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// allocate space on free block
|
||||
currPage->allocate(size, freeBlock);
|
||||
|
||||
// Return the free block address
|
||||
*addr = freeBlock->addr;
|
||||
|
||||
// Update allocated size
|
||||
allocated_ += size;
|
||||
|
||||
return 0;
|
||||
int release(uint64_t addr) {
|
||||
// Walk all pages to find the pointer
|
||||
block_t* usedBlock = nullptr;
|
||||
auto currPage = pages_;
|
||||
while (currPage) {
|
||||
usedBlock = currPage->findUsedBlock(addr);
|
||||
if (usedBlock != nullptr)
|
||||
break;
|
||||
currPage = currPage->next;
|
||||
}
|
||||
|
||||
int release(uint64_t addr) {
|
||||
// Walk all pages to find the pointer
|
||||
block_t* usedBlock = nullptr;
|
||||
auto currPage = pages_;
|
||||
while (currPage) {
|
||||
usedBlock = currPage->findUsedBlock(addr);
|
||||
if (usedBlock != nullptr)
|
||||
break;
|
||||
currPage = currPage->next;
|
||||
}
|
||||
|
||||
// found the corresponding block?
|
||||
if (nullptr == usedBlock) {
|
||||
printf("warning: release address not found: 0x%lx\n", addr);
|
||||
return -1;
|
||||
}
|
||||
|
||||
auto size = usedBlock->size;
|
||||
|
||||
// release the used block
|
||||
currPage->release(usedBlock);
|
||||
|
||||
// Free the page if empty
|
||||
if (currPage->empty()) {
|
||||
this->deletePage(currPage);
|
||||
}
|
||||
|
||||
// update allocated size
|
||||
allocated_ -= size;
|
||||
|
||||
return 0;
|
||||
// found the corresponding block?
|
||||
if (nullptr == usedBlock) {
|
||||
printf("warning: release address not found: 0x%lx\n", addr);
|
||||
return -1;
|
||||
}
|
||||
|
||||
auto size = usedBlock->size;
|
||||
|
||||
// release the used block
|
||||
currPage->release(usedBlock);
|
||||
|
||||
// Free the page if empty
|
||||
if (currPage->empty()) {
|
||||
this->deletePage(currPage);
|
||||
}
|
||||
|
||||
// update allocated size
|
||||
allocated_ -= size;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
struct block_t {
|
||||
block_t* nextFreeS;
|
||||
block_t* prevFreeS;
|
||||
struct block_t {
|
||||
block_t* nextFreeS;
|
||||
block_t* prevFreeS;
|
||||
|
||||
block_t* nextFreeM;
|
||||
block_t* prevFreeM;
|
||||
block_t* nextFreeM;
|
||||
block_t* prevFreeM;
|
||||
|
||||
block_t* nextUsed;
|
||||
block_t* prevUsed;
|
||||
block_t* nextUsed;
|
||||
block_t* prevUsed;
|
||||
|
||||
uint64_t addr;
|
||||
uint64_t size;
|
||||
uint64_t addr;
|
||||
uint64_t size;
|
||||
|
||||
block_t(uint64_t addr, uint64_t size)
|
||||
: nextFreeS(nullptr)
|
||||
, prevFreeS(nullptr)
|
||||
, nextFreeM(nullptr)
|
||||
, prevFreeM(nullptr)
|
||||
, nextUsed(nullptr)
|
||||
, prevUsed(nullptr)
|
||||
, addr(addr)
|
||||
, size(size)
|
||||
{}
|
||||
};
|
||||
block_t(uint64_t addr, uint64_t size)
|
||||
: nextFreeS(nullptr)
|
||||
, prevFreeS(nullptr)
|
||||
, nextFreeM(nullptr)
|
||||
, prevFreeM(nullptr)
|
||||
, nextUsed(nullptr)
|
||||
, prevUsed(nullptr)
|
||||
, addr(addr)
|
||||
, size(size)
|
||||
{}
|
||||
};
|
||||
|
||||
struct page_t {
|
||||
page_t* next;
|
||||
uint64_t addr;
|
||||
uint64_t size;
|
||||
struct page_t {
|
||||
page_t* next;
|
||||
uint64_t addr;
|
||||
uint64_t size;
|
||||
|
||||
page_t(uint64_t addr, uint64_t size, uint32_t blockAlign) :
|
||||
next(nullptr),
|
||||
addr(addr),
|
||||
size(size),
|
||||
blockAlign_(blockAlign),
|
||||
usedList_(nullptr) {
|
||||
freeSList_ = freeMList_ = new block_t(addr, size);
|
||||
}
|
||||
|
||||
~page_t() {
|
||||
// The page should be empty
|
||||
assert(nullptr == usedList_);
|
||||
assert(freeMList_
|
||||
&& (nullptr == freeMList_->nextFreeM)
|
||||
&& (nullptr == freeMList_->prevFreeM));
|
||||
delete freeMList_;
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return (usedList_ == nullptr);
|
||||
}
|
||||
|
||||
void allocate(uint64_t size, block_t* freeBlock) {
|
||||
// Remove the block from the free lists
|
||||
this->removeFreeMList(freeBlock);
|
||||
this->removeFreeSList(freeBlock);
|
||||
|
||||
// If the free block we have found is larger than what we are looking for,
|
||||
// we may be able to split our free block in two.
|
||||
uint64_t extraBytes = freeBlock->size - size;
|
||||
if (extraBytes >= blockAlign_) {
|
||||
// Reduce the free block size to the requested value
|
||||
freeBlock->size = size;
|
||||
|
||||
// Allocate a new block to contain the extra buffer
|
||||
auto nextAddr = freeBlock->addr + size;
|
||||
auto newBlock = new block_t(nextAddr, extraBytes);
|
||||
|
||||
// Add the new block to the free lists
|
||||
this->insertFreeMList(newBlock);
|
||||
this->insertFreeSList(newBlock);
|
||||
}
|
||||
|
||||
// Insert the free block into the used list
|
||||
this->insertUsedList(freeBlock);
|
||||
}
|
||||
|
||||
void release(block_t* usedBlock) {
|
||||
// Remove the block from the used list
|
||||
this->removeUsedList(usedBlock);
|
||||
|
||||
// Insert the block into the free M-list.
|
||||
this->insertFreeMList(usedBlock);
|
||||
|
||||
// Check if we can merge adjacent free blocks from the left.
|
||||
if (usedBlock->prevFreeM) {
|
||||
// Calculate the previous address
|
||||
auto prevAddr = usedBlock->prevFreeM->addr + usedBlock->prevFreeM->size;
|
||||
if (usedBlock->addr == prevAddr) {
|
||||
auto prevBlock = usedBlock->prevFreeM;
|
||||
|
||||
// Merge the blocks to the left
|
||||
prevBlock->size += usedBlock->size;
|
||||
prevBlock->nextFreeM = usedBlock->nextFreeM;
|
||||
if (prevBlock->nextFreeM) {
|
||||
prevBlock->nextFreeM->prevFreeM = prevBlock;
|
||||
}
|
||||
|
||||
// Detach previous block from the free S-list since size increased
|
||||
this->removeFreeSList(prevBlock);
|
||||
|
||||
// reset usedBlock
|
||||
delete usedBlock;
|
||||
usedBlock = prevBlock;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we can merge adjacent free blocks from the right.
|
||||
if (usedBlock->nextFreeM) {
|
||||
// Calculate the next allocation start address
|
||||
auto nextAddr = usedBlock->addr + usedBlock->size;
|
||||
if (usedBlock->nextFreeM->addr == nextAddr) {
|
||||
auto nextBlock = usedBlock->nextFreeM;
|
||||
|
||||
// Merge the blocks to the right
|
||||
usedBlock->size += nextBlock->size;
|
||||
usedBlock->nextFreeM = nextBlock->nextFreeM;
|
||||
if (usedBlock->nextFreeM) {
|
||||
usedBlock->nextFreeM->prevFreeM = usedBlock;
|
||||
}
|
||||
|
||||
// Delete next block
|
||||
this->removeFreeSList(nextBlock);
|
||||
delete nextBlock;
|
||||
}
|
||||
}
|
||||
|
||||
// Insert the block into the free S-list.
|
||||
this->insertFreeSList(usedBlock);
|
||||
}
|
||||
|
||||
block_t* findFreeBlock(uint64_t size) {
|
||||
auto freeBlock = freeSList_;
|
||||
if (freeBlock) {
|
||||
// The free S-list is already sorted with the largest block first
|
||||
// Quick check if the head block has enough space.
|
||||
if (freeBlock->size >= size) {
|
||||
// Find the smallest matching block in the S-list
|
||||
while (freeBlock->nextFreeS
|
||||
&& (freeBlock->nextFreeS->size >= size)) {
|
||||
freeBlock = freeBlock->nextFreeS;
|
||||
}
|
||||
// Return the free block
|
||||
return freeBlock;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
block_t* findUsedBlock(uint64_t addr) {
|
||||
if (addr >= this->addr
|
||||
&& addr < (this->addr + this->size)) {
|
||||
auto useBlock = usedList_;
|
||||
while (useBlock) {
|
||||
if (useBlock->addr == addr)
|
||||
return useBlock;
|
||||
useBlock = useBlock->nextUsed;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
private:
|
||||
|
||||
void insertUsedList(block_t* block) {
|
||||
block->nextUsed = usedList_;
|
||||
if (usedList_) {
|
||||
usedList_->prevUsed = block;
|
||||
}
|
||||
usedList_ = block;
|
||||
}
|
||||
|
||||
void removeUsedList(block_t* block) {
|
||||
if (block->prevUsed) {
|
||||
block->prevUsed->nextUsed = block->nextUsed;
|
||||
} else {
|
||||
usedList_ = block->nextUsed;
|
||||
}
|
||||
if (block->nextUsed) {
|
||||
block->nextUsed->prevUsed = block->prevUsed;
|
||||
}
|
||||
block->nextUsed = nullptr;
|
||||
block->prevUsed = nullptr;
|
||||
}
|
||||
|
||||
void insertFreeMList(block_t* block) {
|
||||
block_t* currBlock = freeMList_;
|
||||
block_t* prevBlock = nullptr;
|
||||
while (currBlock && (currBlock->addr < block->addr)) {
|
||||
prevBlock = currBlock;
|
||||
currBlock = currBlock->nextFreeM;
|
||||
}
|
||||
block->nextFreeM = currBlock;
|
||||
block->prevFreeM = prevBlock;
|
||||
if (prevBlock) {
|
||||
prevBlock->nextFreeM = block;
|
||||
} else {
|
||||
freeMList_ = block;
|
||||
}
|
||||
if (currBlock) {
|
||||
currBlock->prevFreeM = block;
|
||||
}
|
||||
}
|
||||
|
||||
void removeFreeMList(block_t* block) {
|
||||
if (block->prevFreeM) {
|
||||
block->prevFreeM->nextFreeM = block->nextFreeM;
|
||||
} else {
|
||||
freeMList_ = block->nextFreeM;
|
||||
}
|
||||
if (block->nextFreeM) {
|
||||
block->nextFreeM->prevFreeM = block->prevFreeM;
|
||||
}
|
||||
block->nextFreeM = nullptr;
|
||||
block->prevFreeM = nullptr;
|
||||
}
|
||||
|
||||
void insertFreeSList(block_t* block) {
|
||||
block_t* currBlock = freeSList_;
|
||||
block_t* prevBlock = nullptr;
|
||||
while (currBlock && (currBlock->size > block->size)) {
|
||||
prevBlock = currBlock;
|
||||
currBlock = currBlock->nextFreeS;
|
||||
}
|
||||
block->nextFreeS = currBlock;
|
||||
block->prevFreeS = prevBlock;
|
||||
if (prevBlock) {
|
||||
prevBlock->nextFreeS = block;
|
||||
} else {
|
||||
freeSList_ = block;
|
||||
}
|
||||
if (currBlock) {
|
||||
currBlock->prevFreeS = block;
|
||||
}
|
||||
}
|
||||
|
||||
void removeFreeSList(block_t* block) {
|
||||
if (block->prevFreeS) {
|
||||
block->prevFreeS->nextFreeS = block->nextFreeS;
|
||||
} else {
|
||||
freeSList_ = block->nextFreeS;
|
||||
}
|
||||
if (block->nextFreeS) {
|
||||
block->nextFreeS->prevFreeS = block->prevFreeS;
|
||||
}
|
||||
block->nextFreeS = nullptr;
|
||||
block->prevFreeS = nullptr;
|
||||
}
|
||||
|
||||
// block alignment
|
||||
uint32_t blockAlign_;
|
||||
|
||||
// List of used blocks
|
||||
block_t* usedList_;
|
||||
|
||||
// List with blocks sorted by decreasing sizes
|
||||
// Used for block lookup during memory allocation.
|
||||
block_t* freeSList_;
|
||||
|
||||
// List with blocks sorted by increasing memory addresses
|
||||
// Used for block merging during memory release.
|
||||
block_t* freeMList_;
|
||||
};
|
||||
|
||||
page_t* createPage(uint64_t addr, uint64_t size) {
|
||||
// Allocate object
|
||||
auto newPage = new page_t(addr, size, blockAlign_);
|
||||
|
||||
// Insert the new page into the list in address sorted order
|
||||
if (pages_ == nullptr || pages_->addr > newPage->addr) {
|
||||
newPage->next = pages_;
|
||||
pages_ = newPage;
|
||||
} else {
|
||||
page_t* current = pages_;
|
||||
while (current->next != nullptr && current->next->addr < newPage->addr) {
|
||||
current = current->next;
|
||||
}
|
||||
newPage->next = current->next;
|
||||
current->next = newPage;
|
||||
}
|
||||
|
||||
return newPage;
|
||||
page_t(uint64_t addr, uint64_t size, uint32_t blockAlign) :
|
||||
next(nullptr),
|
||||
addr(addr),
|
||||
size(size),
|
||||
blockAlign_(blockAlign),
|
||||
usedList_(nullptr) {
|
||||
freeSList_ = freeMList_ = new block_t(addr, size);
|
||||
}
|
||||
|
||||
void deletePage(page_t* page) {
|
||||
// Remove the page from the list
|
||||
page_t* prevPage = nullptr;
|
||||
auto currPage = pages_;
|
||||
while (currPage) {
|
||||
if (currPage == page) {
|
||||
if (prevPage) {
|
||||
prevPage->next = currPage->next;
|
||||
} else {
|
||||
pages_ = currPage->next;
|
||||
}
|
||||
break;
|
||||
}
|
||||
prevPage = currPage;
|
||||
currPage = currPage->next;
|
||||
}
|
||||
// Delete the page
|
||||
delete page;
|
||||
~page_t() {
|
||||
// The page should be empty
|
||||
assert(nullptr == usedList_);
|
||||
assert(freeMList_
|
||||
&& (nullptr == freeMList_->nextFreeM)
|
||||
&& (nullptr == freeMList_->prevFreeM));
|
||||
delete freeMList_;
|
||||
}
|
||||
|
||||
bool findNextAddress(uint64_t size, uint64_t* addr) {
|
||||
if (pages_ == nullptr) {
|
||||
*addr = baseAddress_;
|
||||
return true;
|
||||
}
|
||||
|
||||
page_t* current = pages_;
|
||||
uint64_t endOfLastPage = baseAddress_;
|
||||
|
||||
while (current != nullptr) {
|
||||
uint64_t startOfCurrentPage = current->addr;
|
||||
if ((endOfLastPage + size) <= startOfCurrentPage) {
|
||||
*addr = endOfLastPage;
|
||||
return true;
|
||||
}
|
||||
// Update the end of the last page to the end of the current page
|
||||
// Move to the next page in the sorted list
|
||||
endOfLastPage = current->addr + current->size;
|
||||
current = current->next;
|
||||
}
|
||||
|
||||
// If no suitable gap is found, place the new page at the end of the last page
|
||||
// Check if the allocator has enough capacity
|
||||
if ((endOfLastPage + size) <= capacity_) {
|
||||
*addr = endOfLastPage;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
bool empty() const {
|
||||
return (usedList_ == nullptr);
|
||||
}
|
||||
|
||||
bool hasPageOverlap(uint64_t start, uint64_t size) {
|
||||
page_t* current = pages_;
|
||||
while (current != nullptr) {
|
||||
uint64_t pageStart = current->addr;
|
||||
uint64_t pageEnd = pageStart + current->size;
|
||||
uint64_t requestEnd = start + size;
|
||||
if ((start >= pageStart && start < pageEnd) || // Start of request is inside the page
|
||||
(requestEnd > pageStart && requestEnd <= pageEnd) || // End of request is inside the page
|
||||
(start <= pageStart && requestEnd >= pageEnd)) { // Request envelops the page
|
||||
return true;
|
||||
}
|
||||
current = current->next;
|
||||
void allocate(uint64_t size, block_t* freeBlock) {
|
||||
// Remove the block from the free lists
|
||||
this->removeFreeMList(freeBlock);
|
||||
this->removeFreeSList(freeBlock);
|
||||
|
||||
// If the free block we have found is larger than what we are looking for,
|
||||
// we may be able to split our free block in two.
|
||||
uint64_t extraBytes = freeBlock->size - size;
|
||||
if (extraBytes >= blockAlign_) {
|
||||
// Reduce the free block size to the requested value
|
||||
freeBlock->size = size;
|
||||
|
||||
// Allocate a new block to contain the extra buffer
|
||||
auto nextAddr = freeBlock->addr + size;
|
||||
auto newBlock = new block_t(nextAddr, extraBytes);
|
||||
|
||||
// Add the new block to the free lists
|
||||
this->insertFreeMList(newBlock);
|
||||
this->insertFreeSList(newBlock);
|
||||
}
|
||||
|
||||
// Insert the free block into the used list
|
||||
this->insertUsedList(freeBlock);
|
||||
}
|
||||
|
||||
void release(block_t* usedBlock) {
|
||||
// Remove the block from the used list
|
||||
this->removeUsedList(usedBlock);
|
||||
|
||||
// Insert the block into the free M-list.
|
||||
this->insertFreeMList(usedBlock);
|
||||
|
||||
// Check if we can merge adjacent free blocks from the left.
|
||||
if (usedBlock->prevFreeM) {
|
||||
// Calculate the previous address
|
||||
auto prevAddr = usedBlock->prevFreeM->addr + usedBlock->prevFreeM->size;
|
||||
if (usedBlock->addr == prevAddr) {
|
||||
auto prevBlock = usedBlock->prevFreeM;
|
||||
|
||||
// Merge the blocks to the left
|
||||
prevBlock->size += usedBlock->size;
|
||||
prevBlock->nextFreeM = usedBlock->nextFreeM;
|
||||
if (prevBlock->nextFreeM) {
|
||||
prevBlock->nextFreeM->prevFreeM = prevBlock;
|
||||
}
|
||||
|
||||
// Detach previous block from the free S-list since size increased
|
||||
this->removeFreeSList(prevBlock);
|
||||
|
||||
// reset usedBlock
|
||||
delete usedBlock;
|
||||
usedBlock = prevBlock;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if we can merge adjacent free blocks from the right.
|
||||
if (usedBlock->nextFreeM) {
|
||||
// Calculate the next allocation start address
|
||||
auto nextAddr = usedBlock->addr + usedBlock->size;
|
||||
if (usedBlock->nextFreeM->addr == nextAddr) {
|
||||
auto nextBlock = usedBlock->nextFreeM;
|
||||
|
||||
// Merge the blocks to the right
|
||||
usedBlock->size += nextBlock->size;
|
||||
usedBlock->nextFreeM = nextBlock->nextFreeM;
|
||||
if (usedBlock->nextFreeM) {
|
||||
usedBlock->nextFreeM->prevFreeM = usedBlock;
|
||||
}
|
||||
|
||||
// Delete next block
|
||||
this->removeFreeSList(nextBlock);
|
||||
delete nextBlock;
|
||||
}
|
||||
}
|
||||
|
||||
// Insert the block into the free S-list.
|
||||
this->insertFreeSList(usedBlock);
|
||||
}
|
||||
|
||||
static uint64_t alignSize(uint64_t size, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return (size + alignment - 1) & ~(alignment - 1);
|
||||
block_t* findFreeBlock(uint64_t size) {
|
||||
auto freeBlock = freeSList_;
|
||||
if (freeBlock) {
|
||||
// The free S-list is already sorted with the largest block first
|
||||
// Quick check if the head block has enough space.
|
||||
if (freeBlock->size >= size) {
|
||||
// Find the smallest matching block in the S-list
|
||||
while (freeBlock->nextFreeS && (freeBlock->nextFreeS->size >= size)) {
|
||||
freeBlock = freeBlock->nextFreeS;
|
||||
}
|
||||
// Return the free block
|
||||
return freeBlock;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint64_t baseAddress_;
|
||||
uint64_t capacity_;
|
||||
uint32_t pageAlign_;
|
||||
block_t* findUsedBlock(uint64_t addr) {
|
||||
if (addr >= this->addr && addr < (this->addr + this->size)) {
|
||||
auto useBlock = usedList_;
|
||||
while (useBlock) {
|
||||
if (useBlock->addr == addr)
|
||||
return useBlock;
|
||||
useBlock = useBlock->nextUsed;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
void insertUsedList(block_t* block) {
|
||||
block->nextUsed = usedList_;
|
||||
if (usedList_) {
|
||||
usedList_->prevUsed = block;
|
||||
}
|
||||
usedList_ = block;
|
||||
}
|
||||
|
||||
void removeUsedList(block_t* block) {
|
||||
if (block->prevUsed) {
|
||||
block->prevUsed->nextUsed = block->nextUsed;
|
||||
} else {
|
||||
usedList_ = block->nextUsed;
|
||||
}
|
||||
if (block->nextUsed) {
|
||||
block->nextUsed->prevUsed = block->prevUsed;
|
||||
}
|
||||
block->nextUsed = nullptr;
|
||||
block->prevUsed = nullptr;
|
||||
}
|
||||
|
||||
void insertFreeMList(block_t* block) {
|
||||
block_t* currBlock = freeMList_;
|
||||
block_t* prevBlock = nullptr;
|
||||
while (currBlock && (currBlock->addr < block->addr)) {
|
||||
prevBlock = currBlock;
|
||||
currBlock = currBlock->nextFreeM;
|
||||
}
|
||||
block->nextFreeM = currBlock;
|
||||
block->prevFreeM = prevBlock;
|
||||
if (prevBlock) {
|
||||
prevBlock->nextFreeM = block;
|
||||
} else {
|
||||
freeMList_ = block;
|
||||
}
|
||||
if (currBlock) {
|
||||
currBlock->prevFreeM = block;
|
||||
}
|
||||
}
|
||||
|
||||
void removeFreeMList(block_t* block) {
|
||||
if (block->prevFreeM) {
|
||||
block->prevFreeM->nextFreeM = block->nextFreeM;
|
||||
} else {
|
||||
freeMList_ = block->nextFreeM;
|
||||
}
|
||||
if (block->nextFreeM) {
|
||||
block->nextFreeM->prevFreeM = block->prevFreeM;
|
||||
}
|
||||
block->nextFreeM = nullptr;
|
||||
block->prevFreeM = nullptr;
|
||||
}
|
||||
|
||||
void insertFreeSList(block_t* block) {
|
||||
block_t* currBlock = freeSList_;
|
||||
block_t* prevBlock = nullptr;
|
||||
while (currBlock && (currBlock->size > block->size)) {
|
||||
prevBlock = currBlock;
|
||||
currBlock = currBlock->nextFreeS;
|
||||
}
|
||||
block->nextFreeS = currBlock;
|
||||
block->prevFreeS = prevBlock;
|
||||
if (prevBlock) {
|
||||
prevBlock->nextFreeS = block;
|
||||
} else {
|
||||
freeSList_ = block;
|
||||
}
|
||||
if (currBlock) {
|
||||
currBlock->prevFreeS = block;
|
||||
}
|
||||
}
|
||||
|
||||
void removeFreeSList(block_t* block) {
|
||||
if (block->prevFreeS) {
|
||||
block->prevFreeS->nextFreeS = block->nextFreeS;
|
||||
} else {
|
||||
freeSList_ = block->nextFreeS;
|
||||
}
|
||||
if (block->nextFreeS) {
|
||||
block->nextFreeS->prevFreeS = block->prevFreeS;
|
||||
}
|
||||
block->nextFreeS = nullptr;
|
||||
block->prevFreeS = nullptr;
|
||||
}
|
||||
|
||||
// block alignment
|
||||
uint32_t blockAlign_;
|
||||
page_t* pages_;
|
||||
uint64_t nextAddress_;
|
||||
uint64_t allocated_;
|
||||
|
||||
// List of used blocks
|
||||
block_t* usedList_;
|
||||
|
||||
// List with blocks sorted by decreasing sizes
|
||||
// Used for block lookup during memory allocation.
|
||||
block_t* freeSList_;
|
||||
|
||||
// List with blocks sorted by increasing memory addresses
|
||||
// Used for block merging during memory release.
|
||||
block_t* freeMList_;
|
||||
};
|
||||
|
||||
page_t* createPage(uint64_t addr, uint64_t size) {
|
||||
// Allocate object
|
||||
auto newPage = new page_t(addr, size, blockAlign_);
|
||||
|
||||
// Insert the new page into the list in address sorted order
|
||||
if (pages_ == nullptr || pages_->addr > newPage->addr) {
|
||||
newPage->next = pages_;
|
||||
pages_ = newPage;
|
||||
} else {
|
||||
page_t* current = pages_;
|
||||
while (current->next != nullptr && current->next->addr < newPage->addr) {
|
||||
current = current->next;
|
||||
}
|
||||
newPage->next = current->next;
|
||||
current->next = newPage;
|
||||
}
|
||||
|
||||
return newPage;
|
||||
}
|
||||
|
||||
void deletePage(page_t* page) {
|
||||
// Remove the page from the list
|
||||
page_t* prevPage = nullptr;
|
||||
auto currPage = pages_;
|
||||
while (currPage) {
|
||||
if (currPage == page) {
|
||||
if (prevPage) {
|
||||
prevPage->next = currPage->next;
|
||||
} else {
|
||||
pages_ = currPage->next;
|
||||
}
|
||||
break;
|
||||
}
|
||||
prevPage = currPage;
|
||||
currPage = currPage->next;
|
||||
}
|
||||
// Delete the page
|
||||
delete page;
|
||||
}
|
||||
|
||||
bool findNextAddress(uint64_t size, uint64_t* addr) {
|
||||
if (pages_ == nullptr) {
|
||||
*addr = baseAddress_;
|
||||
return true;
|
||||
}
|
||||
|
||||
page_t* current = pages_;
|
||||
uint64_t endOfLastPage = baseAddress_;
|
||||
|
||||
while (current != nullptr) {
|
||||
uint64_t startOfCurrentPage = current->addr;
|
||||
if ((endOfLastPage + size) <= startOfCurrentPage) {
|
||||
*addr = endOfLastPage;
|
||||
return true;
|
||||
}
|
||||
// Update the end of the last page to the end of the current page
|
||||
// Move to the next page in the sorted list
|
||||
endOfLastPage = current->addr + current->size;
|
||||
current = current->next;
|
||||
}
|
||||
|
||||
// If no suitable gap is found, place the new page at the end of the last page
|
||||
// Check if the allocator has enough capacity
|
||||
if ((endOfLastPage + size) <= capacity_) {
|
||||
*addr = endOfLastPage;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool hasPageOverlap(uint64_t start, uint64_t size) {
|
||||
page_t* current = pages_;
|
||||
while (current != nullptr) {
|
||||
uint64_t pageStart = current->addr;
|
||||
uint64_t pageEnd = pageStart + current->size;
|
||||
uint64_t requestEnd = start + size;
|
||||
if ((start >= pageStart && start < pageEnd) || // Start of request is inside the page
|
||||
(requestEnd > pageStart && requestEnd <= pageEnd) || // End of request is inside the page
|
||||
(start <= pageStart && requestEnd >= pageEnd)) { // Request envelops the page
|
||||
return true;
|
||||
}
|
||||
current = current->next;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static uint64_t alignSize(uint64_t size, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return (size + alignment - 1) & ~(alignment - 1);
|
||||
}
|
||||
|
||||
uint64_t baseAddress_;
|
||||
uint64_t capacity_;
|
||||
uint32_t pageAlign_;
|
||||
uint32_t blockAlign_;
|
||||
page_t* pages_;
|
||||
uint64_t nextAddress_;
|
||||
uint64_t allocated_;
|
||||
};
|
||||
|
||||
} // namespace vortex
|
||||
} // namespace vortex
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -41,28 +41,28 @@
|
|||
#define CMD_SET_STOP 5
|
||||
|
||||
#define CHECK_ERR(_expr) \
|
||||
do { \
|
||||
int err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[SCOPE] error: '%s' returned %d!\n", #_expr, err); \
|
||||
return err; \
|
||||
} while (false)
|
||||
do { \
|
||||
int err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[SCOPE] error: '%s' returned %d!\n", #_expr, err); \
|
||||
return err; \
|
||||
} while (false)
|
||||
|
||||
struct tap_signal_t {
|
||||
uint32_t id;
|
||||
std::string name;
|
||||
uint32_t width;
|
||||
uint32_t id;
|
||||
std::string name;
|
||||
uint32_t width;
|
||||
};
|
||||
|
||||
struct tap_t {
|
||||
uint32_t id;
|
||||
uint32_t width;
|
||||
uint32_t frames;
|
||||
uint32_t cur_frame;
|
||||
uint64_t cycle_time;
|
||||
std::string path;
|
||||
std::vector<tap_signal_t> signals;
|
||||
uint32_t id;
|
||||
uint32_t width;
|
||||
uint32_t frames;
|
||||
uint32_t cur_frame;
|
||||
uint64_t cycle_time;
|
||||
std::string path;
|
||||
std::vector<tap_signal_t> signals;
|
||||
};
|
||||
|
||||
static scope_callback_t g_callback;
|
||||
|
@ -70,290 +70,290 @@ static scope_callback_t g_callback;
|
|||
using json = nlohmann::json;
|
||||
|
||||
static std::vector<std::string> split(const std::string &s, char delimiter) {
|
||||
std::vector<std::string> tokens;
|
||||
std::string token;
|
||||
std::istringstream tokenStream(s);
|
||||
while (std::getline(tokenStream, token, delimiter)) {
|
||||
tokens.push_back(token);
|
||||
}
|
||||
return tokens;
|
||||
std::vector<std::string> tokens;
|
||||
std::string token;
|
||||
std::istringstream tokenStream(s);
|
||||
while (std::getline(tokenStream, token, delimiter)) {
|
||||
tokens.push_back(token);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
static void dump_module(std::ofstream& ofs,
|
||||
static void dump_module(std::ofstream& ofs,
|
||||
const std::string& name,
|
||||
std::unordered_map<std::string, std::unordered_set<std::string>>& hierarchy,
|
||||
std::unordered_map<std::string, tap_t*>& tails,
|
||||
int indentation) {
|
||||
std::string indent(indentation, ' ');
|
||||
ofs << indent << "$scope module " << name << " $end" << std::endl;
|
||||
std::string indent(indentation, ' ');
|
||||
ofs << indent << "$scope module " << name << " $end" << std::endl;
|
||||
|
||||
auto itt = tails.find(name);
|
||||
if (itt != tails.end()) {
|
||||
for (auto& signal : itt->second->signals) {
|
||||
ofs << indent << " $var reg " << signal.width << " " << signal.id << " " << signal.name << " $end" << std::endl;
|
||||
}
|
||||
auto itt = tails.find(name);
|
||||
if (itt != tails.end()) {
|
||||
for (auto& signal : itt->second->signals) {
|
||||
ofs << indent << " $var reg " << signal.width << " " << signal.id << " " << signal.name << " $end" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
auto ith = hierarchy.find(name);
|
||||
if (ith != hierarchy.end()) {
|
||||
for (auto& child : ith->second) {
|
||||
dump_module(ofs, child, hierarchy, tails, indentation + 1);
|
||||
}
|
||||
auto ith = hierarchy.find(name);
|
||||
if (ith != hierarchy.end()) {
|
||||
for (auto& child : ith->second) {
|
||||
dump_module(ofs, child, hierarchy, tails, indentation + 1);
|
||||
}
|
||||
}
|
||||
|
||||
ofs << indent << "$upscope $end" << std::endl;
|
||||
ofs << indent << "$upscope $end" << std::endl;
|
||||
}
|
||||
|
||||
static void dump_header(std::ofstream& ofs, std::vector<tap_t>& taps) {
|
||||
ofs << "$version Generated by Vortex Scope Analyzer $end" << std::endl;
|
||||
ofs << "$timescale 1 ns $end" << std::endl;
|
||||
ofs << "$scope module TOP $end" << std::endl;
|
||||
ofs << " $var reg 1 0 clk $end" << std::endl;
|
||||
ofs << "$version Generated by Vortex Scope Analyzer $end" << std::endl;
|
||||
ofs << "$timescale 1 ns $end" << std::endl;
|
||||
ofs << "$scope module TOP $end" << std::endl;
|
||||
ofs << " $var reg 1 0 clk $end" << std::endl;
|
||||
|
||||
std::unordered_map<std::string, std::unordered_set<std::string>> hierarchy;
|
||||
std::unordered_set<std::string> heads;
|
||||
std::unordered_map<std::string, tap_t*> tails;
|
||||
std::unordered_map<std::string, std::unordered_set<std::string>> hierarchy;
|
||||
std::unordered_set<std::string> heads;
|
||||
std::unordered_map<std::string, tap_t*> tails;
|
||||
|
||||
// Build hierarchy
|
||||
for (auto& tap : taps) {
|
||||
std::vector<std::string> tokens = split(tap.path, '.');
|
||||
for (size_t i = 1; i < tokens.size(); ++i) {
|
||||
hierarchy[tokens[i-1]].insert(tokens[i]);
|
||||
}
|
||||
auto h = tokens[0];
|
||||
auto t = tokens[tokens.size()-1];
|
||||
heads.insert(h);
|
||||
tails[t] = &tap;
|
||||
// Build hierarchy
|
||||
for (auto& tap : taps) {
|
||||
std::vector<std::string> tokens = split(tap.path, '.');
|
||||
for (size_t i = 1; i < tokens.size(); ++i) {
|
||||
hierarchy[tokens[i-1]].insert(tokens[i]);
|
||||
}
|
||||
auto h = tokens[0];
|
||||
auto t = tokens[tokens.size()-1];
|
||||
heads.insert(h);
|
||||
tails[t] = &tap;
|
||||
}
|
||||
|
||||
// Dump module huierarchy
|
||||
for (auto& head : heads) {
|
||||
dump_module(ofs, head, hierarchy, tails, 1);
|
||||
}
|
||||
// Dump module hierarchy
|
||||
for (auto& head : heads) {
|
||||
dump_module(ofs, head, hierarchy, tails, 1);
|
||||
}
|
||||
|
||||
ofs << "$upscope $end" << std::endl;
|
||||
ofs << "enddefinitions $end" << std::endl;
|
||||
ofs << "$upscope $end" << std::endl;
|
||||
ofs << "enddefinitions $end" << std::endl;
|
||||
}
|
||||
|
||||
static tap_t* find_nearest_tap(std::vector<tap_t>& taps) {
|
||||
tap_t* nearest = nullptr;
|
||||
for (auto& tap : taps) {
|
||||
if (tap.cur_frame == tap.frames)
|
||||
continue;
|
||||
if (nearest != nullptr) {
|
||||
if (tap.cycle_time < nearest->cycle_time)
|
||||
nearest = &tap;
|
||||
} else {
|
||||
nearest = &tap;
|
||||
}
|
||||
tap_t* nearest = nullptr;
|
||||
for (auto& tap : taps) {
|
||||
if (tap.cur_frame == tap.frames)
|
||||
continue;
|
||||
if (nearest != nullptr) {
|
||||
if (tap.cycle_time < nearest->cycle_time)
|
||||
nearest = &tap;
|
||||
} else {
|
||||
nearest = &tap;
|
||||
}
|
||||
return nearest;
|
||||
}
|
||||
return nearest;
|
||||
}
|
||||
|
||||
static uint64_t advance_time(std::ofstream& ofs, uint64_t next_time, uint64_t cur_time) {
|
||||
while (cur_time < next_time) {
|
||||
ofs << '#' << (cur_time * 2 + 0) << std::endl;
|
||||
ofs << "b0 0" << std::endl;
|
||||
ofs << '#' << (cur_time * 2 + 1) << std::endl;
|
||||
ofs << "b1 0" << std::endl;
|
||||
++cur_time;
|
||||
}
|
||||
return cur_time;
|
||||
while (cur_time < next_time) {
|
||||
ofs << '#' << (cur_time * 2 + 0) << std::endl;
|
||||
ofs << "b0 0" << std::endl;
|
||||
ofs << '#' << (cur_time * 2 + 1) << std::endl;
|
||||
ofs << "b1 0" << std::endl;
|
||||
++cur_time;
|
||||
}
|
||||
return cur_time;
|
||||
}
|
||||
|
||||
static int dump_tap(std::ofstream& ofs, tap_t* tap, vx_device_h hdevice) {
|
||||
uint32_t signal_offset = 0;
|
||||
uint32_t frame_offset = 0;
|
||||
uint64_t word;
|
||||
uint32_t signal_offset = 0;
|
||||
uint32_t frame_offset = 0;
|
||||
uint64_t word;
|
||||
|
||||
std::vector<char> signal_data(tap->width);
|
||||
auto signal_it = tap->signals.rbegin();
|
||||
uint32_t signal_width = signal_it->width;
|
||||
std::vector<char> signal_data(tap->width);
|
||||
auto signal_it = tap->signals.rbegin();
|
||||
uint32_t signal_width = signal_it->width;
|
||||
|
||||
do {
|
||||
// read data
|
||||
uint64_t cmd_data = (tap->id << 3) | CMD_GET_DATA;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &word));
|
||||
do {
|
||||
// read data
|
||||
uint64_t cmd_data = (tap->id << 3) | CMD_GET_DATA;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &word));
|
||||
do {
|
||||
uint32_t word_offset = frame_offset % 64;
|
||||
signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0';
|
||||
++signal_offset;
|
||||
++frame_offset;
|
||||
if (signal_offset == signal_width) {
|
||||
signal_data[signal_width] = 0; // string null termination
|
||||
ofs << 'b' << signal_data.data() << ' ' << signal_it->id << std::endl;
|
||||
if (frame_offset == tap->width) {
|
||||
// end-of-frame
|
||||
++tap->cur_frame;
|
||||
if (tap->cur_frame != tap->frames) {
|
||||
// read next delta
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &word));
|
||||
tap->cycle_time += 1 + word;
|
||||
if (0 == (tap->cur_frame % FRAME_FLUSH_SIZE)) {
|
||||
ofs << std::flush;
|
||||
std::cout << std::dec << "[SCOPE] flush tap #" << tap->id << ": "<< tap->cur_frame << "/" << tap->frames << " frames, next_time=" << tap->cycle_time << std::endl;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
signal_offset = 0;
|
||||
++signal_it;
|
||||
signal_width = signal_it->width;
|
||||
uint32_t word_offset = frame_offset % 64;
|
||||
signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0';
|
||||
++signal_offset;
|
||||
++frame_offset;
|
||||
if (signal_offset == signal_width) {
|
||||
signal_data[signal_width] = 0; // string null termination
|
||||
ofs << 'b' << signal_data.data() << ' ' << signal_it->id << std::endl;
|
||||
if (frame_offset == tap->width) {
|
||||
// end-of-frame
|
||||
++tap->cur_frame;
|
||||
if (tap->cur_frame != tap->frames) {
|
||||
// read next delta
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &word));
|
||||
tap->cycle_time += 1 + word;
|
||||
if (0 == (tap->cur_frame % FRAME_FLUSH_SIZE)) {
|
||||
ofs << std::flush;
|
||||
std::cout << std::dec << "[SCOPE] flush tap #" << tap->id << ": "<< tap->cur_frame << "/" << tap->frames << " frames, next_time=" << tap->cycle_time << std::endl;
|
||||
}
|
||||
} while ((frame_offset % 64) != 0);
|
||||
} while (frame_offset != tap->width);
|
||||
}
|
||||
break;
|
||||
}
|
||||
signal_offset = 0;
|
||||
++signal_it;
|
||||
signal_width = signal_it->width;
|
||||
}
|
||||
} while ((frame_offset % 64) != 0);
|
||||
} while (frame_offset != tap->width);
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t start_time, uint64_t stop_time) {
|
||||
if (nullptr == hdevice || nullptr == callback)
|
||||
return -1;
|
||||
int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t start_time, uint64_t stop_time) {
|
||||
if (nullptr == hdevice || nullptr == callback)
|
||||
return -1;
|
||||
|
||||
const char* json_path = getenv("SCOPE_JSON_PATH");
|
||||
std::ifstream ifs(json_path);
|
||||
if (!ifs) {
|
||||
std::cerr << "[SCOPE] error: cannot open scope manifest file: " << json_path << std::endl;
|
||||
return -1;
|
||||
}
|
||||
auto json_obj = json::parse(ifs);
|
||||
if (json_obj.is_null()) {
|
||||
std::cerr << "[SCOPE] error: invalid scope manifest file: " << json_path << std::endl;
|
||||
return -1;
|
||||
const char* json_path = getenv("SCOPE_JSON_PATH");
|
||||
std::ifstream ifs(json_path);
|
||||
if (!ifs) {
|
||||
std::cerr << "[SCOPE] error: cannot open scope manifest file: " << json_path << std::endl;
|
||||
return -1;
|
||||
}
|
||||
auto json_obj = json::parse(ifs);
|
||||
if (json_obj.is_null()) {
|
||||
std::cerr << "[SCOPE] error: invalid scope manifest file: " << json_path << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
g_callback = *callback;
|
||||
|
||||
// validate scope manifest
|
||||
for (auto& tap : json_obj["taps"]) {
|
||||
auto id = tap["id"].get<uint32_t>();
|
||||
auto width = tap["width"].get<uint32_t>();
|
||||
|
||||
uint64_t cmd_width = (id << 3) | CMD_GET_WIDTH;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_width));
|
||||
uint64_t dev_width;
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &dev_width));
|
||||
if (width != dev_width) {
|
||||
std::cerr << "[SCOPE] error: invalid tap #" << id << " width, actual=" << dev_width << ", expected=" << width << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
g_callback = *callback;
|
||||
|
||||
// validate scope manifest
|
||||
// set stop time
|
||||
if (stop_time != uint64_t(-1)) {
|
||||
std::cout << "[SCOPE] stop time: " << std::dec << stop_time << "s" << std::endl;
|
||||
for (auto& tap : json_obj["taps"]) {
|
||||
auto id = tap["id"].get<uint32_t>();
|
||||
auto width = tap["width"].get<uint32_t>();
|
||||
|
||||
uint64_t cmd_width = (id << 3) | CMD_GET_WIDTH;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_width));
|
||||
uint64_t dev_width;
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &dev_width));
|
||||
if (width != dev_width) {
|
||||
std::cerr << "[SCOPE] error: invalid tap #" << id << " width, actual=" << dev_width << ", expected=" << width << std::endl;
|
||||
return 1;
|
||||
}
|
||||
auto id = tap["id"].get<uint32_t>();
|
||||
uint64_t cmd_stop = (stop_time << 11) | (id << 3) | CMD_SET_STOP;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
|
||||
}
|
||||
}
|
||||
|
||||
// set stop time
|
||||
if (stop_time != uint64_t(-1)) {
|
||||
std::cout << "[SCOPE] stop time: " << std::dec << stop_time << "s" << std::endl;
|
||||
for (auto& tap : json_obj["taps"]) {
|
||||
auto id = tap["id"].get<uint32_t>();
|
||||
uint64_t cmd_stop = (stop_time << 11) | (id << 3) | CMD_SET_STOP;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
|
||||
}
|
||||
// start recording
|
||||
if (start_time != uint64_t(-1)) {
|
||||
std::cout << "[SCOPE] start time: " << std::dec << start_time << "s" << std::endl;
|
||||
for (auto& tap : json_obj["taps"]) {
|
||||
auto id = tap["id"].get<uint32_t>();
|
||||
uint64_t cmd_start = (start_time << 11) | (id << 3) | CMD_SET_START;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
|
||||
}
|
||||
}
|
||||
|
||||
// start recording
|
||||
if (start_time != uint64_t(-1)) {
|
||||
std::cout << "[SCOPE] start time: " << std::dec << start_time << "s" << std::endl;
|
||||
for (auto& tap : json_obj["taps"]) {
|
||||
auto id = tap["id"].get<uint32_t>();
|
||||
uint64_t cmd_start = (start_time << 11) | (id << 3) | CMD_SET_START;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int vx_scope_stop(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
std::vector<tap_t> taps;
|
||||
std::vector<tap_t> taps;
|
||||
|
||||
{
|
||||
const char* json_path = getenv("SCOPE_JSON_PATH");
|
||||
std::ifstream ifs(json_path);
|
||||
auto json_obj = json::parse(ifs);
|
||||
if (json_obj.is_null())
|
||||
return 0;
|
||||
{
|
||||
const char* json_path = getenv("SCOPE_JSON_PATH");
|
||||
std::ifstream ifs(json_path);
|
||||
auto json_obj = json::parse(ifs);
|
||||
if (json_obj.is_null())
|
||||
return 0;
|
||||
|
||||
uint32_t signal_id = 1;
|
||||
uint32_t signal_id = 1;
|
||||
|
||||
for (auto& tap : json_obj["taps"]) {
|
||||
tap_t _tap;
|
||||
_tap.id = tap["id"].get<uint32_t>();
|
||||
_tap.width = tap["width"].get<uint32_t>();
|
||||
_tap.path = tap["path"].get<std::string>();
|
||||
_tap.cycle_time = 0;
|
||||
_tap.frames = 0;
|
||||
_tap.cur_frame = 0;
|
||||
for (auto& tap : json_obj["taps"]) {
|
||||
tap_t _tap;
|
||||
_tap.id = tap["id"].get<uint32_t>();
|
||||
_tap.width = tap["width"].get<uint32_t>();
|
||||
_tap.path = tap["path"].get<std::string>();
|
||||
_tap.cycle_time = 0;
|
||||
_tap.frames = 0;
|
||||
_tap.cur_frame = 0;
|
||||
|
||||
for (auto& signal : tap["signals"]) {
|
||||
auto name = signal[0].get<std::string>();
|
||||
auto width = signal[1].get<uint32_t>();
|
||||
_tap.signals.push_back({signal_id, name, width});
|
||||
++signal_id;
|
||||
}
|
||||
for (auto& signal : tap["signals"]) {
|
||||
auto name = signal[0].get<std::string>();
|
||||
auto width = signal[1].get<uint32_t>();
|
||||
_tap.signals.push_back({signal_id, name, width});
|
||||
++signal_id;
|
||||
}
|
||||
|
||||
taps.emplace_back(std::move(_tap));
|
||||
}
|
||||
taps.emplace_back(std::move(_tap));
|
||||
}
|
||||
}
|
||||
|
||||
// stop recording
|
||||
for (auto& tap : taps) {
|
||||
uint64_t cmd_stop = (0 << 11) | (tap.id << 3) | CMD_SET_STOP;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
|
||||
}
|
||||
// stop recording
|
||||
for (auto& tap : taps) {
|
||||
uint64_t cmd_stop = (0 << 11) | (tap.id << 3) | CMD_SET_STOP;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
|
||||
}
|
||||
|
||||
std::cout << "[SCOPE] trace dump begin..." << std::endl;
|
||||
std::cout << "[SCOPE] trace dump begin..." << std::endl;
|
||||
|
||||
std::ofstream ofs("scope.vcd");
|
||||
std::ofstream ofs("scope.vcd");
|
||||
|
||||
dump_header(ofs, taps);
|
||||
dump_header(ofs, taps);
|
||||
|
||||
// load trace info
|
||||
for (auto& tap : taps) {
|
||||
uint64_t count, start, delta;
|
||||
// load trace info
|
||||
for (auto& tap : taps) {
|
||||
uint64_t count, start, delta;
|
||||
|
||||
// get count
|
||||
uint64_t cmd_count = (tap.id << 3) | CMD_GET_COUNT;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_count));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &count));
|
||||
// get count
|
||||
uint64_t cmd_count = (tap.id << 3) | CMD_GET_COUNT;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_count));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &count));
|
||||
|
||||
// get start
|
||||
uint64_t cmd_start = (tap.id << 3) | CMD_GET_START;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &start));
|
||||
// get start
|
||||
uint64_t cmd_start = (tap.id << 3) | CMD_GET_START;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &start));
|
||||
|
||||
// get data
|
||||
uint64_t cmd_data = (tap.id << 3) | CMD_GET_DATA;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &delta));
|
||||
// get data
|
||||
uint64_t cmd_data = (tap.id << 3) | CMD_GET_DATA;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &delta));
|
||||
|
||||
tap.frames = count;
|
||||
tap.cycle_time = 1 + start + delta;
|
||||
tap.frames = count;
|
||||
tap.cycle_time = 1 + start + delta;
|
||||
|
||||
std::cout << std::dec << "[SCOPE] tap #" << tap.id
|
||||
<< ": width=" << tap.width
|
||||
<< ", num_frames=" << tap.frames
|
||||
<< ", start_time=" << tap.cycle_time
|
||||
<< ", path=" << tap.path << std::endl;
|
||||
}
|
||||
std::cout << std::dec << "[SCOPE] tap #" << tap.id
|
||||
<< ": width=" << tap.width
|
||||
<< ", num_frames=" << tap.frames
|
||||
<< ", start_time=" << tap.cycle_time
|
||||
<< ", path=" << tap.path << std::endl;
|
||||
}
|
||||
|
||||
uint64_t cur_time = 0;
|
||||
uint64_t cur_time = 0;
|
||||
|
||||
while (true) {
|
||||
// find the nearest tap
|
||||
auto tap = find_nearest_tap(taps);
|
||||
if (tap == nullptr)
|
||||
break;
|
||||
// advance clock
|
||||
cur_time = advance_time(ofs, tap->cycle_time, cur_time);
|
||||
// dump tap
|
||||
CHECK_ERR(dump_tap(ofs, tap, hdevice));
|
||||
};
|
||||
while (true) {
|
||||
// find the nearest tap
|
||||
auto tap = find_nearest_tap(taps);
|
||||
if (tap == nullptr)
|
||||
break;
|
||||
// advance clock
|
||||
cur_time = advance_time(ofs, tap->cycle_time, cur_time);
|
||||
// dump tap
|
||||
CHECK_ERR(dump_tap(ofs, tap, hdevice));
|
||||
};
|
||||
|
||||
std::cout << "[SCOPE] trace dump done! - " << (cur_time/2) << " cycles" << std::endl;
|
||||
std::cout << "[SCOPE] trace dump done! - " << (cur_time/2) << " cycles" << std::endl;
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -1,50 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vortex.h>
|
||||
#include <cstdint>
|
||||
#include <unordered_map>
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
|
||||
class DeviceConfig {
|
||||
public:
|
||||
void write(uint32_t addr, uint32_t value);
|
||||
int read(uint32_t addr, uint32_t* value) const;
|
||||
private:
|
||||
std::unordered_map<uint32_t, uint32_t> store_;
|
||||
};
|
||||
|
||||
int dcr_initialize(vx_device_h device);
|
||||
|
||||
uint64_t aligned_size(uint64_t size, uint64_t alignment);
|
||||
|
||||
bool is_aligned(uint64_t addr, uint64_t alignment);
|
||||
|
||||
int profiling_add(vx_device_h device);
|
||||
|
||||
void profiling_remove(int id);
|
||||
|
||||
void profiling_begin(int id);
|
||||
|
||||
void profiling_end(int id);
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
#define ALLOC_BASE_ADDR CACHE_BLOCK_SIZE
|
||||
#if (XLEN == 64)
|
||||
#define GLOBAL_MEM_SIZE 0x200000000 // 8 GB
|
||||
#else
|
||||
#define GLOBAL_MEM_SIZE 0x100000000 // 4 GB
|
||||
#endif
|
|
@ -33,9 +33,7 @@ typedef void* vx_buffer_h;
|
|||
#define VX_CAPS_CACHE_LINE_SIZE 0x4
|
||||
#define VX_CAPS_GLOBAL_MEM_SIZE 0x5
|
||||
#define VX_CAPS_LOCAL_MEM_SIZE 0x6
|
||||
#define VX_CAPS_LOCAL_MEM_ADDR 0x7
|
||||
#define VX_CAPS_ISA_FLAGS 0x8
|
||||
#define VX_CAPS_NUM_BARRIERS 0x9
|
||||
#define VX_CAPS_ISA_FLAGS 0x7
|
||||
|
||||
// device isa flags
|
||||
#define VX_ISA_STD_A (1ull << 0)
|
||||
|
@ -127,7 +125,7 @@ int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_
|
|||
int vx_upload_file(vx_device_h hdevice, const char* filename, vx_buffer_h* hbuffer);
|
||||
|
||||
// calculate cooperative threads array occupancy
|
||||
int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_barriers, uint32_t* max_localmem);
|
||||
int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_localmem);
|
||||
|
||||
// performance counters
|
||||
int vx_dump_perf(vx_device_h hdevice, FILE* stream);
|
||||
|
|
|
@ -2,7 +2,7 @@ include ../common.mk
|
|||
|
||||
TARGET ?= opaesim
|
||||
|
||||
DESTDIR ?= $(CURDIR)
|
||||
DESTDIR ?= $(CURDIR)/..
|
||||
|
||||
SYN_DIR := $(HW_DIR)/syn/altera/opae
|
||||
|
||||
|
@ -20,7 +20,7 @@ CXXFLAGS += $(CONFIGS)
|
|||
|
||||
LDFLAGS += -shared -luuid -ldl -pthread
|
||||
|
||||
SRCS = $(SRC_DIR)/vortex.cpp $(SRC_DIR)/driver.cpp $(COMMON_DIR)/utils.cpp
|
||||
SRCS = $(SRC_DIR)/vortex.cpp $(SRC_DIR)/driver.cpp
|
||||
|
||||
# set up target types
|
||||
ifeq ($(TARGET), opaesim)
|
||||
|
@ -49,21 +49,24 @@ ifdef SCOPE
|
|||
SRCS += $(COMMON_DIR)/scope.cpp
|
||||
endif
|
||||
|
||||
# Enable perf counters
|
||||
ifdef PERF
|
||||
CXXFLAGS += -DPERF_ENABLE
|
||||
endif
|
||||
|
||||
PROJECT := libvortex.so
|
||||
PROJECT := libvortex-opae.so
|
||||
|
||||
all: $(DESTDIR)/$(PROJECT)
|
||||
|
||||
driver: $(DESTDIR)/libopae-c-sim.so
|
||||
|
||||
$(DESTDIR)/libopae-c-sim.so:
|
||||
DESTDIR=$(DESTDIR) $(MAKE) -C $(ROOT_DIR)/sim/opaesim $(DESTDIR)/libopae-c-sim.so
|
||||
|
||||
$(DESTDIR)/$(PROJECT): $(SRCS) $(OPAESIM)
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $@
|
||||
|
||||
clean:
|
||||
clean-driver:
|
||||
DESTDIR=$(DESTDIR) $(MAKE) -C $(ROOT_DIR)/sim/opaesim clean
|
||||
rm -rf $(DESTDIR)/$(PROJECT)
|
||||
|
||||
clean-runtime:
|
||||
rm -f $(DESTDIR)/$(PROJECT)
|
||||
|
||||
clean: clean-driver clean-runtime
|
||||
|
||||
.PHONY: all driver clean-driver clean-runtime clean
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -33,49 +33,50 @@
|
|||
#define SET_API(func) \
|
||||
opae_drv_funcs->func = (pfn_##func)dlsym(dl_handle, #func); \
|
||||
if (opae_drv_funcs->func == nullptr) { \
|
||||
printf("dlsym failed: %s\n", dlerror()); \
|
||||
printf("dlsym failed: %s\n", dlerror()); \
|
||||
dlclose(dl_handle); \
|
||||
return -1; \
|
||||
return -1; \
|
||||
}
|
||||
|
||||
void* dl_handle = nullptr;
|
||||
|
||||
int drv_init(opae_drv_api_t* opae_drv_funcs) {
|
||||
if (opae_drv_funcs == nullptr)
|
||||
return -1;
|
||||
if (opae_drv_funcs == nullptr)
|
||||
return -1;
|
||||
|
||||
const char* api_path_s = getenv("OPAE_DRV_PATHS");
|
||||
if (api_path_s == nullptr || api_path_s[0] == '\0') {
|
||||
api_path_s = DEFAULT_OPAE_DRV_PATHS;
|
||||
}
|
||||
const char* api_path_s = getenv("OPAE_DRV_PATHS");
|
||||
if (api_path_s == nullptr || api_path_s[0] == '\0') {
|
||||
api_path_s = DEFAULT_OPAE_DRV_PATHS;
|
||||
}
|
||||
|
||||
std::vector<std::string> api_paths;
|
||||
{
|
||||
std::stringstream ss(api_path_s);
|
||||
while (ss.good()) {
|
||||
std::string path;
|
||||
getline(ss, path, ',');
|
||||
api_paths.push_back(path);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& api_path : api_paths) {
|
||||
dl_handle = dlopen(api_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
|
||||
if (dl_handle)
|
||||
break;
|
||||
}
|
||||
if (dl_handle == nullptr) {
|
||||
printf("dlopen failed: %s\n", dlerror());
|
||||
return -1;
|
||||
std::vector<std::string> api_paths;
|
||||
{
|
||||
std::stringstream ss(api_path_s);
|
||||
while (ss.good()) {
|
||||
std::string path;
|
||||
getline(ss, path, ',');
|
||||
api_paths.push_back(path);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& api_path : api_paths) {
|
||||
dl_handle = dlopen(api_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
|
||||
if (dl_handle)
|
||||
break;
|
||||
}
|
||||
|
||||
if (dl_handle == nullptr) {
|
||||
printf("dlopen failed: %s\n", dlerror());
|
||||
return -1;
|
||||
}
|
||||
|
||||
SET_API (fpgaGetProperties);
|
||||
SET_API (fpgaPropertiesSetObjectType);
|
||||
SET_API (fpgaPropertiesSetGUID);
|
||||
SET_API (fpgaDestroyProperties);
|
||||
SET_API (fpgaDestroyToken);
|
||||
SET_API (fpgaPropertiesGetLocalMemorySize);
|
||||
SET_API (fpgaEnumerate);
|
||||
SET_API (fpgaDestroyToken);
|
||||
SET_API (fpgaPropertiesGetLocalMemorySize);
|
||||
SET_API (fpgaEnumerate);
|
||||
SET_API (fpgaOpen);
|
||||
SET_API (fpgaClose);
|
||||
SET_API (fpgaPrepareBuffer);
|
||||
|
@ -83,9 +84,9 @@ int drv_init(opae_drv_api_t* opae_drv_funcs) {
|
|||
SET_API (fpgaGetIOAddress);
|
||||
SET_API (fpgaWriteMMIO64);
|
||||
SET_API (fpgaReadMMIO64);
|
||||
SET_API (fpgaErrStr);
|
||||
SET_API (fpgaErrStr);
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void drv_close() {
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,6 @@
|
|||
include ../common.mk
|
||||
|
||||
DESTDIR ?= $(CURDIR)
|
||||
DESTDIR ?= $(CURDIR)/..
|
||||
|
||||
SRC_DIR := $(VORTEX_HOME)/runtime/rtlsim
|
||||
|
||||
|
@ -17,7 +17,7 @@ CXXFLAGS += $(CONFIGS)
|
|||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += -L$(DESTDIR) -lrtlsim
|
||||
|
||||
SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp
|
||||
SRCS := $(SRC_DIR)/vortex.cpp
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
|
@ -26,19 +26,24 @@ else
|
|||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
# Enable perf counters
|
||||
ifdef PERF
|
||||
CXXFLAGS += -DPERF_ENABLE
|
||||
endif
|
||||
|
||||
PROJECT := libvortex.so
|
||||
PROJECT := libvortex-rtlsim.so
|
||||
|
||||
all: $(DESTDIR)/$(PROJECT)
|
||||
|
||||
$(DESTDIR)/$(PROJECT): $(SRCS)
|
||||
driver: $(DESTDIR)/librtlsim.so
|
||||
|
||||
$(DESTDIR)/librtlsim.so:
|
||||
DESTDIR=$(DESTDIR) $(MAKE) -C $(ROOT_DIR)/sim/rtlsim $(DESTDIR)/librtlsim.so
|
||||
|
||||
$(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/librtlsim.so
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $@
|
||||
|
||||
clean:
|
||||
DESTDIR=$(DESTDIR) $(MAKE) -C $(ROOT_DIR)/sim/rtlsim clean
|
||||
rm -rf $(DESTDIR)/$(PROJECT) *.o
|
||||
clean-driver:
|
||||
DESTDIR=$(DESTDIR) $(MAKE) -C $(ROOT_DIR)/sim/rtlsim clean-lib
|
||||
|
||||
clean-runtime:
|
||||
rm -f $(DESTDIR)/$(PROJECT)
|
||||
|
||||
clean: clean-driver clean-runtime
|
||||
|
||||
.PHONY: all driver clean-driver clean-runtime clean
|
|
@ -11,6 +11,12 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <common.h>
|
||||
|
||||
#include <mem.h>
|
||||
#include <util.h>
|
||||
#include <processor.h>
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -20,261 +26,219 @@
|
|||
#include <list>
|
||||
#include <chrono>
|
||||
|
||||
#include <vortex.h>
|
||||
#include <malloc.h>
|
||||
#include <utils.h>
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
|
||||
#include <mem.h>
|
||||
#include <util.h>
|
||||
#include <processor.h>
|
||||
#include <unordered_map>
|
||||
#include <array>
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define DBGPRINT(format, ...) ((void)0)
|
||||
#endif
|
||||
|
||||
#define CHECK_ERR(_expr, _cleanup) \
|
||||
do { \
|
||||
auto err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
: ram_(0, RAM_PAGE_SIZE)
|
||||
, global_mem_(ALLOC_BASE_ADDR, GLOBAL_MEM_SIZE - ALLOC_BASE_ADDR, RAM_PAGE_SIZE, CACHE_BLOCK_SIZE)
|
||||
{
|
||||
processor_.attach_ram(&ram_);
|
||||
vx_device()
|
||||
: ram_(0, RAM_PAGE_SIZE)
|
||||
, global_mem_(ALLOC_BASE_ADDR,
|
||||
GLOBAL_MEM_SIZE - ALLOC_BASE_ADDR,
|
||||
RAM_PAGE_SIZE,
|
||||
CACHE_BLOCK_SIZE)
|
||||
{
|
||||
processor_.attach_ram(&ram_);
|
||||
}
|
||||
|
||||
~vx_device() {
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
}
|
||||
|
||||
int init() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int get_caps(uint32_t caps_id, uint64_t *value) {
|
||||
uint64_t _value;
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
_value = IMPLEMENTATION_ID;
|
||||
break;
|
||||
case VX_CAPS_NUM_THREADS:
|
||||
_value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_NUM_WARPS:
|
||||
_value = NUM_WARPS;
|
||||
break;
|
||||
case VX_CAPS_NUM_CORES:
|
||||
_value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
_value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_GLOBAL_MEM_SIZE:
|
||||
_value = GLOBAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
_value = (1 << LMEM_LOG_SIZE);
|
||||
break;
|
||||
case VX_CAPS_ISA_FLAGS:
|
||||
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
*value = _value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_alloc(uint64_t size, int flags, uint64_t* dev_addr) {
|
||||
uint64_t addr;
|
||||
CHECK_ERR(global_mem_.allocate(size, &addr), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(this->mem_access(addr, size, flags), {
|
||||
global_mem_.release(addr);
|
||||
return err;
|
||||
});
|
||||
*dev_addr = addr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_reserve(uint64_t dev_addr, uint64_t size, int flags) {
|
||||
CHECK_ERR(global_mem_.reserve(dev_addr, size), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(this->mem_access(dev_addr, size, flags), {
|
||||
global_mem_.release(dev_addr);
|
||||
return err;
|
||||
});
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_free(uint64_t dev_addr) {
|
||||
return global_mem_.release(dev_addr);
|
||||
}
|
||||
|
||||
int mem_access(uint64_t dev_addr, uint64_t size, int flags) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dev_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.set_acl(dev_addr, size, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_info(uint64_t* mem_free, uint64_t* mem_used) const {
|
||||
if (mem_free)
|
||||
*mem_free = global_mem_.free();
|
||||
if (mem_used)
|
||||
*mem_used = global_mem_.allocated();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int upload(uint64_t dest_addr, const void* src, uint64_t size) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.enable_acl(false);
|
||||
ram_.write((const uint8_t*)src, dest_addr, size);
|
||||
ram_.enable_acl(true);
|
||||
|
||||
/*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src));
|
||||
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
|
||||
printf("\n0x%08lx=", dest_addr + i * CACHE_BLOCK_SIZE);
|
||||
for (int j = 0; j < CACHE_BLOCK_SIZE; ++j) {
|
||||
printf("%02x", *((uint8_t*)src + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
|
||||
}
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int download(void* dest, uint64_t src_addr, uint64_t size) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.enable_acl(false);
|
||||
ram_.read((uint8_t*)dest, src_addr, size);
|
||||
ram_.enable_acl(true);
|
||||
|
||||
/*printf("VXDRV: download %ld bytes to 0x%lx:", size, uintptr_t((uint8_t*)dest));
|
||||
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
|
||||
printf("\n0x%08lx=", src_addr + i * CACHE_BLOCK_SIZE);
|
||||
for (int j = 0; j < CACHE_BLOCK_SIZE; ++j) {
|
||||
printf("%02x", *((uint8_t*)dest + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
|
||||
}
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int start(uint64_t krnl_addr, uint64_t args_addr) {
|
||||
// ensure prior run completed
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
|
||||
~vx_device() {
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
profiling_remove(profiling_id_);
|
||||
// set kernel info
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
|
||||
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run();
|
||||
});
|
||||
|
||||
// clear mpm cache
|
||||
mpm_cache_.clear();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ready_wait(uint64_t timeout) {
|
||||
if (!future_.valid())
|
||||
return 0;
|
||||
uint64_t timeout_sec = timeout / 1000;
|
||||
std::chrono::seconds wait_time(1);
|
||||
for (;;) {
|
||||
// wait for 1 sec and check status
|
||||
auto status = future_.wait_for(wait_time);
|
||||
if (status == std::future_status::ready)
|
||||
break;
|
||||
if (0 == timeout_sec--)
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int init() {
|
||||
CHECK_ERR(dcr_initialize(this), {
|
||||
return err;
|
||||
});
|
||||
profiling_id_ = profiling_add(this);
|
||||
return 0;
|
||||
int dcr_write(uint32_t addr, uint32_t value) {
|
||||
if (future_.valid()) {
|
||||
future_.wait(); // ensure prior run completed
|
||||
}
|
||||
processor_.dcr_write(addr, value);
|
||||
dcrs_.write(addr, value);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int get_caps(uint32_t caps_id, uint64_t *value) {
|
||||
uint64_t _value;
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
_value = IMPLEMENTATION_ID;
|
||||
break;
|
||||
case VX_CAPS_NUM_THREADS:
|
||||
_value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_NUM_WARPS:
|
||||
_value = NUM_WARPS;
|
||||
break;
|
||||
case VX_CAPS_NUM_CORES:
|
||||
_value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_NUM_BARRIERS:
|
||||
_value = NUM_BARRIERS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
_value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_GLOBAL_MEM_SIZE:
|
||||
_value = GLOBAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
_value = (1 << LMEM_LOG_SIZE);
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_ADDR:
|
||||
_value = LMEM_BASE_ADDR;
|
||||
break;
|
||||
case VX_CAPS_ISA_FLAGS:
|
||||
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
*value = _value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_alloc(uint64_t size, int flags, uint64_t* dev_addr) {
|
||||
uint64_t addr;
|
||||
CHECK_ERR(global_mem_.allocate(size, &addr), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(this->mem_access(addr, size, flags), {
|
||||
global_mem_.release(addr);
|
||||
return err;
|
||||
});
|
||||
*dev_addr = addr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_reserve(uint64_t dev_addr, uint64_t size, int flags) {
|
||||
CHECK_ERR(global_mem_.reserve(dev_addr, size), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(this->mem_access(dev_addr, size, flags), {
|
||||
global_mem_.release(dev_addr);
|
||||
return err;
|
||||
});
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_free(uint64_t dev_addr) {
|
||||
return global_mem_.release(dev_addr);
|
||||
}
|
||||
|
||||
int mem_access(uint64_t dev_addr, uint64_t size, int flags) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dev_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.set_acl(dev_addr, size, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_info(uint64_t* mem_free, uint64_t* mem_used) const {
|
||||
if (mem_free)
|
||||
*mem_free = global_mem_.free();
|
||||
if (mem_used)
|
||||
*mem_used = global_mem_.allocated();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int upload(uint64_t dest_addr, const void* src, uint64_t size) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.enable_acl(false);
|
||||
ram_.write((const uint8_t*)src, dest_addr, size);
|
||||
ram_.enable_acl(true);
|
||||
|
||||
/*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src));
|
||||
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
|
||||
printf("\n0x%08lx=", dest_addr + i * CACHE_BLOCK_SIZE);
|
||||
for (int j = 0; j < CACHE_BLOCK_SIZE; ++j) {
|
||||
printf("%02x", *((uint8_t*)src + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
|
||||
}
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int download(void* dest, uint64_t src_addr, uint64_t size) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.enable_acl(false);
|
||||
ram_.read((uint8_t*)dest, src_addr, size);
|
||||
ram_.enable_acl(true);
|
||||
|
||||
/*printf("VXDRV: download %ld bytes to 0x%lx:", size, uintptr_t((uint8_t*)dest));
|
||||
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
|
||||
printf("\n0x%08lx=", src_addr + i * CACHE_BLOCK_SIZE);
|
||||
for (int j = 0; j < CACHE_BLOCK_SIZE; ++j) {
|
||||
printf("%02x", *((uint8_t*)dest + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
|
||||
}
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int start(uint64_t krnl_addr, uint64_t args_addr) {
|
||||
// ensure prior run completed
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
|
||||
// set kernel info
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
|
||||
|
||||
profiling_begin(profiling_id_);
|
||||
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run();
|
||||
});
|
||||
|
||||
// clear mpm cache
|
||||
mpm_cache_.clear();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ready_wait(uint64_t timeout) {
|
||||
if (!future_.valid())
|
||||
return 0;
|
||||
uint64_t timeout_sec = timeout / 1000;
|
||||
std::chrono::seconds wait_time(1);
|
||||
for (;;) {
|
||||
// wait for 1 sec and check status
|
||||
auto status = future_.wait_for(wait_time);
|
||||
if (status == std::future_status::ready)
|
||||
break;
|
||||
if (0 == timeout_sec--)
|
||||
return -1;
|
||||
}
|
||||
profiling_end(profiling_id_);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dcr_write(uint32_t addr, uint32_t value) {
|
||||
if (future_.valid()) {
|
||||
future_.wait(); // ensure prior run completed
|
||||
}
|
||||
processor_.dcr_write(addr, value);
|
||||
dcrs_.write(addr, value);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dcr_read(uint32_t addr, uint32_t* value) const {
|
||||
return dcrs_.read(addr, value);
|
||||
}
|
||||
|
||||
int mpm_query(uint32_t addr, uint32_t core_id, uint64_t* value) {
|
||||
uint32_t offset = addr - VX_CSR_MPM_BASE;
|
||||
if (offset > 31)
|
||||
return -1;
|
||||
if (mpm_cache_.count(core_id) == 0) {
|
||||
uint64_t mpm_mem_addr = IO_MPM_ADDR + core_id * 32 * sizeof(uint64_t);
|
||||
CHECK_ERR(this->download(mpm_cache_[core_id].data(), mpm_mem_addr, 32 * sizeof(uint64_t)), {
|
||||
return err;
|
||||
});
|
||||
}
|
||||
*value = mpm_cache_.at(core_id).at(offset);
|
||||
return 0;
|
||||
int dcr_read(uint32_t addr, uint32_t* value) const {
|
||||
return dcrs_.read(addr, value);
|
||||
}
|
||||
|
||||
int mpm_query(uint32_t addr, uint32_t core_id, uint64_t* value) {
|
||||
uint32_t offset = addr - VX_CSR_MPM_BASE;
|
||||
if (offset > 31)
|
||||
return -1;
|
||||
if (mpm_cache_.count(core_id) == 0) {
|
||||
uint64_t mpm_mem_addr = IO_MPM_ADDR + core_id * 32 * sizeof(uint64_t);
|
||||
CHECK_ERR(this->download(mpm_cache_[core_id].data(), mpm_mem_addr, 32 * sizeof(uint64_t)), {
|
||||
return err;
|
||||
});
|
||||
}
|
||||
*value = mpm_cache_.at(core_id).at(offset);
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
|
@ -570,4 +534,4 @@ extern int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, ui
|
|||
*value = _value;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
include ../common.mk
|
||||
|
||||
DESTDIR ?= $(CURDIR)
|
||||
DESTDIR ?= $(CURDIR)/..
|
||||
|
||||
SRC_DIR := $(VORTEX_HOME)/runtime/simx
|
||||
|
||||
|
@ -13,7 +13,7 @@ CXXFLAGS += -DXLEN_$(XLEN)
|
|||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += -L$(DESTDIR) -lsimx
|
||||
|
||||
SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp
|
||||
SRCS := $(SRC_DIR)/vortex.cpp
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
|
@ -22,14 +22,24 @@ else
|
|||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT := libvortex.so
|
||||
PROJECT := libvortex-simx.so
|
||||
|
||||
all: $(DESTDIR)/$(PROJECT)
|
||||
|
||||
$(DESTDIR)/$(PROJECT): $(SRCS)
|
||||
DESTDIR=$(DESTDIR) $(MAKE) -C $(ROOT_DIR)/sim/simx $(DESTDIR)/libsimx.so
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
driver: $(DESTDIR)/libsimx.so
|
||||
|
||||
clean:
|
||||
DESTDIR=$(DESTDIR) $(MAKE) -C $(ROOT_DIR)/sim/simx clean
|
||||
rm -rf $(DESTDIR)/$(PROJECT) *.o
|
||||
$(DESTDIR)/libsimx.so:
|
||||
DESTDIR=$(DESTDIR) $(MAKE) -C $(ROOT_DIR)/sim/simx $(DESTDIR)/libsimx.so
|
||||
|
||||
$(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/libsimx.so
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $@
|
||||
|
||||
clean-driver:
|
||||
DESTDIR=$(DESTDIR) $(MAKE) -C $(ROOT_DIR)/sim/simx clean-lib
|
||||
|
||||
clean-runtime:
|
||||
rm -f $(DESTDIR)/$(PROJECT)
|
||||
|
||||
clean: clean-driver clean-runtime
|
||||
|
||||
.PHONY: all driver clean-driver clean-runtime clean
|
|
@ -11,6 +11,14 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <common.h>
|
||||
|
||||
#include <util.h>
|
||||
#include <processor.h>
|
||||
#include <arch.h>
|
||||
#include <mem.h>
|
||||
#include <constants.h>
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -19,257 +27,213 @@
|
|||
#include <future>
|
||||
#include <chrono>
|
||||
|
||||
#include <vortex.h>
|
||||
#include <utils.h>
|
||||
#include <malloc.h>
|
||||
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
|
||||
#include <util.h>
|
||||
|
||||
#include <processor.h>
|
||||
#include <arch.h>
|
||||
#include <mem.h>
|
||||
#include <constants.h>
|
||||
#include <unordered_map>
|
||||
#include <array>
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define DBGPRINT(format, ...) ((void)0)
|
||||
#endif
|
||||
|
||||
#define CHECK_ERR(_expr, _cleanup) \
|
||||
do { \
|
||||
auto err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES)
|
||||
, ram_(0, RAM_PAGE_SIZE)
|
||||
, processor_(arch_)
|
||||
, global_mem_(ALLOC_BASE_ADDR, GLOBAL_MEM_SIZE - ALLOC_BASE_ADDR, RAM_PAGE_SIZE, CACHE_BLOCK_SIZE)
|
||||
{
|
||||
// attach memory module
|
||||
processor_.attach_ram(&ram_);
|
||||
vx_device()
|
||||
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES)
|
||||
, ram_(0, RAM_PAGE_SIZE)
|
||||
, processor_(arch_)
|
||||
, global_mem_(ALLOC_BASE_ADDR,
|
||||
GLOBAL_MEM_SIZE - ALLOC_BASE_ADDR,
|
||||
RAM_PAGE_SIZE,
|
||||
CACHE_BLOCK_SIZE)
|
||||
{
|
||||
// attach memory module
|
||||
processor_.attach_ram(&ram_);
|
||||
}
|
||||
|
||||
~vx_device() {
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
}
|
||||
|
||||
int init() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int get_caps(uint32_t caps_id, uint64_t *value) {
|
||||
uint64_t _value;
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
_value = IMPLEMENTATION_ID;
|
||||
break;
|
||||
case VX_CAPS_NUM_THREADS:
|
||||
_value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_NUM_WARPS:
|
||||
_value = NUM_WARPS;
|
||||
break;
|
||||
case VX_CAPS_NUM_CORES:
|
||||
_value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
_value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_GLOBAL_MEM_SIZE:
|
||||
_value = GLOBAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
_value = (1 << LMEM_LOG_SIZE);
|
||||
break;
|
||||
case VX_CAPS_ISA_FLAGS:
|
||||
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
*value = _value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_alloc(uint64_t size, int flags, uint64_t* dev_addr) {
|
||||
uint64_t addr;
|
||||
CHECK_ERR(global_mem_.allocate(size, &addr), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(this->mem_access(addr, size, flags), {
|
||||
global_mem_.release(addr);
|
||||
return err;
|
||||
});
|
||||
*dev_addr = addr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_reserve(uint64_t dev_addr, uint64_t size, int flags) {
|
||||
CHECK_ERR(global_mem_.reserve(dev_addr, size), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(this->mem_access(dev_addr, size, flags), {
|
||||
global_mem_.release(dev_addr);
|
||||
return err;
|
||||
});
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_free(uint64_t dev_addr) {
|
||||
return global_mem_.release(dev_addr);
|
||||
}
|
||||
|
||||
int mem_access(uint64_t dev_addr, uint64_t size, int flags) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dev_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.set_acl(dev_addr, size, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_info(uint64_t* mem_free, uint64_t* mem_used) const {
|
||||
if (mem_free)
|
||||
*mem_free = global_mem_.free();
|
||||
if (mem_used)
|
||||
*mem_used = global_mem_.allocated();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int upload(uint64_t dest_addr, const void* src, uint64_t size) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.enable_acl(false);
|
||||
ram_.write((const uint8_t*)src, dest_addr, size);
|
||||
ram_.enable_acl(true);
|
||||
|
||||
/*DBGPRINT("upload %ld bytes to 0x%lx\n", size, dest_addr);
|
||||
for (uint64_t i = 0; i < size && i < 1024; i += 4) {
|
||||
DBGPRINT(" 0x%lx <- 0x%x\n", dest_addr + i, *(uint32_t*)((uint8_t*)src + i));
|
||||
}*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int download(void* dest, uint64_t src_addr, uint64_t size) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.enable_acl(false);
|
||||
ram_.read((uint8_t*)dest, src_addr, size);
|
||||
ram_.enable_acl(true);
|
||||
|
||||
/*DBGPRINT("download %ld bytes from 0x%lx\n", size, src_addr);
|
||||
for (uint64_t i = 0; i < size && i < 1024; i += 4) {
|
||||
DBGPRINT(" 0x%lx -> 0x%x\n", src_addr + i, *(uint32_t*)((uint8_t*)dest + i));
|
||||
}*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int start(uint64_t krnl_addr, uint64_t args_addr) {
|
||||
// ensure prior run completed
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
|
||||
~vx_device() {
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
profiling_remove(profiling_id_);
|
||||
// set kernel info
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
|
||||
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run();
|
||||
});
|
||||
|
||||
// clear mpm cache
|
||||
mpm_cache_.clear();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ready_wait(uint64_t timeout) {
|
||||
if (!future_.valid())
|
||||
return 0;
|
||||
uint64_t timeout_sec = timeout / 1000;
|
||||
std::chrono::seconds wait_time(1);
|
||||
for (;;) {
|
||||
// wait for 1 sec and check status
|
||||
auto status = future_.wait_for(wait_time);
|
||||
if (status == std::future_status::ready)
|
||||
break;
|
||||
if (0 == timeout_sec--)
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int init() {
|
||||
CHECK_ERR(dcr_initialize(this), {
|
||||
return err;
|
||||
});
|
||||
profiling_id_ = profiling_add(this);
|
||||
return 0;
|
||||
int dcr_write(uint32_t addr, uint32_t value) {
|
||||
if (future_.valid()) {
|
||||
future_.wait(); // ensure prior run completed
|
||||
}
|
||||
processor_.dcr_write(addr, value);
|
||||
dcrs_.write(addr, value);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int get_caps(uint32_t caps_id, uint64_t *value) {
|
||||
uint64_t _value;
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
_value = IMPLEMENTATION_ID;
|
||||
break;
|
||||
case VX_CAPS_NUM_THREADS:
|
||||
_value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_NUM_WARPS:
|
||||
_value = NUM_WARPS;
|
||||
break;
|
||||
case VX_CAPS_NUM_CORES:
|
||||
_value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_NUM_BARRIERS:
|
||||
_value = NUM_BARRIERS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
_value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_GLOBAL_MEM_SIZE:
|
||||
_value = GLOBAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
_value = (1 << LMEM_LOG_SIZE);
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_ADDR:
|
||||
_value = LMEM_BASE_ADDR;
|
||||
break;
|
||||
case VX_CAPS_ISA_FLAGS:
|
||||
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
*value = _value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_alloc(uint64_t size, int flags, uint64_t* dev_addr) {
|
||||
uint64_t addr;
|
||||
CHECK_ERR(global_mem_.allocate(size, &addr), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(this->mem_access(addr, size, flags), {
|
||||
global_mem_.release(addr);
|
||||
return err;
|
||||
});
|
||||
*dev_addr = addr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_reserve(uint64_t dev_addr, uint64_t size, int flags) {
|
||||
CHECK_ERR(global_mem_.reserve(dev_addr, size), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(this->mem_access(dev_addr, size, flags), {
|
||||
global_mem_.release(dev_addr);
|
||||
return err;
|
||||
});
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_free(uint64_t dev_addr) {
|
||||
return global_mem_.release(dev_addr);
|
||||
}
|
||||
|
||||
int mem_access(uint64_t dev_addr, uint64_t size, int flags) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dev_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.set_acl(dev_addr, size, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_info(uint64_t* mem_free, uint64_t* mem_used) const {
|
||||
if (mem_free)
|
||||
*mem_free = global_mem_.free();
|
||||
if (mem_used)
|
||||
*mem_used = global_mem_.allocated();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int upload(uint64_t dest_addr, const void* src, uint64_t size) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.enable_acl(false);
|
||||
ram_.write((const uint8_t*)src, dest_addr, size);
|
||||
ram_.enable_acl(true);
|
||||
|
||||
/*DBGPRINT("upload %ld bytes to 0x%lx\n", size, dest_addr);
|
||||
for (uint64_t i = 0; i < size && i < 1024; i += 4) {
|
||||
DBGPRINT(" 0x%lx <- 0x%x\n", dest_addr + i, *(uint32_t*)((uint8_t*)src + i));
|
||||
}*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int download(void* dest, uint64_t src_addr, uint64_t size) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.enable_acl(false);
|
||||
ram_.read((uint8_t*)dest, src_addr, size);
|
||||
ram_.enable_acl(true);
|
||||
|
||||
/*DBGPRINT("download %ld bytes from 0x%lx\n", size, src_addr);
|
||||
for (uint64_t i = 0; i < size && i < 1024; i += 4) {
|
||||
DBGPRINT(" 0x%lx -> 0x%x\n", src_addr + i, *(uint32_t*)((uint8_t*)dest + i));
|
||||
}*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int start(uint64_t krnl_addr, uint64_t args_addr) {
|
||||
// ensure prior run completed
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
|
||||
// set kernel info
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
|
||||
|
||||
profiling_begin(profiling_id_);
|
||||
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run();
|
||||
});
|
||||
|
||||
// clear mpm cache
|
||||
mpm_cache_.clear();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ready_wait(uint64_t timeout) {
|
||||
if (!future_.valid())
|
||||
return 0;
|
||||
uint64_t timeout_sec = timeout / 1000;
|
||||
std::chrono::seconds wait_time(1);
|
||||
for (;;) {
|
||||
// wait for 1 sec and check status
|
||||
auto status = future_.wait_for(wait_time);
|
||||
if (status == std::future_status::ready)
|
||||
break;
|
||||
if (0 == timeout_sec--)
|
||||
return -1;
|
||||
}
|
||||
profiling_end(profiling_id_);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dcr_write(uint32_t addr, uint32_t value) {
|
||||
if (future_.valid()) {
|
||||
future_.wait(); // ensure prior run completed
|
||||
}
|
||||
processor_.dcr_write(addr, value);
|
||||
dcrs_.write(addr, value);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dcr_read(uint32_t addr, uint32_t* value) const {
|
||||
return dcrs_.read(addr, value);
|
||||
}
|
||||
|
||||
int mpm_query(uint32_t addr, uint32_t core_id, uint64_t* value) {
|
||||
uint32_t offset = addr - VX_CSR_MPM_BASE;
|
||||
if (offset > 31)
|
||||
return -1;
|
||||
if (mpm_cache_.count(core_id) == 0) {
|
||||
uint64_t mpm_mem_addr = IO_MPM_ADDR + core_id * 32 * sizeof(uint64_t);
|
||||
CHECK_ERR(this->download(mpm_cache_[core_id].data(), mpm_mem_addr, 32 * sizeof(uint64_t)), {
|
||||
return err;
|
||||
});
|
||||
}
|
||||
*value = mpm_cache_.at(core_id).at(offset);
|
||||
return 0;
|
||||
int dcr_read(uint32_t addr, uint32_t* value) const {
|
||||
return dcrs_.read(addr, value);
|
||||
}
|
||||
|
||||
int mpm_query(uint32_t addr, uint32_t core_id, uint64_t* value) {
|
||||
uint32_t offset = addr - VX_CSR_MPM_BASE;
|
||||
if (offset > 31)
|
||||
return -1;
|
||||
if (mpm_cache_.count(core_id) == 0) {
|
||||
uint64_t mpm_mem_addr = IO_MPM_ADDR + core_id * 32 * sizeof(uint64_t);
|
||||
CHECK_ERR(this->download(mpm_cache_[core_id].data(), mpm_mem_addr, 32 * sizeof(uint64_t)), {
|
||||
return err;
|
||||
});
|
||||
}
|
||||
*value = mpm_cache_.at(core_id).at(offset);
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
Arch arch_;
|
||||
|
@ -565,4 +529,4 @@ extern int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, ui
|
|||
*value = _value;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -1,23 +1,32 @@
|
|||
include ../common.mk
|
||||
|
||||
DESTDIR ?= $(CURDIR)
|
||||
DESTDIR ?= $(CURDIR)/..
|
||||
|
||||
SRC_DIR := $(VORTEX_HOME)/runtime/stub
|
||||
|
||||
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
CXXFLAGS += -I$(INC_DIR) -I$(COMMON_DIR) -I$(ROOT_DIR)/hw -I$(SIM_DIR)/common
|
||||
CXXFLAGS += -fPIC
|
||||
|
||||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += -shared -pthread -ldl
|
||||
|
||||
SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp
|
||||
SRCS := $(SRC_DIR)/vortex.cpp $(SRC_DIR)/utils.cpp
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT := libvortex.so
|
||||
|
||||
all: $(PROJECT)
|
||||
all: $(DESTDIR)/$(PROJECT)
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(DESTDIR)/$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) obj_dir
|
||||
rm -f $(DESTDIR)/$(PROJECT)
|
||||
|
||||
.PHONY: all clean
|
|
@ -11,7 +11,8 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "utils.h"
|
||||
#include <common.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <list>
|
||||
|
@ -21,129 +22,30 @@
|
|||
#include <vortex.h>
|
||||
#include <assert.h>
|
||||
|
||||
#define RT_CHECK(_expr, _cleanup) \
|
||||
do { \
|
||||
int _ret = _expr; \
|
||||
if (0 == _ret) \
|
||||
break; \
|
||||
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
uint64_t aligned_size(uint64_t size, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return (size + alignment - 1) & ~(alignment - 1);
|
||||
}
|
||||
|
||||
bool is_aligned(uint64_t addr, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return 0 == (addr & (alignment - 1));
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class AutoPerfDump {
|
||||
class ProfilingMode {
|
||||
public:
|
||||
AutoPerfDump() : perf_class_(0) {
|
||||
ProfilingMode() : perf_class_(0) {
|
||||
auto profiling_s = getenv("VORTEX_PROFILING");
|
||||
if (profiling_s) {
|
||||
perf_class_ = std::atoi(profiling_s);
|
||||
}
|
||||
}
|
||||
|
||||
~AutoPerfDump() {}
|
||||
~ProfilingMode() {}
|
||||
|
||||
int add(vx_device_h hdevice) {
|
||||
int ret = devices_.size();
|
||||
devices_[ret] = hdevice;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void remove(int id) {
|
||||
devices_.erase(id);
|
||||
}
|
||||
|
||||
void begin(int id) {
|
||||
auto device = devices_.at(id);
|
||||
vx_dcr_write(device, VX_DCR_BASE_MPM_CLASS, perf_class_);
|
||||
}
|
||||
|
||||
void end(int id) {
|
||||
auto device = devices_.at(id);
|
||||
vx_dump_perf(device, stdout);
|
||||
}
|
||||
|
||||
int get_perf_class() const {
|
||||
int perf_class() const {
|
||||
return perf_class_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unordered_map<int, vx_device_h> devices_;
|
||||
int perf_class_;
|
||||
};
|
||||
|
||||
AutoPerfDump gAutoPerfDump;
|
||||
|
||||
int profiling_add(vx_device_h hdevice) {
|
||||
return gAutoPerfDump.add(hdevice);
|
||||
int get_profiling_mode() {
|
||||
static ProfilingMode gProfilingMode;
|
||||
return gProfilingMode.perf_class();
|
||||
}
|
||||
|
||||
void profiling_remove(int id) {
|
||||
gAutoPerfDump.remove(id);
|
||||
}
|
||||
|
||||
void profiling_begin(int id) {
|
||||
gAutoPerfDump.begin(id);
|
||||
}
|
||||
|
||||
void profiling_end(int id) {
|
||||
gAutoPerfDump.end(id);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void DeviceConfig::write(uint32_t addr, uint32_t value) {
|
||||
store_[addr] = value;
|
||||
}
|
||||
|
||||
int DeviceConfig::read(uint32_t addr, uint32_t* value) const {
|
||||
auto it = store_.find(addr);
|
||||
if (it == store_.end())
|
||||
return -1;
|
||||
*value = it->second;
|
||||
return 0;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int dcr_initialize(vx_device_h hdevice) {
|
||||
const uint64_t startup_addr(STARTUP_ADDR);
|
||||
|
||||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, 0), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG1, 0), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice || nullptr == content || size <= 8 || nullptr == hbuffer)
|
||||
return -1;
|
||||
|
@ -152,35 +54,29 @@ extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint
|
|||
|
||||
auto min_vma = *bytes++;
|
||||
auto max_vma = *bytes++;
|
||||
auto bin_size = size - 16;
|
||||
auto bin_size = size - 2 * 8;
|
||||
auto runtime_size = (max_vma - min_vma);
|
||||
|
||||
vx_buffer_h _hbuffer;
|
||||
#ifndef NDEBUG
|
||||
RT_CHECK(vx_mem_reserve(hdevice, min_vma, runtime_size, 0, &_hbuffer), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mem_reserve(hdevice, min_vma, runtime_size, 0, &_hbuffer), {
|
||||
return err;
|
||||
});
|
||||
#else
|
||||
RT_CHECK(vx_mem_alloc(hdevice, runtime_size, 0, &_hbuffer), {
|
||||
return _ret;
|
||||
});
|
||||
#endif
|
||||
|
||||
// mask binary region as read-only
|
||||
RT_CHECK(vx_mem_access(_hbuffer, 0, bin_size, VX_MEM_READ), {
|
||||
CHECK_ERR(vx_mem_access(_hbuffer, 0, bin_size, VX_MEM_READ), {
|
||||
vx_mem_free(_hbuffer);
|
||||
return _ret;
|
||||
return err;
|
||||
});
|
||||
|
||||
// mark global variables region as read-write
|
||||
RT_CHECK(vx_mem_access(_hbuffer, bin_size, runtime_size - bin_size, VX_MEM_READ_WRITE), {
|
||||
CHECK_ERR(vx_mem_access(_hbuffer, bin_size, runtime_size - bin_size, VX_MEM_READ_WRITE), {
|
||||
vx_mem_free(_hbuffer);
|
||||
return _ret;
|
||||
return err;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(_hbuffer, bytes, 0, bin_size), {
|
||||
CHECK_ERR(vx_copy_to_dev(_hbuffer, bytes, 0, bin_size), {
|
||||
vx_mem_free(_hbuffer);
|
||||
return _ret;
|
||||
return err;
|
||||
});
|
||||
|
||||
*hbuffer = _hbuffer;
|
||||
|
@ -206,8 +102,8 @@ extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, vx_b
|
|||
ifs.read(content.data(), size);
|
||||
|
||||
// upload buffer
|
||||
RT_CHECK(vx_upload_kernel_bytes(hdevice, content.data(), size, hbuffer), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_upload_kernel_bytes(hdevice, content.data(), size, hbuffer), {
|
||||
return err;
|
||||
});
|
||||
|
||||
return 0;
|
||||
|
@ -219,13 +115,13 @@ extern int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t si
|
|||
|
||||
vx_buffer_h _hbuffer;
|
||||
|
||||
RT_CHECK(vx_mem_alloc(hdevice, size, VX_MEM_READ, &_hbuffer), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mem_alloc(hdevice, size, VX_MEM_READ, &_hbuffer), {
|
||||
return err;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(_hbuffer, content, 0, size), {
|
||||
CHECK_ERR(vx_copy_to_dev(_hbuffer, content, 0, size), {
|
||||
vx_mem_free(_hbuffer);
|
||||
return _ret;
|
||||
return err;
|
||||
});
|
||||
|
||||
*hbuffer = _hbuffer;
|
||||
|
@ -251,8 +147,8 @@ extern int vx_upload_file(vx_device_h hdevice, const char* filename, vx_buffer_h
|
|||
ifs.read(content.data(), size);
|
||||
|
||||
// upload buffer
|
||||
RT_CHECK(vx_upload_bytes(hdevice, content.data(), size, hbuffer), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_upload_bytes(hdevice, content.data(), size, hbuffer), {
|
||||
return err;
|
||||
});
|
||||
|
||||
return 0;
|
||||
|
@ -265,8 +161,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
uint64_t total_cycles = 0;
|
||||
uint64_t max_cycles = 0;
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
|
||||
auto calcRatio = [&](uint64_t part, uint64_t total)->int {
|
||||
if (total == 0)
|
||||
return 0;
|
||||
|
@ -283,8 +177,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
return int(caclAverage(part, total) * 100);
|
||||
};
|
||||
|
||||
auto perf_class = gAutoPerfDump.get_perf_class();
|
||||
|
||||
// PERF: pipeline stalls
|
||||
uint64_t sched_idles = 0;
|
||||
uint64_t sched_stalls = 0;
|
||||
|
@ -319,45 +211,44 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
uint64_t mem_reads = 0;
|
||||
uint64_t mem_writes = 0;
|
||||
uint64_t mem_lat = 0;
|
||||
#endif
|
||||
|
||||
uint64_t num_cores;
|
||||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
|
||||
return err;
|
||||
});
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
uint64_t isa_flags;
|
||||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), {
|
||||
return err;
|
||||
});
|
||||
|
||||
bool icache_enable = isa_flags & VX_ISA_EXT_ICACHE;
|
||||
bool dcache_enable = isa_flags & VX_ISA_EXT_DCACHE;
|
||||
bool l2cache_enable = isa_flags & VX_ISA_EXT_L2CACHE;
|
||||
bool l3cache_enable = isa_flags & VX_ISA_EXT_L3CACHE;
|
||||
bool lmem_enable = isa_flags & VX_ISA_EXT_LMEM;
|
||||
#endif
|
||||
|
||||
auto perf_class = get_profiling_mode();
|
||||
|
||||
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
|
||||
uint64_t cycles_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MCYCLE, core_id, &cycles_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MCYCLE, core_id, &cycles_per_core), {
|
||||
return err;
|
||||
});
|
||||
|
||||
uint64_t instrs_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MINSTRET, core_id, &instrs_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MINSTRET, core_id, &instrs_per_core), {
|
||||
return err;
|
||||
});
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
switch (perf_class) {
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
// PERF: pipeline
|
||||
// scheduler idles
|
||||
{
|
||||
uint64_t sched_idles_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ID, core_id, &sched_idles_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ID, core_id, &sched_idles_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
|
||||
|
@ -368,8 +259,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// scheduler stalls
|
||||
{
|
||||
uint64_t sched_stalls_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ST, core_id, &sched_stalls_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ST, core_id, &sched_stalls_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
|
||||
|
@ -380,8 +271,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// ibuffer_stalls
|
||||
{
|
||||
uint64_t ibuffer_stalls_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IBUF_ST, core_id, &ibuffer_stalls_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_IBUF_ST, core_id, &ibuffer_stalls_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
|
||||
|
@ -392,24 +283,24 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// issue_stalls
|
||||
{
|
||||
uint64_t scrb_stalls_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ST, core_id, &scrb_stalls_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ST, core_id, &scrb_stalls_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_alu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ALU, core_id, &scrb_alu_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ALU, core_id, &scrb_alu_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_fpu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_FPU, core_id, &scrb_fpu_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_FPU, core_id, &scrb_fpu_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_lsu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_LSU, core_id, &scrb_lsu_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_LSU, core_id, &scrb_lsu_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_sfu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
|
||||
return err;
|
||||
});
|
||||
scrb_alu += scrb_alu_per_core;
|
||||
scrb_fpu += scrb_fpu_per_core;
|
||||
|
@ -428,16 +319,16 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// sfu_stalls
|
||||
{
|
||||
uint64_t scrb_sfu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_wctl_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_csrs_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_CSRS, core_id, &scrb_csrs_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_CSRS, core_id, &scrb_csrs_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core;
|
||||
|
@ -455,15 +346,15 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// ifetches
|
||||
{
|
||||
uint64_t ifetches_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCHES, core_id, &ifetches_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCHES, core_id, &ifetches_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
|
||||
ifetches += ifetches_per_core;
|
||||
|
||||
uint64_t ifetch_lat_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCH_LT, core_id, &ifetch_lat_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCH_LT, core_id, &ifetch_lat_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
|
||||
|
@ -474,15 +365,15 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// loads
|
||||
{
|
||||
uint64_t loads_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LOADS, core_id, &loads_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LOADS, core_id, &loads_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
|
||||
loads += loads_per_core;
|
||||
|
||||
uint64_t load_lat_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LOAD_LT, core_id, &load_lat_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LOAD_LT, core_id, &load_lat_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
|
||||
|
@ -493,8 +384,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// stores
|
||||
{
|
||||
uint64_t stores_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_STORES, core_id, &stores_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_STORES, core_id, &stores_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
|
||||
stores += stores_per_core;
|
||||
|
@ -504,16 +395,16 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
if (lmem_enable) {
|
||||
// PERF: lmem
|
||||
uint64_t lmem_reads;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_READS, core_id, &lmem_reads), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_READS, core_id, &lmem_reads), {
|
||||
return err;
|
||||
});
|
||||
uint64_t lmem_writes;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_WRITES, core_id, &lmem_writes), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_WRITES, core_id, &lmem_writes), {
|
||||
return err;
|
||||
});
|
||||
uint64_t lmem_bank_stalls;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_BANK_ST, core_id, &lmem_bank_stalls), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_BANK_ST, core_id, &lmem_bank_stalls), {
|
||||
return err;
|
||||
});
|
||||
int lmem_bank_utilization = calcAvgPercent(lmem_reads + lmem_writes, lmem_reads + lmem_writes + lmem_bank_stalls);
|
||||
fprintf(stream, "PERF: core%d: lmem reads=%ld\n", core_id, lmem_reads);
|
||||
|
@ -524,16 +415,16 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
if (icache_enable) {
|
||||
// PERF: Icache
|
||||
uint64_t icache_reads;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_READS, core_id, &icache_reads), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_READS, core_id, &icache_reads), {
|
||||
return err;
|
||||
});
|
||||
uint64_t icache_read_misses;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MISS_R, core_id, &icache_read_misses), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MISS_R, core_id, &icache_read_misses), {
|
||||
return err;
|
||||
});
|
||||
uint64_t icache_mshr_stalls;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MSHR_ST, core_id, &icache_mshr_stalls), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MSHR_ST, core_id, &icache_mshr_stalls), {
|
||||
return err;
|
||||
});
|
||||
int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
|
||||
int mshr_utilization = calcAvgPercent(icache_read_misses, icache_read_misses + icache_mshr_stalls);
|
||||
|
@ -545,28 +436,28 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
if (dcache_enable) {
|
||||
// PERF: Dcache
|
||||
uint64_t dcache_reads;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_READS, core_id, &dcache_reads), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_READS, core_id, &dcache_reads), {
|
||||
return err;
|
||||
});
|
||||
uint64_t dcache_writes;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), {
|
||||
return err;
|
||||
});
|
||||
uint64_t dcache_read_misses;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), {
|
||||
return err;
|
||||
});
|
||||
uint64_t dcache_write_misses;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_W, core_id, &dcache_write_misses), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_W, core_id, &dcache_write_misses), {
|
||||
return err;
|
||||
});
|
||||
uint64_t dcache_bank_stalls;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_BANK_ST, core_id, &dcache_bank_stalls), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_BANK_ST, core_id, &dcache_bank_stalls), {
|
||||
return err;
|
||||
});
|
||||
uint64_t dcache_mshr_stalls;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MSHR_ST, core_id, &dcache_mshr_stalls), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MSHR_ST, core_id, &dcache_mshr_stalls), {
|
||||
return err;
|
||||
});
|
||||
int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads);
|
||||
int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes);
|
||||
|
@ -583,74 +474,73 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
if (l2cache_enable) {
|
||||
// PERF: L2cache
|
||||
uint64_t tmp;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_READS, core_id, &tmp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_READS, core_id, &tmp), {
|
||||
return err;
|
||||
});
|
||||
l2cache_reads += tmp;
|
||||
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_WRITES, core_id, &tmp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_WRITES, core_id, &tmp), {
|
||||
return err;
|
||||
});
|
||||
l2cache_writes += tmp;
|
||||
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MISS_R, core_id, &tmp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MISS_R, core_id, &tmp), {
|
||||
return err;
|
||||
});
|
||||
l2cache_read_misses += tmp;
|
||||
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MISS_W, core_id, &tmp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MISS_W, core_id, &tmp), {
|
||||
return err;
|
||||
});
|
||||
l2cache_write_misses += tmp;
|
||||
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_BANK_ST, core_id, &tmp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_BANK_ST, core_id, &tmp), {
|
||||
return err;
|
||||
});
|
||||
l2cache_bank_stalls += tmp;
|
||||
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MSHR_ST, core_id, &tmp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MSHR_ST, core_id, &tmp), {
|
||||
return err;
|
||||
});
|
||||
l2cache_mshr_stalls += tmp;
|
||||
}
|
||||
if (0 == core_id) {
|
||||
if (l3cache_enable) {
|
||||
// PERF: L3cache
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_READS, core_id, &l3cache_reads), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_READS, core_id, &l3cache_reads), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_WRITES, core_id, &l3cache_writes), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_WRITES, core_id, &l3cache_writes), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MISS_R, core_id, &l3cache_read_misses), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MISS_R, core_id, &l3cache_read_misses), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MISS_W, core_id, &l3cache_write_misses), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MISS_W, core_id, &l3cache_write_misses), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_BANK_ST, core_id, &l3cache_bank_stalls), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_BANK_ST, core_id, &l3cache_bank_stalls), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MSHR_ST, core_id, &l3cache_mshr_stalls), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MSHR_ST, core_id, &l3cache_mshr_stalls), {
|
||||
return err;
|
||||
});
|
||||
}
|
||||
// PERF: memory
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_READS, core_id, &mem_reads), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_READS, core_id, &mem_reads), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_WRITES, core_id, &mem_writes), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_WRITES, core_id, &mem_writes), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_LT, core_id, &mem_lat), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_LT, core_id, &mem_lat), {
|
||||
return err;
|
||||
});
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
|
||||
|
@ -659,7 +549,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
max_cycles = std::max<uint64_t>(cycles_per_core, max_cycles);
|
||||
}
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
switch (perf_class) {
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
int sched_idles_percent = calcAvgPercent(sched_idles, total_cycles);
|
||||
|
@ -728,7 +617,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
float IPC = (float)(double(total_instrs) / double(max_cycles));
|
||||
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, max_cycles, IPC);
|
||||
|
@ -738,18 +626,18 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_barriers, uint32_t* max_localmem) {
|
||||
int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_localmem) {
|
||||
// check group size
|
||||
uint64_t warps_per_core, threads_per_warp;
|
||||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_WARPS, &warps_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_WARPS, &warps_per_core), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_THREADS, &threads_per_warp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_THREADS, &threads_per_warp), {
|
||||
return err;
|
||||
});
|
||||
uint32_t threads_per_core = warps_per_core * threads_per_warp;
|
||||
if (group_size > threads_per_core) {
|
||||
printf("Error: device cannot schedule group size > (%d)\n", threads_per_core);
|
||||
printf("Error: cannot schedule kernel with group_size > threads_per_core (%d,%d)\n", group_size, threads_per_core);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -757,24 +645,11 @@ int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_b
|
|||
int warps_per_group = (group_size + threads_per_warp-1) / threads_per_warp;
|
||||
int groups_per_core = warps_per_core / warps_per_group;
|
||||
|
||||
// check barriers capacity
|
||||
if (max_barriers) {
|
||||
uint64_t num_barriers;
|
||||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_BARRIERS, &num_barriers), {
|
||||
return _ret;
|
||||
});
|
||||
if (warps_per_group < 2) {
|
||||
*max_barriers = -1;
|
||||
} else {
|
||||
*max_barriers = num_barriers / groups_per_core;
|
||||
}
|
||||
}
|
||||
|
||||
// check local memory capacity
|
||||
if (max_localmem) {
|
||||
uint64_t local_mem_size;
|
||||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_LOCAL_MEM_SIZE, &local_mem_size), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_LOCAL_MEM_SIZE, &local_mem_size), {
|
||||
return err;
|
||||
});
|
||||
*max_localmem = local_mem_size / groups_per_core;
|
||||
}
|
|
@ -11,69 +11,156 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <vortex.h>
|
||||
#include <common.h>
|
||||
|
||||
extern int vx_dev_open(vx_device_h* /*hdevice*/) {
|
||||
return -1;
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
#include <cstdlib>
|
||||
#include <dlfcn.h>
|
||||
#include <iostream>
|
||||
|
||||
int get_profiling_mode();
|
||||
|
||||
static int dcr_initialize(vx_device_h hdevice) {
|
||||
const uint64_t startup_addr(STARTUP_ADDR);
|
||||
|
||||
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff), {
|
||||
return err;
|
||||
});
|
||||
|
||||
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32), {
|
||||
return err;
|
||||
});
|
||||
|
||||
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, 0), {
|
||||
return err;
|
||||
});
|
||||
|
||||
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG1, 0), {
|
||||
return err;
|
||||
});
|
||||
|
||||
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
|
||||
return err;
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_close(vx_device_h /*hdevice*/) {
|
||||
return -1;
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static callbacks_t g_callbacks;
|
||||
static void* g_drv_handle = nullptr;
|
||||
|
||||
typedef int (*vx_dev_init_t)(callbacks_t*);
|
||||
|
||||
extern int vx_dev_open(vx_device_h* hdevice) {
|
||||
{
|
||||
const char* driverName = getenv("VORTEX_DRIVER");
|
||||
if (driverName == nullptr) {
|
||||
driverName = "simx";
|
||||
}
|
||||
std::string driverName_s(driverName);
|
||||
std::string libName = "libvortex-" + driverName_s + ".so";
|
||||
auto handle = dlopen(libName.c_str(), RTLD_LAZY);
|
||||
if (handle == nullptr) {
|
||||
std::cerr << "Cannot open library: " << dlerror() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto vx_dev_init = (vx_dev_init_t)dlsym(handle, "vx_dev_init");
|
||||
auto dlsym_error = dlerror();
|
||||
if (dlsym_error) {
|
||||
std::cerr << "Cannot load symbol 'vx_init': " << dlsym_error << std::endl;
|
||||
dlclose(handle);
|
||||
return 1;
|
||||
}
|
||||
|
||||
vx_dev_init(&g_callbacks);
|
||||
g_drv_handle = handle;
|
||||
}
|
||||
|
||||
vx_device_h _hdevice;
|
||||
|
||||
CHECK_ERR((g_callbacks.dev_open)(&_hdevice), {
|
||||
return err;
|
||||
});
|
||||
|
||||
CHECK_ERR(dcr_initialize(_hdevice), {
|
||||
return err;
|
||||
});
|
||||
|
||||
*hdevice = _hdevice;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_caps(vx_device_h /*hdevice*/, uint32_t /*caps_id*/, uint64_t* /*value*/) {
|
||||
return -1;
|
||||
extern int vx_dev_close(vx_device_h hdevice) {
|
||||
vx_dump_perf(hdevice, stdout);
|
||||
int ret = (g_callbacks.dev_close)(hdevice);
|
||||
dlclose(g_drv_handle);
|
||||
return ret;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h /*hdevice*/, uint64_t /*size*/, int /*flags*/, vx_buffer_h* /*hbuffer*/) {
|
||||
return -1;
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t* value) {
|
||||
return (g_callbacks.dev_caps)(hdevice, caps_id, value);
|
||||
}
|
||||
|
||||
extern int vx_mem_reserve(vx_device_h /*hdevice*/, uint64_t /*address*/, uint64_t /*size*/, int /*flags*/, vx_buffer_h* /*hbuffer*/) {
|
||||
return -1;
|
||||
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int flags, vx_buffer_h* hbuffer) {
|
||||
return (g_callbacks.mem_alloc)(hdevice, size, flags, hbuffer);
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_buffer_h /*hbuffer*/) {
|
||||
return -1;
|
||||
extern int vx_mem_reserve(vx_device_h hdevice, uint64_t address, uint64_t size, int flags, vx_buffer_h* hbuffer) {
|
||||
return (g_callbacks.mem_reserve)(hdevice, address, size, flags, hbuffer);
|
||||
}
|
||||
|
||||
extern int vx_mem_access(vx_buffer_h /*hbuffer*/, uint64_t /*offset*/, uint64_t /*size*/, int /*flags*/) {
|
||||
return -1;
|
||||
extern int vx_mem_free(vx_buffer_h hbuffer) {
|
||||
return (g_callbacks.mem_free)(hbuffer);
|
||||
}
|
||||
|
||||
extern int vx_mem_address(vx_buffer_h /*hbuffer*/, uint64_t* /*address*/) {
|
||||
return -1;
|
||||
extern int vx_mem_access(vx_buffer_h hbuffer, uint64_t offset, uint64_t size, int flags) {
|
||||
return (g_callbacks.mem_access)(hbuffer, offset, size, flags);
|
||||
}
|
||||
|
||||
extern int vx_mem_info(vx_device_h /*hdevice*/, uint64_t* /*mem_free*/, uint64_t* /*mem_used*/) {
|
||||
return 0;
|
||||
extern int vx_mem_address(vx_buffer_h hbuffer, uint64_t* address) {
|
||||
return (g_callbacks.mem_address)(hbuffer, address);
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, const void* /*host_ptr*/, uint64_t /*dst_offset*/, uint64_t /*size*/) {
|
||||
return -1;
|
||||
extern int vx_mem_info(vx_device_h hdevice, uint64_t* mem_free, uint64_t* mem_used) {
|
||||
return (g_callbacks.mem_info)(hdevice, mem_free, mem_used);
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(void* /*host_ptr*/, vx_buffer_h /*hbuffer*/, uint64_t /*src_offset*/, uint64_t /*size*/) {
|
||||
return -1;
|
||||
extern int vx_copy_to_dev(vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offset, uint64_t size) {
|
||||
return (g_callbacks.copy_to_dev)(hbuffer, host_ptr, dst_offset, size);
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h /*hdevice*/, vx_buffer_h /*hkernel*/, vx_buffer_h /*harguments*/) {
|
||||
return -1;
|
||||
extern int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size) {
|
||||
return (g_callbacks.copy_from_dev)(host_ptr, hbuffer, src_offset, size);
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h /*hdevice*/, uint64_t /*timeout*/) {
|
||||
return -1;
|
||||
extern int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments) {
|
||||
int profiling_mode = get_profiling_mode();
|
||||
if (profiling_mode != 0) {
|
||||
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, profiling_mode), {
|
||||
return err;
|
||||
});
|
||||
}
|
||||
return (g_callbacks.start)(hdevice, hkernel, harguments);
|
||||
}
|
||||
|
||||
extern int vx_dcr_read(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint32_t* /*value*/) {
|
||||
return -1;
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
return (g_callbacks.ready_wait)(hdevice, timeout);
|
||||
}
|
||||
|
||||
|
||||
extern int vx_dcr_write(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint32_t /*value*/) {
|
||||
return -1;
|
||||
extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {
|
||||
return (g_callbacks.dcr_read)(hdevice, addr, value);
|
||||
}
|
||||
|
||||
extern int vx_mpm_query(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint32_t /*core_id*/, uint64_t* /*value*/) {
|
||||
return -1;
|
||||
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
|
||||
return (g_callbacks.dcr_write)(hdevice, addr, value);
|
||||
}
|
||||
|
||||
extern int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value) {
|
||||
return (g_callbacks.mpm_query)(hdevice, addr, core_id, value);
|
||||
}
|
|
@ -2,7 +2,7 @@ include ../common.mk
|
|||
|
||||
TARGET ?= xrtsim
|
||||
|
||||
DESTDIR ?= $(CURDIR)
|
||||
DESTDIR ?= $(CURDIR)/..
|
||||
|
||||
SRC_DIR := $(VORTEX_HOME)/runtime/xrt
|
||||
|
||||
|
@ -13,7 +13,7 @@ CXXFLAGS += -fPIC
|
|||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += -L$(XILINX_XRT)/lib
|
||||
|
||||
SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp $(SIM_DIR)/common/util.cpp
|
||||
SRCS := $(SRC_DIR)/vortex.cpp $(SIM_DIR)/common/util.cpp
|
||||
|
||||
# set up target types
|
||||
ifeq ($(TARGET), xrtsim)
|
||||
|
@ -24,7 +24,7 @@ else
|
|||
LDFLAGS += -luuid -lxrt_coreutil
|
||||
endif
|
||||
|
||||
PROJECT := libvortex.so
|
||||
PROJECT := libvortex-xrt.so
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
|
@ -41,13 +41,20 @@ endif
|
|||
|
||||
all: $(DESTDIR)/$(PROJECT)
|
||||
|
||||
driver: $(DESTDIR)/libxrtsim.so
|
||||
|
||||
$(DESTDIR)/libxrtsim.so:
|
||||
DESTDIR=$(DESTDIR) $(MAKE) -C $(ROOT_DIR)/sim/xrtsim $(DESTDIR)/libxrtsim.so
|
||||
|
||||
$(DESTDIR)/$(PROJECT): $(SRCS) $(XRTSIM)
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $@
|
||||
|
||||
clean:
|
||||
clean-driver:
|
||||
DESTDIR=$(DESTDIR) $(MAKE) -C $(ROOT_DIR)/sim/xrtsim clean
|
||||
rm -rf $(DESTDIR)/$(PROJECT)
|
||||
|
||||
clean-runtime:
|
||||
rm -f $(DESTDIR)/$(PROJECT)
|
||||
|
||||
clean: clean-driver clean-runtime
|
||||
|
||||
.PHONY: all driver clean-driver clean-runtime clean
|
File diff suppressed because it is too large
Load diff
|
@ -1,55 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <unordered_map>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class UUIDGenerator {
|
||||
public:
|
||||
UUIDGenerator() : ids_(0) {}
|
||||
virtual ~UUIDGenerator() {}
|
||||
|
||||
uint32_t get_uuid(uint64_t /*PC*/) {
|
||||
/*uint16_t id;
|
||||
uint16_t ref;
|
||||
auto it = uuid_map_.find(PC);
|
||||
if (it != uuid_map_.end()) {
|
||||
uint32_t value = it->second;
|
||||
ref = value & 0xffff;
|
||||
id = value >> 16;
|
||||
++ref;
|
||||
} else {
|
||||
ref = 0;
|
||||
id = ids_++;
|
||||
}
|
||||
uint32_t ret = (uint32_t(id) << 16) | ref;
|
||||
uuid_map_[PC] = ret;*/
|
||||
return ids_++;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
//uuid_map_.clear();
|
||||
ids_ = 0;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
//std::unordered_map<uint64_t, uint32_t> uuid_map_;
|
||||
uint16_t ids_;
|
||||
};
|
||||
|
||||
}
|
|
@ -122,7 +122,8 @@ $(DESTDIR)/vortex_afu.h : $(AFU_DIR)/vortex_afu.vh
|
|||
$(SCRIPT_DIR)/gen_config.py -i $^ -o $@
|
||||
|
||||
$(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/vortex_afu.h $(SCOPE_JSON)
|
||||
verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@
|
||||
verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $@.obj_dir -o $@
|
||||
|
||||
clean:
|
||||
rm -rf $(DESTDIR)/obj_dir $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT)
|
||||
rm -rf $(DESTDIR)/$(PROJECT).obj_dir
|
||||
rm -f $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT)
|
||||
|
|
|
@ -67,7 +67,7 @@ VL_FLAGS += -j $(THREADS)
|
|||
ifdef DEBUG
|
||||
VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
|
||||
CXXFLAGS += -g -O0 $(DBG_FLAGS)
|
||||
else
|
||||
else
|
||||
VL_FLAGS += -DNDEBUG
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
@ -83,10 +83,17 @@ PROJECT := rtlsim
|
|||
all: $(DESTDIR)/$(PROJECT)
|
||||
|
||||
$(DESTDIR)/$(PROJECT): $(SRCS) $(SRC_DIR)/main.cpp
|
||||
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS) -DSTARTUP_ADDR=0x80000000' -LDFLAGS '$(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@
|
||||
|
||||
$(DESTDIR)/lib$(PROJECT).so: $(SRCS)
|
||||
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@
|
||||
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS) -DSTARTUP_ADDR=0x80000000' -LDFLAGS '$(LDFLAGS)' --Mdir $@.obj_dir -o $@
|
||||
|
||||
clean:
|
||||
rm -rf $(DESTDIR)/obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
|
||||
$(DESTDIR)/lib$(PROJECT).so: $(SRCS)
|
||||
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' --Mdir $@.obj_dir -o $@
|
||||
|
||||
clean-lib:
|
||||
rm -rf $(DESTDIR)/lib$(PROJECT).so.obj_dir
|
||||
rm -f $(DESTDIR)/lib$(PROJECT).so
|
||||
|
||||
clean-exe:
|
||||
rm -rf $(DESTDIR)/$(PROJECT).obj_dir
|
||||
rm -f $(DESTDIR)/$(PROJECT)
|
||||
|
||||
clean: clean-lib clean-exe
|
|
@ -26,19 +26,15 @@
|
|||
using namespace vortex;
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Usage: [-r: riscv-test] [-h: help] <program>" << std::endl;
|
||||
std::cout << "Usage: [-h: help] <program>" << std::endl;
|
||||
}
|
||||
|
||||
bool riscv_test = false;
|
||||
const char* program = nullptr;
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "rh?")) != -1) {
|
||||
switch (c) {
|
||||
case 'r':
|
||||
riscv_test = true;
|
||||
break;
|
||||
case 'h':
|
||||
case '?':
|
||||
show_usage();
|
||||
|
@ -95,21 +91,10 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
|
||||
// run simulation
|
||||
exitcode = processor.run();
|
||||
processor.run();
|
||||
|
||||
if (riscv_test) {
|
||||
if (1 == exitcode) {
|
||||
std::cout << "Passed" << std::endl;
|
||||
exitcode = 0;
|
||||
} else {
|
||||
std::cout << "Failed" << std::endl;
|
||||
exitcode = 1;
|
||||
}
|
||||
} else {
|
||||
if (exitcode != 0) {
|
||||
std::cout << "*** error: exitcode=" << exitcode << std::endl;
|
||||
}
|
||||
}
|
||||
// read exitcode from @MPM.1
|
||||
ram.read(&exitcode, (IO_MPM_ADDR + 8), 4);
|
||||
|
||||
return exitcode;
|
||||
}
|
||||
|
|
|
@ -182,8 +182,7 @@ public:
|
|||
ram_ = ram;
|
||||
}
|
||||
|
||||
int run() {
|
||||
int exitcode = 0;
|
||||
void run() {
|
||||
|
||||
#ifndef NDEBUG
|
||||
std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;
|
||||
|
@ -200,10 +199,6 @@ public:
|
|||
|
||||
// wait on device to go idle
|
||||
while (device_->busy) {
|
||||
if (get_ebreak()) {
|
||||
exitcode = (int)get_last_wb_value(3);
|
||||
break;
|
||||
}
|
||||
this->tick();
|
||||
}
|
||||
|
||||
|
@ -211,8 +206,6 @@ public:
|
|||
this->reset();
|
||||
|
||||
this->cout_flush();
|
||||
|
||||
return exitcode;
|
||||
}
|
||||
|
||||
void dcr_write(uint32_t addr, uint32_t value) {
|
||||
|
@ -607,22 +600,6 @@ private:
|
|||
}
|
||||
}
|
||||
|
||||
bool get_ebreak() const {
|
||||
#ifdef AXI_BUS
|
||||
return (bool)device_->Vortex_axi->vortex->sim_ebreak;
|
||||
#else
|
||||
return (bool)device_->Vortex->sim_ebreak;
|
||||
#endif
|
||||
}
|
||||
|
||||
uint64_t get_last_wb_value(int reg) const {
|
||||
#ifdef AXI_BUS
|
||||
return ((Word*)device_->Vortex_axi->vortex->sim_wb_value.data())[reg];
|
||||
#else
|
||||
return ((Word*)device_->Vortex->sim_wb_value.data())[reg];
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
typedef struct {
|
||||
|
@ -675,8 +652,8 @@ void Processor::attach_ram(RAM* mem) {
|
|||
impl_->attach_ram(mem);
|
||||
}
|
||||
|
||||
int Processor::run() {
|
||||
return impl_->run();
|
||||
void Processor::run() {
|
||||
impl_->run();
|
||||
}
|
||||
|
||||
void Processor::dcr_write(uint32_t addr, uint32_t value) {
|
||||
|
|
|
@ -27,7 +27,7 @@ public:
|
|||
|
||||
void attach_ram(RAM* ram);
|
||||
|
||||
int run();
|
||||
void run();
|
||||
|
||||
void dcr_write(uint32_t addr, uint32_t value);
|
||||
|
||||
|
|
|
@ -22,14 +22,20 @@ SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(
|
|||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
|
||||
#CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer
|
||||
else
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
# Enable perf counters
|
||||
ifdef PERF
|
||||
VL_FLAGS += -DPERF_ENABLE
|
||||
CXXFLAGS += -DPERF_ENABLE
|
||||
endif
|
||||
|
||||
PROJECT := simx
|
||||
|
||||
all: $(DESTDIR)/$(PROJECT)
|
||||
|
||||
|
||||
$(DESTDIR)/$(PROJECT): $(SRCS) $(SRC_DIR)/main.cpp
|
||||
$(CXX) $(CXXFLAGS) -DSTARTUP_ADDR=0x80000000 $^ $(LDFLAGS) -o $@
|
||||
|
||||
|
@ -39,5 +45,10 @@ $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
|
|||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
|
||||
clean-lib:
|
||||
rm -f $(DESTDIR)/lib$(PROJECT).so
|
||||
|
||||
clean-exe:
|
||||
rm -f $(DESTDIR)/$(PROJECT)
|
||||
|
||||
clean: clean-lib clean-exe
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -22,21 +22,22 @@
|
|||
|
||||
namespace vortex {
|
||||
|
||||
class Arch {
|
||||
class Arch {
|
||||
private:
|
||||
uint16_t num_threads_;
|
||||
uint16_t num_warps_;
|
||||
uint16_t num_cores_;
|
||||
uint16_t num_clusters_;
|
||||
uint16_t num_cores_;
|
||||
uint16_t num_clusters_;
|
||||
uint16_t socket_size_;
|
||||
uint16_t vsize_;
|
||||
uint16_t num_regs_;
|
||||
uint16_t num_csrs_;
|
||||
uint16_t num_barriers_;
|
||||
uint16_t ipdom_size_;
|
||||
|
||||
uint64_t local_mem_base_;
|
||||
|
||||
public:
|
||||
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)
|
||||
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)
|
||||
: num_threads_(num_threads)
|
||||
, num_warps_(num_warps)
|
||||
, num_cores_(num_cores)
|
||||
|
@ -47,10 +48,11 @@ public:
|
|||
, num_csrs_(4096)
|
||||
, num_barriers_(NUM_BARRIERS)
|
||||
, ipdom_size_((num_threads-1) * 2)
|
||||
, local_mem_base_(LMEM_BASE_ADDR)
|
||||
{}
|
||||
|
||||
uint16_t vsize() const {
|
||||
return vsize_;
|
||||
uint16_t vsize() const {
|
||||
return vsize_;
|
||||
}
|
||||
|
||||
uint16_t num_regs() const {
|
||||
|
@ -65,6 +67,10 @@ public:
|
|||
return num_barriers_;
|
||||
}
|
||||
|
||||
uint64_t local_mem_base() const {
|
||||
return local_mem_base_;
|
||||
}
|
||||
|
||||
uint16_t ipdom_size() const {
|
||||
return ipdom_size_;
|
||||
}
|
||||
|
@ -80,7 +86,7 @@ public:
|
|||
uint16_t num_cores() const {
|
||||
return num_cores_;
|
||||
}
|
||||
|
||||
|
||||
uint16_t num_clusters() const {
|
||||
return num_clusters_;
|
||||
}
|
||||
|
|
|
@ -44,12 +44,13 @@ Emulator::ipdom_entry_t::ipdom_entry_t(const ThreadMask &tmask)
|
|||
Emulator::warp_t::warp_t(const Arch& arch)
|
||||
: ireg_file(arch.num_threads(), std::vector<Word>(arch.num_regs()))
|
||||
, freg_file(arch.num_threads(), std::vector<uint64_t>(arch.num_regs()))
|
||||
, uuid(0)
|
||||
{}
|
||||
|
||||
void Emulator::warp_t::clear(uint64_t startup_addr) {
|
||||
this->PC = startup_addr;
|
||||
this->tmask.reset();
|
||||
this->uui_gen.reset();
|
||||
this->uuid = 0;
|
||||
this->fcsr = 0;
|
||||
|
||||
for (auto& reg_file : this->ireg_file) {
|
||||
|
@ -153,7 +154,7 @@ instr_trace_t* Emulator::step() {
|
|||
assert(warp.tmask.any());
|
||||
|
||||
#ifndef NDEBUG
|
||||
uint32_t instr_uuid = warp.uui_gen.get_uuid(warp.PC);
|
||||
uint32_t instr_uuid = warp.uuid++;
|
||||
uint32_t g_wid = core_->id() * arch_.num_warps() + scheduled_warp;
|
||||
uint64_t uuid = (uint64_t(g_wid) << 32) | instr_uuid;
|
||||
#else
|
||||
|
@ -286,7 +287,7 @@ void Emulator::dcache_write(const void* data, uint64_t addr, uint32_t size) {
|
|||
auto type = get_addr_type(addr);
|
||||
if (addr >= uint64_t(IO_COUT_ADDR)
|
||||
&& addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
|
||||
this->writeToStdOut(data, addr, size);
|
||||
this->writeToStdOut(data, addr, size);
|
||||
} else {
|
||||
if (type == AddrType::Shared) {
|
||||
core_->local_mem()->write(data, addr, size);
|
||||
|
@ -357,6 +358,7 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
case VX_CSR_MTVEC:
|
||||
case VX_CSR_MEPC:
|
||||
case VX_CSR_MNSTATUS:
|
||||
case VX_CSR_MCAUSE:
|
||||
return 0;
|
||||
|
||||
case VX_CSR_FFLAGS: return warps_.at(wid).fcsr & 0x1F;
|
||||
|
@ -371,7 +373,7 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
case VX_CSR_NUM_THREADS:return arch_.num_threads();
|
||||
case VX_CSR_NUM_WARPS: return arch_.num_warps();
|
||||
case VX_CSR_NUM_CORES: return uint32_t(arch_.num_cores()) * arch_.num_clusters();
|
||||
case VX_CSR_NUM_BARRIERS:return arch_.num_barriers();
|
||||
case VX_CSR_LOCAL_MEM_BASE: return arch_.local_mem_base();
|
||||
case VX_CSR_MSCRATCH: return csr_mscratch_;
|
||||
CSR_READ_64(VX_CSR_MCYCLE, core_perf.cycles);
|
||||
CSR_READ_64(VX_CSR_MINSTRET, core_perf.instrs);
|
||||
|
@ -480,6 +482,7 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
|
|||
case VX_CSR_PMPCFG0:
|
||||
case VX_CSR_PMPADDR0:
|
||||
case VX_CSR_MNSTATUS:
|
||||
case VX_CSR_MCAUSE:
|
||||
break;
|
||||
default: {
|
||||
std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl;
|
||||
|
@ -497,12 +500,4 @@ void Emulator::update_fcrs(uint32_t fflags, uint32_t tid, uint32_t wid) {
|
|||
this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, tid, wid) | fflags, tid, wid);
|
||||
this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, tid, wid) | fflags, tid, wid);
|
||||
}
|
||||
}
|
||||
|
||||
void Emulator::trigger_ecall() {
|
||||
active_warps_.reset();
|
||||
}
|
||||
|
||||
void Emulator::trigger_ebreak() {
|
||||
active_warps_.reset();
|
||||
}
|
|
@ -75,7 +75,7 @@ private:
|
|||
std::vector<std::vector<uint64_t>>freg_file;
|
||||
std::stack<ipdom_entry_t> ipdom_stack;
|
||||
Byte fcsr;
|
||||
UUIDGenerator uui_gen;
|
||||
uint32_t uuid;
|
||||
};
|
||||
|
||||
struct wspawn_t {
|
||||
|
@ -110,10 +110,6 @@ private:
|
|||
|
||||
void update_fcrs(uint32_t fflags, uint32_t tid, uint32_t wid);
|
||||
|
||||
void trigger_ecall();
|
||||
|
||||
void trigger_ebreak();
|
||||
|
||||
const Arch& arch_;
|
||||
const DCRS& dcrs_;
|
||||
Core* core_;
|
||||
|
|
|
@ -829,17 +829,11 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
trace->alu_type = AluType::SYSCALL;
|
||||
trace->fetch_stall = true;
|
||||
switch (csr_addr) {
|
||||
case 0:
|
||||
// RV32I: ECALL
|
||||
this->trigger_ecall();
|
||||
break;
|
||||
case 1:
|
||||
// RV32I: EBREAK
|
||||
this->trigger_ebreak();
|
||||
break;
|
||||
case 0x002: // URET
|
||||
case 0x102: // SRET
|
||||
case 0x302: // MRET
|
||||
case 0x000: // RV32I: ECALL
|
||||
case 0x001: // RV32I: EBREAK
|
||||
case 0x002: // RV32I: URET
|
||||
case 0x102: // RV32I: SRET
|
||||
case 0x302: // RV32I: MRET
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
|
|
|
@ -29,14 +29,13 @@
|
|||
using namespace vortex;
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Usage: [-c <cores>] [-w <warps>] [-t <threads>] [-r: riscv-test] [-s: stats] [-h: help] <program>" << std::endl;
|
||||
std::cout << "Usage: [-c <cores>] [-w <warps>] [-t <threads>] [-s: stats] [-h: help] <program>" << std::endl;
|
||||
}
|
||||
|
||||
uint32_t num_threads = NUM_THREADS;
|
||||
uint32_t num_warps = NUM_WARPS;
|
||||
uint32_t num_cores = NUM_CORES;
|
||||
bool showStats = false;
|
||||
bool riscv_test = false;
|
||||
const char* program = nullptr;
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
|
@ -52,9 +51,6 @@ static void parse_args(int argc, char **argv) {
|
|||
case 'c':
|
||||
num_cores = atoi(optarg);
|
||||
break;
|
||||
case 'r':
|
||||
riscv_test = true;
|
||||
break;
|
||||
case 's':
|
||||
showStats = true;
|
||||
break;
|
||||
|
@ -118,14 +114,10 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
|
||||
// run simulation
|
||||
exitcode = processor.run();
|
||||
if (riscv_test) {
|
||||
exitcode = (1 - exitcode);
|
||||
}
|
||||
}
|
||||
processor.run();
|
||||
|
||||
if (exitcode != 0) {
|
||||
std::cout << "*** error: exitcode=" << exitcode << std::endl;
|
||||
// read exitcode from @MPM.1
|
||||
ram.read(&exitcode, (IO_MPM_ADDR + 8), 4);
|
||||
}
|
||||
|
||||
return exitcode;
|
||||
|
|
|
@ -83,12 +83,11 @@ void ProcessorImpl::attach_ram(RAM* ram) {
|
|||
}
|
||||
}
|
||||
|
||||
int ProcessorImpl::run() {
|
||||
void ProcessorImpl::run() {
|
||||
SimPlatform::instance().reset();
|
||||
this->reset();
|
||||
|
||||
bool done;
|
||||
int exitcode = 0;
|
||||
do {
|
||||
SimPlatform::instance().tick();
|
||||
done = true;
|
||||
|
@ -97,12 +96,9 @@ int ProcessorImpl::run() {
|
|||
done = false;
|
||||
continue;
|
||||
}
|
||||
exitcode |= cluster->get_exitcode();
|
||||
}
|
||||
perf_mem_latency_ += perf_mem_pending_reads_;
|
||||
} while (!done);
|
||||
|
||||
return exitcode;
|
||||
}
|
||||
|
||||
void ProcessorImpl::reset() {
|
||||
|
@ -139,8 +135,8 @@ void Processor::attach_ram(RAM* mem) {
|
|||
impl_->attach_ram(mem);
|
||||
}
|
||||
|
||||
int Processor::run() {
|
||||
return impl_->run();
|
||||
void Processor::run() {
|
||||
impl_->run();
|
||||
}
|
||||
|
||||
void Processor::dcr_write(uint32_t addr, uint32_t value) {
|
||||
|
|
|
@ -28,7 +28,7 @@ public:
|
|||
|
||||
void attach_ram(RAM* mem);
|
||||
|
||||
int run();
|
||||
void run();
|
||||
|
||||
void dcr_write(uint32_t addr, uint32_t value);
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ public:
|
|||
|
||||
void attach_ram(RAM* mem);
|
||||
|
||||
int run();
|
||||
void run();
|
||||
|
||||
void dcr_write(uint32_t addr, uint32_t value);
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue