Merge branch 'master' into graphics

This commit is contained in:
Blaise Tine 2021-10-15 19:32:11 -07:00
commit e380ded5e1
542 changed files with 124552 additions and 18682 deletions

3
.gitmodules vendored
View file

@ -1,3 +1,6 @@
[submodule "hw/rtl/fp_cores/fpnew"]
path = hw/rtl/fp_cores/fpnew
url = https://github.com/pulp-platform/fpnew.git
[submodule "sim/common/softfloat"]
path = sim/common/softfloat
url = https://github.com/ucb-bar/berkeley-softfloat-3.git

View file

@ -30,22 +30,25 @@ jobs:
include:
- stage: test
name: coverage
script: cp -r $PWD ../build1 && cd ../build1 && ./ci/regression.sh -coverage
script: cp -r $PWD ../build1 && cd ../build1 && ./ci/travis_run.py ./ci/regression.sh -coverage
- stage: test
name: cluster
script: cp -r $PWD ../build2 && cd ../build2 && ./ci/regression.sh -cluster
script: cp -r $PWD ../build2 && cd ../build2 && ./ci/travis_run.py ./ci/regression.sh -cluster
- stage: test
name: debug
script: cp -r $PWD ../build3 && cd ../build3 && ./ci/regression.sh -debug
script: cp -r $PWD ../build3 && cd ../build3 && ./ci/travis_run.py ./ci/regression.sh -debug
- stage: test
name: config
script: cp -r $PWD ../build4 && cd ../build4 && ./ci/regression.sh -config
script: cp -r $PWD ../build4 && cd ../build4 && ./ci/travis_run.py ./ci/regression.sh -config
- stage: test
name: stress
script: cp -r $PWD ../build5 && cd ../build5 && ./ci/regression.sh -stress
name: stress0
script: cp -r $PWD ../build5 && cd ../build5 && ./ci/travis_run.py ./ci/regression.sh -stress0
- stage: test
name: stress1
script: cp -r $PWD ../build6 && cd ../build6 && ./ci/travis_run.py ./ci/regression.sh -stress1
- stage: test
name: compiler
script: cp -r $PWD ../build6 && cd ../build6 && ./ci/test_compiler.sh
script: cp -r $PWD ../build7 && cd ../build7 && ./ci/travis_run.py ./ci/test_compiler.sh
after_success:
# Gather code coverage

View file

@ -1,14 +1,13 @@
all:
$(MAKE) -C hw
$(MAKE) -C sim
$(MAKE) -C driver
$(MAKE) -C runtime
$(MAKE) -C simX
$(MAKE) -C tests
clean:
$(MAKE) -C hw clean
$(MAKE) -C sim clean
$(MAKE) -C driver clean
$(MAKE) -C simX clean
$(MAKE) -C runtime clean
$(MAKE) -C tests clean

View file

@ -8,7 +8,11 @@ Vortex is a full-system RISCV-based GPGPU processor.
## Specifications
- Support RISC-V RV32IMF ISA
- Scalability: 1 to 32 cores with optional L2 and L3 caches
- Performance:
- 1024 total threads running at 250 MHz
- 128 Gflops of compute bandwidth
- 16 GB/s of memory bandwidth
- Scalability: up to 64 cores with optional L2 and L3 caches
- Software: OpenCL 1.2 Support
- Supported FPGAs:
- Intel Arria 10
@ -20,11 +24,11 @@ Vortex is a full-system RISCV-based GPGPU processor.
- `hw`: Hardware sources.
- `driver`: Host driver software.
- `driver`: Host drivers repository.
- `runtime`: Kernel Runtime software.
- `simX`: Cycle-approximate simulator.
- `sim`: Simulators repository.
- `tests`: Tests repository.

View file

@ -20,6 +20,7 @@ L3=0
DEBUG=0
SCOPE=0
HAS_ARGS=0
DEBUG_LEVEL=1
for i in "$@"
do
@ -88,28 +89,19 @@ done
case $DRIVER in
rtlsim)
DRIVER_PATH=$VORTEX_HOME/driver/rtlsim
DRIVER_EXTRA=
CLEAN_TOKEN=clean
;;
vlsim)
DRIVER_PATH=$VORTEX_HOME/driver/opae
DRIVER_EXTRA=vlsim
CLEAN_TOKEN=clean-vlsim
DRIVER_PATH=$VORTEX_HOME/driver/vlsim
;;
asesim)
DRIVER_PATH=$VORTEX_HOME/driver/opae
DRIVER_EXTRA=asesim
CLEAN_TOKEN=clean-asesim
DRIVER_PATH=$VORTEX_HOME/driver/asesim
;;
fpga)
DRIVER_PATH=$VORTEX_HOME/driver/opae
DRIVER_EXTRA=fpga
CLEAN_TOKEN=clean-fpga
DRIVER_PATH=$VORTEX_HOME/driver/fpga
;;
simx)
DRIVER_PATH=$VORTEX_HOME/driver/simx
DRIVER_EXTRA=
CLEAN_TOKEN=clean
DEBUG_LEVEL=3
;;
*)
echo "invalid driver: $DRIVER"
@ -132,7 +124,7 @@ CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_TH
echo "CONFIGS=$CONFIGS"
make -C $DRIVER_PATH $CLEAN_TOKEN
make -C $DRIVER_PATH clean
status=0
@ -140,9 +132,9 @@ if [ $DEBUG -eq 1 ]
then
if [ $SCOPE -eq 1 ]
then
DEBUG=1 SCOPE=1 CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH $DRIVER_EXTRA
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH
else
DEBUG=1 CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH $DRIVER_EXTRA
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH
fi
if [ $HAS_ARGS -eq 1 ]
@ -161,9 +153,9 @@ then
else
if [ $SCOPE -eq 1 ]
then
SCOPE=1 CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH $DRIVER_EXTRA
SCOPE=1 CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH
else
CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH $DRIVER_EXTRA
CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH
fi
if [ $HAS_ARGS -eq 1 ]

View file

@ -3,9 +3,13 @@
# exit when any command fails
set -e
OS_DIR=ubuntu/bionic
SRCDIR=/opt
DESTDIR=.
OS_DIR=${OS_DIR:-'ubuntu/bionic'}
SRCDIR=${SRCDIR:-'/opt'}
DESTDIR=${DESTDIR:-'.'}
echo "OS_DIR=${OS_DIR}"
echo "SRCDIR=${SRCDIR}"
echo "DESTDIR=${DESTDIR}"
riscv()
{

View file

@ -3,7 +3,7 @@
# exit when any command fails
set -e
# build sources
# ensure build
make -s
coverage()
@ -27,17 +27,17 @@ cluster()
echo "begin clustering tests..."
# warp/threads configurations
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=demo
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=demo
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=demo
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=demo
# cores clustering
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1"
# L2/L3
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
echo "clustering tests done!"
}
@ -46,9 +46,9 @@ debug()
{
echo "begin debugging tests..."
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --perf --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --debug --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=basic --args="-t0 -n1"
./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1"
./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1"
./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=basic --args="-t0 -n1"
echo "debugging tests done!"
}
@ -75,13 +75,21 @@ FPU_CORE=FPU_DEFAULT ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood
# using FPNEW FPU core
FPU_CORE=FPU_FPNEW ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood
# using AXI bus
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
# adjust l1 block size to match l2
CONFIGS="-DMEM_BLOCK_SIZE=16 -DL1_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
# test cache banking
CONFIGS="-DDNUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
CONFIGS="-DDNUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr
CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr
# test cache multi-porting
CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
CONFIGS="-DDNUM_PORTS=4" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr
CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo --debug --args="-n1"
CONFIGS="-DL2_NUM_PORTS=2 -DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr
CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=io_addr
# test 128-bit MEM block
CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
@ -95,51 +103,70 @@ CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=
# test 128-bit DRAM block
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28 -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=sgemm
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=sgemm
# test long memory latency
CONFIGS="-DMEM_LATENCY=100 -DMEM_RQ_SIZE=4 -DMEM_STALLS_MODULO=4" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
echo "configuration tests done!"
}
stress()
stress0()
{
echo "begin stress tests..."
echo "begin stress0 tests..."
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --clusters=2 --l3cache --app=sgemm --args="-n256"
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=sgemm
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=sgemm
FPU_CORE=FPU_DEFAULT CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
FPU_CORE=FPU_DEFAULT CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr
CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=printf
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=printf
echo "stress tests done!"
echo "stress0 tests done!"
}
stress1()
{
echo "begin stress1 tests..."
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --clusters=2 --l3cache --app=sgemm --args="-n256"
echo "stress1 tests done!"
}
usage()
{
echo "usage: regression [-coverage] [-cluster] [-debug] [-config] [-stress] [-all] [-h|--help]"
echo "usage: regression [-coverage] [-cluster] [-debug] [-config] [-stress[#n]] [-all] [-h|--help]"
}
while [ "$1" != "" ]; do
case $1 in
-coverage ) coverage
;;
;;
-cluster ) cluster
;;
;;
-debug ) debug
;;
;;
-config ) config
;;
-stress ) stress
-stress0 ) stress0
;;
-stress1 ) stress1
;;
-stress ) stress0
stress1
;;
-all ) coverage
cluster
debug
config
stress
;;
stress0
stress1
;;
-h | --help ) usage
exit
;;
;;
* ) usage
exit 1
esac

View file

@ -7,25 +7,30 @@ The directory/file layout of the Vortex codebase is as followed:
- `cache`: cache subsystem code
- `fp_cores`: floating point unit code
- `interfaces`: interfaces for inter-module communication
- `libs`: general-purpose modules (i.e., encoder, arbiter, ...)
- `libs`: general-purpose RTL modules
- `syn`: synthesis directory
- `opae`: OPAE synthesis scripts
- `quartus`: Quartus synthesis scripts
- `synopsys`: Synopsys synthesis scripts
- `modelsim`: Modelsim synthesis scripts
- `yosys`: Yosys synthesis scripts
- `simulate`: baseline RTL simulator (used by RTLSIM)
- `unit_tests`: unit tests for some hardware components
- `driver`: Host driver software
- `driver`: host drivers repository
- `include`: Vortex driver public headers
- `opae`: software driver that uses Intel OPAE
- `vlsim`: software driver that simulates Full RTL (include AFU)
- `rtlsim`: software driver that simulates processor RTL
- `stub`: Vortex stub driver library
- `fpga`: software driver that uses Intel OPAE FPGA
- `asesim`: software driver that uses Intel ASE simulator
- `vlsim`: software driver that uses vlsim simulator
- `rtlsim`: software driver that uses rtlsim simulator
- `simx`: software driver that uses simX simulator
- `runtime`: Kernel runtime software
- `runtime`: kernel runtime software
- `include`: Vortex runtime public headers
- `linker`: linker file for compiling kernels
- `src`: runtime implementation
- `simX`: cycle approximate simulator for vortex
- `sim`:
- `vlsim`: AFU RTL simulator
- `rtlsim`: processor RTL simulator
- `simX`: cycle approximate simulator for vortex
- `tests`: tests repository.
- `runtime`: runtime tests
- `regression`: regression tests

View file

@ -1,10 +1,16 @@
all: stub rtlsim simx opae
all: stub rtlsim simx vlsim
stub:
$(MAKE) -C stub
opae:
$(MAKE) -C opae
fpga:
$(MAKE) -C fpga
asesim:
$(MAKE) -C asesim
vlsim:
$(MAKE) -C vlsim
rtlsim:
$(MAKE) -C rtlsim
@ -14,8 +20,10 @@ simx:
clean:
$(MAKE) clean -C stub
$(MAKE) clean -C opae
$(MAKE) clean -C fpga
$(MAKE) clean -C asesim
$(MAKE) clean -C vlsim
$(MAKE) clean -C rtlsim
$(MAKE) clean -C simx
.PHONY: all stub opae rtlsim simx clean
.PHONY: all stub fpga asesim vlsim rtlsim simx clean

74
driver/asesim/Makefile Normal file
View file

@ -0,0 +1,74 @@
OPAE_HOME ?= /tools/opae/1.4.0
RTL_DIR=../../hw/rtl
SCRIPT_DIR=../../hw/scripts
OPAE_SYN_DIR=../../hw/syn/opae
CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors
#CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I. -I../include -I../../hw -I$(OPAE_HOME)/include -I$(OPAE_SYN_DIR)
LDFLAGS += -L$(OPAE_HOME)/lib -luuid -lopae-c-ase
# stack execution protection
LDFLAGS +=-z noexecstack
# data relocation and projection
LDFLAGS +=-z relro -z now
# stack buffer overrun detection
CXXFLAGS +=-fstack-protector
# Position independent code
CXXFLAGS += -fPIC
# Add external configuration
CXXFLAGS += $(CONFIGS)
# Dump perf stats
CXXFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared
PROJECT = libvortex.so
SRCS = ../common/opae.cpp ../common/vx_utils.cpp
# Enable scope analyzer
ifdef SCOPE
CXXFLAGS += -DSCOPE
SRCS += ../common/vx_scope.cpp
SCOPE_H = scope-defs.h
endif
# Enable perf counters
ifdef PERF
CXXFLAGS += -DPERF_ENABLE
endif
all: $(PROJECT)
$(OPAE_SYN_DIR)/vortex_afu.h:
$(MAKE) -C $(OPAE_SYN_DIR) vortex_afu.h
scope-defs.h: $(SCRIPT_DIR)/scope.json
$(SCRIPT_DIR)/scope.py $(CONFIGS) -cc scope-defs.h -vl $(RTL_DIR)/scope-defs.vh $(SCRIPT_DIR)/scope.json
# generate scope data
scope: scope-defs.h
$(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H)
$(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) -o $(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
clean:
rm -rf $(PROJECT) *.o .depend scope-defs.h
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View file

@ -8,12 +8,13 @@
#include <cmath>
#include <sstream>
#include <unordered_map>
#include <list>
#if defined(USE_FPGA) || defined(USE_ASE)
#include <opae/fpga.h>
#include <uuid/uuid.h>
#elif defined(USE_VLSIM)
#include "vlsim/fpga.h"
#include <fpga.h>
#endif
#include <vortex.h>
@ -78,6 +79,34 @@ inline bool is_aligned(size_t addr, size_t alignment) {
///////////////////////////////////////////////////////////////////////////////
#ifdef DUMP_PERF_STATS
class AutoPerfDump {
private:
std::list<vx_device_h> devices_;
public:
AutoPerfDump() {}
~AutoPerfDump() {
for (auto device : devices_) {
vx_dump_perf(device, stdout);
}
}
void add_device(vx_device_h device) {
devices_.push_back(device);
}
void remove_device(vx_device_h device) {
devices_.remove(device);
}
};
AutoPerfDump gAutoPerfDump;
#endif
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
if (nullptr == hdevice)
return -1;
@ -223,6 +252,10 @@ extern int vx_dev_open(vx_device_h* hdevice) {
*hdevice = device;
#ifdef DUMP_PERF_STATS
gAutoPerfDump.add_device(*hdevice);
#endif
return 0;
}
@ -237,7 +270,8 @@ extern int vx_dev_close(vx_device_h hdevice) {
#endif
#ifdef DUMP_PERF_STATS
vx_dump_perf(device, stdout);
gAutoPerfDump.remove_device(hdevice);
vx_dump_perf(hdevice, stdout);
#endif
fpgaClose(device->fpga);

View file

@ -1,3 +1,4 @@
#include "vx_scope.h"
#include <iostream>
#include <fstream>
#include <thread>
@ -7,17 +8,9 @@
#include <chrono>
#include <thread>
#include <mutex>
#ifdef USE_VLSIM
#include "vlsim/fpga.h"
#else
#include <opae/fpga.h>
#endif
#include <VX_config.h>
#include "vx_scope.h"
#include "vortex_afu.h"
#include "scope-defs.h"
#include <vortex_afu.h>
#include <scope-defs.h>
#define FRAME_FLUSH_SIZE 100

View file

@ -1,5 +1,13 @@
#pragma once
#include <cstdint>
#ifdef USE_VLSIM
#include <fpga.h>
#else
#include <opae/fpga.h>
#endif
#if defined(USE_FPGA)
#define HANG_TIMEOUT 60
#else

76
driver/fpga/Makefile Normal file
View file

@ -0,0 +1,76 @@
OPAE_HOME ?= /tools/opae/1.4.0
RTL_DIR=../../hw/rtl
SCRIPT_DIR=../../hw/scripts
OPAE_SYN_DIR=../../hw/syn/opae
CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors
#CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I. -I../include -I../../hw -I$(OPAE_HOME)/include -I$(OPAE_SYN_DIR)
LDFLAGS += -L$(OPAE_HOME)/lib -luuid -lopae-c
#SCOPE=1
# stack execution protection
LDFLAGS +=-z noexecstack
# data relocation and projection
LDFLAGS +=-z relro -z now
# stack buffer overrun detection
CXXFLAGS +=-fstack-protector
# Position independent code
CXXFLAGS += -fPIC
# Add external configuration
CXXFLAGS += $(CONFIGS)
# Dump perf stats
CXXFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared
PROJECT = libvortex.so
SRCS = ../common/opae.cpp ../common/vx_utils.cpp
# Enable scope analyzer
ifdef SCOPE
CXXFLAGS += -DSCOPE
SRCS += ../common/vx_scope.cpp
SCOPE_H = scope-defs.h
endif
# Enable perf counters
ifdef PERF
CXXFLAGS += -DPERF_ENABLE
endif
all: $(PROJECT)
$(OPAE_SYN_DIR)/vortex_afu.h:
$(MAKE) -C $(OPAE_SYN_DIR) vortex_afu.h
scope-defs.h: $(SCRIPT_DIR)/scope.json
$(SCRIPT_DIR)/scope.py $(CONFIGS) -cc scope-defs.h -vl $(RTL_DIR)/scope-defs.vh $(SCRIPT_DIR)/scope.json
# generate scope data
scope: scope-defs.h
$(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H)
$(CXX) $(CXXFLAGS) -DUSE_FPGA $^ $(LDFLAGS) -o $(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
clean:
rm -rf $(PROJECT) *.o .depend scope-defs.h
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View file

@ -1,116 +0,0 @@
OPAE_HOME ?= /tools/opae/1.4.0
CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors
#CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I../include -I$(OPAE_HOME)/include -I../../hw
LDFLAGS += -L$(OPAE_HOME)/lib
#SCOPE=1
# stack execution protection
LDFLAGS +=-z noexecstack
# data relocation and projection
LDFLAGS +=-z relro -z now
# stack buffer overrun detection
CXXFLAGS +=-fstack-protector
# Position independent code
CXXFLAGS += -fPIC
# Add external configuration
CXXFLAGS += $(CONFIGS)
# Dump perf stats
CXXFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared
FPGA_LIBS += -luuid -lopae-c
ASE_LIBS += -luuid -lopae-c-ase
VLSIM_LIBS += -lopae-c-vlsim
ASE_DIR = ase
VLSIM_DIR = vlsim
RTL_DIR=../../hw/rtl
SCRIPT_DIR=../../hw/scripts
PROJECT = libvortex.so
PROJECT_ASE = $(ASE_DIR)/libvortex.so
PROJECT_VLSIM = $(VLSIM_DIR)/libvortex.so
AFU_JSON_INFO = vortex_afu.h
SRCS = vortex.cpp ../common/vx_utils.cpp
# Enable scope analyzer
ifdef SCOPE
CXXFLAGS += -DSCOPE
SRCS += vx_scope.cpp
SCOPE_ENABLE = SCOPE=1
SCOPE_H = scope-defs.h
endif
# Enable perf counters
ifdef PERF
CXXFLAGS += -DPERF_ENABLE
PERF_ENABLE = PERF=1
endif
all: vlsim
# AFU info from JSON file, including AFU UUID
json: ../../hw/opae/vortex_afu.json
afu_json_mgr json-info --afu-json=$^ --c-hdr=$@
scope-defs.h: $(SCRIPT_DIR)/scope.json
$(SCRIPT_DIR)/scope.py $(RTL_INCLUDE) $(CONFIGS) -cc scope-defs.h -vl $(RTL_DIR)/scope-defs.vh $(SCRIPT_DIR)/scope.json
# generate scope data
scope: scope-defs.h
vlsim-hw: $(SCOPE_H)
$(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C vlsim
fpga: $(SRCS) $(SCOPE_H)
$(CXX) $(CXXFLAGS) -DUSE_FPGA $^ $(LDFLAGS) $(FPGA_LIBS) -o $(PROJECT)
asesim: $(SRCS) $(ASE_DIR) $(SCOPE_H)
$(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) $(ASE_LIBS) -o $(PROJECT_ASE)
vlsim: $(SRCS) vlsim-hw
$(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -L./vlsim $(VLSIM_LIBS) -o $(PROJECT_VLSIM)
vortex.o: vortex.cpp
$(CXX) $(CXXFLAGS) -c vortex.cpp -o $@
$(ASE_DIR):
mkdir -p ase
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
clean-fpga:
rm -rf $(PROJECT) *.o .depend
clean-asesim:
rm -rf $(PROJECT_ASE) *.o .depend
clean-vlsim:
$(MAKE) -C vlsim clean
clean: clean-fpga clean-asesim clean-vlsim
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View file

@ -1,100 +0,0 @@
CFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
CFLAGS += -DUSE_VLSIM -fPIC -Wno-maybe-uninitialized
CFLAGS += -I../../../../hw
# control RTL debug print states
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
DBG_PRINT_FLAGS += -DDBG_PRINT_TEX
DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1
CFLAGS += $(CONFIGS)
CFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -pthread
# LDFLAGS += -dynamiclib -pthread
TOP = vortex_afu_shim
RTL_DIR=../../../hw/rtl
DPI_DIR=../../../hw/dpi
SRCS = fpga.cpp opae_sim.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += verilator.vlt
VL_FLAGS += $(CONFIGS)
# Enable Verilator multithreaded simulation
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
#VL_FLAGS += --threads $(THREADS)
# Debugigng
ifdef DEBUG
VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS)
CFLAGS += -DVCD_OUTPUT $(DBG_FLAGS)
else
VL_FLAGS += -DNDEBUG
CFLAGS += -DNDEBUG
endif
# Enable scope analyzer
ifdef SCOPE
VL_FLAGS += -DSCOPE
CFLAGS += -DSCOPE
endif
# Enable perf counters
ifdef PERF
VL_FLAGS += -DPERF_ENABLE
CFLAGS += -DPERF_ENABLE
endif
# use our OPAE shim
VL_FLAGS += -DNOPAE
CFLAGS += -DNOPAE
# ALU backend
VL_FLAGS += -DIMUL_DPI
VL_FLAGS += -DIDIV_DPI
# FPU backend
FPU_CORE ?= FPU_DPI
VL_FLAGS += -D$(FPU_CORE)
PROJECT = libopae-c-vlsim.so
all: $(PROJECT)
vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh
../../../hw/scripts/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h
$(PROJECT): $(SRCS) vortex_afu.h
verilator --exe --cc $(TOP) --top-module $(TOP) $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT)
make -j -C obj_dir -f V$(TOP).mk
clean:
rm -rf $(PROJECT) obj_dir ../scope-defs.h $(RTL_DIR)/scope-defs.vh vortex_afu.h

View file

@ -1,64 +0,0 @@
#pragma once
#include <stdio.h>
#include <stdint.h>
class RAM {
private:
mutable uint8_t *mem_[(1 << 12)];
uint8_t *get(uint32_t address) const {
uint32_t block_addr = address >> 20;
uint32_t block_offset = address & 0x000FFFFF;
if (mem_[block_addr] == NULL) {
mem_[block_addr] = new uint8_t[(1 << 20)];
}
return mem_[block_addr] + block_offset;
}
public:
RAM() {
for (uint32_t i = 0; i < (1 << 12); i++) {
mem_[i] = NULL;
}
}
~RAM() {
this->clear();
}
size_t size() const {
return (1ull << 32);
}
void clear() {
for (uint32_t i = 0; i < (1 << 12); i++) {
if (mem_[i]) {
delete [] mem_[i];
mem_[i] = NULL;
}
}
}
void read(uint32_t address, uint32_t length, uint8_t *data) const {
for (unsigned i = 0; i < length; i++) {
data[i] = *this->get(address + i);
}
}
void write(uint32_t address, uint32_t length, const uint8_t *data) {
for (unsigned i = 0; i < length; i++) {
*this->get(address + i) = data[i];
}
}
uint8_t& operator[](uint32_t address) {
return *get(address);
}
const uint8_t& operator[](uint32_t address) const {
return *get(address);
}
};

View file

@ -1,24 +0,0 @@
//
// Generated by afu_json_mgr from ../../hw/opae/vortex_afu.json
//
#ifndef __AFU_JSON_INFO__
#define __AFU_JSON_INFO__
#define AFU_ACCEL_NAME "vortex_afu"
#define AFU_ACCEL_UUID "35F9452B-25C2-434C-93D5-6F8C60DB361C"
#define AFU_IMAGE_CMD_MEM_READ 1
#define AFU_IMAGE_CMD_MEM_WRITE 2
#define AFU_IMAGE_CMD_RUN 3
#define AFU_IMAGE_MMIO_CMD_TYPE 10
#define AFU_IMAGE_MMIO_DATA_SIZE 16
#define AFU_IMAGE_MMIO_IO_ADDR 12
#define AFU_IMAGE_MMIO_MEM_ADDR 14
#define AFU_IMAGE_MMIO_SCOPE_READ 20
#define AFU_IMAGE_MMIO_SCOPE_WRITE 22
#define AFU_IMAGE_MMIO_DEV_CAPS 24
#define AFU_IMAGE_MMIO_STATUS 18
#define AFU_IMAGE_POWER 0
#define AFU_TOP_IFC "ccip_std_afu_avalon_mm"
#endif // __AFU_JSON_INFO__

View file

@ -1,87 +1,38 @@
CFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
RTLSIM_DIR = ../../sim/rtlsim
CFLAGS += -DUSE_RTLSIM -fPIC -Wno-maybe-uninitialized
CFLAGS += -I../../include -I../../../hw/simulate -I../../../hw
CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors
#CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
# control RTL debug print states
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
DBG_PRINT_FLAGS += -DDBG_PRINT_TEX
CXXFLAGS += -I../include -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common
DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
LDFLAGS += $(RTLSIM_DIR)/librtlsim.a
CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1
# Position independent code
CXXFLAGS += -fPIC
CFLAGS += $(CONFIGS)
CFLAGS += -DDUMP_PERF_STATS
# Add external configuration
CXXFLAGS += $(CONFIGS)
# Dump perf stats
CXXFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -pthread
#LDFLAGS += -dynamiclib -pthread
TOP = Vortex
RTL_DIR = ../../hw/rtl
DPI_DIR = ../../hw/dpi
SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += verilator.vlt
VL_FLAGS += $(CONFIGS)
# Enable Verilator multithreaded simulation
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
#VL_FLAGS += --threads $(THREADS)
# Debugigng
ifdef DEBUG
VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS)
CFLAGS += -DVCD_OUTPUT $(DBG_FLAGS)
else
VL_FLAGS += -DNDEBUG
CFLAGS += -DNDEBUG
endif
SRCS = vortex.cpp ../common/vx_utils.cpp
# Enable perf counters
ifdef PERF
VL_FLAGS += -DPERF_ENABLE
CFLAGS += -DPERF_ENABLE
CXXFLAGS += -DPERF_ENABLE
endif
# ALU backend
VL_FLAGS += -DIMUL_DPI
VL_FLAGS += -DIDIV_DPI
# FPU backend
FPU_CORE ?= FPU_DPI
VL_FLAGS += -D$(FPU_CORE)
PROJECT = libvortex.so
# PROJECT = libvortex.dylib
all: $(PROJECT)
$(PROJECT): $(SRCS)
verilator --exe --cc $(TOP) --top-module $(TOP) $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT)
make -j -C obj_dir -f V$(TOP).mk
$(MAKE) -C $(RTLSIM_DIR) static
$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT)
clean:
rm -rf $(PROJECT) obj_dir
$(MAKE) -C $(RTLSIM_DIR) clean-static
rm -rf $(PROJECT) *.o .depend

View file

@ -1,64 +0,0 @@
#pragma once
#include <stdio.h>
#include <stdint.h>
class RAM {
private:
mutable uint8_t *mem_[(1 << 12)];
uint8_t *get(uint32_t address) const {
uint32_t block_addr = address >> 20;
uint32_t block_offset = address & 0x000FFFFF;
if (mem_[block_addr] == NULL) {
mem_[block_addr] = new uint8_t[(1 << 20)];
}
return mem_[block_addr] + block_offset;
}
public:
RAM() {
for (uint32_t i = 0; i < (1 << 12); i++) {
mem_[i] = NULL;
}
}
~RAM() {
this->clear();
}
size_t size() const {
return (1ull << 32);
}
void clear() {
for (uint32_t i = 0; i < (1 << 12); i++) {
if (mem_[i]) {
delete [] mem_[i];
mem_[i] = NULL;
}
}
}
void read(uint32_t address, uint32_t length, uint8_t *data) const {
for (unsigned i = 0; i < length; i++) {
data[i] = *this->get(address + i);
}
}
void write(uint32_t address, uint32_t length, const uint8_t *data) {
for (unsigned i = 0; i < length; i++) {
*this->get(address + i) = data[i];
}
}
uint8_t& operator[](uint32_t address) {
return *get(address);
}
const uint8_t& operator[](uint32_t address) const {
return *get(address);
}
};

View file

@ -8,20 +8,15 @@
#include <vortex.h>
#include <VX_config.h>
#include <ram.h>
#include <mem.h>
#include <util.h>
#include <simulator.h>
///////////////////////////////////////////////////////////////////////////////
inline size_t align_size(size_t size, size_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return (size + alignment - 1) & ~(alignment - 1);
}
using namespace vortex;
///////////////////////////////////////////////////////////////////////////////
class vx_device;
class vx_buffer {
public:
vx_buffer(size_t size, vx_device* device)
@ -59,7 +54,7 @@ private:
class vx_device {
public:
vx_device() {
vx_device() : ram_((1<<12), (1<<20)) {
mem_allocation_ = ALLOC_BASE_ADDR;
}
@ -84,16 +79,16 @@ public:
if (dest_addr + asize > ram_.size())
return -1;
/*printf("VXDRV: upload %d bytes from 0x%lx to 0x%lx", size, (uint8_t*)src + src_offset, dest_addr);
if (size <= 1024) {
printf(": ");
for (int i = asize-1; i >= 0; --i) {
printf("%x", *((uint8_t*)src + src_offset + i));
/*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src + src_offset));
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
printf("\n0x%08lx=", dest_addr + i * CACHE_BLOCK_SIZE);
for (int j = 0; j < CACHE_BLOCK_SIZE; ++j) {
printf("%02x", *((uint8_t*)src + src_offset + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
}
}
printf("\n");*/
ram_.write(dest_addr, asize, (const uint8_t*)src + src_offset);
ram_.write((const uint8_t*)src + src_offset, dest_addr, asize);
return 0;
}
@ -102,13 +97,13 @@ public:
if (src_addr + asize > ram_.size())
return -1;
ram_.read(src_addr, asize, (uint8_t*)dest + dest_offset);
ram_.read((uint8_t*)dest + dest_offset, src_addr, asize);
/*printf("VXDRV: download %d bytes from 0x%lx to 0x%lx", size, src_addr, (uint8_t*)dest + dest_offset);
if (size <= 1024) {
printf(": ");
for (int i = asize-1; i >= 0; --i) {
printf("%x", *((uint8_t*)dest + dest_offset + i));
/*printf("VXDRV: download %ld bytes to 0x%lx:", size, uintptr_t((uint8_t*)dest + dest_offset));
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
printf("\n0x%08lx=", src_addr + i * CACHE_BLOCK_SIZE);
for (int j = 0; j < CACHE_BLOCK_SIZE; ++j) {
printf("%02x", *((uint8_t*)dest + dest_offset + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
}
}
printf("\n");*/
@ -154,6 +149,34 @@ private:
///////////////////////////////////////////////////////////////////////////////
#ifdef DUMP_PERF_STATS
class AutoPerfDump {
private:
std::list<vx_device_h> devices_;
public:
AutoPerfDump() {}
~AutoPerfDump() {
for (auto device : devices_) {
vx_dump_perf(device, stdout);
}
}
void add_device(vx_device_h device) {
devices_.push_back(device);
}
void remove_device(vx_device_h device) {
devices_.remove(device);
}
};
AutoPerfDump gAutoPerfDump;
#endif
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
if (nullptr == hdevice)
return -1;
@ -198,6 +221,10 @@ extern int vx_dev_open(vx_device_h* hdevice) {
*hdevice = new vx_device();
#ifdef DUMP_PERF_STATS
gAutoPerfDump.add_device(*hdevice);
#endif
return 0;
}
@ -208,7 +235,8 @@ extern int vx_dev_close(vx_device_h hdevice) {
vx_device *device = ((vx_device*)hdevice);
#ifdef DUMP_PERF_STATS
vx_dump_perf(device, stdout);
gAutoPerfDump.remove_device(hdevice);
vx_dump_perf(hdevice, stdout);
#endif
delete device;

View file

@ -1,37 +1,29 @@
PROJECT = libvortex.so
#PROJECT = libvortex.dylib
SIMX_DIR = ../../simX
SIMX_DIR = ../../sim/simX
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
#CXXFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -DUSE_SIMX -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I../include -I../../hw -I$(SIMX_DIR)
CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I../include -I../../hw -I$(SIMX_DIR) -I$(SIMX_DIR)/../common
CXXFLAGS += $(CONFIGS)
CXXFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -pthread
#LDFLAGS += -dynamiclib -pthread
LDFLAGS += $(SIMX_DIR)/libsimX.a
SRCS = vortex.cpp ../common/vx_utils.cpp
SRCS += $(SIMX_DIR)/util.cpp $(SIMX_DIR)/args.cpp $(SIMX_DIR)/mem.cpp $(SIMX_DIR)/pipeline.cpp $(SIMX_DIR)/warp.cpp $(SIMX_DIR)/core.cpp $(SIMX_DIR)/decode.cpp $(SIMX_DIR)/execute.cpp
# Debugigng
ifndef DEBUG
CXXFLAGS += -DNDEBUG
endif
PROJECT = libvortex.so
all: $(PROJECT)
$(PROJECT): $(SRCS)
$(MAKE) -C $(SIMX_DIR) static
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
$(MAKE) -C $(SIMX_DIR) clean-static
rm -rf $(PROJECT) *.o .depend

View file

@ -10,15 +10,11 @@
#include <vortex.h>
#include <core.h>
#include <VX_config.h>
#include <util.h>
#define PAGE_SIZE 4096
#define PAGE_SIZE 4096
///////////////////////////////////////////////////////////////////////////////
inline size_t align_size(size_t size, size_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return (size + alignment - 1) & ~(alignment - 1);
}
using namespace vortex;
///////////////////////////////////////////////////////////////////////////////
@ -74,7 +70,7 @@ public:
mem_allocation_ = ALLOC_BASE_ADDR;
mmu_.attach(ram_, 0, 0xffffffff);
for (int i = 0; i < arch_.num_cores(); ++i) {
cores_[i] = std::make_shared<vortex::Core>(arch_, decoder_, mmu_, i);
cores_[i] = std::make_shared<Core>(arch_, decoder_, mmu_, i);
}
}
@ -101,7 +97,7 @@ public:
if (dest_addr + asize > ram_.size())
return -1;
ram_.write(dest_addr, (const uint8_t*)src + src_offset, asize);
ram_.write((const uint8_t*)src + src_offset, dest_addr, asize);
/*printf("VXDRV: upload %d bytes to 0x%x\n", size, dest_addr);
for (int i = 0; i < size; i += 4) {
@ -116,7 +112,7 @@ public:
if (src_addr + asize > ram_.size())
return -1;
ram_.read(src_addr, (uint8_t*)dest + dest_offset, asize);
ram_.read((uint8_t*)dest + dest_offset, src_addr, asize);
/*printf("VXDRV: download %d bytes from 0x%x\n", size, src_addr);
for (int i = 0; i < size; i += 4) {
@ -209,25 +205,57 @@ private:
device->thread_proc();
}
vortex::ArchDef arch_;
vortex::Decoder decoder_;
vortex::MemoryUnit mmu_;
std::vector<std::shared_ptr<vortex::Core>> cores_;
ArchDef arch_;
Decoder decoder_;
MemoryUnit mmu_;
std::vector<std::shared_ptr<Core>> cores_;
bool is_done_;
bool is_running_;
size_t mem_allocation_;
std::thread thread_;
vortex::RAM ram_;
RAM ram_;
std::mutex mutex_;
};
///////////////////////////////////////////////////////////////////////////////
#ifdef DUMP_PERF_STATS
class AutoPerfDump {
private:
std::list<vx_device_h> devices_;
public:
AutoPerfDump() {}
~AutoPerfDump() {
for (auto device : devices_) {
vx_dump_perf(device, stdout);
}
}
void add_device(vx_device_h device) {
devices_.push_back(device);
}
void remove_device(vx_device_h device) {
devices_.remove(device);
}
};
AutoPerfDump gAutoPerfDump;
#endif
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_open(vx_device_h* hdevice) {
if (nullptr == hdevice)
return -1;
*hdevice = new vx_device();
*hdevice = new vx_device();
#ifdef DUMP_PERF_STATS
gAutoPerfDump.add_device(*hdevice);
#endif
return 0;
}
@ -239,7 +267,8 @@ extern int vx_dev_close(vx_device_h hdevice) {
vx_device *device = ((vx_device*)hdevice);
#ifdef DUMP_PERF_STATS
vx_dump_perf(device, stdout);
gAutoPerfDump.remove_device(hdevice);
vx_dump_perf(hdevice, stdout);
#endif
delete device;

62
driver/vlsim/Makefile Normal file
View file

@ -0,0 +1,62 @@
VLSIM_DIR = ../../sim/vlsim
RTL_DIR=../../hw/rtl
SCRIPT_DIR=../../hw/scripts
CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors
#CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I. -I../include -I../../hw -I$(VLSIM_DIR)
LDFLAGS += $(VLSIM_DIR)/libopae-c-vlsim.a
# Position independent code
CXXFLAGS += -fPIC
# Add external configuration
CXXFLAGS += $(CONFIGS)
# Dump perf stats
CXXFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -pthread
SRCS = ../common/opae.cpp ../common/vx_utils.cpp
# Enable scope analyzer
ifdef SCOPE
CXXFLAGS += -DSCOPE
SRCS += ../common/vx_scope.cpp
SCOPE_H = scope-defs.h
endif
# Enable perf counters
ifdef PERF
CXXFLAGS += -DPERF_ENABLE
endif
PROJECT = libvortex.so
all: $(PROJECT)
scope-defs.h: $(SCRIPT_DIR)/scope.json
$(SCRIPT_DIR)/scope.py $(CONFIGS) -cc scope-defs.h -vl $(RTL_DIR)/scope-defs.vh $(SCRIPT_DIR)/scope.json
# generate scope data
scope: scope-defs.h
$(PROJECT): $(SRCS) $(SCOPE_H)
$(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C $(VLSIM_DIR) static
$(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -o $(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
clean:
$(MAKE) -C $(VLSIM_DIR) clean-static
rm -rf $(PROJECT) *.o .depend scope-defs.h
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View file

@ -1,9 +1,12 @@
.PHONY: build_config
RTL_DIR=./rtl
SCRIPT_DIR=./scripts
build_config: ./rtl/VX_config.vh
./scripts/gen_config.py -i ./rtl/VX_config.vh -o ./VX_config.h
$(MAKE) -C simulate
all: VX_config.h
VX_config.h: $(RTL_DIR)/VX_config.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_config.vh -o VX_config.h
clean:
rm -f ./VX_config.h
$(MAKE) -C simulate clean
rm -f VX_config.h
.PHONY: VX_config.h

View file

@ -1,2 +0,0 @@
*.v
*.sh

View file

View file

@ -4,293 +4,168 @@
#include <vector>
#include <mutex>
#include <iostream>
#include <rvfloats.h>
#include "svdpi.h"
#include "verilated_vpi.h"
#include "VX_config.h"
extern "C" {
void dpi_fadd(int a, int b, int frm, int* result, int* fflags);
void dpi_fsub(int a, int b, int frm, int* result, int* fflags);
void dpi_fmul(int a, int b, int frm, int* result, int* fflags);
void dpi_fmadd(int a, int b, int c, int frm, int* result, int* fflags);
void dpi_fmsub(int a, int b, int c, int frm, int* result, int* fflags);
void dpi_fnmadd(int a, int b, int c, int frm, int* result, int* fflags);
void dpi_fnmsub(int a, int b, int c, int frm, int* result, int* fflags);
void dpi_fadd(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fsub(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fmul(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fnmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fnmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fdiv(int a, int b, int frm, int* result, int* fflags);
void dpi_fsqrt(int a, int frm, int* result, int* fflags);
void dpi_fdiv(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fsqrt(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_ftoi(int a, int frm, int* result, int* fflags);
void dpi_ftou(int a, int frm, int* result, int* fflags);
void dpi_itof(int a, int frm, int* result, int* fflags);
void dpi_utof(int a, int frm, int* result, int* fflags);
void dpi_ftoi(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_ftou(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_itof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_utof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fclss(int a, int* result);
void dpi_fsgnj(int a, int b, int* result);
void dpi_fsgnjn(int a, int b, int* result);
void dpi_fsgnjx(int a, int b, int* result);
void dpi_fclss(bool enable, int a, int* result);
void dpi_fsgnj(bool enable, int a, int b, int* result);
void dpi_fsgnjn(bool enable, int a, int b, int* result);
void dpi_fsgnjx(bool enable, int a, int b, int* result);
void dpi_flt(int a, int b, int* result, int* fflags);
void dpi_fle(int a, int b, int* result, int* fflags);
void dpi_feq(int a, int b, int* result, int* fflags);
void dpi_fmin(int a, int b, int* result, int* fflags);
void dpi_fmax(int a, int b, int* result, int* fflags);
void dpi_flt(bool enable, int a, int b, int* result, svBitVecVal* fflags);
void dpi_fle(bool enable, int a, int b, int* result, svBitVecVal* fflags);
void dpi_feq(bool enable, int a, int b, int* result, svBitVecVal* fflags);
void dpi_fmin(bool enable, int a, int b, int* result, svBitVecVal* fflags);
void dpi_fmax(bool enable, int a, int b, int* result, svBitVecVal* fflags);
}
union Float_t {
float f;
int i;
struct {
uint32_t man : 23;
uint32_t exp : 8;
uint32_t sign : 1;
} parts;
};
void dpi_fadd(int a, int b, int frm, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f + fb.f;
*result = fr.i;
*fflags = 0;
void dpi_fadd(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fadd(a, b, (*frm & 0x7), fflags);
}
void dpi_fsub(int a, int b, int frm, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f - fb.f;
*result = fr.i;
*fflags = 0;
void dpi_fsub(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fsub(a, b, (*frm & 0x7), fflags);
}
void dpi_fmul(int a, int b, int frm, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f * fb.f;
*result = fr.i;
*fflags = 0;
void dpi_fmul(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fmul(a, b, (*frm & 0x7), fflags);
}
void dpi_fmadd(int a, int b, int c, int frm, int* result, int* fflags) {
Float_t fa, fb, fc, fr;
fa.i = a;
fb.i = b;
fc.i = c;
fr.f = fa.f * fb.f + fc.f;
*result = fr.i;
*fflags = 0;
void dpi_fmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fmadd(a, b, c, (*frm & 0x7), fflags);
}
void dpi_fmsub(int a, int b, int c, int frm, int* result, int* fflags) {
Float_t fa, fb, fc, fr;
fa.i = a;
fb.i = b;
fc.i = c;
fr.f = fa.f * fb.f - fc.f;
*result = fr.i;
*fflags = 0;
void dpi_fmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fmsub(a, b, c, (*frm & 0x7), fflags);
}
void dpi_fnmadd(int a, int b, int c, int frm, int* result, int* fflags) {
Float_t fa, fb, fc, fr;
fa.i = a;
fb.i = b;
fc.i = c;
fr.f = -(fa.f * fb.f + fc.f);
*result = fr.i;
*fflags = 0;
void dpi_fnmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fnmadd(a, b, c, (*frm & 0x7), fflags);
}
void dpi_fnmsub(int a, int b, int c, int frm, int* result, int* fflags) {
Float_t fa, fb, fc, fr;
fa.i = a;
fb.i = b;
fc.i = c;
fr.f = -(fa.f * fb.f - fc.f);
*result = fr.i;
*fflags = 0;
void dpi_fnmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fnmsub(a, b, c, (*frm & 0x7), fflags);
}
void dpi_fdiv(int a, int b, int frm, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f / fb.f;
*result = fr.i;
*fflags = 0;
void dpi_fdiv(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fdiv(a, b, (*frm & 0x7), fflags);
}
void dpi_fsqrt(int a, int frm, int* result, int* fflags) {
Float_t fa, fr;
fa.i = a;
fr.f = sqrtf(fa.f);
*result = fr.i;
*fflags = 0;
void dpi_fsqrt(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fsqrt(a, (*frm & 0x7), fflags);
}
void dpi_ftoi(int a, int frm, int* result, int* fflags) {
Float_t fa, fr;
fa.i = a;
fr.i = int(fa.f);
*result = fr.i;
*fflags = 0;
void dpi_ftoi(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_ftoi(a, (*frm & 0x7), fflags);
}
void dpi_ftou(int a, int frm, int* result, int* fflags) {
Float_t fa, fr;
fa.i = a;
fr.i = unsigned(fa.f);
*result = fr.i;
*fflags = 0;
void dpi_ftou(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_ftou(a, (*frm & 0x7), fflags);
}
void dpi_itof(int a, int frm, int* result, int* fflags) {
Float_t fa, fr;
fr.f = (float)a;
*result = fr.i;
*fflags = 0;
void dpi_itof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_itof(a, (*frm & 0x7), fflags);
}
void dpi_utof(int a, int frm, int* result, int* fflags) {
Float_t fa, fr;
unsigned ua = a;
fr.f = (float)ua;
*result = fr.i;
*fflags = 0;
void dpi_utof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_utof(a, (*frm & 0x7), fflags);
}
void dpi_flt(int a, int b, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f < fb.f;
*result = fr.i;
*fflags = 0;
void dpi_flt(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_flt(a, b, fflags);
}
void dpi_fle(int a, int b, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f <= fb.f;
*result = fr.i;
*fflags = 0;
void dpi_fle(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fle(a, b, fflags);
}
void dpi_feq(int a, int b, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f == fb.f;
*result = fr.i;
*fflags = 0;
void dpi_feq(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_feq(a, b, fflags);
}
void dpi_fmin(int a, int b, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = std::min<float>(fa.f, fb.f);
*result = fr.i;
*fflags = 0;
void dpi_fmin(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fmin(a, b, fflags);
}
void dpi_fmax(int a, int b, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = std::max<float>(fa.f, fb.f);
*result = fr.i;
*fflags = 0;
void dpi_fmax(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fmax(a, b, fflags);
}
void dpi_fclss(int a, int* result) {
int r = 0; // clear all bits
bool fsign = (a >> 31);
uint32_t expo = (a >> 23) & 0xFF;
uint32_t fraction = a & 0x7FFFFF;
if ((expo == 0) && (fraction == 0)) {
r = fsign ? (1 << 3) : (1 << 4); // +/- 0
} else if ((expo == 0) && (fraction != 0)) {
r = fsign ? (1 << 2) : (1 << 5); // +/- subnormal
} else if ((expo == 0xFF) && (fraction == 0)) {
r = fsign ? (1<<0) : (1<<7); // +/- infinity
} else if ((expo == 0xFF ) && (fraction != 0)) {
if (!fsign && (fraction == 0x00400000)) {
r = (1 << 9); // quiet NaN
} else {
r = (1 << 8); // signaling NaN
}
} else {
r = fsign ? (1 << 1) : (1 << 6); // +/- normal
}
*result = r;
void dpi_fclss(bool enable, int a, int* result) {
if (!enable)
return;
*result = rv_fclss(a);
}
void dpi_fsgnj(int a, int b, int* result) {
int sign = b & 0x80000000;
int r = sign | (a & 0x7FFFFFFF);
*result = r;
void dpi_fsgnj(bool enable, int a, int b, int* result) {
if (!enable)
return;
*result = rv_fsgnj(a, b);
}
void dpi_fsgnjn(int a, int b, int* result) {
int sign = ~b & 0x80000000;
int r = sign | (a & 0x7FFFFFFF);
*result = r;
void dpi_fsgnjn(bool enable, int a, int b, int* result) {
if (!enable)
return;
*result = rv_fsgnjn(a, b);
}
void dpi_fsgnjx(int a, int b, int* result) {
int sign1 = a & 0x80000000;
int sign2 = b & 0x80000000;
int r = (sign1 ^ sign2) | (a & 0x7FFFFFFF);
*result = r;
void dpi_fsgnjx(bool enable, int a, int b, int* result) {
if (!enable)
return;
*result = rv_fsgnjx(a, b);
}

View file

@ -1,31 +1,31 @@
`ifndef FLOAT_DPI
`define FLOAT_DPI
import "DPI-C" context function void dpi_fadd(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fsub(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fmul(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fmadd(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fmsub(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fnmadd(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fnmsub(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fadd(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fsub(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmul(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmadd(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmsub(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fnmadd(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fnmsub(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fdiv(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fsqrt(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fdiv(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fsqrt(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_ftoi(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_ftou(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_itof(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_utof(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_ftoi(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_ftou(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_itof(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_utof(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fclss(input int a, output int result);
import "DPI-C" context function void dpi_fsgnj(input int a, input int b, output int result);
import "DPI-C" context function void dpi_fsgnjn(input int a, input int b, output int result);
import "DPI-C" context function void dpi_fsgnjx(input int a, input int b, output int result);
import "DPI-C" function void dpi_fclss(input logic enable, input int a, output int result);
import "DPI-C" function void dpi_fsgnj(input logic enable, input int a, input int b, output int result);
import "DPI-C" function void dpi_fsgnjn(input logic enable, input int a, input int b, output int result);
import "DPI-C" function void dpi_fsgnjx(input logic enable, input int a, input int b, output int result);
import "DPI-C" context function void dpi_flt(input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fle(input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_feq(input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fmin(input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fmax(input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_flt(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fle(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_feq(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmin(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmax(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
`endif

View file

@ -9,13 +9,20 @@
#include "VX_config.h"
extern "C" {
void dpi_imul(int a, int b, bool is_signed_a, bool is_signed_b, int* resultl, int* resulth);
void dpi_idiv(int a, int b, bool is_signed, int* quotient, int* remainder);
void dpi_imul(bool enable, int a, int b, bool is_signed_a, bool is_signed_b, int* resultl, int* resulth);
void dpi_idiv(bool enable, int a, int b, bool is_signed, int* quotient, int* remainder);
int dpi_register();
void dpi_assert(int inst, bool cond, int delay);
void dpi_trace(const char* format, ...);
void dpi_trace_start();
void dpi_trace_stop();
}
bool sim_trace_enabled();
void sim_trace_enable(bool enable);
class ShiftRegister {
public:
ShiftRegister() : init_(false), depth_(0) {}
@ -86,15 +93,18 @@ void dpi_assert(int inst, bool cond, int delay) {
}
}
void dpi_imul(int a, int b, bool is_signed_a, bool is_signed_b, int* resultl, int* resulth) {
uint64_t first = a;
uint64_t second = b;
void dpi_imul(bool enable, int a, int b, bool is_signed_a, bool is_signed_b, int* resultl, int* resulth) {
if (!enable)
return;
if (is_signed_a && (a & 0x80000000)) {
uint64_t first = *(uint32_t*)&a;
uint64_t second = *(uint32_t*)&b;
if (is_signed_a && (first & 0x80000000)) {
first |= 0xFFFFFFFF00000000;
}
if (is_signed_b && (b & 0x80000000)) {
if (is_signed_b && (second & 0x80000000)) {
second |= 0xFFFFFFFF00000000;
}
@ -109,9 +119,12 @@ void dpi_imul(int a, int b, bool is_signed_a, bool is_signed_b, int* resultl, in
*resulth = (result >> 32) & 0xFFFFFFFF;
}
void dpi_idiv(int a, int b, bool is_signed, int* quotient, int* remainder) {
uint32_t dividen = a;
uint32_t divisor = b;
void dpi_idiv(bool enable, int a, int b, bool is_signed, int* quotient, int* remainder) {
if (!enable)
return;
uint32_t dividen = *(uint32_t*)&a;
uint32_t divisor = *(uint32_t*)&b;
if (is_signed) {
if (b == 0) {
@ -133,4 +146,21 @@ void dpi_idiv(int a, int b, bool is_signed, int* quotient, int* remainder) {
*remainder = dividen % divisor;
}
}
}
void dpi_trace(const char* format, ...) {
if (!sim_trace_enabled())
return;
va_list va;
va_start(va, format);
vprintf(format, va);
va_end(va);
}
void dpi_trace_start() {
sim_trace_enable(true);
}
void dpi_trace_stop() {
sim_trace_enable(false);
}

View file

@ -1,10 +1,14 @@
`ifndef UTIL_DPI
`define UTIL_DPI
import "DPI-C" context function void dpi_imul(input int a, input int b, input logic is_signed_a, input logic is_signed_b, output int resultl, output int resulth);
import "DPI-C" context function void dpi_idiv(input int a, input int b, input logic is_signed, output int quotient, output int remainder);
import "DPI-C" function void dpi_imul(input logic enable, input int a, input int b, input logic is_signed_a, input logic is_signed_b, output int resultl, output int resulth);
import "DPI-C" function void dpi_idiv(input logic enable, input int a, input int b, input logic is_signed, output int quotient, output int remainder);
import "DPI-C" context function int dpi_register();
import "DPI-C" context function void dpi_assert(int inst, input logic cond, input int delay);
import "DPI-C" function int dpi_register();
import "DPI-C" function void dpi_assert(int inst, input logic cond, input int delay);
import "DPI-C" function void dpi_trace(input string format /*verilator sformat*/);
import "DPI-C" function void dpi_trace_start();
import "DPI-C" function void dpi_trace_stop();
`endif

View file

@ -3,15 +3,15 @@
module VX_alu_unit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
input wire clk,
input wire reset,
// Inputs
VX_alu_req_if alu_req_if,
VX_alu_req_if.slave alu_req_if,
// Outputs
VX_branch_ctl_if branch_ctl_if,
VX_commit_if alu_commit_if
VX_branch_ctl_if.master branch_ctl_if,
VX_commit_if.master alu_commit_if
);
`UNUSED_PARAM (CORE_ID)
@ -22,15 +22,15 @@ module VX_alu_unit #(
wire [`NUM_THREADS-1:0][31:0] shr_result;
reg [`NUM_THREADS-1:0][31:0] msc_result;
wire stall_in, stall_out;
wire ready_in;
`UNUSED_VAR (alu_req_if.op_mod)
wire is_br_op = `ALU_IS_BR(alu_req_if.op_mod);
wire [`ALU_BITS-1:0] alu_op = `ALU_OP(alu_req_if.op_type);
wire [`BR_BITS-1:0] br_op = `BR_OP(alu_req_if.op_type);
wire alu_signed = `ALU_SIGNED(alu_op);
wire [1:0] alu_op_class = `ALU_OP_CLASS(alu_op);
wire is_sub = (alu_op == `ALU_SUB);
wire is_br_op = `INST_ALU_IS_BR(alu_req_if.op_mod);
wire [`INST_ALU_BITS-1:0] alu_op = `INST_ALU_BITS'(alu_req_if.op_type);
wire [`INST_BR_BITS-1:0] br_op = `INST_BR_BITS'(alu_req_if.op_type);
wire alu_signed = `INST_ALU_SIGNED(alu_op);
wire [1:0] alu_op_class = `INST_ALU_OP_CLASS(alu_op);
wire is_sub = (alu_op == `INST_ALU_SUB);
wire [`NUM_THREADS-1:0][31:0] alu_in1 = alu_req_if.rs1_data;
wire [`NUM_THREADS-1:0][31:0] alu_in2 = alu_req_if.rs2_data;
@ -57,10 +57,10 @@ module VX_alu_unit #(
for (genvar i = 0; i < `NUM_THREADS; i++) begin
always @(*) begin
case (alu_op)
`ALU_AND: msc_result[i] = alu_in1[i] & alu_in2_imm[i];
`ALU_OR: msc_result[i] = alu_in1[i] | alu_in2_imm[i];
`ALU_XOR: msc_result[i] = alu_in1[i] ^ alu_in2_imm[i];
//`ALU_SLL,
`INST_ALU_AND: msc_result[i] = alu_in1[i] & alu_in2_imm[i];
`INST_ALU_OR: msc_result[i] = alu_in1[i] | alu_in2_imm[i];
`INST_ALU_XOR: msc_result[i] = alu_in1[i] ^ alu_in2_imm[i];
//`INST_ALU_SLL,
default: msc_result[i] = alu_in1[i] << alu_in2_imm[i][4:0];
endcase
end
@ -81,7 +81,7 @@ module VX_alu_unit #(
// branch
wire is_jal = is_br_op && (br_op == `BR_JAL || br_op == `BR_JALR);
wire is_jal = is_br_op && (br_op == `INST_BR_JAL || br_op == `INST_BR_JALR);
wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result;
wire [31:0] br_dest = add_result[alu_req_if.tid];
@ -90,24 +90,51 @@ module VX_alu_unit #(
wire is_less = cmp_result[32];
wire is_equal = ~(| cmp_result[31:0]);
wire br_neg = `BR_NEG(br_op);
wire br_less = `BR_LESS(br_op);
wire br_static = `BR_STATIC(br_op);
wire br_taken = ((br_less ? is_less : is_equal) ^ br_neg) | br_static;
// output
wire result_valid;
wire [`NW_BITS-1:0] result_wid;
wire [`NUM_THREADS-1:0] result_tmask;
wire [31:0] result_PC;
wire [`NR_BITS-1:0] result_rd;
wire result_wb;
wire [`NUM_THREADS-1:0][31:0] result_data;
wire result_is_br;
wire alu_valid_in;
wire alu_ready_in;
wire alu_valid_out;
wire alu_ready_out;
wire [`NW_BITS-1:0] alu_wid;
wire [`NUM_THREADS-1:0] alu_tmask;
wire [31:0] alu_PC;
wire [`NR_BITS-1:0] alu_rd;
wire alu_wb;
wire [`NUM_THREADS-1:0][31:0] alu_data;
wire [`INST_BR_BITS-1:0] br_op_r;
wire [31:0] br_dest_r;
wire is_less_r;
wire is_equal_r;
wire is_br_op_r;
assign alu_ready_in = alu_ready_out || ~alu_valid_out;
VX_pipe_register #(
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (alu_ready_in),
.data_in ({alu_valid_in, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}),
.data_out ({alu_valid_out, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r})
);
`UNUSED_VAR (br_op_r)
wire br_neg = `INST_BR_NEG(br_op_r);
wire br_less = `INST_BR_LESS(br_op_r);
wire br_static = `INST_BR_STATIC(br_op_r);
assign branch_ctl_if.valid = alu_valid_out && alu_ready_out && is_br_op_r;
assign branch_ctl_if.taken = ((br_less ? is_less_r : is_equal_r) ^ br_neg) | br_static;
assign branch_ctl_if.wid = alu_wid;
assign branch_ctl_if.dest = br_dest_r;
`ifdef EXT_M_ENABLE
wire mul_valid_in;
wire mul_ready_in;
wire mul_valid_out;
wire mul_ready_out;
@ -118,14 +145,14 @@ module VX_alu_unit #(
wire mul_wb;
wire [`NUM_THREADS-1:0][31:0] mul_data;
wire is_mul_op = `ALU_IS_MUL(alu_req_if.op_mod);
wire [`INST_MUL_BITS-1:0] mul_op = `INST_MUL_BITS'(alu_req_if.op_type);
VX_muldiv muldiv (
.clk (clk),
.reset (reset),
// Inputs
.alu_op (`MUL_OP(alu_req_if.op_type)),
.alu_op (mul_op),
.wid_in (alu_req_if.wid),
.tmask_in (alu_req_if.tmask),
.PC_in (alu_req_if.PC),
@ -143,69 +170,58 @@ module VX_alu_unit #(
.data_out (mul_data),
// handshake
.valid_in (alu_req_if.valid && is_mul_op),
.valid_in (mul_valid_in),
.ready_in (mul_ready_in),
.valid_out (mul_valid_out),
.ready_out (mul_ready_out)
);
assign stall_in = (is_mul_op && ~mul_ready_in)
|| (~is_mul_op && (mul_valid_out || stall_out));
assign mul_ready_out = ~stall_out;
wire is_mul_op = `INST_ALU_IS_MUL(alu_req_if.op_mod);
assign result_valid = mul_valid_out || (alu_req_if.valid && ~is_mul_op);
assign result_wid = mul_valid_out ? mul_wid : alu_req_if.wid;
assign result_tmask = mul_valid_out ? mul_tmask : alu_req_if.tmask;
assign result_PC = mul_valid_out ? mul_PC : alu_req_if.PC;
assign result_rd = mul_valid_out ? mul_rd : alu_req_if.rd;
assign result_wb = mul_valid_out ? mul_wb : alu_req_if.wb;
assign result_data = mul_valid_out ? mul_data : alu_jal_result;
assign result_is_br = ~mul_valid_out && is_br_op;
assign ready_in = is_mul_op ? mul_ready_in : alu_ready_in;
assign alu_valid_in = alu_req_if.valid && ~is_mul_op;
assign mul_valid_in = alu_req_if.valid && is_mul_op;
assign alu_commit_if.valid = alu_valid_out || mul_valid_out;
assign alu_commit_if.wid = alu_valid_out ? alu_wid : mul_wid;
assign alu_commit_if.tmask = alu_valid_out ? alu_tmask : mul_tmask;
assign alu_commit_if.PC = alu_valid_out ? alu_PC : mul_PC;
assign alu_commit_if.rd = alu_valid_out ? alu_rd : mul_rd;
assign alu_commit_if.wb = alu_valid_out ? alu_wb : mul_wb;
assign alu_commit_if.data = alu_valid_out ? alu_data : mul_data;
assign alu_ready_out = alu_commit_if.ready;
assign mul_ready_out = alu_commit_if.ready & ~alu_valid_out; // ALU takes priority
`else
assign stall_in = stall_out;
assign ready_in = alu_ready_in;
assign result_valid = alu_req_if.valid;
assign result_wid = alu_req_if.wid;
assign result_tmask = alu_req_if.tmask;
assign result_PC = alu_req_if.PC;
assign result_rd = alu_req_if.rd;
assign result_wb = alu_req_if.wb;
assign result_data = alu_jal_result;
assign result_is_br = is_br_op;
assign alu_valid_in = alu_req_if.valid;
assign alu_commit_if.valid = alu_valid_out;
assign alu_commit_if.wid = alu_wid;
assign alu_commit_if.tmask = alu_tmask;
assign alu_commit_if.PC = alu_PC;
assign alu_commit_if.rd = alu_rd;
assign alu_commit_if.wb = alu_wb;
assign alu_commit_if.data = alu_data;
assign alu_ready_out = alu_commit_if.ready;
`endif
wire is_br_op_r;
assign stall_out = ~alu_commit_if.ready && alu_commit_if.valid;
VX_pipe_register #(
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + 1 + 32),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall_out),
.data_in ({result_valid, result_wid, result_tmask, result_PC, result_rd, result_wb, result_data, result_is_br, br_taken, br_dest}),
.data_out ({alu_commit_if.valid, alu_commit_if.wid, alu_commit_if.tmask, alu_commit_if.PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data, is_br_op_r, branch_ctl_if.taken, branch_ctl_if.dest})
);
assign alu_commit_if.eop = 1'b1;
assign branch_ctl_if.valid = alu_commit_if.valid && alu_commit_if.ready && is_br_op_r;
assign branch_ctl_if.wid = alu_commit_if.wid;
// can accept new request?
assign alu_req_if.ready = ~stall_in;
assign alu_req_if.ready = ready_in;
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (branch_ctl_if.valid) begin
$display("%t: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h", $time, CORE_ID,
branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest);
dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h\n",
$time, CORE_ID, branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest);
end
end
`endif

View file

@ -12,16 +12,16 @@ module VX_cluster #(
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [`L2MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`L2MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`L2MEM_DATA_WIDTH-1:0] mem_req_data,
output wire [`L2MEM_TAG_WIDTH-1:0] mem_req_tag,
output wire [`L2_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`L2_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`L2_MEM_DATA_WIDTH-1:0] mem_req_data,
output wire [`L2_MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`L2MEM_DATA_WIDTH-1:0] mem_rsp_data,
input wire [`L2MEM_TAG_WIDTH-1:0] mem_rsp_tag,
input wire [`L2_MEM_DATA_WIDTH-1:0] mem_rsp_data,
input wire [`L2_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// Status
@ -31,15 +31,15 @@ module VX_cluster #(
wire [`NUM_CORES-1:0] per_core_mem_req_valid;
wire [`NUM_CORES-1:0] per_core_mem_req_rw;
wire [`NUM_CORES-1:0][`DMEM_BYTEEN_WIDTH-1:0] per_core_mem_req_byteen;
wire [`NUM_CORES-1:0][`DMEM_ADDR_WIDTH-1:0] per_core_mem_req_addr;
wire [`NUM_CORES-1:0][`DMEM_DATA_WIDTH-1:0] per_core_mem_req_data;
wire [`NUM_CORES-1:0][`XMEM_TAG_WIDTH-1:0] per_core_mem_req_tag;
wire [`NUM_CORES-1:0][`DCACHE_MEM_BYTEEN_WIDTH-1:0] per_core_mem_req_byteen;
wire [`NUM_CORES-1:0][`DCACHE_MEM_ADDR_WIDTH-1:0] per_core_mem_req_addr;
wire [`NUM_CORES-1:0][`DCACHE_MEM_DATA_WIDTH-1:0] per_core_mem_req_data;
wire [`NUM_CORES-1:0][`L1_MEM_TAG_WIDTH-1:0] per_core_mem_req_tag;
wire [`NUM_CORES-1:0] per_core_mem_req_ready;
wire [`NUM_CORES-1:0] per_core_mem_rsp_valid;
wire [`NUM_CORES-1:0][`DMEM_DATA_WIDTH-1:0] per_core_mem_rsp_data;
wire [`NUM_CORES-1:0][`XMEM_TAG_WIDTH-1:0] per_core_mem_rsp_tag;
wire [`NUM_CORES-1:0][`DCACHE_MEM_DATA_WIDTH-1:0] per_core_mem_rsp_data;
wire [`NUM_CORES-1:0][`L1_MEM_TAG_WIDTH-1:0] per_core_mem_rsp_tag;
wire [`NUM_CORES-1:0] per_core_mem_rsp_ready;
wire [`NUM_CORES-1:0] per_core_busy;
@ -69,7 +69,7 @@ module VX_cluster #(
.mem_rsp_tag (per_core_mem_rsp_tag [i]),
.mem_rsp_ready (per_core_mem_rsp_ready[i]),
.busy (per_core_busy [i])
.busy (per_core_busy [i])
);
end
@ -83,21 +83,22 @@ module VX_cluster #(
`RESET_RELAY (l2_reset);
VX_cache #(
.CACHE_ID (`L2CACHE_ID),
.CACHE_SIZE (`L2CACHE_SIZE),
.CACHE_LINE_SIZE (`L2CACHE_LINE_SIZE),
.NUM_BANKS (`L2NUM_BANKS),
.WORD_SIZE (`L2WORD_SIZE),
.NUM_REQS (`L2NUM_REQS),
.CREQ_SIZE (`L2CREQ_SIZE),
.CRSQ_SIZE (`L2CRSQ_SIZE),
.MSHR_SIZE (`L2MSHR_SIZE),
.MRSQ_SIZE (`L2MRSQ_SIZE),
.MREQ_SIZE (`L2MREQ_SIZE),
.CACHE_ID (`L2_CACHE_ID),
.CACHE_SIZE (`L2_CACHE_SIZE),
.CACHE_LINE_SIZE (`L2_CACHE_LINE_SIZE),
.NUM_BANKS (`L2_NUM_BANKS),
.NUM_PORTS (`L2_NUM_PORTS),
.WORD_SIZE (`L2_WORD_SIZE),
.NUM_REQS (`L2_NUM_REQS),
.CREQ_SIZE (`L2_CREQ_SIZE),
.CRSQ_SIZE (`L2_CRSQ_SIZE),
.MSHR_SIZE (`L2_MSHR_SIZE),
.MRSQ_SIZE (`L2_MRSQ_SIZE),
.MREQ_SIZE (`L2_MREQ_SIZE),
.WRITE_ENABLE (1),
.CORE_TAG_WIDTH (`XMEM_TAG_WIDTH),
.CORE_TAG_WIDTH (`L1_MEM_TAG_WIDTH),
.CORE_TAG_ID_BITS (0),
.MEM_TAG_WIDTH (`L2MEM_TAG_WIDTH),
.MEM_TAG_WIDTH (`L2_MEM_TAG_WIDTH),
.NC_ENABLE (1)
) l2cache (
`SCOPE_BIND_VX_cluster_l2cache
@ -143,16 +144,20 @@ module VX_cluster #(
end else begin
`RESET_RELAY (mem_arb_reset);
VX_mem_arb #(
.NUM_REQS (`NUM_CORES),
.DATA_WIDTH (`L2MEM_DATA_WIDTH),
.ADDR_WIDTH (`L2MEM_ADDR_WIDTH),
.TAG_IN_WIDTH (`XMEM_TAG_WIDTH),
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
.NUM_REQS (`NUM_CORES),
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
.ADDR_WIDTH (`DCACHE_MEM_ADDR_WIDTH),
.TAG_IN_WIDTH (`L1_MEM_TAG_WIDTH),
.TYPE ("R"),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
) mem_arb (
.clk (clk),
.reset (reset),
.reset (mem_arb_reset),
// Core request
.req_valid_in (per_core_mem_req_valid),

130
hw/rtl/VX_commit.sv Normal file
View file

@ -0,0 +1,130 @@
`include "VX_define.vh"
module VX_commit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_commit_if.slave alu_commit_if,
VX_commit_if.slave ld_commit_if,
VX_commit_if.slave st_commit_if,
VX_commit_if.slave csr_commit_if,
`ifdef EXT_F_ENABLE
VX_commit_if.slave fpu_commit_if,
`endif
VX_commit_if.slave gpu_commit_if,
// outputs
VX_writeback_if.master writeback_if,
VX_cmt_to_csr_if.master cmt_to_csr_if
);
// CSRs update
wire alu_commit_fire = alu_commit_if.valid && alu_commit_if.ready;
wire ld_commit_fire = ld_commit_if.valid && ld_commit_if.ready;
wire st_commit_fire = st_commit_if.valid && st_commit_if.ready;
wire csr_commit_fire = csr_commit_if.valid && csr_commit_if.ready;
`ifdef EXT_F_ENABLE
wire fpu_commit_fire = fpu_commit_if.valid && fpu_commit_if.ready;
`endif
wire gpu_commit_fire = gpu_commit_if.valid && gpu_commit_if.ready;
wire commit_fire = alu_commit_fire
|| ld_commit_fire
|| st_commit_fire
|| csr_commit_fire
`ifdef EXT_F_ENABLE
|| fpu_commit_fire
`endif
|| gpu_commit_fire;
wire [`NUM_THREADS-1:0] commit_tmask;
assign commit_tmask = alu_commit_fire ? alu_commit_if.tmask:
ld_commit_fire ? ld_commit_if.tmask:
st_commit_fire ? st_commit_if.tmask:
csr_commit_fire ? csr_commit_if.tmask:
`ifdef EXT_F_ENABLE
fpu_commit_fire ? fpu_commit_if.tmask:
`endif
/*gpu_commit_fire ?*/ gpu_commit_if.tmask;
wire [$clog2(`NUM_THREADS+1)-1:0] commit_cnt;
`POP_COUNT(commit_cnt, commit_tmask);
VX_pipe_register #(
.DATAW (1 + $clog2(`NUM_THREADS+1)),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({commit_fire, commit_cnt}),
.data_out ({cmt_to_csr_if.valid, cmt_to_csr_if.commit_size})
);
// Writeback
VX_writeback #(
.CORE_ID(CORE_ID)
) writeback (
.clk (clk),
.reset (reset),
.alu_commit_if (alu_commit_if),
.ld_commit_if (ld_commit_if),
.csr_commit_if (csr_commit_if),
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if),
`endif
.writeback_if (writeback_if)
);
// store and gpu commits don't writeback
assign st_commit_if.ready = 1'b1;
assign gpu_commit_if.ready = 1'b1;
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (alu_commit_if.valid && alu_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd);
`TRACE_ARRAY1D(alu_commit_if.data, `NUM_THREADS);
dpi_trace("\n");
end
if (ld_commit_if.valid && ld_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.wb, ld_commit_if.rd);
`TRACE_ARRAY1D(ld_commit_if.data, `NUM_THREADS);
dpi_trace("\n");
end
if (st_commit_if.valid && st_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d\n", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd);
end
if (csr_commit_if.valid && csr_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=CSR, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.wb, csr_commit_if.rd);
`TRACE_ARRAY1D(csr_commit_if.data, `NUM_THREADS);
dpi_trace("\n");
end
`ifdef EXT_F_ENABLE
if (fpu_commit_if.valid && fpu_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.wb, fpu_commit_if.rd);
`TRACE_ARRAY1D(fpu_commit_if.data, `NUM_THREADS);
dpi_trace("\n");
end
`endif
if (gpu_commit_if.valid && gpu_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=GPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.wb, gpu_commit_if.rd);
`TRACE_ARRAY1D(gpu_commit_if.data, `NUM_THREADS);
dpi_trace("\n");
end
end
`endif
endmodule

View file

@ -1,127 +0,0 @@
`include "VX_define.vh"
module VX_commit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_commit_if alu_commit_if,
VX_commit_if ld_commit_if,
VX_commit_if st_commit_if,
VX_commit_if csr_commit_if,
VX_commit_if fpu_commit_if,
VX_commit_if gpu_commit_if,
// outputs
VX_writeback_if writeback_if,
VX_cmt_to_csr_if cmt_to_csr_if
);
localparam CMTW = $clog2(3*`NUM_THREADS+1);
// CSRs update
wire alu_commit_fire = alu_commit_if.valid && alu_commit_if.ready;
wire ld_commit_fire = ld_commit_if.valid && ld_commit_if.ready;
wire st_commit_fire = st_commit_if.valid && st_commit_if.ready;
wire csr_commit_fire = csr_commit_if.valid && csr_commit_if.ready;
wire fpu_commit_fire = fpu_commit_if.valid && fpu_commit_if.ready;
wire gpu_commit_fire = gpu_commit_if.valid && gpu_commit_if.ready;
wire commit_fire = alu_commit_fire
|| ld_commit_fire
|| st_commit_fire
|| csr_commit_fire
|| fpu_commit_fire
|| gpu_commit_fire;
wire [`NUM_THREADS-1:0] commit_tmask1, commit_tmask2, commit_tmask3;
assign commit_tmask1 = alu_commit_fire ? alu_commit_if.tmask:
ld_commit_fire ? ld_commit_if.tmask:
csr_commit_fire ? csr_commit_if.tmask:
fpu_commit_fire ? fpu_commit_if.tmask:
0;
assign commit_tmask2 = st_commit_fire ? st_commit_if.tmask : 0;
assign commit_tmask3 = gpu_commit_fire ? gpu_commit_if.tmask : 0;
wire [CMTW-1:0] commit_size;
assign commit_size = $countones({commit_tmask3, commit_tmask2, commit_tmask1});
VX_pipe_register #(
.DATAW (1 + CMTW),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({commit_fire, commit_size}),
.data_out ({cmt_to_csr_if.valid, cmt_to_csr_if.commit_size})
);
// Writeback
VX_writeback #(
.CORE_ID(CORE_ID)
) writeback (
.clk (clk),
.reset (reset),
.alu_commit_if (alu_commit_if),
.ld_commit_if (ld_commit_if),
.csr_commit_if (csr_commit_if),
.fpu_commit_if (fpu_commit_if),
.gpu_commit_if (gpu_commit_if),
.writeback_if (writeback_if)
);
// store doesn't writeback
assign st_commit_if.ready = 1'b1;
// assign gpu_commit_if.ready = 1'b1;
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (alu_commit_if.valid && alu_commit_if.ready) begin
$write("%t: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd);
`PRINT_ARRAY1D(alu_commit_if.data, `NUM_THREADS);
$write("\n");
end
if (ld_commit_if.valid && ld_commit_if.ready) begin
$write("%t: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.wb, ld_commit_if.rd);
`PRINT_ARRAY1D(ld_commit_if.data, `NUM_THREADS);
$write("\n");
end
if (st_commit_if.valid && st_commit_if.ready) begin
$display("%t: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd);
end
if (csr_commit_if.valid && csr_commit_if.ready) begin
$write("%t: core%0d-commit: wid=%0d, PC=%0h, ex=CSR, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.wb, csr_commit_if.rd);
`PRINT_ARRAY1D(csr_commit_if.data, `NUM_THREADS);
$write("\n");
end
if (fpu_commit_if.valid && fpu_commit_if.ready) begin
$write("%t: core%0d-commit: wid=%0d, PC=%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.wb, fpu_commit_if.rd);
`PRINT_ARRAY1D(fpu_commit_if.data, `NUM_THREADS);
$write("\n");
end
if (gpu_commit_if.valid && gpu_commit_if.ready) begin
$write("%t: core%0d-commit: wid=%0d, PC=%0h, ex=GPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.wb, gpu_commit_if.rd);
`PRINT_ARRAY1D(gpu_commit_if.data, `NUM_THREADS);
$write("\n");
end
end
`else
`UNUSED_VAR (fpu_commit_if.PC)
`endif
endmodule

View file

@ -38,7 +38,7 @@
`endif
`ifndef L1_BLOCK_SIZE
`define L1_BLOCK_SIZE (`NUM_THREADS * 4)
`define L1_BLOCK_SIZE ((`L2_ENABLE || `L3_ENABLE) ? 16 : `MEM_BLOCK_SIZE)
`endif
`ifndef STARTUP_ADDR
@ -227,6 +227,7 @@
`define CSR_LWID 12'hCC3
`define CSR_GWID `CSR_MHARTID
`define CSR_GCID 12'hCC5
`define CSR_TMASK 12'hCC4
// Machine SIMT CSRs
`define CSR_NT 12'hFC0
@ -250,6 +251,11 @@
// Pipeline Queues ////////////////////////////////////////////////////////////
// Size of Instruction Buffer
`ifndef IBUF_SIZE
`define IBUF_SIZE 2
`endif
// Size of LSU Request Queue
`ifndef LSUQ_SIZE
`define LSUQ_SIZE (`NUM_WARPS * 2)
@ -268,28 +274,28 @@
`endif
// Core Request Queue Size
`ifndef ICREQ_SIZE
`define ICREQ_SIZE 0
`ifndef ICACHE_CREQ_SIZE
`define ICACHE_CREQ_SIZE 0
`endif
// Core Response Queue Size
`ifndef ICRSQ_SIZE
`define ICRSQ_SIZE 2
`ifndef ICACHE_CRSQ_SIZE
`define ICACHE_CRSQ_SIZE 2
`endif
// Miss Handling Register Size
`ifndef IMSHR_SIZE
`define IMSHR_SIZE `NUM_WARPS
`ifndef ICACHE_MSHR_SIZE
`define ICACHE_MSHR_SIZE `NUM_WARPS
`endif
// Memory Request Queue Size
`ifndef IMREQ_SIZE
`define IMREQ_SIZE 4
`ifndef ICACHE_MREQ_SIZE
`define ICACHE_MREQ_SIZE 4
`endif
// Memory Response Queue Size
`ifndef IMRSQ_SIZE
`define IMRSQ_SIZE 0
`ifndef ICACHE_MRSQ_SIZE
`define ICACHE_MRSQ_SIZE 0
`endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
@ -300,38 +306,38 @@
`endif
// Number of banks
`ifndef DNUM_BANKS
`define DNUM_BANKS `NUM_THREADS
`ifndef DCACHE_NUM_BANKS
`define DCACHE_NUM_BANKS `NUM_THREADS
`endif
// Number of bank ports
`ifndef DNUM_PORTS
`define DNUM_PORTS 1
// Number of ports per bank
`ifndef DCACHE_NUM_PORTS
`define DCACHE_NUM_PORTS 1
`endif
// Core Request Queue Size
`ifndef DCREQ_SIZE
`define DCREQ_SIZE 0
`ifndef DCACHE_CREQ_SIZE
`define DCACHE_CREQ_SIZE 0
`endif
// Core Response Queue Size
`ifndef DCRSQ_SIZE
`define DCRSQ_SIZE 2
`ifndef DCACHE_CRSQ_SIZE
`define DCACHE_CRSQ_SIZE 2
`endif
// Miss Handling Register Size
`ifndef DMSHR_SIZE
`define DMSHR_SIZE `LSUQ_SIZE
`ifndef DCACHE_MSHR_SIZE
`define DCACHE_MSHR_SIZE `LSUQ_SIZE
`endif
// Memory Request Queue Size
`ifndef DMREQ_SIZE
`define DMREQ_SIZE 4
`ifndef DCACHE_MREQ_SIZE
`define DCACHE_MREQ_SIZE 4
`endif
// Memory Response Queue Size
`ifndef DMRSQ_SIZE
`define DMRSQ_SIZE 0
`ifndef DCACHE_MRSQ_SIZE
`define DCACHE_MRSQ_SIZE 0
`endif
// SM Configurable Knobs //////////////////////////////////////////////////////
@ -348,92 +354,102 @@
`endif
// Number of banks
`ifndef SNUM_BANKS
`define SNUM_BANKS `NUM_THREADS
`ifndef SMEM_NUM_BANKS
`define SMEM_NUM_BANKS `NUM_THREADS
`endif
// Core Request Queue Size
`ifndef SCREQ_SIZE
`define SCREQ_SIZE 2
`ifndef SMEM_CREQ_SIZE
`define SMEM_CREQ_SIZE 2
`endif
// Core Response Queue Size
`ifndef SCRSQ_SIZE
`define SCRSQ_SIZE 2
`ifndef SMEM_CRSQ_SIZE
`define SMEM_CRSQ_SIZE 2
`endif
// L2cache Configurable Knobs /////////////////////////////////////////////////
// Size of cache in bytes
`ifndef L2CACHE_SIZE
`define L2CACHE_SIZE 131072
`ifndef L2_CACHE_SIZE
`define L2_CACHE_SIZE 131072
`endif
// Number of banks
`ifndef L2NUM_BANKS
`define L2NUM_BANKS `MIN(`NUM_CORES, 4)
`ifndef L2_NUM_BANKS
`define L2_NUM_BANKS `MIN(`NUM_CORES, 4)
`endif
// Number of ports per bank
`ifndef L2_NUM_PORTS
`define L2_NUM_PORTS 1
`endif
// Core Request Queue Size
`ifndef L2CREQ_SIZE
`define L2CREQ_SIZE 0
`ifndef L2_CREQ_SIZE
`define L2_CREQ_SIZE 0
`endif
// Core Response Queue Size
`ifndef L2CRSQ_SIZE
`define L2CRSQ_SIZE 2
`ifndef L2_CRSQ_SIZE
`define L2_CRSQ_SIZE 2
`endif
// Miss Handling Register Size
`ifndef L2MSHR_SIZE
`define L2MSHR_SIZE 16
`ifndef L2_MSHR_SIZE
`define L2_MSHR_SIZE 16
`endif
// Memory Request Queue Size
`ifndef L2MREQ_SIZE
`define L2MREQ_SIZE 4
`ifndef L2_MREQ_SIZE
`define L2_MREQ_SIZE 4
`endif
// Memory Response Queue Size
`ifndef L2MRSQ_SIZE
`define L2MRSQ_SIZE 0
`ifndef L2_MRSQ_SIZE
`define L2_MRSQ_SIZE 0
`endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Size of cache in bytes
`ifndef L3CACHE_SIZE
`define L3CACHE_SIZE 1048576
`ifndef L3_CACHE_SIZE
`define L3_CACHE_SIZE 1048576
`endif
// Number of banks
`ifndef L3NUM_BANKS
`define L3NUM_BANKS `MIN(`NUM_CLUSTERS, 4)
`ifndef L3_NUM_BANKS
`define L3_NUM_BANKS `MIN(`NUM_CLUSTERS, 4)
`endif
// Number of ports per bank
`ifndef L3_NUM_PORTS
`define L3_NUM_PORTS 1
`endif
// Core Request Queue Size
`ifndef L3CREQ_SIZE
`define L3CREQ_SIZE 0
`ifndef L3_CREQ_SIZE
`define L3_CREQ_SIZE 0
`endif
// Core Response Queue Size
`ifndef L3CRSQ_SIZE
`define L3CRSQ_SIZE 2
`ifndef L3_CRSQ_SIZE
`define L3_CRSQ_SIZE 2
`endif
// Miss Handling Register Size
`ifndef L3MSHR_SIZE
`define L3MSHR_SIZE 16
`ifndef L3_MSHR_SIZE
`define L3_MSHR_SIZE 16
`endif
// Memory Request Queue Size
`ifndef L3MREQ_SIZE
`define L3MREQ_SIZE 4
`ifndef L3_MREQ_SIZE
`define L3_MREQ_SIZE 4
`endif
// Memory Response Queue Size
`ifndef L3MRSQ_SIZE
`define L3MRSQ_SIZE 0
`ifndef L3_MRSQ_SIZE
`define L3_MRSQ_SIZE 0
`endif
`endif

View file

@ -12,16 +12,16 @@ module VX_core #(
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [`DMEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`DMEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`DMEM_DATA_WIDTH-1:0] mem_req_data,
output wire [`XMEM_TAG_WIDTH-1:0] mem_req_tag,
output wire [`DCACHE_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`DCACHE_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`DCACHE_MEM_DATA_WIDTH-1:0] mem_req_data,
output wire [`L1_MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory reponse
input wire mem_rsp_valid,
input wire [`DMEM_DATA_WIDTH-1:0] mem_rsp_data,
input wire [`XMEM_TAG_WIDTH-1:0] mem_rsp_tag,
input wire [`DCACHE_MEM_DATA_WIDTH-1:0] mem_rsp_data,
input wire [`L1_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// Status
@ -32,14 +32,14 @@ module VX_core #(
`endif
VX_mem_req_if #(
.DATA_WIDTH (`DMEM_DATA_WIDTH),
.ADDR_WIDTH (`DMEM_ADDR_WIDTH),
.TAG_WIDTH (`XMEM_TAG_WIDTH)
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
.ADDR_WIDTH (`DCACHE_MEM_ADDR_WIDTH),
.TAG_WIDTH (`L1_MEM_TAG_WIDTH)
) mem_req_if();
VX_mem_rsp_if #(
.DATA_WIDTH (`DMEM_DATA_WIDTH),
.TAG_WIDTH (`XMEM_TAG_WIDTH)
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
.TAG_WIDTH (`L1_MEM_TAG_WIDTH)
) mem_rsp_if();
assign mem_req_valid = mem_req_if.valid;
@ -58,25 +58,25 @@ module VX_core #(
//--
VX_dcache_req_if #(
.NUM_REQS (`DNUM_REQS),
.WORD_SIZE (`DWORD_SIZE),
.TAG_WIDTH (`DCORE_TAG_WIDTH)
.NUM_REQS (`DCACHE_NUM_REQS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH)
) dcache_req_if();
VX_dcache_rsp_if #(
.NUM_REQS (`DNUM_REQS),
.WORD_SIZE (`DWORD_SIZE),
.TAG_WIDTH (`DCORE_TAG_WIDTH)
.NUM_REQS (`DCACHE_NUM_REQS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH)
) dcache_rsp_if();
VX_icache_req_if #(
.WORD_SIZE (`IWORD_SIZE),
.TAG_WIDTH (`ICORE_TAG_WIDTH)
.WORD_SIZE (`ICACHE_WORD_SIZE),
.TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH)
) icache_req_if();
VX_icache_rsp_if #(
.WORD_SIZE (`IWORD_SIZE),
.TAG_WIDTH (`ICORE_TAG_WIDTH)
.WORD_SIZE (`ICACHE_WORD_SIZE),
.TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH)
) icache_rsp_if();
VX_pipeline #(

View file

@ -7,15 +7,18 @@ module VX_csr_data #(
input wire reset,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
VX_perf_pipeline_if perf_pipeline_if,
VX_perf_memsys_if.slave perf_memsys_if,
VX_perf_pipeline_if.slave perf_pipeline_if,
`endif
VX_cmt_to_csr_if cmt_to_csr_if,
VX_fpu_to_csr_if fpu_to_csr_if,
VX_cmt_to_csr_if.slave cmt_to_csr_if,
VX_fetch_to_csr_if.slave fetch_to_csr_if,
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if,
`endif
`ifdef EXT_TEX_ENABLE
VX_tex_csr_if tex_csr_if,
VX_tex_csr_if.slave tex_csr_if,
`endif
input wire read_enable,
@ -30,6 +33,8 @@ module VX_csr_data #(
input wire busy
);
import fpu_types::*;
reg [`CSR_WIDTH-1:0] csr_satp;
reg [`CSR_WIDTH-1:0] csr_mstatus;
reg [`CSR_WIDTH-1:0] csr_medeleg;
@ -42,38 +47,40 @@ module VX_csr_data #(
reg [63:0] csr_cycle;
reg [63:0] csr_instret;
reg [`NUM_WARPS-1:0][`FRM_BITS+`FFG_BITS-1:0] fcsr;
always @(posedge clk) begin
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr;
always @(posedge clk) begin
`ifdef EXT_F_ENABLE
if (reset) begin
fcsr <= '0;
end
end
if (fpu_to_csr_if.write_enable) begin
fcsr[fpu_to_csr_if.write_wid][`FFG_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFG_BITS-1:0]
| fpu_to_csr_if.write_fflags;
fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0]
| fpu_to_csr_if.write_fflags;
end
`endif
if (write_enable) begin
case (write_addr)
`CSR_FFLAGS: fcsr[write_wid][`FFG_BITS-1:0] <= write_data[`FFG_BITS-1:0];
`CSR_FRM: fcsr[write_wid][`FRM_BITS+`FFG_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0];
`CSR_FCSR: fcsr[write_wid] <= write_data[`FFG_BITS+`FRM_BITS-1:0];
`CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0];
`CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0];
`CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0];
`CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0];
`CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0];
`CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0];
`CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0];
`CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0];
`CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0];
`CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0];
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0];
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0];
`CSR_SATP: csr_satp <= write_data;
`CSR_MSTATUS: csr_mstatus <= write_data;
`CSR_MEDELEG: csr_medeleg <= write_data;
`CSR_MIDELEG: csr_mideleg <= write_data;
`CSR_MIE: csr_mie <= write_data;
`CSR_MTVEC: csr_mtvec <= write_data;
`CSR_MEPC: csr_mepc <= write_data;
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data;
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data;
default: begin
assert (write_addr >= `CSR_TEX_BEGIN(0) && write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))
else $error("%t: invalid CSR write address: %0h", $time, write_addr);
else `ASSERT(~write_enable, ("%t: invalid CSR write address: %0h", $time, write_addr));
end
endcase
end
@ -109,8 +116,8 @@ module VX_csr_data #(
read_data_r = 'x;
read_addr_valid_r = 1;
case (read_addr)
`CSR_FFLAGS : read_data_r = 32'(fcsr[read_wid][`FFG_BITS-1:0]);
`CSR_FRM : read_data_r = 32'(fcsr[read_wid][`FRM_BITS+`FFG_BITS-1:`FFG_BITS]);
`CSR_FFLAGS : read_data_r = 32'(fcsr[read_wid][`FFLAGS_BITS-1:0]);
`CSR_FRM : read_data_r = 32'(fcsr[read_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS]);
`CSR_FCSR : read_data_r = 32'(fcsr[read_wid]);
`CSR_WTID ,
@ -120,6 +127,9 @@ module VX_csr_data #(
/*`CSR_MHARTID ,*/
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid);
`CSR_GCID : read_data_r = CORE_ID;
`CSR_TMASK : read_data_r = 32'(fetch_to_csr_if.thread_masks[read_wid]);
`CSR_NT : read_data_r = `NUM_THREADS;
`CSR_NW : read_data_r = `NUM_WARPS;
`CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS;
@ -223,7 +233,9 @@ module VX_csr_data #(
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("invalid CSR read address: %0h", read_addr))
assign read_data = read_data_r;
assign fpu_to_csr_if.read_frm = fcsr[fpu_to_csr_if.read_wid][`FRM_BITS+`FFG_BITS-1:`FFG_BITS];
`ifdef EXT_F_ENABLE
assign fpu_to_csr_if.read_frm = fcsr[fpu_to_csr_if.read_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS];
`endif
endmodule

View file

@ -3,28 +3,29 @@
module VX_csr_unit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
VX_perf_pipeline_if perf_pipeline_if,
VX_perf_memsys_if.slave perf_memsys_if,
VX_perf_pipeline_if.slave perf_pipeline_if,
`endif
VX_cmt_to_csr_if cmt_to_csr_if,
VX_fpu_to_csr_if fpu_to_csr_if,
`ifdef EXT_TEX_ENABLE
VX_tex_csr_if tex_csr_if,
`endif
VX_cmt_to_csr_if.slave cmt_to_csr_if,
VX_fetch_to_csr_if.slave fetch_to_csr_if,
VX_csr_req_if.slave csr_req_if,
VX_commit_if.master csr_commit_if,
VX_csr_req_if csr_req_if,
VX_commit_if csr_commit_if,
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if,
input wire[`NUM_WARPS-1:0] fpu_pending,
`endif
`ifdef EXT_TEX_ENABLE
VX_tex_csr_if.slave tex_csr_if,
`endif
input wire busy,
input wire[`NUM_WARPS-1:0] fpu_pending,
output wire[`NUM_WARPS-1:0] pending
output wire[`NUM_WARPS-1:0] pending,
input wire busy
);
wire csr_we_s1;
wire [`CSR_ADDR_BITS-1:0] csr_addr_s1;
@ -33,7 +34,7 @@ module VX_csr_unit #(
wire write_enable = csr_commit_if.valid && csr_we_s1;
wire [31:0] csr_req_data = csr_req_if.use_imm ? 32'(csr_req_if.rs1) : csr_req_if.rs1_data;
wire [31:0] csr_req_data = csr_req_if.use_imm ? 32'(csr_req_if.imm) : csr_req_if.rs1_data;
VX_csr_data #(
.CORE_ID(CORE_ID)
@ -45,6 +46,8 @@ module VX_csr_unit #(
.perf_pipeline_if (perf_pipeline_if),
`endif
.cmt_to_csr_if (cmt_to_csr_if),
.fetch_to_csr_if(fetch_to_csr_if),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
`ifdef EXT_TEX_ENABLE
.tex_csr_if (tex_csr_if),
@ -72,21 +75,25 @@ module VX_csr_unit #(
always @(*) begin
csr_we_s0_unqual = (csr_req_data != 0);
case (csr_req_if.op_type)
`CSR_RW: begin
`INST_CSR_RW: begin
csr_updated_data = csr_req_data;
csr_we_s0_unqual = 1;
end
`CSR_RS: begin
`INST_CSR_RS: begin
csr_updated_data = csr_read_data_qual | csr_req_data;
end
//`CSR_RC
//`INST_CSR_RC
default: begin
csr_updated_data = csr_read_data_qual & ~csr_req_data;
end
endcase
end
`ifdef EXT_F_ENABLE
wire stall_in = fpu_pending[csr_req_if.wid];
`else
wire stall_in = 0;
`endif
wire csr_req_valid = csr_req_if.valid && !stall_in;

View file

@ -1,16 +1,17 @@
`include "VX_define.vh"
`ifdef DBG_PRINT_PIPELINE
`include "VX_print_instr.vh"
`endif
`ifdef EXT_F_ENABLE
`define USED_IREG(r) \
used_regs[{1'b0, r}] = 1
`define USED_IREG(r) \
r``_r = {1'b0, ``r}
`define USED_FREG(r) \
r``_r[5] = 1; \
used_regs[{1'b1, r}] = 1
`define USED_FREG(r) \
r``_r = {1'b1, ``r}
`else
`define USED_IREG(r) \
used_regs[r] = 1
r``_r = ``r
`endif
module VX_decode #(
@ -20,25 +21,24 @@ module VX_decode #(
input wire reset,
// inputs
VX_ifetch_rsp_if ifetch_rsp_if,
VX_ifetch_rsp_if.slave ifetch_rsp_if,
// outputs
VX_decode_if decode_if,
VX_wstall_if wstall_if,
VX_join_if join_if
VX_decode_if.master decode_if,
VX_wstall_if.master wstall_if,
VX_join_if.master join_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
reg [`EX_BITS-1:0] ex_type;
reg [`OP_BITS-1:0] op_type;
reg [`MOD_BITS-1:0] op_mod;
reg [`INST_OP_BITS-1:0] op_type;
reg [`INST_MOD_BITS-1:0] op_mod;
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
reg [31:0] imm;
reg use_rd, use_PC, use_imm;
reg is_join, is_wstall;
reg [`NUM_REGS-1:0] used_regs;
wire [31:0] instr = ifetch_rsp_if.data;
wire [6:0] opcode = instr[6:0];
@ -58,36 +58,37 @@ module VX_decode #(
wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0};
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
wire [11:0] jalr_imm = {func7, rs2};
`UNUSED_VAR (rs3)
always @(*) begin
ex_type = 0;
op_type = 'x;
op_mod = 0;
rd_r = `NR_BITS'(rd);
rs1_r = `NR_BITS'(rs1);
rs2_r = `NR_BITS'(rs2);
rs3_r = `NR_BITS'(rs3);
rd_r = 0;
rs1_r = 0;
rs2_r = 0;
rs3_r = 0;
imm = 'x;
use_imm = 0;
use_PC = 0;
use_rd = 0;
is_join = 0;
is_wstall = 0;
used_regs = 0;
case (opcode)
`INST_I: begin
ex_type = `EX_ALU;
case (func3)
3'h0: op_type = `OP_BITS'(`ALU_ADD);
3'h1: op_type = `OP_BITS'(`ALU_SLL);
3'h2: op_type = `OP_BITS'(`ALU_SLT);
3'h3: op_type = `OP_BITS'(`ALU_SLTU);
3'h4: op_type = `OP_BITS'(`ALU_XOR);
3'h5: op_type = (func7[5]) ? `OP_BITS'(`ALU_SRA) : `OP_BITS'(`ALU_SRL);
3'h6: op_type = `OP_BITS'(`ALU_OR);
3'h7: op_type = `OP_BITS'(`ALU_AND);
3'h0: op_type = `INST_OP_BITS'(`INST_ALU_ADD);
3'h1: op_type = `INST_OP_BITS'(`INST_ALU_SLL);
3'h2: op_type = `INST_OP_BITS'(`INST_ALU_SLT);
3'h3: op_type = `INST_OP_BITS'(`INST_ALU_SLTU);
3'h4: op_type = `INST_OP_BITS'(`INST_ALU_XOR);
3'h5: op_type = (func7[5]) ? `INST_OP_BITS'(`INST_ALU_SRA) : `INST_OP_BITS'(`INST_ALU_SRL);
3'h6: op_type = `INST_OP_BITS'(`INST_ALU_OR);
3'h7: op_type = `INST_OP_BITS'(`INST_ALU_AND);
default:;
endcase
use_rd = 1;
@ -101,14 +102,14 @@ module VX_decode #(
`ifdef EXT_F_ENABLE
if (func7[0]) begin
case (func3)
3'h0: op_type = `OP_BITS'(`MUL_MUL);
3'h1: op_type = `OP_BITS'(`MUL_MULH);
3'h2: op_type = `OP_BITS'(`MUL_MULHSU);
3'h3: op_type = `OP_BITS'(`MUL_MULHU);
3'h4: op_type = `OP_BITS'(`MUL_DIV);
3'h5: op_type = `OP_BITS'(`MUL_DIVU);
3'h6: op_type = `OP_BITS'(`MUL_REM);
3'h7: op_type = `OP_BITS'(`MUL_REMU);
3'h0: op_type = `INST_OP_BITS'(`INST_MUL_MUL);
3'h1: op_type = `INST_OP_BITS'(`INST_MUL_MULH);
3'h2: op_type = `INST_OP_BITS'(`INST_MUL_MULHSU);
3'h3: op_type = `INST_OP_BITS'(`INST_MUL_MULHU);
3'h4: op_type = `INST_OP_BITS'(`INST_MUL_DIV);
3'h5: op_type = `INST_OP_BITS'(`INST_MUL_DIVU);
3'h6: op_type = `INST_OP_BITS'(`INST_MUL_REM);
3'h7: op_type = `INST_OP_BITS'(`INST_MUL_REMU);
default:;
endcase
op_mod = 2;
@ -116,14 +117,14 @@ module VX_decode #(
`endif
begin
case (func3)
3'h0: op_type = (func7[5]) ? `OP_BITS'(`ALU_SUB) : `OP_BITS'(`ALU_ADD);
3'h1: op_type = `OP_BITS'(`ALU_SLL);
3'h2: op_type = `OP_BITS'(`ALU_SLT);
3'h3: op_type = `OP_BITS'(`ALU_SLTU);
3'h4: op_type = `OP_BITS'(`ALU_XOR);
3'h5: op_type = (func7[5]) ? `OP_BITS'(`ALU_SRA) : `OP_BITS'(`ALU_SRL);
3'h6: op_type = `OP_BITS'(`ALU_OR);
3'h7: op_type = `OP_BITS'(`ALU_AND);
3'h0: op_type = (func7[5]) ? `INST_OP_BITS'(`INST_ALU_SUB) : `INST_OP_BITS'(`INST_ALU_ADD);
3'h1: op_type = `INST_OP_BITS'(`INST_ALU_SLL);
3'h2: op_type = `INST_OP_BITS'(`INST_ALU_SLT);
3'h3: op_type = `INST_OP_BITS'(`INST_ALU_SLTU);
3'h4: op_type = `INST_OP_BITS'(`INST_ALU_XOR);
3'h5: op_type = (func7[5]) ? `INST_OP_BITS'(`INST_ALU_SRA) : `INST_OP_BITS'(`INST_ALU_SRL);
3'h6: op_type = `INST_OP_BITS'(`INST_ALU_OR);
3'h7: op_type = `INST_OP_BITS'(`INST_ALU_AND);
default:;
endcase
end
@ -134,7 +135,7 @@ module VX_decode #(
end
`INST_LUI: begin
ex_type = `EX_ALU;
op_type = `OP_BITS'(`ALU_LUI);
op_type = `INST_OP_BITS'(`INST_ALU_LUI);
use_rd = 1;
use_imm = 1;
imm = {upper_imm, 12'(0)};
@ -143,7 +144,7 @@ module VX_decode #(
end
`INST_AUIPC: begin
ex_type = `EX_ALU;
op_type = `OP_BITS'(`ALU_AUIPC);
op_type = `INST_OP_BITS'(`INST_ALU_AUIPC);
use_rd = 1;
use_imm = 1;
use_PC = 1;
@ -152,7 +153,7 @@ module VX_decode #(
end
`INST_JAL: begin
ex_type = `EX_ALU;
op_type = `OP_BITS'(`BR_JAL);
op_type = `INST_OP_BITS'(`INST_BR_JAL);
op_mod = 1;
use_rd = 1;
use_imm = 1;
@ -163,7 +164,7 @@ module VX_decode #(
end
`INST_JALR: begin
ex_type = `EX_ALU;
op_type = `OP_BITS'(`BR_JALR);
op_type = `INST_OP_BITS'(`INST_BR_JALR);
op_mod = 1;
use_rd = 1;
use_imm = 1;
@ -175,12 +176,12 @@ module VX_decode #(
`INST_B: begin
ex_type = `EX_ALU;
case (func3)
3'h0: op_type = `OP_BITS'(`BR_EQ);
3'h1: op_type = `OP_BITS'(`BR_NE);
3'h4: op_type = `OP_BITS'(`BR_LT);
3'h5: op_type = `OP_BITS'(`BR_GE);
3'h6: op_type = `OP_BITS'(`BR_LTU);
3'h7: op_type = `OP_BITS'(`BR_GEU);
3'h0: op_type = `INST_OP_BITS'(`INST_BR_EQ);
3'h1: op_type = `INST_OP_BITS'(`INST_BR_NE);
3'h4: op_type = `INST_OP_BITS'(`INST_BR_LT);
3'h5: op_type = `INST_OP_BITS'(`INST_BR_GE);
3'h6: op_type = `INST_OP_BITS'(`INST_BR_LTU);
3'h7: op_type = `INST_OP_BITS'(`INST_BR_GEU);
default:;
endcase
op_mod = 1;
@ -193,35 +194,37 @@ module VX_decode #(
end
`INST_F: begin
ex_type = `EX_LSU;
op_mod = `MOD_BITS'(!func3[0]); // data fence
op_type = `INST_OP_BITS'(func3[0]);
op_mod = `INST_MOD_BITS'(1);
end
`INST_SYS : begin
if (func3[1:0] != 0) begin
ex_type = `EX_CSR;
op_type = `OP_BITS'(func3[1:0]);
op_type = `INST_OP_BITS'(func3[1:0]);
use_rd = 1;
use_imm = func3[2];
imm = 32'(u_12); // addr
imm[`CSR_ADDR_BITS-1:0] = u_12; // addr
`USED_IREG (rd);
if (func3[2]) begin
rs1_r = `NR_BITS'(rs1); // imm
imm[`CSR_ADDR_BITS +: `NRI_BITS] = rs1; // imm
end else begin
`USED_IREG (rs1);
end
end else begin
ex_type = `EX_ALU;
case (u_12)
12'h000: op_type = `OP_BITS'(`BR_ECALL);
12'h001: op_type = `OP_BITS'(`BR_EBREAK);
12'h302: op_type = `OP_BITS'(`BR_MRET);
12'h102: op_type = `OP_BITS'(`BR_SRET);
12'h7B2: op_type = `OP_BITS'(`BR_DRET);
12'h000: op_type = `INST_OP_BITS'(`INST_BR_ECALL);
12'h001: op_type = `INST_OP_BITS'(`INST_BR_EBREAK);
12'h302: op_type = `INST_OP_BITS'(`INST_BR_MRET);
12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET);
12'h7B2: op_type = `INST_OP_BITS'(`INST_BR_DRET);
default:;
endcase
op_mod = 1;
use_rd = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1;
imm = 32'd4;
`USED_IREG (rd);
end
@ -231,7 +234,7 @@ module VX_decode #(
`endif
`INST_L: begin
ex_type = `EX_LSU;
op_type = `OP_BITS'({1'b0, func3});
op_type = `INST_OP_BITS'({1'b0, func3});
use_rd = 1;
imm = {{20{u_12[11]}}, u_12};
`ifdef EXT_F_ENABLE
@ -247,7 +250,7 @@ module VX_decode #(
`endif
`INST_S: begin
ex_type = `EX_LSU;
op_type = `OP_BITS'({1'b1, func3});
op_type = `INST_OP_BITS'({1'b1, func3});
imm = {{20{s_imm[11]}}, s_imm};
`USED_IREG (rs1);
`ifdef EXT_F_ENABLE
@ -263,7 +266,7 @@ module VX_decode #(
`INST_FNMSUB,
`INST_FNMADD: begin
ex_type = `EX_FPU;
op_type = `OP_BITS'(opcode[3:0]);
op_type = `INST_OP_BITS'(opcode[3:0]);
op_mod = func3;
use_rd = 1;
`USED_FREG (rd);
@ -280,35 +283,35 @@ module VX_decode #(
7'h04, // FSUB
7'h08, // FMUL
7'h0C: begin // FDIV
op_type = `OP_BITS'(func7[3:0]);
op_type = `INST_OP_BITS'(func7[3:0]);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
7'h2C: begin
op_type = `OP_BITS'(`FPU_SQRT);
op_type = `INST_OP_BITS'(`INST_FPU_SQRT);
`USED_FREG (rd);
`USED_FREG (rs1);
end
7'h50: begin
op_type = `OP_BITS'(`FPU_CMP);
op_type = `INST_OP_BITS'(`INST_FPU_CMP);
`USED_IREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
7'h60: begin
op_type = (instr[20]) ? `OP_BITS'(`FPU_CVTWUS) : `OP_BITS'(`FPU_CVTWS);
op_type = (instr[20]) ? `INST_OP_BITS'(`INST_FPU_CVTWUS) : `INST_OP_BITS'(`INST_FPU_CVTWS);
`USED_IREG (rd);
`USED_FREG (rs1);
end
7'h68: begin
op_type = (instr[20]) ? `OP_BITS'(`FPU_CVTSWU) : `OP_BITS'(`FPU_CVTSW);
op_type = (instr[20]) ? `INST_OP_BITS'(`INST_FPU_CVTSWU) : `INST_OP_BITS'(`INST_FPU_CVTSW);
`USED_FREG (rd);
`USED_IREG (rs1);
end
7'h10: begin
// FSGNJ=0, FSGNJN=1, FSGNJX=2
op_type = `OP_BITS'(`FPU_MISC);
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = {1'b0, func3[1:0]};
`USED_FREG (rd);
`USED_FREG (rs1);
@ -316,7 +319,7 @@ module VX_decode #(
end
7'h14: begin
// FMIN=3, FMAX=4
op_type = `OP_BITS'(`FPU_MISC);
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = func3[0] ? 4 : 3;
`USED_FREG (rd);
`USED_FREG (rs1);
@ -325,10 +328,10 @@ module VX_decode #(
7'h70: begin
if (func3[0]) begin
// FCLASS
op_type = `OP_BITS'(`FPU_CLASS);
op_type = `INST_OP_BITS'(`INST_FPU_CLASS);
end else begin
// FMV.X.W=5
op_type = `OP_BITS'(`FPU_MISC);
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 5;
end
`USED_IREG (rd);
@ -336,7 +339,7 @@ module VX_decode #(
end
7'h78: begin
// FMV.W.X=6
op_type = `OP_BITS'(`FPU_MISC);
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 6;
`USED_FREG (rd);
`USED_IREG (rs1);
@ -349,26 +352,26 @@ module VX_decode #(
ex_type = `EX_GPU;
case (func3)
3'h0: begin
op_type = `OP_BITS'(`GPU_TMC);
op_type = rs2[0] ? `INST_OP_BITS'(`INST_GPU_PRED) : `INST_OP_BITS'(`INST_GPU_TMC);
is_wstall = 1;
`USED_IREG (rs1);
end
3'h1: begin
op_type = `OP_BITS'(`GPU_WSPAWN);
op_type = `INST_OP_BITS'(`INST_GPU_WSPAWN);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'h2: begin
op_type = `OP_BITS'(`GPU_SPLIT);
op_type = `INST_OP_BITS'(`INST_GPU_SPLIT);
is_wstall = 1;
`USED_IREG (rs1);
end
3'h3: begin
op_type = `OP_BITS'(`GPU_JOIN);
op_type = `INST_OP_BITS'(`INST_GPU_JOIN);
is_join = 1;
end
3'h4: begin
op_type = `OP_BITS'(`GPU_BAR);
op_type = `INST_OP_BITS'(`INST_GPU_BAR);
is_wstall = 1;
`USED_IREG (rs1);
`USED_IREG (rs2);
@ -410,7 +413,6 @@ module VX_decode #(
assign decode_if.imm = imm;
assign decode_if.use_PC = use_PC;
assign decode_if.use_imm = use_imm;
assign decode_if.used_regs = used_regs;
///////////////////////////////////////////////////////////////////////////
@ -419,19 +421,20 @@ module VX_decode #(
assign join_if.valid = ifetch_rsp_fire && is_join;
assign join_if.wid = ifetch_rsp_if.wid;
assign wstall_if.valid = ifetch_rsp_fire && is_wstall;
assign wstall_if.valid = ifetch_rsp_fire;
assign wstall_if.wid = ifetch_rsp_if.wid;
assign wstall_if.stalled = is_wstall;
assign ifetch_rsp_if.ready = decode_if.ready;
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (decode_if.valid && decode_if.ready) begin
$write("%t: core%0d-decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_if.wid, decode_if.PC);
dpi_trace("%d: core%0d-decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_if.wid, decode_if.PC);
print_ex_type(decode_if.ex_type);
$write(", op=");
dpi_trace(", op=");
print_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
$write(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_regs=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm, decode_if.used_regs);
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm);
end
end
`endif

View file

@ -14,14 +14,16 @@
`define NB_BITS `LOG2UP(`NUM_BARRIERS)
`define REQS_BITS `LOG2UP(NUM_REQS)
`define NUM_IREGS 32
`define NRI_BITS `LOG2UP(`NUM_IREGS)
`define NTEX_BITS `LOG2UP(`NUM_TEX_UNITS)
`ifdef EXT_F_ENABLE
`define NUM_REGS 64
`define NUM_REGS (2 * `NUM_IREGS)
`else
`define NUM_REGS 32
`define NUM_REGS `NUM_IREGS
`endif
`define NR_BITS `LOG2UP(`NUM_REGS)
@ -34,6 +36,16 @@
///////////////////////////////////////////////////////////////////////////////
`define EX_NOP 3'h0
`define EX_ALU 3'h1
`define EX_LSU 3'h2
`define EX_CSR 3'h3
`define EX_FPU 3'h4
`define EX_GPU 3'h5
`define EX_BITS 3
///////////////////////////////////////////////////////////////////////////////
`define INST_LUI 7'b0110111
`define INST_AUIPC 7'b0010111
`define INST_JAL 7'b1101111
@ -60,139 +72,126 @@
///////////////////////////////////////////////////////////////////////////////
`define FRM_RNE 3'b000 // round to nearest even
`define FRM_RTZ 3'b001 // round to zero
`define FRM_RDN 3'b010 // round to -inf
`define FRM_RUP 3'b011 // round to +inf
`define FRM_RMM 3'b100 // round to nearest max magnitude
`define FRM_DYN 3'b111 // dynamic mode
`define FRM_BITS 3
`define INST_FRM_RNE 3'b000 // round to nearest even
`define INST_FRM_RTZ 3'b001 // round to zero
`define INST_FRM_RDN 3'b010 // round to -inf
`define INST_FRM_RUP 3'b011 // round to +inf
`define INST_FRM_RMM 3'b100 // round to nearest max magnitude
`define INST_FRM_DYN 3'b111 // dynamic mode
`define INST_FRM_BITS 3
///////////////////////////////////////////////////////////////////////////////
`define EX_NOP 3'h0
`define EX_ALU 3'h1
`define EX_LSU 3'h2
`define EX_CSR 3'h3
`define EX_FPU 3'h4
`define EX_GPU 3'h5
`define EX_BITS 3
`define NUM_EXS 6
`define NE_BITS `LOG2UP(`NUM_EXS)
`define INST_OP_BITS 4
`define INST_MOD_BITS 3
///////////////////////////////////////////////////////////////////////////////
`define OP_BITS 4
`define MOD_BITS 3
`define INST_ALU_ADD 4'b0000
`define INST_ALU_LUI 4'b0010
`define INST_ALU_AUIPC 4'b0011
`define INST_ALU_SLTU 4'b0100
`define INST_ALU_SLT 4'b0101
`define INST_ALU_SRL 4'b1000
`define INST_ALU_SRA 4'b1001
`define INST_ALU_SUB 4'b1011
`define INST_ALU_AND 4'b1100
`define INST_ALU_OR 4'b1101
`define INST_ALU_XOR 4'b1110
`define INST_ALU_SLL 4'b1111
`define INST_ALU_OTHER 4'b0111
`define INST_ALU_BITS 4
`define INST_ALU_OP(x) x[`INST_ALU_BITS-1:0]
`define INST_ALU_OP_CLASS(x) x[3:2]
`define INST_ALU_SIGNED(x) x[0]
`define INST_ALU_IS_BR(x) x[0]
`define INST_ALU_IS_MUL(x) x[1]
`define ALU_ADD 4'b0000
`define ALU_LUI 4'b0010
`define ALU_AUIPC 4'b0011
`define ALU_SLTU 4'b0100
`define ALU_SLT 4'b0101
`define ALU_SRL 4'b1000
`define ALU_SRA 4'b1001
`define ALU_SUB 4'b1011
`define ALU_AND 4'b1100
`define ALU_OR 4'b1101
`define ALU_XOR 4'b1110
`define ALU_SLL 4'b1111
`define ALU_OTHER 4'b0111
`define ALU_BITS 4
`define ALU_OP(x) x[`ALU_BITS-1:0]
`define ALU_OP_CLASS(x) x[3:2]
`define ALU_SIGNED(x) x[0]
`define ALU_IS_BR(x) x[0]
`define ALU_IS_MUL(x) x[1]
`define INST_BR_EQ 4'b0000
`define INST_BR_NE 4'b0010
`define INST_BR_LTU 4'b0100
`define INST_BR_GEU 4'b0110
`define INST_BR_LT 4'b0101
`define INST_BR_GE 4'b0111
`define INST_BR_JAL 4'b1000
`define INST_BR_JALR 4'b1001
`define INST_BR_ECALL 4'b1010
`define INST_BR_EBREAK 4'b1011
`define INST_BR_MRET 4'b1100
`define INST_BR_SRET 4'b1101
`define INST_BR_DRET 4'b1110
`define INST_BR_OTHER 4'b1111
`define INST_BR_BITS 4
`define INST_BR_NEG(x) x[1]
`define INST_BR_LESS(x) x[2]
`define INST_BR_STATIC(x) x[3]
`define BR_EQ 4'b0000
`define BR_NE 4'b0010
`define BR_LTU 4'b0100
`define BR_GEU 4'b0110
`define BR_LT 4'b0101
`define BR_GE 4'b0111
`define BR_JAL 4'b1000
`define BR_JALR 4'b1001
`define BR_ECALL 4'b1010
`define BR_EBREAK 4'b1011
`define BR_MRET 4'b1100
`define BR_SRET 4'b1101
`define BR_DRET 4'b1110
`define BR_OTHER 4'b1111
`define BR_BITS 4
`define BR_OP(x) x[`BR_BITS-1:0]
`define BR_NEG(x) x[1]
`define BR_LESS(x) x[2]
`define BR_STATIC(x) x[3]
`define INST_MUL_MUL 3'h0
`define INST_MUL_MULH 3'h1
`define INST_MUL_MULHSU 3'h2
`define INST_MUL_MULHU 3'h3
`define INST_MUL_DIV 3'h4
`define INST_MUL_DIVU 3'h5
`define INST_MUL_REM 3'h6
`define INST_MUL_REMU 3'h7
`define INST_MUL_BITS 3
`define INST_MUL_IS_DIV(x) x[2]
`define MUL_MUL 3'h0
`define MUL_MULH 3'h1
`define MUL_MULHSU 3'h2
`define MUL_MULHU 3'h3
`define MUL_DIV 3'h4
`define MUL_DIVU 3'h5
`define MUL_REM 3'h6
`define MUL_REMU 3'h7
`define MUL_BITS 3
`define MUL_OP(x) x[`MUL_BITS-1:0]
`define MUL_IS_DIV(x) x[2]
`define INST_FMT_B 3'b000
`define INST_FMT_H 3'b001
`define INST_FMT_W 3'b010
`define INST_FMT_BU 3'b100
`define INST_FMT_HU 3'b101
`define FMT_B 3'b000
`define FMT_H 3'b001
`define FMT_W 3'b010
`define FMT_BU 3'b100
`define FMT_HU 3'b101
`define INST_LSU_LB 4'b0000
`define INST_LSU_LH 4'b0001
`define INST_LSU_LW 4'b0010
`define INST_LSU_LBU 4'b0100
`define INST_LSU_LHU 4'b0101
`define INST_LSU_SB 4'b1000
`define INST_LSU_SH 4'b1001
`define INST_LSU_SW 4'b1010
`define INST_LSU_BITS 4
`define INST_LSU_FMT(x) x[2:0]
`define INST_LSU_WSIZE(x) x[1:0]
`define INST_LSU_IS_FENCE(x) x[0]
`define LSU_LB 4'b0000
`define LSU_LH 4'b0001
`define LSU_LW 4'b0010
`define LSU_LBU 4'b0100
`define LSU_LHU 4'b0101
`define LSU_SB 4'b1000
`define LSU_SH 4'b1001
`define LSU_SW 4'b1010
`define LSU_BITS 4
`define LSU_FMT(x) x[2:0]
`define LSU_WSIZE(x) x[1:0]
`define LSU_OP(x) x[`LSU_BITS-1:0]
`define LSU_IS_FENCE(x) x[0]
`define INST_FENCE_BITS 1
`define INST_FENCE_D 1'h0
`define INST_FENCE_I 1'h1
`define CSR_RW 2'h1
`define CSR_RS 2'h2
`define CSR_RC 2'h3
`define CSR_OTHER 2'h0
`define CSR_BITS 2
`define CSR_OP(x) x[`CSR_BITS-1:0]
`define INST_CSR_RW 2'h1
`define INST_CSR_RS 2'h2
`define INST_CSR_RC 2'h3
`define INST_CSR_OTHER 2'h0
`define INST_CSR_BITS 2
`define FPU_ADD 4'h0
`define FPU_SUB 4'h4
`define FPU_MUL 4'h8
`define FPU_DIV 4'hC
`define FPU_CVTWS 4'h1 // FCVT.W.S
`define FPU_CVTWUS 4'h5 // FCVT.WU.S
`define FPU_CVTSW 4'h9 // FCVT.S.W
`define FPU_CVTSWU 4'hD // FCVT.S.WU
`define FPU_SQRT 4'h2
`define FPU_CLASS 4'h6
`define FPU_CMP 4'hA
`define FPU_MISC 4'hE // SGNJ, SGNJN, SGNJX, FMIN, FMAX, MVXW, MVWX
`define FPU_MADD 4'h3
`define FPU_MSUB 4'h7
`define FPU_NMSUB 4'hB
`define FPU_NMADD 4'hF
`define FPU_BITS 4
`define FPU_OP(x) x[`FPU_BITS-1:0]
`define INST_FPU_ADD 4'h0
`define INST_FPU_SUB 4'h4
`define INST_FPU_MUL 4'h8
`define INST_FPU_DIV 4'hC
`define INST_FPU_CVTWS 4'h1 // FCVT.W.S
`define INST_FPU_CVTWUS 4'h5 // FCVT.WU.S
`define INST_FPU_CVTSW 4'h9 // FCVT.S.W
`define INST_FPU_CVTSWU 4'hD // FCVT.S.WU
`define INST_FPU_SQRT 4'h2
`define INST_FPU_CLASS 4'h6
`define INST_FPU_CMP 4'hA
`define INST_FPU_MISC 4'hE // SGNJ, SGNJN, SGNJX, FMIN, FMAX, MVXW, MVWX
`define INST_FPU_MADD 4'h3
`define INST_FPU_MSUB 4'h7
`define INST_FPU_NMSUB 4'hB
`define INST_FPU_NMADD 4'hF
`define INST_FPU_BITS 4
`define GPU_TMC 3'h0
`define GPU_WSPAWN 3'h1
`define GPU_SPLIT 3'h2
`define GPU_JOIN 3'h3
`define GPU_BAR 3'h4
`define GPU_TEX 3'h5
`define GPU_OTHER 3'h7
`define GPU_BITS 3
`define GPU_OP(x) x[`GPU_BITS-1:0]
`define INST_GPU_TMC 3'h0
`define INST_GPU_WSPAWN 3'h1
`define INST_GPU_SPLIT 3'h2
`define INST_GPU_JOIN 3'h3
`define INST_GPU_BAR 3'h4
`define INST_GPU_PRED 3'h5
`define INST_GPU_TEX 3'h6
`define INST_GPU_BITS 3
///////////////////////////////////////////////////////////////////////////////
@ -244,59 +243,44 @@
`endif
// non-cacheable address bit
`define NC_ADDR_BITS 1
`define NC_FLAG_BITS 1
////////////////////////// Icache Configurable Knobs //////////////////////////
// Cache ID
`define ICACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0)
// Block size in bytes
`define ICACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `MEM_BLOCK_SIZE)
// Word size in bytes
`define IWORD_SIZE 4
`define ICACHE_WORD_SIZE 4
// Number of banks
`define INUM_BANKS 1
// Core request address bits
`define ICORE_ADDR_WIDTH (32-`CLOG2(`IWORD_SIZE))
// Core request byte enable bits
`define ICORE_BYTEEN_WIDTH `DWORD_SIZE
// Block size in bytes
`define ICACHE_LINE_SIZE `L1_BLOCK_SIZE
// TAG sharing enable
`define ICORE_TAG_ID_BITS `NW_BITS
`define ICACHE_CORE_TAG_ID_BITS `NW_BITS
// Core request tag bits
`define ICORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `ICORE_TAG_ID_BITS)
`define ICACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `ICACHE_CORE_TAG_ID_BITS)
// Memory request data bits
`define IMEM_DATA_WIDTH (`ICACHE_LINE_SIZE * 8)
`define ICACHE_MEM_DATA_WIDTH (`ICACHE_LINE_SIZE * 8)
// Memory request address bits
`define IMEM_ADDR_WIDTH (32 - `CLOG2(`ICACHE_LINE_SIZE))
// Memory byte enable bits
`define IMEM_BYTEEN_WIDTH `ICACHE_LINE_SIZE
`define ICACHE_MEM_ADDR_WIDTH (32 - `CLOG2(`ICACHE_LINE_SIZE))
// Memory request tag bits
`define IMEM_TAG_WIDTH `IMEM_ADDR_WIDTH
`define ICACHE_MEM_TAG_WIDTH `CLOG2(`ICACHE_MSHR_SIZE)
////////////////////////// Dcache Configurable Knobs //////////////////////////
// Cache ID
`define DCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1)
// Block size in bytes
`define DCACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `MEM_BLOCK_SIZE)
// Word size in bytes
`define DWORD_SIZE 4
`define DCACHE_WORD_SIZE 4
// Core request address bits
`define DCORE_ADDR_WIDTH (32-`CLOG2(`DWORD_SIZE))
// Block size in bytes
`define DCACHE_LINE_SIZE `L1_BLOCK_SIZE
// Core request tag bits
`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE)
@ -304,126 +288,127 @@
`define LSU_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_ADDR_BITS + `SM_ENABLE)
`define TEX_TAG_ID_BITS (2)
`define LSU_TEX_TAG_ID_BITS `MAX(`LSU_TAG_ID_BITS, `TEX_TAG_ID_BITS)
`define DCORE_TAG_ID_BITS (`LSU_TEX_TAG_ID_BITS + 1)
`define DCACHE_DCORE_TAG_ID_BITS (`LSU_TEX_TAG_ID_BITS + `NC_FLAG_BITS)
`define LSU_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TAG_ID_BITS)
`define TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `TEX_TAG_ID_BITS)
`define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TEX_TAG_ID_BITS)
`else
`define DCORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_ADDR_BITS + `SM_ENABLE)
`define DCACHE_DCORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_ADDR_BITS + `SM_ENABLE)
`endif
`define DCORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCORE_TAG_ID_BITS)
`define DCACHE_DCORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS)
// Memory request data bits
`define DMEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8)
`define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8)
// Memory request address bits
`define DMEM_ADDR_WIDTH (32 - `CLOG2(`DCACHE_LINE_SIZE))
`define DCACHE_MEM_ADDR_WIDTH (32 - `CLOG2(`DCACHE_LINE_SIZE))
// Memory byte enable bits
`define DMEM_BYTEEN_WIDTH `DCACHE_LINE_SIZE
`define DCACHE_MEM_BYTEEN_WIDTH `DCACHE_LINE_SIZE
// Input request size
`define DNUM_REQS `NUM_THREADS
`define DCACHE_NUM_REQS `NUM_THREADS
// Memory request tag bits
`define _DMEM_ADDR_RATIO_W $clog2(`DCACHE_LINE_SIZE / `DWORD_SIZE)
`define _DNC_MEM_TAG_WIDTH ($clog2(`DNUM_REQS) + `_DMEM_ADDR_RATIO_W + `DCORE_TAG_WIDTH)
`define DMEM_TAG_WIDTH `MAX((`DMEM_ADDR_WIDTH + `NC_ADDR_BITS), `_DNC_MEM_TAG_WIDTH)
`define _DMEM_ADDR_RATIO_W $clog2(`DCACHE_LINE_SIZE / `DCACHE_WORD_SIZE)
`define _DNC_MEM_TAG_WIDTH ($clog2(`DCACHE_NUM_REQS) + `_DMEM_ADDR_RATIO_W + `DCACHE_CORE_TAG_WIDTH)
`define DCACHE_MEM_TAG_WIDTH `MAX((`CLOG2(`DCACHE_NUM_BANKS) + `CLOG2(`DCACHE_MSHR_SIZE) + `NC_FLAG_BITS), `_DNC_MEM_TAG_WIDTH)
// Merged D-cache/I-cache memory tag
`define L1_MEM_TAG_WIDTH (`MAX(`ICACHE_MEM_TAG_WIDTH, `DCACHE_MEM_TAG_WIDTH) + `CLOG2(2))
////////////////////////// SM Configurable Knobs //////////////////////////////
// Cache ID
`define SCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2)
`define SMEM_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2)
// Word size in bytes
`define SWORD_SIZE 4
`define SMEM_WORD_SIZE 4
// bank address offset
`define SBANK_ADDR_OFFSET `CLOG2(`STACK_SIZE / `SWORD_SIZE)
`define SMEM_BANK_ADDR_OFFSET `CLOG2(`STACK_SIZE / `SMEM_WORD_SIZE)
// Input request size
`define SNUM_REQS `NUM_THREADS
`define SMEM_NUM_REQS `NUM_THREADS
////////////////////////// L2cache Configurable Knobs /////////////////////////
// Cache ID
`define L2CACHE_ID (32'(`L3_ENABLE) + CLUSTER_ID)
// Block size in bytes
`define L2CACHE_LINE_SIZE `MEM_BLOCK_SIZE
`define L2_CACHE_ID (32'(`L3_ENABLE) + CLUSTER_ID)
// Word size in bytes
`define L2WORD_SIZE `DCACHE_LINE_SIZE
`define L2_WORD_SIZE `DCACHE_LINE_SIZE
// Block size in bytes
`define L2_CACHE_LINE_SIZE ((`L2_ENABLE) ? `MEM_BLOCK_SIZE : `L2_WORD_SIZE)
// Input request tag bits
`define L2CORE_TAG_WIDTH (`DCORE_TAG_WIDTH + `CLOG2(`NUM_CORES))
`define L2_CORE_TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH + `CLOG2(`NUM_CORES))
// Memory request data bits
`define L2MEM_DATA_WIDTH (`L2CACHE_LINE_SIZE * 8)
`define L2_MEM_DATA_WIDTH (`L2_CACHE_LINE_SIZE * 8)
// Memory request address bits
`define L2MEM_ADDR_WIDTH (32 - `CLOG2(`L2CACHE_LINE_SIZE))
`define L2_MEM_ADDR_WIDTH (32 - `CLOG2(`L2_CACHE_LINE_SIZE))
// Memory byte enable bits
`define L2MEM_BYTEEN_WIDTH `L2CACHE_LINE_SIZE
`define L2_MEM_BYTEEN_WIDTH `L2_CACHE_LINE_SIZE
// Input request size
`define L2NUM_REQS `NUM_CORES
`define L2_NUM_REQS `NUM_CORES
// Memory request tag bits
`define _L2MEM_ADDR_RATIO_W $clog2(`L2CACHE_LINE_SIZE / `L2WORD_SIZE)
`define _L2NC_MEM_TAG_WIDTH ($clog2(`L2NUM_REQS) + `_L2MEM_ADDR_RATIO_W + `XMEM_TAG_WIDTH)
`define _L2MEM_TAG_WIDTH `MAX((`L2MEM_ADDR_WIDTH + `NC_ADDR_BITS), `_L2NC_MEM_TAG_WIDTH)
`define L2MEM_TAG_WIDTH (`L2_ENABLE ? `_L2MEM_TAG_WIDTH : (`XMEM_TAG_WIDTH + `CLOG2(`L2NUM_REQS)))
`define _L2_MEM_ADDR_RATIO_W $clog2(`L2_CACHE_LINE_SIZE / `L2_WORD_SIZE)
`define _L2_NC_MEM_TAG_WIDTH ($clog2(`L2_NUM_REQS) + `_L2_MEM_ADDR_RATIO_W + `L1_MEM_TAG_WIDTH)
`define _L2_MEM_TAG_WIDTH `MAX((`CLOG2(`L2_NUM_BANKS) + `CLOG2(`L2_MSHR_SIZE) + `NC_FLAG_BITS), `_L2_NC_MEM_TAG_WIDTH)
`define L2_MEM_TAG_WIDTH ((`L2_ENABLE) ? `_L2_MEM_TAG_WIDTH : (`L1_MEM_TAG_WIDTH + `CLOG2(`L2_NUM_REQS)))
////////////////////////// L3cache Configurable Knobs /////////////////////////
// Cache ID
`define L3CACHE_ID 0
// Block size in bytes
`define L3CACHE_LINE_SIZE `MEM_BLOCK_SIZE
`define L3_CACHE_ID 0
// Word size in bytes
`define L3WORD_SIZE `L2CACHE_LINE_SIZE
`define L3_WORD_SIZE `L2_CACHE_LINE_SIZE
// Block size in bytes
`define L3_CACHE_LINE_SIZE ((`L3_ENABLE) ? `MEM_BLOCK_SIZE : `L3_WORD_SIZE)
// Input request tag bits
`define L3CORE_TAG_WIDTH (`L2CORE_TAG_WIDTH + `CLOG2(`NUM_CLUSTERS))
`define L3_CORE_TAG_WIDTH (`L2_CORE_TAG_WIDTH + `CLOG2(`NUM_CLUSTERS))
// Memory request data bits
`define L3MEM_DATA_WIDTH (`L3CACHE_LINE_SIZE * 8)
`define L3_MEM_DATA_WIDTH (`L3_CACHE_LINE_SIZE * 8)
// Memory request address bits
`define L3MEM_ADDR_WIDTH (32 - `CLOG2(`L3CACHE_LINE_SIZE))
`define L3_MEM_ADDR_WIDTH (32 - `CLOG2(`L3_CACHE_LINE_SIZE))
// Memory byte enable bits
`define L3MEM_BYTEEN_WIDTH `L3CACHE_LINE_SIZE
`define L3_MEM_BYTEEN_WIDTH `L3_CACHE_LINE_SIZE
// Input request size
`define L3NUM_REQS `NUM_CLUSTERS
`define L3_NUM_REQS `NUM_CLUSTERS
// Memory request tag bits
`define _L3MEM_ADDR_RATIO_W $clog2(`L3CACHE_LINE_SIZE / `L3WORD_SIZE)
`define _L3NC_MEM_TAG_WIDTH ($clog2(`L3NUM_REQS) + `_L3MEM_ADDR_RATIO_W + `L2MEM_TAG_WIDTH)
`define _L3MEM_TAG_WIDTH `MAX((`L3MEM_ADDR_WIDTH + `NC_ADDR_BITS), `_L3NC_MEM_TAG_WIDTH)
`define L3MEM_TAG_WIDTH (`L3_ENABLE ? `_L3MEM_TAG_WIDTH : (`L2MEM_TAG_WIDTH + `CLOG2(`L3NUM_REQS)))
`define _L3_MEM_ADDR_RATIO_W $clog2(`L3_CACHE_LINE_SIZE / `L3_WORD_SIZE)
`define _L3_NC_MEM_TAG_WIDTH ($clog2(`L3_NUM_REQS) + `_L3_MEM_ADDR_RATIO_W + `L2_MEM_TAG_WIDTH)
`define _L3_MEM_TAG_WIDTH `MAX((`CLOG2(`L3_NUM_BANKS) + `CLOG2(`L3_MSHR_SIZE) + `NC_FLAG_BITS), `_L3_NC_MEM_TAG_WIDTH)
`define L3_MEM_TAG_WIDTH ((`L3_ENABLE) ? `_L3_MEM_TAG_WIDTH : (`L2_MEM_TAG_WIDTH + `CLOG2(`L3_NUM_REQS)))
///////////////////////////////////////////////////////////////////////////////
`define VX_MEM_BYTEEN_WIDTH `L3MEM_BYTEEN_WIDTH
`define VX_MEM_ADDR_WIDTH `L3MEM_ADDR_WIDTH
`define VX_MEM_DATA_WIDTH `L3MEM_DATA_WIDTH
`define VX_MEM_TAG_WIDTH `L3MEM_TAG_WIDTH
`define VX_CORE_TAG_WIDTH `L3CORE_TAG_WIDTH
`define VX_MEM_BYTEEN_WIDTH `L3_MEM_BYTEEN_WIDTH
`define VX_MEM_ADDR_WIDTH `L3_MEM_ADDR_WIDTH
`define VX_MEM_DATA_WIDTH `L3_MEM_DATA_WIDTH
`define VX_MEM_TAG_WIDTH `L3_MEM_TAG_WIDTH
`define VX_CORE_TAG_WIDTH `L3_CORE_TAG_WIDTH
`define VX_CSR_ID_WIDTH `LOG2UP(`NUM_CLUSTERS * `NUM_CORES)
`define TO_FULL_ADDR(x) {x, (32-$bits(x))'(0)}
// Merged D-cache/I-cache memory tag
`define XMEM_TAG_WIDTH (`DMEM_TAG_WIDTH + `CLOG2(2))
///////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////
`include "VX_types.vh"
`include "VX_fpu_types.vh"
`include "VX_gpu_types.vh"
`endif

View file

@ -9,35 +9,42 @@ module VX_execute #(
input wire reset,
// Dcache interface
VX_dcache_req_if dcache_req_if,
VX_dcache_rsp_if dcache_rsp_if,
VX_dcache_req_if.master dcache_req_if,
VX_dcache_rsp_if.slave dcache_rsp_if,
// commit status
VX_cmt_to_csr_if cmt_to_csr_if,
// commit interface
VX_cmt_to_csr_if.slave cmt_to_csr_if,
// fetch interface
VX_fetch_to_csr_if.slave fetch_to_csr_if,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
VX_perf_pipeline_if perf_pipeline_if,
VX_perf_memsys_if.slave perf_memsys_if,
VX_perf_pipeline_if.slave perf_pipeline_if,
`endif
// inputs
VX_alu_req_if alu_req_if,
VX_lsu_req_if lsu_req_if,
VX_csr_req_if csr_req_if,
VX_fpu_req_if fpu_req_if,
VX_gpu_req_if gpu_req_if,
VX_alu_req_if.slave alu_req_if,
VX_lsu_req_if.slave lsu_req_if,
VX_csr_req_if.slave csr_req_if,
`ifdef EXT_F_ENABLE
VX_fpu_req_if.slave fpu_req_if,
`endif
VX_gpu_req_if.slave gpu_req_if,
// outputs
VX_branch_ctl_if branch_ctl_if,
VX_warp_ctl_if warp_ctl_if,
VX_commit_if alu_commit_if,
VX_commit_if ld_commit_if,
VX_commit_if st_commit_if,
VX_commit_if csr_commit_if,
VX_commit_if fpu_commit_if,
VX_commit_if gpu_commit_if,
VX_branch_ctl_if.master branch_ctl_if,
VX_warp_ctl_if.master warp_ctl_if,
VX_commit_if.master alu_commit_if,
VX_commit_if.master ld_commit_if,
VX_commit_if.master st_commit_if,
VX_commit_if.master csr_commit_if,
`ifdef EXT_F_ENABLE
VX_commit_if.master fpu_commit_if,
`endif
VX_commit_if.master gpu_commit_if,
input wire busy
input wire busy
);
VX_fpu_to_csr_if fpu_to_csr_if();
@ -178,8 +185,8 @@ module VX_execute #(
.clk (clk),
.reset (csr_reset),
`ifdef PERF_ENABLE
.perf_memsys_if (perf_memsys_if),
.perf_pipeline_if (perf_pipeline_if),
.perf_memsys_if (perf_memsys_if),
.perf_pipeline_if(perf_pipeline_if),
`endif
.cmt_to_csr_if (cmt_to_csr_if),
.fpu_to_csr_if (fpu_to_csr_if),
@ -188,8 +195,13 @@ module VX_execute #(
`endif
.csr_req_if (csr_req_if),
.csr_commit_if (csr_commit_if),
.fpu_pending (fpu_pending),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
.fpu_pending (fpu_pending),
.pending (csr_pending),
`else
`UNUSED_PIN (pending),
`endif
.busy (busy)
);
@ -207,22 +219,6 @@ module VX_execute #(
.csr_pending (csr_pending),
.pending (fpu_pending)
);
`else
`UNUSED_VAR (csr_pending)
`UNUSED_VAR (fpu_to_csr_if.read_frm)
assign fpu_req_if.ready = 0;
assign fpu_commit_if.valid = 0;
assign fpu_commit_if.wid = 0;
assign fpu_commit_if.PC = 0;
assign fpu_commit_if.tmask = 0;
assign fpu_commit_if.wb = 0;
assign fpu_commit_if.rd = 0;
assign fpu_commit_if.data = 0;
assign fpu_to_csr_if.write_enable = 0;
assign fpu_to_csr_if.write_wid = 0;
assign fpu_to_csr_if.write_fflags = 0;
assign fpu_to_csr_if.read_wid = 0;
assign fpu_pending = 0;
`endif
VX_gpu_unit #(
@ -244,8 +240,8 @@ module VX_execute #(
// special workaround to get RISC-V tests Pass/Fail status
wire ebreak /* verilator public */;
assign ebreak = alu_req_if.valid && alu_req_if.ready
&& `ALU_IS_BR(alu_req_if.op_mod)
&& (`BR_OP(alu_req_if.op_type) == `BR_EBREAK
|| `BR_OP(alu_req_if.op_type) == `BR_ECALL);
&& `INST_ALU_IS_BR(alu_req_if.op_mod)
&& (`INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_EBREAK
|| `INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_ECALL);
endmodule

View file

@ -9,19 +9,23 @@ module VX_fetch #(
input wire reset,
// Icache interface
VX_icache_req_if icache_req_if,
VX_icache_rsp_if icache_rsp_if,
VX_icache_req_if.master icache_req_if,
VX_icache_rsp_if.slave icache_rsp_if,
// inputs
VX_wstall_if wstall_if,
VX_join_if join_if,
VX_branch_ctl_if branch_ctl_if,
VX_warp_ctl_if warp_ctl_if,
VX_wstall_if.slave wstall_if,
VX_join_if.slave join_if,
VX_branch_ctl_if.slave branch_ctl_if,
VX_warp_ctl_if.slave warp_ctl_if,
// outputs
VX_ifetch_rsp_if ifetch_rsp_if,
VX_ifetch_rsp_if.master ifetch_rsp_if,
output wire busy
// csr interface
VX_fetch_to_csr_if.master fetch_to_csr_if,
// busy status
output wire busy
);
VX_ifetch_req_if ifetch_req_if();
@ -32,13 +36,17 @@ module VX_fetch #(
`SCOPE_BIND_VX_fetch_warp_sched
.clk (clk),
.reset (reset),
.reset (reset),
.warp_ctl_if (warp_ctl_if),
.wstall_if (wstall_if),
.join_if (join_if),
.branch_ctl_if (branch_ctl_if),
.ifetch_req_if (ifetch_req_if),
.ifetch_rsp_if (ifetch_rsp_if),
.fetch_to_csr_if (fetch_to_csr_if),
.busy (busy)
);

View file

@ -3,21 +3,18 @@
module VX_fpu_unit #(
parameter CORE_ID = 0
) (
// inputs
input wire clk,
input wire reset,
// inputs
VX_fpu_req_if fpu_req_if,
// outputs
VX_fpu_to_csr_if fpu_to_csr_if,
VX_commit_if fpu_commit_if,
VX_fpu_req_if.slave fpu_req_if,
VX_fpu_to_csr_if.master fpu_to_csr_if,
VX_commit_if.master fpu_commit_if,
input wire[`NUM_WARPS-1:0] csr_pending,
output wire[`NUM_WARPS-1:0] pending
);
import fpu_types::*;
`UNUSED_PARAM (CORE_ID)
localparam FPUQ_BITS = `LOG2UP(`FPUQ_SIZE);
@ -65,7 +62,7 @@ module VX_fpu_unit #(
// resolve dynamic FRM from CSR
assign fpu_to_csr_if.read_wid = fpu_req_if.wid;
wire [`FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod;
wire [`INST_FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `INST_FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod;
`ifdef FPU_DPI
@ -183,7 +180,7 @@ module VX_fpu_unit #(
wire stall_out = ~fpu_commit_if.ready && fpu_commit_if.valid;
VX_pipe_register #(
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFG_BITS),
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS),
.RESETW (1)
) pipe_reg (
.clk (clk),

View file

@ -1,37 +0,0 @@
`include "VX_define.vh"
`TRACING_OFF
module VX_gpr_ram_f #(
parameter DATAW = 1,
parameter DEPTH = 1,
parameter ADDRW = $clog2(DEPTH)
) (
input wire clk,
input wire wren,
input wire [ADDRW-1:0] waddr,
input wire [DATAW-1:0] wdata,
input wire [ADDRW-1:0] raddr1,
input wire [ADDRW-1:0] raddr2,
input wire [ADDRW-1:0] raddr3,
output wire [DATAW-1:0] rdata1,
output wire [DATAW-1:0] rdata2,
output wire [DATAW-1:0] rdata3
);
reg [DATAW-1:0] mem [DEPTH-1:0];
initial mem = '{default: 0};
always @(posedge clk) begin
if (wren) begin
mem [waddr] <= wdata;
end
end
assign rdata1 = mem [raddr1];
assign rdata2 = mem [raddr2];
assign rdata3 = mem [raddr3];
endmodule
`TRACING_ON

View file

@ -1,34 +0,0 @@
`include "VX_define.vh"
`TRACING_OFF
module VX_gpr_ram_i #(
parameter DATAW = 1,
parameter DEPTH = 1,
parameter ADDRW = $clog2(DEPTH)
) (
input wire clk,
input wire wren,
input wire [ADDRW-1:0] waddr,
input wire [DATAW-1:0] wdata,
input wire [ADDRW-1:0] raddr1,
input wire [ADDRW-1:0] raddr2,
output wire [DATAW-1:0] rdata1,
output wire [DATAW-1:0] rdata2
);
reg [DATAW-1:0] mem [DEPTH-1:0];
initial mem = '{default: 0};
always @(posedge clk) begin
if (wren) begin
mem [waddr] <= wdata;
end
end
assign rdata1 = mem [raddr1];
assign rdata2 = mem [raddr2];
endmodule
`TRACING_ON

91
hw/rtl/VX_gpr_stage.sv Normal file
View file

@ -0,0 +1,91 @@
`include "VX_define.vh"
module VX_gpr_stage #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_writeback_if.slave writeback_if,
VX_gpr_req_if.slave gpr_req_if,
// outputs
VX_gpr_rsp_if.master gpr_rsp_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (reset)
localparam RAM_SIZE = `NUM_WARPS * `NUM_REGS;
// ensure r0 never gets written, which can happen before the reset
wire write_enable = writeback_if.valid && (writeback_if.rd != 0);
wire [`NUM_THREADS-1:0] wren;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign wren[i] = write_enable && writeback_if.tmask[i];
end
wire [$clog2(RAM_SIZE)-1:0] waddr, raddr1, raddr2;
assign waddr = {writeback_if.wid, writeback_if.rd};
assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1};
assign raddr2 = {gpr_req_if.wid, gpr_req_if.rs2};
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
VX_dp_ram #(
.DATAW (32),
.SIZE (RAM_SIZE),
.INIT_ENABLE (1),
.INIT_VALUE (0)
) dp_ram1 (
.clk (clk),
.wren (wren[i]),
.waddr (waddr),
.wdata (writeback_if.data[i]),
.raddr (raddr1),
.rdata (gpr_rsp_if.rs1_data[i])
);
VX_dp_ram #(
.DATAW (32),
.SIZE (RAM_SIZE),
.INIT_ENABLE (1),
.INIT_VALUE (0)
) dp_ram2 (
.clk (clk),
.wren (wren[i]),
.waddr (waddr),
.wdata (writeback_if.data[i]),
.raddr (raddr2),
.rdata (gpr_rsp_if.rs2_data[i])
);
end
`ifdef EXT_F_ENABLE
wire [$clog2(RAM_SIZE)-1:0] raddr3;
assign raddr3 = {gpr_req_if.wid, gpr_req_if.rs3};
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
VX_dp_ram #(
.DATAW (32),
.SIZE (RAM_SIZE),
.INIT_ENABLE (1),
.INIT_VALUE (0)
) dp_ram3 (
.clk (clk),
.wren (wren[i]),
.waddr (waddr),
.wdata (writeback_if.data[i]),
.raddr (raddr3),
.rdata (gpr_rsp_if.rs3_data[i])
);
end
`else
`UNUSED_VAR (gpr_req_if.rs3)
assign gpr_rsp_if.rs3_data = 'x;
`endif
assign writeback_if.ready = 1'b1;
endmodule

View file

@ -1,87 +0,0 @@
`include "VX_define.vh"
module VX_gpr_stage #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_writeback_if writeback_if,
VX_gpr_req_if gpr_req_if,
// outputs
VX_gpr_rsp_if gpr_rsp_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (reset)
// ensure r0 never gets written, which can happen before the reset
wire write_enable = writeback_if.valid && (writeback_if.rd != 0);
`ifdef EXT_F_ENABLE
localparam RAM_DEPTH = `NUM_WARPS * `NUM_REGS;
wire [`NUM_THREADS-1:0][31:0] rdata1, rdata2, rdata3;
wire [$clog2(RAM_DEPTH)-1:0] waddr, raddr1, raddr2, raddr3;
assign waddr = {writeback_if.wid, writeback_if.rd};
assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1};
assign raddr2 = {gpr_req_if.wid, gpr_req_if.rs2};
assign raddr3 = {gpr_req_if.wid, gpr_req_if.rs3};
for (genvar i = 0; i < `NUM_THREADS; i++) begin
VX_gpr_ram_f #(
.DATAW (32),
.DEPTH (RAM_DEPTH)
) gpr_ram_f (
.clk (clk),
.wren (write_enable && writeback_if.tmask[i]),
.waddr (waddr),
.wdata (writeback_if.data[i]),
.raddr1 (raddr1),
.raddr2 (raddr2),
.raddr3 (raddr3),
.rdata1 (rdata1[i]),
.rdata2 (rdata2[i]),
.rdata3 (rdata3[i])
);
end
assign gpr_rsp_if.rs1_data = rdata1;
assign gpr_rsp_if.rs2_data = rdata2;
assign gpr_rsp_if.rs3_data = rdata3;
`else
localparam RAM_DEPTH = `NUM_WARPS * `NUM_REGS;
wire [`NUM_THREADS-1:0][31:0] rdata1, rdata2;
wire [$clog2(RAM_DEPTH)-1:0] waddr, raddr1, raddr2;
assign waddr = {writeback_if.wid, writeback_if.rd};
assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1};
assign raddr2 = {gpr_req_if.wid, gpr_req_if.rs2};
`UNUSED_VAR (gpr_req_if.rs3)
for (genvar i = 0; i < `NUM_THREADS; i++) begin
VX_gpr_ram_i #(
.DATAW (32),
.DEPTH (RAM_DEPTH)
) gpr_ram_i (
.clk (clk),
.wren (write_enable && writeback_if.tmask[i]),
.waddr (waddr),
.wdata (writeback_if.data[i]),
.raddr1 (raddr1),
.raddr2 (raddr2),
.rdata1 (rdata1[i]),
.rdata2 (rdata2[i])
);
end
assign gpr_rsp_if.rs1_data = rdata1;
assign gpr_rsp_if.rs2_data = rdata2;
assign gpr_rsp_if.rs3_data = 0;
`endif
assign writeback_if.ready = 1'b1;
endmodule

43
hw/rtl/VX_gpu_types.vh Normal file
View file

@ -0,0 +1,43 @@
`ifndef VX_GPU_TYPES
`define VX_GPU_TYPES
`include "VX_define.vh"
package gpu_types;
typedef struct packed {
logic valid;
logic [`NUM_THREADS-1:0] tmask;
} gpu_tmc_t;
`define GPU_TMC_BITS $bits(gpu_types::gpu_tmc_t)
typedef struct packed {
logic valid;
logic [`NUM_WARPS-1:0] wmask;
logic [31:0] pc;
} gpu_wspawn_t;
`define GPU_WSPAWN_BITS $bits(gpu_types::gpu_wspawn_t)
typedef struct packed {
logic valid;
logic diverged;
logic [`NUM_THREADS-1:0] then_tmask;
logic [`NUM_THREADS-1:0] else_tmask;
logic [31:0] pc;
} gpu_split_t;
`define GPU_SPLIT_BITS $bits(gpu_types::gpu_split_t)
typedef struct packed {
logic valid;
logic [`NB_BITS-1:0] id;
logic [`NW_BITS-1:0] size_m1;
} gpu_barrier_t;
`define GPU_BARRIER_BITS $bits(gpu_types::gpu_barrier_t)
endpackage
`endif

View file

@ -5,11 +5,11 @@ module VX_gpu_unit #(
) (
`SCOPE_IO_VX_gpu_unit
input wire clk,
input wire reset,
input wire clk,
input wire reset,
// Inputs
VX_gpu_req_if gpu_req_if,
VX_gpu_req_if.slave gpu_req_if,
`ifdef EXT_TEX_ENABLE
VX_tex_csr_if tex_csr_if,
@ -19,9 +19,10 @@ module VX_gpu_unit #(
`endif
// Outputs
VX_warp_ctl_if warp_ctl_if,
VX_commit_if gpu_commit_if
VX_warp_ctl_if.master warp_ctl_if,
VX_commit_if.master gpu_commit_if
);
import gpu_types::*;
`UNUSED_PARAM (CORE_ID)
@ -47,26 +48,32 @@ module VX_gpu_unit #(
wire stall_in, stall_out;
wire is_wspawn = (gpu_req_if.op_type == `GPU_WSPAWN);
wire is_tmc = (gpu_req_if.op_type == `GPU_TMC);
wire is_split = (gpu_req_if.op_type == `GPU_SPLIT);
wire is_bar = (gpu_req_if.op_type == `GPU_BAR);
wire is_wspawn = (gpu_req_if.op_type == `INST_GPU_WSPAWN);
wire is_tmc = (gpu_req_if.op_type == `INST_GPU_TMC);
wire is_split = (gpu_req_if.op_type == `INST_GPU_SPLIT);
wire is_bar = (gpu_req_if.op_type == `INST_GPU_BAR);
// tmc
wire [`NUM_THREADS-1:0] tmc_new_mask;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign tmc_new_mask[i] = (i < gpu_req_if.rs1_data[0]);
end
assign tmc.valid = is_tmc;
assign tmc.tmask = tmc_new_mask;
wire taken = (gpu_req_if.rs1_data[i] != 0);
assign taken_tmask[i] = gpu_req_if.tmask[i] & taken;
assign not_taken_tmask[i] = gpu_req_if.tmask[i] & ~taken;
end
// tmc
wire [`NUM_THREADS-1:0] pred_mask = (taken_tmask != 0) ? taken_tmask : gpu_req_if.tmask;
assign tmc.valid = is_tmc || is_pred;
assign tmc.tmask = is_pred ? pred_mask : rs1_data[`NUM_THREADS-1:0];
// wspawn
wire [31:0] wspawn_pc = gpu_req_if.rs2_data[0];
wire [`NUM_WARPS-1:0] wspawn_wmask;
for (genvar i = 0; i < `NUM_WARPS; i++) begin
assign wspawn_wmask[i] = (i < gpu_req_if.rs1_data[0]);
assign wspawn_wmask[i] = (i < rs1_data);
end
assign wspawn.valid = is_wspawn;
assign wspawn.wmask = wspawn_wmask;
@ -74,20 +81,11 @@ module VX_gpu_unit #(
// split
wire [`NUM_THREADS-1:0] split_then_mask;
wire [`NUM_THREADS-1:0] split_else_mask;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire taken = gpu_req_if.rs1_data[i][0];
assign split_then_mask[i] = gpu_req_if.tmask[i] & taken;
assign split_else_mask[i] = gpu_req_if.tmask[i] & ~taken;
end
assign split.valid = is_split;
assign split.diverged = (| split_then_mask) && (| split_else_mask);
assign split.then_mask = split_then_mask;
assign split.else_mask = split_else_mask;
assign split.pc = gpu_req_if.next_PC;
assign split.valid = is_split;
assign split.diverged = (| taken_tmask) && (| not_taken_tmask);
assign split.then_tmask = taken_tmask;
assign split.else_tmask = not_taken_tmask;
assign split.pc = gpu_req_if.next_PC;
// barrier
@ -200,9 +198,9 @@ module VX_gpu_unit #(
`SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid);
`SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid);
`SCOPE_ASSIGN (gpu_rsp_tmc, warp_ctl_if.tmc);
`SCOPE_ASSIGN (gpu_rsp_wspawn, warp_ctl_if.wspawn);
`SCOPE_ASSIGN (gpu_rsp_split, warp_ctl_if.split);
`SCOPE_ASSIGN (gpu_rsp_barrier, warp_ctl_if.barrier);
`SCOPE_ASSIGN (gpu_rsp_tmc, warp_ctl_if.tmc.valid);
`SCOPE_ASSIGN (gpu_rsp_wspawn, warp_ctl_if.wspawn.valid);
`SCOPE_ASSIGN (gpu_rsp_split, warp_ctl_if.split.valid);
`SCOPE_ASSIGN (gpu_rsp_barrier, warp_ctl_if.barrier.valid);
endmodule

View file

@ -7,17 +7,16 @@ module VX_ibuffer #(
input wire reset,
// inputs
VX_decode_if decode_if,
VX_decode_if.slave decode_if,
// outputs
VX_ibuffer_if ibuffer_if
VX_ibuffer_if.master ibuffer_if
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `OP_BITS + `FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1 + `NUM_REGS;
localparam SIZE = 3;
localparam ADDRW = $clog2(SIZE);
localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1;
localparam ADDRW = $clog2(`IBUF_SIZE+1);
localparam NWARPSW = $clog2(`NUM_WARPS+1);
reg [`NUM_WARPS-1:0][ADDRW-1:0] used_r;
@ -36,14 +35,16 @@ module VX_ibuffer #(
wire writing = enq_fire && (i == decode_if.wid);
wire reading = deq_fire && (i == ibuffer_if.wid);
wire is_head_ptr = empty_r[i] || (alm_empty_r[i] && reading);
wire going_empty = empty_r[i] || (alm_empty_r[i] && reading);
VX_skid_buffer #(
.DATAW (DATAW)
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
.OUT_REG (1)
) queue (
.clk (clk),
.reset (reset),
.valid_in (writing && !is_head_ptr),
.valid_in (writing && !going_empty),
.data_in (q_data_in),
.ready_out(reading),
.data_out (q_data_prev[i]),
@ -63,7 +64,7 @@ module VX_ibuffer #(
empty_r[i] <= 0;
if (used_r[i] == 1)
alm_empty_r[i] <= 0;
if (used_r[i] == ADDRW'(SIZE-1))
if (used_r[i] == ADDRW'(`IBUF_SIZE))
full_r[i] <= 1;
end
end else if (reading) begin
@ -76,7 +77,7 @@ module VX_ibuffer #(
used_r[i] <= used_r[i] + ADDRW'($signed(2'(writing) - 2'(reading)));
end
if (writing && is_head_ptr) begin
if (writing && going_empty) begin
q_data_out[i] <= q_data_in;
end else if (reading) begin
q_data_out[i] <= q_data_prev[i];
@ -97,6 +98,8 @@ module VX_ibuffer #(
reg [DATAW-1:0] deq_instr, deq_instr_n;
reg [NWARPSW-1:0] num_warps;
`UNUSED_VAR (deq_instr)
// calculate valid table
always @(*) begin
valid_table_n = valid_table;
@ -114,11 +117,11 @@ module VX_ibuffer #(
) rr_arbiter (
.clk (clk),
.reset (reset),
.enable (ibuffer_if.ready),
.requests (valid_table_n),
.grant_index (deq_wid_rr_n),
`UNUSED_PIN (grant_onehot),
`UNUSED_PIN (grant_valid)
`UNUSED_PIN (grant_valid),
`UNUSED_PIN (grant_onehot),
`UNUSED_PIN (enable)
);
// schedule the next instruction to issue
@ -146,11 +149,10 @@ module VX_ibuffer #(
valid_table <= 0;
deq_valid <= 0;
num_warps <= 0;
deq_wid_rr <= 0;
end else begin
valid_table <= valid_table_n;
deq_valid <= deq_valid_n;
deq_wid_rr <= deq_wid_rr_n;
if (warp_added && !warp_removed) begin
num_warps <= num_warps + NWARPSW'(1);
@ -159,42 +161,48 @@ module VX_ibuffer #(
end
end
deq_wid <= deq_wid_n;
deq_instr <= deq_instr_n;
deq_wid <= deq_wid_n;
deq_wid_rr <= deq_wid_rr_n;
deq_instr <= deq_instr_n;
end
assign decode_if.ready = ~q_full[decode_if.wid];
assign q_data_in = {decode_if.tmask,
decode_if.PC,
decode_if.ex_type,
decode_if.op_type,
decode_if.op_mod,
decode_if.wb,
decode_if.wb,
decode_if.use_PC,
decode_if.use_imm,
decode_if.imm,
decode_if.rd,
decode_if.rs1,
decode_if.rs2,
decode_if.rs3,
decode_if.imm,
decode_if.use_PC,
decode_if.use_imm,
decode_if.used_regs};
decode_if.rs3};
assign ibuffer_if.valid = deq_valid;
assign ibuffer_if.wid = deq_wid;
assign ibuffer_if.wid_n = deq_wid_n;
assign {ibuffer_if.tmask,
ibuffer_if.PC,
ibuffer_if.ex_type,
ibuffer_if.op_type,
ibuffer_if.op_mod,
ibuffer_if.wb,
ibuffer_if.wb,
ibuffer_if.use_PC,
ibuffer_if.use_imm,
ibuffer_if.imm,
ibuffer_if.rd,
ibuffer_if.rs1,
ibuffer_if.rs2,
ibuffer_if.rs3,
ibuffer_if.imm,
ibuffer_if.use_PC,
ibuffer_if.use_imm,
ibuffer_if.used_regs} = deq_instr;
ibuffer_if.rs3} = deq_instr;
// scoreboard forwarding
assign ibuffer_if.wid_n = deq_wid_n;
assign ibuffer_if.rd_n = deq_instr_n[3*`NR_BITS +: `NR_BITS];
assign ibuffer_if.rs1_n = deq_instr_n[2*`NR_BITS +: `NR_BITS];
assign ibuffer_if.rs2_n = deq_instr_n[1*`NR_BITS +: `NR_BITS];
assign ibuffer_if.rs3_n = deq_instr_n[0*`NR_BITS +: `NR_BITS];
endmodule

View file

@ -5,24 +5,24 @@ module VX_icache_stage #(
) (
`SCOPE_IO_VX_icache_stage
input wire clk,
input wire reset,
input wire clk,
input wire reset,
// Icache interface
VX_icache_req_if icache_req_if,
VX_icache_rsp_if icache_rsp_if,
VX_icache_req_if.master icache_req_if,
VX_icache_rsp_if.slave icache_rsp_if,
// request
VX_ifetch_req_if ifetch_req_if,
VX_ifetch_req_if.slave ifetch_req_if,
// reponse
VX_ifetch_rsp_if ifetch_rsp_if
VX_ifetch_rsp_if.master ifetch_rsp_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (reset)
localparam OUTPUT_REG = 0;
localparam OUT_REG = 0;
wire icache_req_fire = icache_req_if.valid && icache_req_if.ready;
@ -33,20 +33,21 @@ module VX_icache_stage #(
wire [`NUM_THREADS-1:0] rsp_tmask;
VX_dp_ram #(
.DATAW(32 + `NUM_THREADS),
.SIZE(`NUM_WARPS),
.FASTRAM(1)
.DATAW (32 + `NUM_THREADS),
.SIZE (`NUM_WARPS),
.LUTRAM (1)
) req_metadata (
.clk(clk),
.waddr(req_tag),
.raddr(rsp_tag),
.wren(icache_req_fire),
.byteen(1'b1),
.rden(ifetch_rsp_if.valid),
.din({ifetch_req_if.PC, ifetch_req_if.tmask}),
.dout({rsp_PC, rsp_tmask})
.clk (clk),
.wren (icache_req_fire),
.waddr (req_tag),
.wdata ({ifetch_req_if.PC, ifetch_req_if.tmask}),
.raddr (rsp_tag),
.rdata ({rsp_PC, rsp_tmask})
);
`RUNTIME_ASSERT((!ifetch_req_if.valid || ifetch_req_if.PC >= `STARTUP_ADDR),
("invalid PC=%0h, wid=%0d, tmask=%b", ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask))
// Icache Request
assign icache_req_if.valid = ifetch_req_if.valid;
assign icache_req_if.addr = ifetch_req_if.PC[31:2];
@ -62,12 +63,12 @@ module VX_icache_stage #(
wire [`NW_BITS-1:0] rsp_wid = rsp_tag;
wire stall_out = ~ifetch_rsp_if.ready && (0 == OUTPUT_REG && ifetch_rsp_if.valid);
wire stall_out = ~ifetch_rsp_if.ready && (0 == OUT_REG && ifetch_rsp_if.valid);
VX_pipe_register #(
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32),
.RESETW (1),
.DEPTH (OUTPUT_REG)
.DEPTH (OUT_REG)
) pipe_reg (
.clk (clk),
.reset (reset),
@ -90,10 +91,10 @@ module VX_icache_stage #(
`ifdef DBG_PRINT_CORE_ICACHE
always @(posedge clk) begin
if (icache_req_if.valid && icache_req_if.ready) begin
$display("%t: I$%0d req: wid=%0d, PC=%0h", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC);
dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC);
end
if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin
$display("%t: I$%0d rsp: wid=%0d, PC=%0h, data=%0h", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, ifetch_rsp_if.data);
dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, data=%0h\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, ifetch_rsp_if.data);
end
end
`endif

159
hw/rtl/VX_instr_demux.sv Normal file
View file

@ -0,0 +1,159 @@
`include "VX_define.vh"
module VX_instr_demux (
input wire clk,
input wire reset,
// inputs
VX_ibuffer_if.slave ibuffer_if,
VX_gpr_rsp_if.slave gpr_rsp_if,
// outputs
VX_alu_req_if.master alu_req_if,
VX_lsu_req_if.master lsu_req_if,
VX_csr_req_if.master csr_req_if,
`ifdef EXT_F_ENABLE
VX_fpu_req_if.master fpu_req_if,
`endif
VX_gpu_req_if.master gpu_req_if
);
wire [`NT_BITS-1:0] tid;
wire alu_req_ready;
wire lsu_req_ready;
wire csr_req_ready;
`ifdef EXT_F_ENABLE
wire fpu_req_ready;
`endif
wire gpu_req_ready;
VX_lzc #(
.N (`NUM_THREADS)
) tid_select (
.in_i (ibuffer_if.tmask),
.cnt_o (tid),
`UNUSED_PIN (valid_o)
);
wire [31:0] next_PC = ibuffer_if.PC + 4;
// ALU unit
wire alu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_ALU);
wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)),
.OUT_REG (1)
) alu_buffer (
.clk (clk),
.reset (reset),
.valid_in (alu_req_valid),
.ready_in (alu_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}),
.valid_out (alu_req_if.valid),
.ready_out (alu_req_if.ready)
);
// lsu unit
wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU);
wire [`INST_LSU_BITS-1:0] lsu_op_type = `INST_LSU_BITS'(ibuffer_if.op_type);
wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)),
.OUT_REG (1)
) lsu_buffer (
.clk (clk),
.reset (reset),
.valid_in (lsu_req_valid),
.ready_in (lsu_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data}),
.valid_out (lsu_req_if.valid),
.ready_out (lsu_req_if.ready)
);
// csr unit
wire csr_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_CSR);
wire [`INST_CSR_BITS-1:0] csr_op_type = `INST_CSR_BITS'(ibuffer_if.op_type);
wire [`CSR_ADDR_BITS-1:0] csr_addr = ibuffer_if.imm[`CSR_ADDR_BITS-1:0];
wire [`NRI_BITS-1:0] csr_imm = ibuffer_if.imm[`CSR_ADDR_BITS +: `NRI_BITS];
wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid];
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32),
.OUT_REG (1)
) csr_buffer (
.clk (clk),
.reset (reset),
.valid_in (csr_req_valid),
.ready_in (csr_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}),
.data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}),
.valid_out (csr_req_if.valid),
.ready_out (csr_req_if.ready)
);
// fpu unit
`ifdef EXT_F_ENABLE
wire fpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_FPU);
wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
.OUT_REG (1)
) fpu_buffer (
.clk (clk),
.reset (reset),
.valid_in (fpu_req_valid),
.ready_in (fpu_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
.data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
.valid_out (fpu_req_if.valid),
.ready_out (fpu_req_if.ready)
);
`else
`UNUSED_VAR (gpr_rsp_if.rs3_data)
`endif
// gpu unit
wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU);
wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type);
wire [31:0] gpu_rs2_data = gpr_rsp_if.rs2_data[tid];
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)),
.OUT_REG (1)
) gpu_buffer (
.clk (clk),
.reset (reset),
.valid_in (gpu_req_valid),
.ready_in (gpu_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpu_rs2_data}),
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data}),
.valid_out (gpu_req_if.valid),
.ready_out (gpu_req_if.ready)
);
// can take next request?
reg ready_r;
always @(*) begin
case (ibuffer_if.ex_type)
`EX_ALU: ready_r = alu_req_ready;
`EX_LSU: ready_r = lsu_req_ready;
`EX_CSR: ready_r = csr_req_ready;
`ifdef EXT_F_ENABLE
`EX_FPU: ready_r = fpu_req_ready;
`endif
`EX_GPU: ready_r = gpu_req_ready;
default: ready_r = 1'b1; // ignore NOPs
endcase
end
assign ibuffer_if.ready = ready_r;
endmodule

View file

@ -1,146 +0,0 @@
`include "VX_define.vh"
module VX_instr_demux (
input wire clk,
input wire reset,
// inputs
VX_ibuffer_if ibuffer_if,
VX_gpr_rsp_if gpr_rsp_if,
// outputs
VX_alu_req_if alu_req_if,
VX_lsu_req_if lsu_req_if,
VX_csr_req_if csr_req_if,
VX_fpu_req_if fpu_req_if,
VX_gpu_req_if gpu_req_if
);
wire [`NT_BITS-1:0] tid;
wire alu_req_ready;
wire lsu_req_ready;
wire csr_req_ready;
wire fpu_req_ready;
wire gpu_req_ready;
VX_priority_encoder #(
.N (`NUM_THREADS)
) tid_select (
.data_in (ibuffer_if.tmask),
.index (tid),
`UNUSED_PIN (onehot),
`UNUSED_PIN (valid_out)
);
wire [31:0] next_PC = ibuffer_if.PC + 4;
// ALU unit
wire alu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_ALU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)),
.OUTPUT_REG (1)
) alu_buffer (
.clk (clk),
.reset (reset),
.valid_in (alu_req_valid),
.ready_in (alu_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `ALU_OP(ibuffer_if.op_type), ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}),
.valid_out (alu_req_if.valid),
.ready_out (alu_req_if.ready)
);
// lsu unit
wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU);
wire lsu_is_fence = `LSU_IS_FENCE(ibuffer_if.op_mod);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)),
.OUTPUT_REG (1)
) lsu_buffer (
.clk (clk),
.reset (reset),
.valid_in (lsu_req_valid),
.ready_in (lsu_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, `LSU_OP(ibuffer_if.op_type), lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data}),
.valid_out (lsu_req_if.valid),
.ready_out (lsu_req_if.ready)
);
// csr unit
wire csr_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_CSR);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32),
.OUTPUT_REG (1)
) csr_buffer (
.clk (clk),
.reset (reset),
.valid_in (csr_req_valid),
.ready_in (csr_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, `CSR_OP(ibuffer_if.op_type), ibuffer_if.imm[`CSR_ADDR_BITS-1:0], ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, ibuffer_if.rs1, gpr_rsp_if.rs1_data[0]}),
.data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.rs1, csr_req_if.rs1_data}),
.valid_out (csr_req_if.valid),
.ready_out (csr_req_if.ready)
);
// fpu unit
`ifdef EXT_F_ENABLE
wire fpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_FPU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
.OUTPUT_REG (1)
) fpu_buffer (
.clk (clk),
.reset (reset),
.valid_in (fpu_req_valid),
.ready_in (fpu_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, `FPU_OP(ibuffer_if.op_type), ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
.data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
.valid_out (fpu_req_if.valid),
.ready_out (fpu_req_if.ready)
);
`else
`UNUSED_VAR (gpr_rsp_if.rs3_data)
assign fpu_req_ready = 0;
`endif
// gpu unit
wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
.OUTPUT_REG (1)
) gpu_buffer (
.clk (clk),
.reset (reset),
.valid_in (gpu_req_valid),
.ready_in (gpu_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `GPU_OP(ibuffer_if.op_type), ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}),
.valid_out (gpu_req_if.valid),
.ready_out (gpu_req_if.ready)
);
// can take next request?
reg ready_r;
always @(*) begin
case (ibuffer_if.ex_type)
`EX_ALU: ready_r = alu_req_ready;
`EX_LSU: ready_r = lsu_req_ready;
`EX_CSR: ready_r = csr_req_ready;
`EX_FPU: ready_r = fpu_req_ready;
`EX_GPU: ready_r = gpu_req_ready;
default: ready_r = 1'b1; // ignore NOPs
endcase
end
assign ibuffer_if.ready = ready_r;
endmodule

View file

@ -6,11 +6,13 @@ module VX_ipdom_stack #(
) (
input wire clk,
input wire reset,
input wire pair,
input wire [WIDTH - 1:0] q1,
input wire [WIDTH - 1:0] q2,
output wire [WIDTH - 1:0] d,
input wire push,
input wire pop,
output wire index,
output wire empty,
output wire full
);
@ -38,32 +40,29 @@ module VX_ipdom_stack #(
end
VX_dp_ram #(
.DATAW(WIDTH * 2),
.SIZE(DEPTH),
.RWCHECK(1),
.FASTRAM(1)
.DATAW (WIDTH * 2),
.SIZE (DEPTH),
.LUTRAM (1)
) store (
.clk(clk),
.waddr(wr_ptr),
.raddr(rd_ptr),
.wren(push),
.byteen(1'b1),
.rden(pop),
.din({q2, q1}),
.dout({d2, d1})
.clk (clk),
.wren (push),
.waddr (wr_ptr),
.wdata ({q2, q1}),
.raddr (rd_ptr),
.rdata ({d2, d1})
);
always @(posedge clk) begin
if (push) begin
is_part[wr_ptr] <= 0;
is_part[wr_ptr] <= ~pair;
end else if (pop) begin
is_part[rd_ptr] <= 1;
end
end
wire p = is_part[rd_ptr];
assign d = p ? d1 : d2;
assign empty = ~(| wr_ptr);
assign index = is_part[rd_ptr];
assign d = index ? d1 : d2;
assign empty = (ADDRW'(0) == wr_ptr);
assign full = (ADDRW'(DEPTH-1) == wr_ptr);
endmodule

View file

@ -9,30 +9,76 @@ module VX_issue #(
input wire reset,
`ifdef PERF_ENABLE
VX_perf_pipeline_if perf_pipeline_if,
VX_perf_pipeline_if.master perf_pipeline_if,
`endif
VX_decode_if decode_if,
VX_writeback_if writeback_if,
VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if,
VX_alu_req_if alu_req_if,
VX_lsu_req_if lsu_req_if,
VX_csr_req_if csr_req_if,
VX_fpu_req_if fpu_req_if,
VX_gpu_req_if gpu_req_if
VX_alu_req_if.master alu_req_if,
VX_lsu_req_if.master lsu_req_if,
VX_csr_req_if.master csr_req_if,
`ifdef EXT_F_ENABLE
VX_fpu_req_if.master fpu_req_if,
`endif
VX_gpu_req_if.master gpu_req_if
);
VX_ibuffer_if ibuffer_if();
VX_ibuffer_if execute_if();
VX_gpr_req_if gpr_req_if();
VX_gpr_rsp_if gpr_rsp_if();
wire scoreboard_delay;
VX_gpr_req_if gpr_req_if();
assign gpr_req_if.wid = ibuffer_if.wid;
assign gpr_req_if.rs1 = ibuffer_if.rs1;
assign gpr_req_if.rs2 = ibuffer_if.rs2;
assign gpr_req_if.rs3 = ibuffer_if.rs3;
VX_writeback_if sboard_wb_if();
assign sboard_wb_if.valid = writeback_if.valid;
assign sboard_wb_if.wid = writeback_if.wid;
assign sboard_wb_if.PC = writeback_if.PC;
assign sboard_wb_if.rd = writeback_if.rd;
assign sboard_wb_if.eop = writeback_if.eop;
assign sboard_wb_if.ready = writeback_if.ready;
VX_ibuffer_if sboard_ib_if();
assign sboard_ib_if.valid = ibuffer_if.valid && idmux_ib_if.ready;
assign sboard_ib_if.wid = ibuffer_if.wid;
assign sboard_ib_if.PC = ibuffer_if.PC;
assign sboard_ib_if.wb = ibuffer_if.wb;
assign sboard_ib_if.rd = ibuffer_if.rd;
assign sboard_ib_if.rd_n = ibuffer_if.rd_n;
assign sboard_ib_if.rs1_n = ibuffer_if.rs1_n;
assign sboard_ib_if.rs2_n = ibuffer_if.rs2_n;
assign sboard_ib_if.rs3_n = ibuffer_if.rs3_n;
assign sboard_ib_if.wid_n = ibuffer_if.wid_n;
VX_ibuffer_if idmux_ib_if();
assign idmux_ib_if.valid = ibuffer_if.valid && sboard_ib_if.ready;
assign idmux_ib_if.wid = ibuffer_if.wid;
assign idmux_ib_if.tmask = ibuffer_if.tmask;
assign idmux_ib_if.PC = ibuffer_if.PC;
assign idmux_ib_if.ex_type = ibuffer_if.ex_type;
assign idmux_ib_if.op_type = ibuffer_if.op_type;
assign idmux_ib_if.op_mod = ibuffer_if.op_mod;
assign idmux_ib_if.wb = ibuffer_if.wb;
assign idmux_ib_if.rd = ibuffer_if.rd;
assign idmux_ib_if.rs1 = ibuffer_if.rs1;
assign idmux_ib_if.imm = ibuffer_if.imm;
assign idmux_ib_if.use_PC = ibuffer_if.use_PC;
assign idmux_ib_if.use_imm = ibuffer_if.use_imm;
// issue the instruction
assign ibuffer_if.ready = sboard_ib_if.ready && idmux_ib_if.ready;
`RESET_RELAY (ibuf_reset);
`RESET_RELAY (gpr_reset);
`RESET_RELAY (demux_reset);
VX_ibuffer #(
.CORE_ID(CORE_ID)
) ibuffer (
.clk (clk),
.reset (reset),
.reset (ibuf_reset),
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
@ -42,54 +88,33 @@ module VX_issue #(
) scoreboard (
.clk (clk),
.reset (reset),
.ibuffer_if (ibuffer_if),
.writeback_if(writeback_if),
.delay (scoreboard_delay)
.ibuffer_if (sboard_ib_if),
.writeback_if(sboard_wb_if)
);
assign gpr_req_if.wid = ibuffer_if.wid;
assign gpr_req_if.rs1 = ibuffer_if.rs1;
assign gpr_req_if.rs2 = ibuffer_if.rs2;
assign gpr_req_if.rs3 = ibuffer_if.rs3;
VX_gpr_stage #(
.CORE_ID(CORE_ID)
) gpr_stage (
.clk (clk),
.reset (reset),
.reset (gpr_reset),
.writeback_if (writeback_if),
.gpr_req_if (gpr_req_if),
.gpr_rsp_if (gpr_rsp_if)
);
assign execute_if.valid = ibuffer_if.valid && ~scoreboard_delay;
assign execute_if.wid = ibuffer_if.wid;
assign execute_if.tmask = ibuffer_if.tmask;
assign execute_if.PC = ibuffer_if.PC;
assign execute_if.ex_type = ibuffer_if.ex_type;
assign execute_if.op_type = ibuffer_if.op_type;
assign execute_if.op_mod = ibuffer_if.op_mod;
assign execute_if.wb = ibuffer_if.wb;
assign execute_if.rd = ibuffer_if.rd;
assign execute_if.rs1 = ibuffer_if.rs1;
assign execute_if.imm = ibuffer_if.imm;
assign execute_if.use_PC = ibuffer_if.use_PC;
assign execute_if.use_imm = ibuffer_if.use_imm;
VX_instr_demux instr_demux (
.clk (clk),
.reset (reset),
.ibuffer_if (execute_if),
.reset (demux_reset),
.ibuffer_if (idmux_ib_if),
.gpr_rsp_if (gpr_rsp_if),
.alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if),
`ifdef EXT_F_ENABLE
.fpu_req_if (fpu_req_if),
`endif
.gpu_req_if (gpu_req_if)
);
// issue the instruction
assign ibuffer_if.ready = !scoreboard_delay && execute_if.ready;
);
`SCOPE_ASSIGN (issue_fire, ibuffer_if.valid && ibuffer_if.ready);
`SCOPE_ASSIGN (issue_wid, ibuffer_if.wid);
@ -106,8 +131,8 @@ module VX_issue #(
`SCOPE_ASSIGN (issue_imm, ibuffer_if.imm);
`SCOPE_ASSIGN (issue_use_pc, ibuffer_if.use_PC);
`SCOPE_ASSIGN (issue_use_imm, ibuffer_if.use_imm);
`SCOPE_ASSIGN (scoreboard_delay, scoreboard_delay);
`SCOPE_ASSIGN (execute_delay, ~execute_if.ready);
`SCOPE_ASSIGN (scoreboard_delay, !sboard_wb_if.ready);
`SCOPE_ASSIGN (execute_delay, !idmux_ib_if.ready);
`SCOPE_ASSIGN (gpr_rsp_a, gpr_rsp_if.rs1_data);
`SCOPE_ASSIGN (gpr_rsp_b, gpr_rsp_if.rs2_data);
`SCOPE_ASSIGN (gpr_rsp_c, gpr_rsp_if.rs3_data);
@ -145,7 +170,7 @@ module VX_issue #(
if (decode_if.valid & !decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1;
end
if (ibuffer_if.valid & scoreboard_delay) begin
if (ibuffer_if.valid & !sboard_wb_if.ready) begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1;
end
if (alu_req_if.valid & !alu_req_if.ready) begin
@ -182,46 +207,48 @@ module VX_issue #(
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (alu_req_if.valid && alu_req_if.ready) begin
$write("%t: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rd=%0d, rs1_data=",
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rd=%0d, rs1_data=",
$time, CORE_ID, alu_req_if.wid, alu_req_if.PC, alu_req_if.tmask, alu_req_if.rd);
`PRINT_ARRAY1D(alu_req_if.rs1_data, `NUM_THREADS);
$write(", rs2_data=");
`PRINT_ARRAY1D(alu_req_if.rs2_data, `NUM_THREADS);
$write("\n");
`TRACE_ARRAY1D(alu_req_if.rs1_data, `NUM_THREADS);
dpi_trace(", rs2_data=");
`TRACE_ARRAY1D(alu_req_if.rs2_data, `NUM_THREADS);
dpi_trace("\n");
end
if (lsu_req_if.valid && lsu_req_if.ready) begin
$write("%t: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rd=%0d, offset=%0h, addr=",
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rd=%0d, offset=%0h, addr=",
$time, CORE_ID, lsu_req_if.wid, lsu_req_if.PC, lsu_req_if.tmask, lsu_req_if.rd, lsu_req_if.offset);
`PRINT_ARRAY1D(lsu_req_if.base_addr, `NUM_THREADS);
$write(", data=");
`PRINT_ARRAY1D(lsu_req_if.store_data, `NUM_THREADS);
$write("\n");
`TRACE_ARRAY1D(lsu_req_if.base_addr, `NUM_THREADS);
dpi_trace(", data=");
`TRACE_ARRAY1D(lsu_req_if.store_data, `NUM_THREADS);
dpi_trace("\n");
end
if (csr_req_if.valid && csr_req_if.ready) begin
$write("%t: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, rd=%0d, addr=%0h, rs1_data=",
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, rd=%0d, addr=%0h, rs1_data=",
$time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.rd, csr_req_if.addr);
`PRINT_ARRAY1D(csr_req_if.rs1_data, `NUM_THREADS);
$write("\n");
`TRACE_ARRAY1D(csr_req_if.rs1_data, `NUM_THREADS);
dpi_trace("\n");
end
`ifdef EXT_F_ENABLE
if (fpu_req_if.valid && fpu_req_if.ready) begin
$write("%t: core%0d-issue: wid=%0d, PC=%0h, ex=FPU, tmask=%b, rd=%0d, rs1_data=",
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=FPU, tmask=%b, rd=%0d, rs1_data=",
$time, CORE_ID, fpu_req_if.wid, fpu_req_if.PC, fpu_req_if.tmask, fpu_req_if.rd);
`PRINT_ARRAY1D(fpu_req_if.rs1_data, `NUM_THREADS);
$write(", rs2_data=");
`PRINT_ARRAY1D(fpu_req_if.rs2_data, `NUM_THREADS);
$write(", rs3_data=");
`PRINT_ARRAY1D(fpu_req_if.rs3_data, `NUM_THREADS);
$write("\n");
`TRACE_ARRAY1D(fpu_req_if.rs1_data, `NUM_THREADS);
dpi_trace(", rs2_data=");
`TRACE_ARRAY1D(fpu_req_if.rs2_data, `NUM_THREADS);
dpi_trace(", rs3_data=");
`TRACE_ARRAY1D(fpu_req_if.rs3_data, `NUM_THREADS);
dpi_trace("\n");
end
`endif
if (gpu_req_if.valid && gpu_req_if.ready) begin
$write("%t: core%0d-issue: wid=%0d, PC=%0h, ex=GPU, tmask=%b, rd=%0d, rs1_data=",
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=GPU, tmask=%b, rd=%0d, rs1_data=",
$time, CORE_ID, gpu_req_if.wid, gpu_req_if.PC, gpu_req_if.tmask, gpu_req_if.rd);
`PRINT_ARRAY1D(gpu_req_if.rs1_data, `NUM_THREADS);
$write(", rs2_data=");
`PRINT_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS);
$write(", rs3_data=");
`PRINT_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS);
$write("\n");
`TRACE_ARRAY1D(gpu_req_if.rs1_data, `NUM_THREADS);
dpi_trace(", rs2_data=");
`TRACE_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS);
dpi_trace(", rs3_data=");
`TRACE_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS);
dpi_trace("\n");
end
end
`endif

View file

@ -5,27 +5,26 @@ module VX_lsu_unit #(
) (
`SCOPE_IO_VX_lsu_unit
input wire clk,
input wire reset,
input wire clk,
input wire reset,
// Dcache interface
VX_dcache_req_if dcache_req_if,
VX_dcache_rsp_if dcache_rsp_if,
VX_dcache_req_if.master dcache_req_if,
VX_dcache_rsp_if.slave dcache_rsp_if,
// inputs
VX_lsu_req_if lsu_req_if,
VX_lsu_req_if.slave lsu_req_if,
// outputs
VX_commit_if ld_commit_if,
VX_commit_if st_commit_if
VX_commit_if.master ld_commit_if,
VX_commit_if.master st_commit_if
);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = 32 - MEM_ASHIFT;
localparam REQ_ASHIFT = `CLOG2(`DWORD_SIZE);
localparam REQ_ADDRW = 32 - REQ_ASHIFT;
localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE);
localparam ADDR_TYPEW = `NC_ADDR_BITS + `SM_ENABLE;
localparam ADDR_TYPEW = `NC_FLAG_BITS + `SM_ENABLE;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
@ -34,7 +33,7 @@ module VX_lsu_unit #(
wire req_valid;
wire [`NUM_THREADS-1:0] req_tmask;
wire [`NUM_THREADS-1:0][31:0] req_addr;
wire [`LSU_BITS-1:0] req_type;
wire [`INST_LSU_BITS-1:0] req_type;
wire [`NUM_THREADS-1:0][31:0] req_data;
wire [`NR_BITS-1:0] req_rd;
wire req_wb;
@ -52,9 +51,9 @@ module VX_lsu_unit #(
end
// detect duplicate addresses
wire [`NUM_THREADS-1:0] addr_matches;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign addr_matches[i] = (full_addr[0] == full_addr[i]) || ~lsu_req_if.tmask[i];
wire [`NUM_THREADS-2:0] addr_matches;
for (genvar i = 0; i < (`NUM_THREADS-1); i++) begin
assign addr_matches[i] = (lsu_req_if.base_addr[i+1] == lsu_req_if.base_addr[0]) || ~lsu_req_if.tmask[i+1];
end
wire lsu_is_dup = lsu_req_if.tmask[0] && (& addr_matches);
@ -81,7 +80,7 @@ module VX_lsu_unit #(
wire lsu_valid = lsu_req_if.valid && ~fence_wait;
VX_pipe_register #(
.DATAW (1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
.DATAW (1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
.RESETW (1)
) req_pipe_reg (
.clk (clk),
@ -98,7 +97,7 @@ module VX_lsu_unit #(
wire [31:0] rsp_pc;
wire [`NR_BITS-1:0] rsp_rd;
wire rsp_wb;
wire [`LSU_BITS-1:0] rsp_type;
wire [`INST_LSU_BITS-1:0] rsp_type;
wire rsp_is_dup;
`UNUSED_VAR (rsp_type)
@ -120,11 +119,12 @@ module VX_lsu_unit #(
wire [`NUM_THREADS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready;
wire dcache_req_fire_any = (| dcache_req_fire);
wire dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready;
wire mbuf_push = dcache_req_fire_any
wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1};
wire mbuf_push = ~mbuf_full
&& (| ({`NUM_THREADS{req_valid}} & req_tmask_dup & dcache_req_if.ready))
&& is_req_start // first submission only
&& req_wb; // loads only
@ -134,8 +134,8 @@ module VX_lsu_unit #(
`UNUSED_VAR (dcache_rsp_if.tag)
VX_index_buffer #(
.DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1),
.SIZE (`LSUQ_SIZE)
.DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1),
.SIZE (`LSUQ_SIZE)
) req_metadata (
.clk (clk),
.reset (reset),
@ -150,9 +150,7 @@ module VX_lsu_unit #(
.empty (mbuf_empty)
);
wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1};
wire req_ready_all = &(dcache_req_if.ready | req_sent_mask | ~req_tmask_dup);
wire dcache_req_ready = &(dcache_req_if.ready | req_sent_mask | ~req_tmask_dup);
wire [`NUM_THREADS-1:0] req_sent_mask_n = req_sent_mask | dcache_req_fire;
@ -161,7 +159,7 @@ module VX_lsu_unit #(
req_sent_mask <= 0;
is_req_start <= 1;
end else begin
if (req_ready_all) begin
if (dcache_req_ready) begin
req_sent_mask <= 0;
is_req_start <= 1;
end else begin
@ -204,7 +202,7 @@ module VX_lsu_unit #(
always @(*) begin
mem_req_byteen = {4{req_wb}};
case (`LSU_WSIZE(req_type))
case (`INST_LSU_WSIZE(req_type))
0: mem_req_byteen[req_offset[i]] = 1;
1: begin
mem_req_byteen[req_offset[i]] = 1;
@ -237,11 +235,11 @@ module VX_lsu_unit #(
`endif
end
assign ready_in = req_dep_ready && req_ready_all;
assign ready_in = req_dep_ready && dcache_req_ready;
// send store commit
wire is_store_rsp = req_valid && ~req_wb && req_ready_all;
wire is_store_rsp = req_valid && ~req_wb && dcache_req_ready;
assign st_commit_if.valid = is_store_rsp;
assign st_commit_if.wid = req_wid;
@ -263,11 +261,11 @@ module VX_lsu_unit #(
wire [7:0] rsp_data8 = rsp_offset[i][0] ? rsp_data16[15:8] : rsp_data16[7:0];
always @(*) begin
case (`LSU_FMT(rsp_type))
`FMT_B: rsp_data[i] = 32'(signed'(rsp_data8));
`FMT_H: rsp_data[i] = 32'(signed'(rsp_data16));
`FMT_BU: rsp_data[i] = 32'(unsigned'(rsp_data8));
`FMT_HU: rsp_data[i] = 32'(unsigned'(rsp_data16));
case (`INST_LSU_FMT(rsp_type))
`INST_FMT_B: rsp_data[i] = 32'(signed'(rsp_data8));
`INST_FMT_H: rsp_data[i] = 32'(signed'(rsp_data16));
`INST_FMT_BU: rsp_data[i] = 32'(unsigned'(rsp_data8));
`INST_FMT_HU: rsp_data[i] = 32'(unsigned'(rsp_data16));
default: rsp_data[i] = rsp_data32;
endcase
end
@ -307,7 +305,7 @@ module VX_lsu_unit #(
`SCOPE_ASSIGN (dcache_rsp_tag, mbuf_raddr);
`ifndef SYNTHESIS
reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + 64 + 1)-1:0] pending_reqs;
reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + 64 + 1)-1:0] pending_reqs;
wire [63:0] delay_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
always @(posedge clk) begin
@ -315,7 +313,7 @@ module VX_lsu_unit #(
pending_reqs <= '0;
end begin
if (mbuf_push) begin
pending_reqs[mbuf_waddr] <= {req_wid, req_pc, $time, 1'b1};
pending_reqs[mbuf_waddr] <= {req_wid, req_pc, req_rd, $time, 1'b1};
end
if (mbuf_pop) begin
pending_reqs[mbuf_raddr] <= '0;
@ -324,40 +322,42 @@ module VX_lsu_unit #(
for (integer i = 0; i < `LSUQ_SIZE; ++i) begin
if (pending_reqs[i][0]) begin
assert(($time - pending_reqs[i][1 +: 64]) < delay_timeout) else
$error("%t: *** D$%0d response timeout: wid=%0d, PC=%0h", $time, CORE_ID, pending_reqs[i][1+64+32 +: `NW_BITS], pending_reqs[i][1+64 +: 32]);
`ASSERT(($time - pending_reqs[i][1 +: 64]) < delay_timeout,
("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d",
$time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+32+`NR_BITS +: `NW_BITS], pending_reqs[i][1+64+`NR_BITS +: 32], pending_reqs[i][1+64 +: `NR_BITS]));
end
end
end
`endif
`ifdef DBG_PRINT_CORE_DCACHE
always @(posedge clk) begin
if (lsu_req_if.valid && fence_wait) begin
$display("%t: *** D$%0d fence wait", $time, CORE_ID);
wire dcache_req_fire_any = (| dcache_req_fire);
always @(posedge clk) begin
if (lsu_req_if.valid && fence_wait) begin
dpi_trace("%d: *** D$%0d fence wait\n", $time, CORE_ID);
end
if (dcache_req_fire_any) begin
if (dcache_req_if.rw[0]) begin
$write("%t: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
`PRINT_ARRAY1D(req_addr, `NUM_THREADS);
$write(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`PRINT_ARRAY1D(req_addr_type, `NUM_THREADS);
$write(", data=");
`PRINT_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
$write("\n");
dpi_trace("%d: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
`TRACE_ARRAY1D(req_addr, `NUM_THREADS);
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
dpi_trace(", data=");
`TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
dpi_trace("\n");
end else begin
$write("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
`PRINT_ARRAY1D(req_addr, `NUM_THREADS);
$write(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`PRINT_ARRAY1D(req_addr_type, `NUM_THREADS);
$write(", rd=%0d, is_dup=%b\n", req_rd, req_is_dup);
dpi_trace("%d: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
`TRACE_ARRAY1D(req_addr, `NUM_THREADS);
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
dpi_trace(", rd=%0d, is_dup=%b\n", req_rd, req_is_dup);
end
end
if (dcache_rsp_fire) begin
$write("%t: D$%0d Rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=",
dpi_trace("%d: D$%0d Rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=",
$time, CORE_ID, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd);
`PRINT_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS);
$write(", is_dup=%b\n", rsp_is_dup);
`TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS);
dpi_trace(", is_dup=%b\n", rsp_is_dup);
end
end
`endif

View file

@ -8,11 +8,11 @@ module VX_mem_arb #(
parameter TAG_SEL_IDX = 0,
parameter BUFFERED_REQ = 0,
parameter BUFFERED_RSP = 0,
parameter TYPE = "R",
parameter TYPE = "P",
localparam DATA_SIZE = (DATA_WIDTH / 8),
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS),
localparam TAG_OUT_WIDTH = TAG_IN_WIDTH + LOG_NUM_REQS
parameter DATA_SIZE = (DATA_WIDTH / 8),
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
parameter TAG_OUT_WIDTH = TAG_IN_WIDTH + LOG_NUM_REQS
) (
input wire clk,
input wire reset,

View file

@ -5,24 +5,24 @@ module VX_mem_unit # (
) (
`SCOPE_IO_VX_mem_unit
input wire clk,
input wire reset,
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
VX_perf_memsys_if.master perf_memsys_if,
`endif
// Core <-> Dcache
VX_dcache_req_if dcache_req_if,
VX_dcache_rsp_if dcache_rsp_if,
VX_dcache_req_if.slave dcache_req_if,
VX_dcache_rsp_if.master dcache_rsp_if,
// Core <-> Icache
VX_icache_req_if icache_req_if,
VX_icache_rsp_if icache_rsp_if,
VX_icache_req_if.slave icache_req_if,
VX_icache_rsp_if.master icache_rsp_if,
// Memory
VX_mem_req_if mem_req_if,
VX_mem_rsp_if mem_rsp_if
VX_mem_req_if.master mem_req_if,
VX_mem_rsp_if.slave mem_rsp_if
);
`ifdef PERF_ENABLE
@ -30,58 +30,59 @@ module VX_mem_unit # (
`endif
VX_mem_req_if #(
.DATA_WIDTH (`IMEM_DATA_WIDTH),
.ADDR_WIDTH (`IMEM_ADDR_WIDTH),
.TAG_WIDTH (`IMEM_TAG_WIDTH)
.DATA_WIDTH (`ICACHE_MEM_DATA_WIDTH),
.ADDR_WIDTH (`ICACHE_MEM_ADDR_WIDTH),
.TAG_WIDTH (`ICACHE_MEM_TAG_WIDTH)
) icache_mem_req_if();
VX_mem_rsp_if #(
.DATA_WIDTH (`IMEM_DATA_WIDTH),
.TAG_WIDTH (`IMEM_TAG_WIDTH)
.DATA_WIDTH (`ICACHE_MEM_DATA_WIDTH),
.TAG_WIDTH (`ICACHE_MEM_TAG_WIDTH)
) icache_mem_rsp_if();
VX_mem_req_if #(
.DATA_WIDTH (`DMEM_DATA_WIDTH),
.ADDR_WIDTH (`DMEM_ADDR_WIDTH),
.TAG_WIDTH (`DMEM_TAG_WIDTH)
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
.ADDR_WIDTH (`DCACHE_MEM_ADDR_WIDTH),
.TAG_WIDTH (`DCACHE_MEM_TAG_WIDTH)
) dcache_mem_req_if();
VX_mem_rsp_if #(
.DATA_WIDTH (`DMEM_DATA_WIDTH),
.TAG_WIDTH (`DMEM_TAG_WIDTH)
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
.TAG_WIDTH (`DCACHE_MEM_TAG_WIDTH)
) dcache_mem_rsp_if();
VX_dcache_req_if #(
.NUM_REQS (`DNUM_REQS),
.WORD_SIZE (`DWORD_SIZE),
.TAG_WIDTH (`DCORE_TAG_WIDTH-`SM_ENABLE)
.NUM_REQS (`DCACHE_NUM_REQS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE)
) dcache_req_tmp_if();
VX_dcache_rsp_if #(
.NUM_REQS (`DNUM_REQS),
.WORD_SIZE (`DWORD_SIZE),
.TAG_WIDTH (`DCORE_TAG_WIDTH-`SM_ENABLE)
.NUM_REQS (`DCACHE_NUM_REQS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE)
) dcache_rsp_tmp_if();
`RESET_RELAY (icache_reset);
`RESET_RELAY (dcache_reset);
`RESET_RELAY (mem_arb_reset);
VX_cache #(
.CACHE_ID (`ICACHE_ID),
.CACHE_SIZE (`ICACHE_SIZE),
.CACHE_LINE_SIZE (`ICACHE_LINE_SIZE),
.NUM_BANKS (`INUM_BANKS),
.WORD_SIZE (`IWORD_SIZE),
.NUM_BANKS (1),
.WORD_SIZE (`ICACHE_WORD_SIZE),
.NUM_REQS (1),
.CREQ_SIZE (`ICREQ_SIZE),
.CRSQ_SIZE (`ICRSQ_SIZE),
.MSHR_SIZE (`IMSHR_SIZE),
.MRSQ_SIZE (`IMRSQ_SIZE),
.MREQ_SIZE (`IMREQ_SIZE),
.CREQ_SIZE (`ICACHE_CREQ_SIZE),
.CRSQ_SIZE (`ICACHE_CRSQ_SIZE),
.MSHR_SIZE (`ICACHE_MSHR_SIZE),
.MRSQ_SIZE (`ICACHE_MRSQ_SIZE),
.MREQ_SIZE (`ICACHE_MREQ_SIZE),
.WRITE_ENABLE (0),
.CORE_TAG_WIDTH (`ICORE_TAG_WIDTH),
.CORE_TAG_ID_BITS (`ICORE_TAG_ID_BITS),
.MEM_TAG_WIDTH (`IMEM_TAG_WIDTH)
.CORE_TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH),
.CORE_TAG_ID_BITS (`ICACHE_CORE_TAG_ID_BITS),
.MEM_TAG_WIDTH (`ICACHE_MEM_TAG_WIDTH)
) icache (
`SCOPE_BIND_VX_mem_unit_icache
@ -91,7 +92,7 @@ module VX_mem_unit # (
// Core request
.core_req_valid (icache_req_if.valid),
.core_req_rw (1'b0),
.core_req_byteen ({`IWORD_SIZE{1'b1}}),
.core_req_byteen ('b0),
.core_req_addr (icache_req_if.addr),
.core_req_data ('x),
.core_req_tag (icache_req_if.tag),
@ -128,19 +129,19 @@ module VX_mem_unit # (
.CACHE_ID (`DCACHE_ID),
.CACHE_SIZE (`DCACHE_SIZE),
.CACHE_LINE_SIZE (`DCACHE_LINE_SIZE),
.NUM_BANKS (`DNUM_BANKS),
.NUM_PORTS (`DNUM_PORTS),
.WORD_SIZE (`DWORD_SIZE),
.NUM_REQS (`DNUM_REQS),
.CREQ_SIZE (`DCREQ_SIZE),
.CRSQ_SIZE (`DCRSQ_SIZE),
.MSHR_SIZE (`DMSHR_SIZE),
.MRSQ_SIZE (`DMRSQ_SIZE),
.MREQ_SIZE (`DMREQ_SIZE),
.NUM_BANKS (`DCACHE_NUM_BANKS),
.NUM_PORTS (`DCACHE_NUM_PORTS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.NUM_REQS (`DCACHE_NUM_REQS),
.CREQ_SIZE (`DCACHE_CREQ_SIZE),
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
.MREQ_SIZE (`DCACHE_MREQ_SIZE),
.WRITE_ENABLE (1),
.CORE_TAG_WIDTH (`DCORE_TAG_WIDTH-`SM_ENABLE),
.CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS-`SM_ENABLE),
.MEM_TAG_WIDTH (`DMEM_TAG_WIDTH),
.CORE_TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE),
.CORE_TAG_ID_BITS (`DCACHE_CORE_TAG_ID_BITS-`SM_ENABLE),
.MEM_TAG_WIDTH (`DCACHE_MEM_TAG_WIDTH),
.NC_ENABLE (1)
) dcache (
`SCOPE_BIND_VX_mem_unit_dcache
@ -186,28 +187,31 @@ module VX_mem_unit # (
if (`SM_ENABLE) begin
VX_dcache_req_if #(
.NUM_REQS (`DNUM_REQS),
.WORD_SIZE (`DWORD_SIZE),
.TAG_WIDTH (`DCORE_TAG_WIDTH-1)
.NUM_REQS (`DCACHE_NUM_REQS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE)
) smem_req_if();
VX_dcache_rsp_if #(
.NUM_REQS (`DNUM_REQS),
.WORD_SIZE (`DWORD_SIZE),
.TAG_WIDTH (`DCORE_TAG_WIDTH-1)
.NUM_REQS (`DCACHE_NUM_REQS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE)
) smem_rsp_if();
VX_smem_arb #(
`RESET_RELAY (smem_arb_reset);
`RESET_RELAY (smem_reset);
VX_smem_arb #(
.NUM_REQS (2),
.LANES (`NUM_THREADS),
.DATA_SIZE (4),
.TAG_IN_WIDTH (`DCORE_TAG_WIDTH),
.TYPE ("X"),
.TAG_IN_WIDTH (`DCACHE_CORE_TAG_WIDTH),
.TYPE ("P"),
.BUFFERED_REQ (2),
.BUFFERED_RSP (1)
) smem_arb (
.clk (clk),
.reset (reset),
.reset (smem_arb_reset),
// input request
.req_valid_in (dcache_req_if.valid),
@ -242,19 +246,17 @@ module VX_mem_unit # (
.rsp_ready_out (dcache_rsp_if.ready)
);
`RESET_RELAY (smem_reset);
VX_shared_mem #(
.CACHE_ID (`SCACHE_ID),
.CACHE_ID (`SMEM_ID),
.CACHE_SIZE (`SMEM_SIZE),
.NUM_BANKS (`SNUM_BANKS),
.WORD_SIZE (`SWORD_SIZE),
.NUM_REQS (`SNUM_REQS),
.CREQ_SIZE (`SCREQ_SIZE),
.CRSQ_SIZE (`SCRSQ_SIZE),
.CORE_TAG_WIDTH (`DCORE_TAG_WIDTH-1),
.CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS-1),
.BANK_ADDR_OFFSET (`SBANK_ADDR_OFFSET)
.NUM_BANKS (`SMEM_NUM_BANKS),
.WORD_SIZE (`SMEM_WORD_SIZE),
.NUM_REQS (`SMEM_NUM_REQS),
.CREQ_SIZE (`SMEM_CREQ_SIZE),
.CRSQ_SIZE (`SMEM_CRSQ_SIZE),
.CORE_TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE),
.CORE_TAG_ID_BITS (`DCACHE_CORE_TAG_ID_BITS-`SM_ENABLE),
.BANK_ADDR_OFFSET (`SMEM_BANK_ADDR_OFFSET)
) smem (
.clk (clk),
.reset (smem_reset),
@ -281,13 +283,20 @@ module VX_mem_unit # (
);
end else begin
// core to D-cache request
assign dcache_req_tmp_if.valid = dcache_req_if.valid;
assign dcache_req_tmp_if.addr = dcache_req_if.addr;
assign dcache_req_tmp_if.rw = dcache_req_if.rw;
assign dcache_req_tmp_if.byteen = dcache_req_if.byteen;
assign dcache_req_tmp_if.data = dcache_req_if.data;
assign dcache_req_tmp_if.tag = dcache_req_if.tag;
assign dcache_req_if.ready = dcache_req_tmp_if.ready;
for (genvar i = 0; i < `DCACHE_NUM_REQS; ++i) begin
VX_skid_buffer #(
.DATAW ((32-`CLOG2(`DCACHE_WORD_SIZE)) + 1 + `DCACHE_WORD_SIZE + (8*`DCACHE_WORD_SIZE) + `DCACHE_CORE_TAG_WIDTH)
) req_buf (
.clk (clk),
.reset (reset),
.valid_in (dcache_req_if.valid[i]),
.data_in ({dcache_req_if.addr[i], dcache_req_if.rw[i], dcache_req_if.byteen[i], dcache_req_if.data[i], dcache_req_if.tag[i]}),
.ready_in (dcache_req_if.ready[i]),
.valid_out (dcache_req_tmp_if.valid[i]),
.data_out ({dcache_req_tmp_if.addr[i], dcache_req_tmp_if.rw[i], dcache_req_tmp_if.byteen[i], dcache_req_tmp_if.data[i], dcache_req_tmp_if.tag[i]}),
.ready_out (dcache_req_tmp_if.ready[i])
);
end
// D-cache to core reponse
assign dcache_rsp_if.valid = dcache_rsp_tmp_if.valid;
@ -297,21 +306,23 @@ module VX_mem_unit # (
assign dcache_rsp_tmp_if.ready = dcache_rsp_if.ready;
end
wire [`DMEM_TAG_WIDTH-1:0] icache_mem_req_tag = `DMEM_TAG_WIDTH'(icache_mem_req_if.tag);
wire [`DMEM_TAG_WIDTH-1:0] icache_mem_rsp_tag;
assign icache_mem_rsp_if.tag = icache_mem_rsp_tag[`IMEM_TAG_WIDTH-1:0];
wire [`DCACHE_MEM_TAG_WIDTH-1:0] icache_mem_req_tag = `DCACHE_MEM_TAG_WIDTH'(icache_mem_req_if.tag);
wire [`DCACHE_MEM_TAG_WIDTH-1:0] icache_mem_rsp_tag;
assign icache_mem_rsp_if.tag = icache_mem_rsp_tag[`ICACHE_MEM_TAG_WIDTH-1:0];
`UNUSED_VAR (icache_mem_rsp_tag)
VX_mem_arb #(
.NUM_REQS (2),
.DATA_WIDTH (`DMEM_DATA_WIDTH),
.ADDR_WIDTH (`DMEM_ADDR_WIDTH),
.TAG_IN_WIDTH (`DMEM_TAG_WIDTH),
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
.ADDR_WIDTH (`DCACHE_MEM_ADDR_WIDTH),
.TAG_IN_WIDTH (`DCACHE_MEM_TAG_WIDTH),
.TYPE ("R"),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.BUFFERED_REQ (1),
.BUFFERED_RSP (2)
) mem_arb (
.clk (clk),
.reset (reset),
.reset (mem_arb_reset),
// Source request
.req_valid_in ({dcache_mem_req_if.valid, icache_mem_req_if.valid}),

View file

@ -1,15 +1,11 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "util_dpi.vh"
`endif
module VX_muldiv (
input wire clk,
input wire reset,
// Inputs
input wire [`MUL_BITS-1:0] alu_op,
input wire [`INST_MUL_BITS-1:0] alu_op,
input wire [`NW_BITS-1:0] wid_in,
input wire [`NUM_THREADS-1:0] tmask_in,
input wire [31:0] PC_in,
@ -33,7 +29,7 @@ module VX_muldiv (
input wire ready_out
);
wire is_div_op = `MUL_IS_DIV(alu_op);
wire is_div_op = `INST_MUL_IS_DIV(alu_op);
wire [`NUM_THREADS-1:0][31:0] mul_result;
wire [`NW_BITS-1:0] mul_wid_out;
@ -48,18 +44,20 @@ module VX_muldiv (
wire mul_valid_in = valid_in && !is_div_op;
wire mul_ready_in = ~stall_out || ~mul_valid_out;
wire is_mulh_in = (alu_op != `MUL_MUL);
wire is_signed_mul_a = (alu_op != `MUL_MULHU);
wire is_signed_mul_b = (alu_op != `MUL_MULHU && alu_op != `MUL_MULHSU);
wire is_mulh_in = (alu_op != `INST_MUL_MUL);
wire is_signed_mul_a = (alu_op != `INST_MUL_MULHU);
wire is_signed_mul_b = (alu_op != `INST_MUL_MULHU && alu_op != `INST_MUL_MULHSU);
`ifdef IMUL_DPI
wire [`NUM_THREADS-1:0][31:0] mul_result_tmp;
wire [`NUM_THREADS-1:0][31:0] mul_result_tmp;
wire mul_fire_in = mul_valid_in && mul_ready_in;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [31:0] mul_resultl, mul_resulth;
always @(*) begin
dpi_imul (alu_in1[i], alu_in2[i], is_signed_mul_a, is_signed_mul_b, mul_resultl, mul_resulth);
dpi_imul (mul_fire_in, alu_in1[i], alu_in2[i], is_signed_mul_a, is_signed_mul_b, mul_resultl, mul_resulth);
end
assign mul_result_tmp[i] = is_mulh_in ? mul_resulth : mul_resultl;
end
@ -83,9 +81,9 @@ module VX_muldiv (
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [32:0] mul_in1 = {is_signed_mul_a & alu_in1[i][31], alu_in1[i]};
wire [32:0] mul_in2 = {is_signed_mul_b & alu_in2[i][31], alu_in2[i]};
`IGNORE_WARNINGS_BEGIN
`IGNORE_UNUSED_BEGIN
wire [65:0] mul_result_tmp;
`IGNORE_WARNINGS_END
`IGNORE_UNUSED_END
VX_multiplier #(
.WIDTHA (33),
@ -127,8 +125,8 @@ module VX_muldiv (
wire [`NR_BITS-1:0] div_rd_out;
wire div_wb_out;
wire is_rem_op_in = (alu_op == `MUL_REM) || (alu_op == `MUL_REMU);
wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM);
wire is_rem_op_in = (alu_op == `INST_MUL_REM) || (alu_op == `INST_MUL_REMU);
wire is_signed_div = (alu_op == `INST_MUL_DIV) || (alu_op == `INST_MUL_REM);
wire div_valid_in = valid_in && is_div_op;
wire div_ready_out = ~stall_out && ~mul_valid_out; // arbitration prioritizes MUL
wire div_ready_in;
@ -137,11 +135,13 @@ module VX_muldiv (
`ifdef IDIV_DPI
wire [`NUM_THREADS-1:0][31:0] div_result_tmp;
wire div_fire_in = div_valid_in && div_ready_in;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [31:0] div_quotient, div_remainder;
always @(*) begin
dpi_idiv (alu_in1[i], alu_in2[i], is_signed_div, div_quotient, div_remainder);
dpi_idiv (div_fire_in, alu_in1[i], alu_in2[i], is_signed_div, div_quotient, div_remainder);
end
assign div_result_tmp[i] = is_rem_op_in ? div_remainder : div_quotient;
end

View file

@ -15,30 +15,30 @@ module VX_pipeline #(
output wire [`NUM_THREADS-1:0][3:0] dcache_req_byteen,
output wire [`NUM_THREADS-1:0][29:0] dcache_req_addr,
output wire [`NUM_THREADS-1:0][31:0] dcache_req_data,
output wire [`NUM_THREADS-1:0][`DCORE_TAG_WIDTH-1:0] dcache_req_tag,
output wire [`NUM_THREADS-1:0][`DCACHE_CORE_TAG_WIDTH-1:0] dcache_req_tag,
input wire [`NUM_THREADS-1:0] dcache_req_ready,
// Dcache core reponse
input wire dcache_rsp_valid,
input wire [`NUM_THREADS-1:0] dcache_rsp_tmask,
input wire [`NUM_THREADS-1:0][31:0] dcache_rsp_data,
input wire [`DCORE_TAG_WIDTH-1:0] dcache_rsp_tag,
input wire [`DCACHE_CORE_TAG_WIDTH-1:0] dcache_rsp_tag,
output wire dcache_rsp_ready,
// Icache core request
output wire icache_req_valid,
output wire [29:0] icache_req_addr,
output wire [`ICORE_TAG_WIDTH-1:0] icache_req_tag,
output wire [`ICACHE_CORE_TAG_WIDTH-1:0] icache_req_tag,
input wire icache_req_ready,
// Icache core response
input wire icache_rsp_valid,
input wire [31:0] icache_rsp_data,
input wire [`ICORE_TAG_WIDTH-1:0] icache_rsp_tag,
input wire [`ICACHE_CORE_TAG_WIDTH-1:0] icache_rsp_tag,
output wire icache_rsp_ready,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
VX_perf_memsys_if.slave perf_memsys_if,
`endif
// Status
@ -51,7 +51,7 @@ module VX_pipeline #(
VX_dcache_req_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`DCORE_TAG_WIDTH)
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH)
) dcache_req_if();
assign dcache_req_valid = dcache_req_if.valid;
@ -69,7 +69,7 @@ module VX_pipeline #(
VX_dcache_rsp_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`DCORE_TAG_WIDTH)
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH)
) dcache_rsp_if();
assign dcache_rsp_if.valid = dcache_rsp_valid;
@ -84,7 +84,7 @@ module VX_pipeline #(
VX_icache_req_if #(
.WORD_SIZE (4),
.TAG_WIDTH (`ICORE_TAG_WIDTH)
.TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH)
) icache_req_if();
assign icache_req_valid = icache_req_if.valid;
@ -98,7 +98,7 @@ module VX_pipeline #(
VX_icache_rsp_if #(
.WORD_SIZE (4),
.TAG_WIDTH (`ICORE_TAG_WIDTH)
.TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH)
) icache_rsp_if();
assign icache_rsp_if.valid = icache_rsp_valid;
@ -108,18 +108,18 @@ module VX_pipeline #(
///////////////////////////////////////////////////////////////////////////
VX_cmt_to_csr_if #(
.SIZE ($clog2(3*`NUM_THREADS+1))
) cmt_to_csr_if();
VX_fetch_to_csr_if fetch_to_csr_if();
VX_cmt_to_csr_if cmt_to_csr_if();
VX_decode_if decode_if();
VX_branch_ctl_if branch_ctl_if();
VX_warp_ctl_if warp_ctl_if();
VX_ifetch_rsp_if ifetch_rsp_if();
VX_alu_req_if alu_req_if();
VX_lsu_req_if lsu_req_if();
VX_csr_req_if csr_req_if();
VX_csr_req_if csr_req_if();
`ifdef EXT_F_ENABLE
VX_fpu_req_if fpu_req_if();
`endif
VX_gpu_req_if gpu_req_if();
VX_writeback_if writeback_if();
VX_wstall_if wstall_if();
@ -128,7 +128,9 @@ module VX_pipeline #(
VX_commit_if ld_commit_if();
VX_commit_if st_commit_if();
VX_commit_if csr_commit_if();
`ifdef EXT_F_ENABLE
VX_commit_if fpu_commit_if();
`endif
VX_commit_if gpu_commit_if();
`ifdef PERF_ENABLE
@ -154,6 +156,7 @@ module VX_pipeline #(
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.ifetch_rsp_if (ifetch_rsp_if),
.fetch_to_csr_if(fetch_to_csr_if),
.busy (busy)
);
@ -186,7 +189,9 @@ module VX_pipeline #(
.alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if),
`ifdef EXT_F_ENABLE
.fpu_req_if (fpu_req_if),
`endif
.gpu_req_if (gpu_req_if)
);
@ -206,12 +211,15 @@ module VX_pipeline #(
.dcache_req_if (dcache_req_if),
.dcache_rsp_if (dcache_rsp_if),
.cmt_to_csr_if (cmt_to_csr_if),
.cmt_to_csr_if (cmt_to_csr_if),
.fetch_to_csr_if(fetch_to_csr_if),
.alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if),
`ifdef EXT_F_ENABLE
.fpu_req_if (fpu_req_if),
`endif
.gpu_req_if (gpu_req_if),
.warp_ctl_if (warp_ctl_if),
@ -220,7 +228,9 @@ module VX_pipeline #(
.ld_commit_if (ld_commit_if),
.st_commit_if (st_commit_if),
.csr_commit_if (csr_commit_if),
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if),
`endif
.gpu_commit_if (gpu_commit_if),
.busy (busy)
@ -236,7 +246,9 @@ module VX_pipeline #(
.ld_commit_if (ld_commit_if),
.st_commit_if (st_commit_if),
.csr_commit_if (csr_commit_if),
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if),
`endif
.gpu_commit_if (gpu_commit_if),
.writeback_if (writeback_if),

View file

@ -1,10 +1,16 @@
`ifndef VX_PLATFORM
`define VX_PLATFORM
`ifndef SYNTHESIS
`include "util_dpi.vh"
`endif
`include "VX_scope.vh"
///////////////////////////////////////////////////////////////////////////////
`ifndef SYNTHESIS
`ifndef NDEBUG
`define DEBUG_BLOCK(x) /* verilator lint_off UNUSED */ \
x \
@ -13,9 +19,9 @@
`define DEBUG_BLOCK(x)
`endif
`define DEBUG_BEGIN /* verilator lint_off UNUSED */
`define IGNORE_UNUSED_BEGIN /* verilator lint_off UNUSED */
`define DEBUG_END /* verilator lint_on UNUSED */
`define IGNORE_UNUSED_END /* verilator lint_on UNUSED */
`define IGNORE_WARNINGS_BEGIN /* verilator lint_off UNUSED */ \
/* verilator lint_off PINCONNECTEMPTY */ \
@ -23,7 +29,8 @@
/* verilator lint_off UNOPTFLAT */ \
/* verilator lint_off UNDRIVEN */ \
/* verilator lint_off DECLFILENAME */ \
/* verilator lint_off IMPLICIT */
/* verilator lint_off IMPLICIT */ \
/* verilator lint_off IMPORTSTAR */
`define IGNORE_WARNINGS_END /* verilator lint_on UNUSED */ \
/* verilator lint_on PINCONNECTEMPTY */ \
@ -31,7 +38,8 @@
/* verilator lint_on UNOPTFLAT */ \
/* verilator lint_on UNDRIVEN */ \
/* verilator lint_on DECLFILENAME */ \
/* verilator lint_on IMPLICIT */
/* verilator lint_on IMPLICIT */ \
/* verilator lint_on IMPORTSTAR */
`define UNUSED_PARAM(x) /* verilator lint_off UNUSED */ \
localparam __``x = x; \
@ -43,7 +51,11 @@
. x () \
/* verilator lint_on PINCONNECTEMPTY */
`define STRINGIFY(x) `"x`"
`define ERROR(msg) \
$error msg
`define ASSERT(cond, msg) \
assert(cond) else $error msg
`define STATIC_ASSERT(cond, msg) \
generate \
@ -51,41 +63,61 @@
endgenerate
`define RUNTIME_ASSERT(cond, msg) \
always @(posedge clk) \
always @(posedge clk) begin \
assert(cond) else $error msg; \
end
`define TRACING_ON /* verilator tracing_on */
`define TRACING_OFF /* verilator tracing_off */
`define RESET_RELAY(signal) \
wire signal; \
VX_reset_relay __``signal ( \
.clk (clk), \
.reset (reset), \
.reset_o (signal) \
)
`else // SYNTHESIS
`define DEBUG_BLOCK(x)
`define IGNORE_UNUSED_BEGIN
`define IGNORE_UNUSED_END
`define IGNORE_WARNINGS_BEGIN
`define IGNORE_WARNINGS_END
`define UNUSED_PARAM(x)
`define UNUSED_VAR(x)
`define UNUSED_PIN(x) . x ()
`define ERROR(msg)
`define ASSERT(cond, msg) if (cond);
`define STATIC_ASSERT(cond, msg)
`define RUNTIME_ASSERT(cond, msg)
`define TRACING_ON
`define TRACING_OFF
`endif // SYNTHESIS
///////////////////////////////////////////////////////////////////////////////
`ifdef QUARTUS
`define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *)
`define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *)
`define DISABLE_BRAM (* ramstyle = "logic" *)
`define PRESERVE_REG (* preserve *)
`else
`define USE_FAST_BRAM
`define NO_RW_RAM_CHECK
`define DISABLE_BRAM
`define PRESERVE_REG
`endif
///////////////////////////////////////////////////////////////////////////////
`define STRINGIFY(x) `"x`"
`define CLOG2(x) $clog2(x)
`define FLOG2(x) ($clog2(x) - (((1 << $clog2(x)) > (x)) ? 1 : 0))
`define LOG2UP(x) (((x) > 1) ? $clog2(x) : 1)
`define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1))))
`define ABS(x) (($signed(x) < 0) ? (-$signed(x)) : x);
`define ABS(x) (($signed(x) < 0) ? (-$signed(x)) : (x));
`define MIN(x, y) ((x < y) ? (x) : (y))
`define MAX(x, y) ((x > y) ? (x) : (y))
`define MIN(x, y) (((x) < (y)) ? (x) : (y))
`define MAX(x, y) (((x) > (y)) ? (x) : (y))
`define UP(x) (((x) > 0) ? x : 1)
`define SAFE_RNG(h, l) `MAX(h,l) : l
`define UP(x) (((x) > 0) ? (x) : 1)
`define RTRIM(x, s) x[$bits(x)-1:($bits(x)-s)]
@ -112,25 +144,41 @@
end \
$write("}")
`define PRINT_ARRAY1D(a, m) \
$write("{"); \
`define TRACE_ARRAY1D(a, m) \
dpi_trace("{"); \
for (integer i = (m-1); i >= 0; --i) begin \
if (i != (m-1)) $write(", "); \
$write("0x%0h", a[i]); \
if (i != (m-1)) dpi_trace(", "); \
dpi_trace("0x%0h", a[i]); \
end \
$write("}"); \
dpi_trace("}"); \
`define PRINT_ARRAY2D(a, m, n) \
$write("{"); \
`define TRACE_ARRAY2D(a, m, n) \
dpi_trace("{"); \
for (integer i = n-1; i >= 0; --i) begin \
if (i != (n-1)) $write(", "); \
$write("{"); \
if (i != (n-1)) dpi_trace(", "); \
dpi_trace("{"); \
for (integer j = (m-1); j >= 0; --j) begin \
if (j != (m-1)) $write(", "); \
$write("0x%0h", a[i][j]); \
if (j != (m-1)) dpi_trace(", "); \
dpi_trace("0x%0h", a[i][j]); \
end \
$write("}"); \
dpi_trace("}"); \
end \
$write("}")
dpi_trace("}")
`define RESET_RELAY(signal) \
wire signal; \
VX_reset_relay __``signal ( \
.clk (clk), \
.reset (reset), \
.reset_o (signal) \
)
`define POP_COUNT(out, in) \
VX_popcount #( \
.N ($bits(in)) \
) __``out ( \
.in_i (in), \
.cnt_o (out) \
)
`endif

View file

@ -7,132 +7,141 @@ task print_ex_type (
input [`EX_BITS-1:0] ex_type
);
case (ex_type)
`EX_ALU: $write("ALU");
`EX_LSU: $write("LSU");
`EX_CSR: $write("CSR");
`EX_FPU: $write("FPU");
`EX_GPU: $write("GPU");
default: $write("NOP");
`EX_ALU: dpi_trace("ALU");
`EX_LSU: dpi_trace("LSU");
`EX_CSR: dpi_trace("CSR");
`EX_FPU: dpi_trace("FPU");
`EX_GPU: dpi_trace("GPU");
default: dpi_trace("NOP");
endcase
endtask
task print_ex_op (
input [`EX_BITS-1:0] ex_type,
input [`OP_BITS-1:0] op_type,
input [`MOD_BITS-1:0] op_mod
input [`INST_OP_BITS-1:0] op_type,
input [`INST_MOD_BITS-1:0] op_mod
);
case (ex_type)
`EX_ALU: begin
if (`ALU_IS_BR(op_mod)) begin
case (`BR_BITS'(op_type))
`BR_EQ: $write("BEQ");
`BR_NE: $write("BNE");
`BR_LT: $write("BLT");
`BR_GE: $write("BGE");
`BR_LTU: $write("BLTU");
`BR_GEU: $write("BGEU");
`BR_JAL: $write("JAL");
`BR_JALR: $write("JALR");
`BR_ECALL: $write("ECALL");
`BR_EBREAK:$write("EBREAK");
`BR_MRET: $write("MRET");
`BR_SRET: $write("SRET");
`BR_DRET: $write("DRET");
default: $write("?");
if (`INST_ALU_IS_BR(op_mod)) begin
case (`INST_BR_BITS'(op_type))
`INST_BR_EQ: dpi_trace("BEQ");
`INST_BR_NE: dpi_trace("BNE");
`INST_BR_LT: dpi_trace("BLT");
`INST_BR_GE: dpi_trace("BGE");
`INST_BR_LTU: dpi_trace("BLTU");
`INST_BR_GEU: dpi_trace("BGEU");
`INST_BR_JAL: dpi_trace("JAL");
`INST_BR_JALR: dpi_trace("JALR");
`INST_BR_ECALL: dpi_trace("ECALL");
`INST_BR_EBREAK:dpi_trace("EBREAK");
`INST_BR_MRET: dpi_trace("MRET");
`INST_BR_SRET: dpi_trace("SRET");
`INST_BR_DRET: dpi_trace("DRET");
default: dpi_trace("?");
endcase
end else if (`ALU_IS_MUL(op_mod)) begin
case (`MUL_BITS'(op_type))
`MUL_MUL: $write("MUL");
`MUL_MULH: $write("MULH");
`MUL_MULHSU:$write("MULHSU");
`MUL_MULHU: $write("MULHU");
`MUL_DIV: $write("DIV");
`MUL_DIVU: $write("DIVU");
`MUL_REM: $write("REM");
`MUL_REMU: $write("REMU");
default: $write("?");
end else if (`INST_ALU_IS_MUL(op_mod)) begin
case (`INST_MUL_BITS'(op_type))
`INST_MUL_MUL: dpi_trace("MUL");
`INST_MUL_MULH: dpi_trace("MULH");
`INST_MUL_MULHSU:dpi_trace("MULHSU");
`INST_MUL_MULHU: dpi_trace("MULHU");
`INST_MUL_DIV: dpi_trace("DIV");
`INST_MUL_DIVU: dpi_trace("DIVU");
`INST_MUL_REM: dpi_trace("REM");
`INST_MUL_REMU: dpi_trace("REMU");
default: dpi_trace("?");
endcase
end else begin
case (`ALU_BITS'(op_type))
`ALU_ADD: $write("ADD");
`ALU_SUB: $write("SUB");
`ALU_SLL: $write("SLL");
`ALU_SRL: $write("SRL");
`ALU_SRA: $write("SRA");
`ALU_SLT: $write("SLT");
`ALU_SLTU: $write("SLTU");
`ALU_XOR: $write("XOR");
`ALU_OR: $write("OR");
`ALU_AND: $write("AND");
`ALU_LUI: $write("LUI");
`ALU_AUIPC: $write("AUIPC");
default: $write("?");
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: dpi_trace("ADD");
`INST_ALU_SUB: dpi_trace("SUB");
`INST_ALU_SLL: dpi_trace("SLL");
`INST_ALU_SRL: dpi_trace("SRL");
`INST_ALU_SRA: dpi_trace("SRA");
`INST_ALU_SLT: dpi_trace("SLT");
`INST_ALU_SLTU: dpi_trace("SLTU");
`INST_ALU_XOR: dpi_trace("XOR");
`INST_ALU_OR: dpi_trace("OR");
`INST_ALU_AND: dpi_trace("AND");
`INST_ALU_LUI: dpi_trace("LUI");
`INST_ALU_AUIPC: dpi_trace("AUIPC");
default: dpi_trace("?");
endcase
end
end
`EX_LSU: begin
case (`LSU_BITS'(op_type))
`LSU_LB: $write("LB");
`LSU_LH: $write("LH");
`LSU_LW: $write("LW");
`LSU_LBU:$write("LBU");
`LSU_LHU:$write("LHU");
`LSU_SB: $write("SB");
`LSU_SH: $write("SH");
`LSU_SW: $write("SW");
default: $write("?");
endcase
if (op_mod == 0) begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LB: dpi_trace("LB");
`INST_LSU_LH: dpi_trace("LH");
`INST_LSU_LW: dpi_trace("LW");
`INST_LSU_LBU:dpi_trace("LBU");
`INST_LSU_LHU:dpi_trace("LHU");
`INST_LSU_SB: dpi_trace("SB");
`INST_LSU_SH: dpi_trace("SH");
`INST_LSU_SW: dpi_trace("SW");
default: dpi_trace("?");
endcase
end else if (op_mod == 1) begin
case (`INST_FENCE_BITS'(op_type))
`INST_FENCE_D: dpi_trace("DFENCE");
`INST_FENCE_I: dpi_trace("IFENCE");
default: dpi_trace("?");
endcase
end
end
`EX_CSR: begin
case (`CSR_BITS'(op_type))
`CSR_RW: $write("CSRW");
`CSR_RS: $write("CSRS");
`CSR_RC: $write("CSRC");
default: $write("?");
case (`INST_CSR_BITS'(op_type))
`INST_CSR_RW: dpi_trace("CSRW");
`INST_CSR_RS: dpi_trace("CSRS");
`INST_CSR_RC: dpi_trace("CSRC");
default: dpi_trace("?");
endcase
end
`EX_FPU: begin
case (`FPU_BITS'(op_type))
`FPU_ADD: $write("ADD");
`FPU_SUB: $write("SUB");
`FPU_MUL: $write("MUL");
`FPU_DIV: $write("DIV");
`FPU_SQRT: $write("SQRT");
`FPU_MADD: $write("MADD");
`FPU_NMSUB: $write("NMSUB");
`FPU_NMADD: $write("NMADD");
`FPU_CVTWS: $write("CVTWS");
`FPU_CVTWUS:$write("CVTWUS");
`FPU_CVTSW: $write("CVTSW");
`FPU_CVTSWU:$write("CVTSWU");
`FPU_CLASS: $write("CLASS");
`FPU_CMP: $write("CMP");
`FPU_MISC: begin
case (`INST_FPU_BITS'(op_type))
`INST_FPU_ADD: dpi_trace("ADD");
`INST_FPU_SUB: dpi_trace("SUB");
`INST_FPU_MUL: dpi_trace("MUL");
`INST_FPU_DIV: dpi_trace("DIV");
`INST_FPU_SQRT: dpi_trace("SQRT");
`INST_FPU_MADD: dpi_trace("MADD");
`INST_FPU_NMSUB: dpi_trace("NMSUB");
`INST_FPU_NMADD: dpi_trace("NMADD");
`INST_FPU_CVTWS: dpi_trace("CVTWS");
`INST_FPU_CVTWUS:dpi_trace("CVTWUS");
`INST_FPU_CVTSW: dpi_trace("CVTSW");
`INST_FPU_CVTSWU:dpi_trace("CVTSWU");
`INST_FPU_CLASS: dpi_trace("CLASS");
`INST_FPU_CMP: dpi_trace("CMP");
`INST_FPU_MISC: begin
case (op_mod)
0: $write("SGNJ");
1: $write("SGNJN");
2: $write("SGNJX");
3: $write("MIN");
4: $write("MAX");
5: $write("MVXW");
6: $write("MVWX");
0: dpi_trace("SGNJ");
1: dpi_trace("SGNJN");
2: dpi_trace("SGNJX");
3: dpi_trace("MIN");
4: dpi_trace("MAX");
5: dpi_trace("MVXW");
6: dpi_trace("MVWX");
endcase
end
default: $write("?");
default: dpi_trace("?");
endcase
end
`EX_GPU: begin
case (`GPU_BITS'(op_type))
`GPU_TMC: $write("TMC");
`GPU_WSPAWN:$write("WSPAWN");
`GPU_SPLIT: $write("SPLIT");
`GPU_JOIN: $write("JOIN");
`GPU_BAR: $write("BAR");
`GPU_TEX: $write("TEX");
default: $write("?");
case (`INST_GPU_BITS'(op_type))
`INST_GPU_TMC: dpi_trace("TMC");
`INST_GPU_WSPAWN:dpi_trace("WSPAWN");
`INST_GPU_SPLIT: dpi_trace("SPLIT");
`INST_GPU_JOIN: dpi_trace("JOIN");
`INST_GPU_BAR: dpi_trace("BAR");
`INST_GPU_PRED: dpi_trace("PRED");
`INST_GPU_TEX: dpi_trace("TEX");
default: dpi_trace("?");
endcase
end
default: $write("?");
default: dpi_trace("?");
endcase
endtask

84
hw/rtl/VX_scoreboard.sv Normal file
View file

@ -0,0 +1,84 @@
`include "VX_define.vh"
module VX_scoreboard #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
VX_ibuffer_if.scoreboard ibuffer_if,
VX_writeback_if.scoreboard writeback_if
);
reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
wire reserve_reg = ibuffer_if.valid && ibuffer_if.ready && ibuffer_if.wb;
wire release_reg = writeback_if.valid && writeback_if.ready && writeback_if.eop;
always @(*) begin
inuse_regs_n = inuse_regs;
if (reserve_reg) begin
inuse_regs_n[ibuffer_if.wid][ibuffer_if.rd] = 1;
end
if (release_reg) begin
inuse_regs_n[writeback_if.wid][writeback_if.rd] = 0;
end
end
always @(posedge clk) begin
if (reset) begin
inuse_regs <= '0;
end else begin
inuse_regs <= inuse_regs_n;
end
end
reg deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3;
always @(posedge clk) begin
deq_inuse_rd <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rd_n];
deq_inuse_rs1 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs1_n];
deq_inuse_rs2 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs2_n];
deq_inuse_rs3 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs3_n];
end
assign writeback_if.ready = 1'b1;
assign ibuffer_if.ready = ~(deq_inuse_rd
| deq_inuse_rs1
| deq_inuse_rs2
| deq_inuse_rs3);
`UNUSED_VAR (writeback_if.PC)
reg [31:0] deadlock_ctr;
wire [31:0] deadlock_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
always @(posedge clk) begin
if (reset) begin
deadlock_ctr <= 0;
end else begin
`ifdef DBG_PRINT_PIPELINE
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3);
end
`endif
if (release_reg) begin
`ASSERT(inuse_regs[writeback_if.wid][writeback_if.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d",
$time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd));
end
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
deadlock_ctr <= deadlock_ctr + 1;
`ASSERT(deadlock_ctr < deadlock_timeout,
("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3));
end else if (ibuffer_if.valid && ibuffer_if.ready) begin
deadlock_ctr <= 0;
end
end
end
endmodule

View file

@ -1,71 +0,0 @@
`include "VX_define.vh"
module VX_scoreboard #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
VX_ibuffer_if ibuffer_if,
VX_writeback_if writeback_if,
output wire delay
);
reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
reg [`NUM_REGS-1:0] deq_inuse_regs;
assign delay = |(deq_inuse_regs & ibuffer_if.used_regs);
wire reserve_reg = ibuffer_if.valid && ibuffer_if.ready && ibuffer_if.wb;
wire release_reg = writeback_if.valid && writeback_if.ready && writeback_if.eop;
always @(*) begin
inuse_regs_n = inuse_regs;
if (reserve_reg) begin
inuse_regs_n[ibuffer_if.wid][ibuffer_if.rd] = 1;
end
if (release_reg) begin
inuse_regs_n[writeback_if.wid][writeback_if.rd] = 0;
end
end
always @(posedge clk) begin
if (reset) begin
inuse_regs <= '0;
end else begin
inuse_regs <= inuse_regs_n;
end
deq_inuse_regs <= inuse_regs_n[ibuffer_if.wid_n];
end
reg [31:0] deadlock_ctr;
wire [31:0] deadlock_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
always @(posedge clk) begin
if (reset) begin
deadlock_ctr <= 0;
end else begin
`ifdef DBG_PRINT_PIPELINE
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
$display("%t: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
deq_inuse_regs[ibuffer_if.rd], deq_inuse_regs[ibuffer_if.rs1], deq_inuse_regs[ibuffer_if.rs2], deq_inuse_regs[ibuffer_if.rs3]);
end
`endif
if (release_reg) begin
assert(inuse_regs[writeback_if.wid][writeback_if.rd] != 0)
else $error("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d",
$time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd);
end
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
deadlock_ctr <= deadlock_ctr + 1;
assert(deadlock_ctr < deadlock_timeout) else $error("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
deq_inuse_regs[ibuffer_if.rd], deq_inuse_regs[ibuffer_if.rs1], deq_inuse_regs[ibuffer_if.rs2], deq_inuse_regs[ibuffer_if.rs3]);
end else if (ibuffer_if.valid && ibuffer_if.ready) begin
deadlock_ctr <= 0;
end
end
end
endmodule

View file

@ -8,12 +8,12 @@ module VX_smem_arb #(
parameter TAG_SEL_IDX = 0,
parameter BUFFERED_REQ = 0,
parameter BUFFERED_RSP = 0,
parameter TYPE = "R",
parameter TYPE = "P",
localparam ADDR_WIDTH = (32-`CLOG2(DATA_SIZE)),
localparam DATA_WIDTH = (8 * DATA_SIZE),
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS),
localparam TAG_OUT_WIDTH = TAG_IN_WIDTH - LOG_NUM_REQS
parameter ADDR_WIDTH = (32-`CLOG2(DATA_SIZE)),
parameter DATA_WIDTH = (8 * DATA_SIZE),
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
parameter TAG_OUT_WIDTH = TAG_IN_WIDTH - LOG_NUM_REQS
) (
input wire clk,
input wire reset,

View file

@ -1,59 +0,0 @@
`ifndef VX_TYPES
`define VX_TYPES
`include "VX_define.vh"
typedef struct packed {
logic is_normal;
logic is_zero;
logic is_subnormal;
logic is_inf;
logic is_nan;
logic is_quiet;
logic is_signaling;
} fp_type_t;
typedef struct packed {
logic NV; // 4-Invalid
logic DZ; // 3-Divide by zero
logic OF; // 2-Overflow
logic UF; // 1-Underflow
logic NX; // 0-Inexact
} fflags_t;
`define FFG_BITS $bits(fflags_t)
typedef struct packed {
logic valid;
logic [`NUM_THREADS-1:0] tmask;
} gpu_tmc_t;
`define GPU_TMC_BITS (1+`NUM_THREADS)
typedef struct packed {
logic valid;
logic [`NUM_WARPS-1:0] wmask;
logic [31:0] pc;
} gpu_wspawn_t;
`define GPU_WSPAWN_BITS (1+`NUM_WARPS+32)
typedef struct packed {
logic valid;
logic diverged;
logic [`NUM_THREADS-1:0] then_mask;
logic [`NUM_THREADS-1:0] else_mask;
logic [31:0] pc;
} gpu_split_t;
`define GPU_SPLIT_BITS (1+1+`NUM_THREADS+`NUM_THREADS+32)
typedef struct packed {
logic valid;
logic [`NB_BITS-1:0] id;
logic [`NW_BITS-1:0] size_m1;
} gpu_barrier_t;
`define GPU_BARRIER_BITS (1+`NB_BITS+`NW_BITS)
`endif

246
hw/rtl/VX_warp_sched.sv Normal file
View file

@ -0,0 +1,246 @@
`include "VX_define.vh"
module VX_warp_sched #(
parameter CORE_ID = 0
) (
`SCOPE_IO_VX_warp_sched
input wire clk,
input wire reset,
VX_warp_ctl_if.slave warp_ctl_if,
VX_wstall_if.slave wstall_if,
VX_join_if.slave join_if,
VX_branch_ctl_if.slave branch_ctl_if,
VX_ifetch_req_if.master ifetch_req_if,
VX_fetch_to_csr_if.master fetch_to_csr_if,
output wire busy
);
`UNUSED_PARAM (CORE_ID)
wire join_else;
wire [31:0] join_pc;
wire [`NUM_THREADS-1:0] join_tmask;
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // real active warps (updated when a warp is activated or disabled)
reg [`NUM_WARPS-1:0] stalled_warps; // asserted when a branch/gpgpu instructions are issued
reg [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks;
reg [`NUM_WARPS-1:0][31:0] warp_pcs;
// barriers
reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks; // warps waiting on barrier
wire reached_barrier_limit; // the expected number of warps reached the barrier
// wspawn
reg [31:0] wspawn_pc;
reg [`NUM_WARPS-1:0] use_wspawn;
wire [`NW_BITS-1:0] schedule_wid;
wire [`NUM_THREADS-1:0] schedule_tmask;
wire [31:0] schedule_pc;
wire schedule_valid;
wire warp_scheduled;
wire ifetch_req_fire = ifetch_req_if.valid && ifetch_req_if.ready;
wire tmc_active = (warp_ctl_if.tmc.tmask != 0);
always @(*) begin
active_warps_n = active_warps;
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
active_warps_n = warp_ctl_if.wspawn.wmask;
end
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
active_warps_n[warp_ctl_if.wid] = tmc_active;
end
end
always @(posedge clk) begin
if (reset) begin
barrier_masks <= 0;
use_wspawn <= 0;
stalled_warps <= 0;
warp_pcs <= '0;
active_warps <= '0;
thread_masks <= '0;
// activate first warp
warp_pcs[0] <= `STARTUP_ADDR;
active_warps[0] <= '1;
thread_masks[0] <= '1;
end else begin
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1));
wspawn_pc <= warp_ctl_if.wspawn.pc;
end
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
stalled_warps[warp_ctl_if.wid] <= 0;
if (reached_barrier_limit) begin
barrier_masks[warp_ctl_if.barrier.id] <= 0;
end else begin
barrier_masks[warp_ctl_if.barrier.id][warp_ctl_if.wid] <= 1;
end
end
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.tmc.tmask;
stalled_warps[warp_ctl_if.wid] <= 0;
end
if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin
stalled_warps[warp_ctl_if.wid] <= 0;
if (warp_ctl_if.split.diverged) begin
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.split.then_tmask;
end
end
// Branch
if (branch_ctl_if.valid) begin
if (branch_ctl_if.taken) begin
warp_pcs[branch_ctl_if.wid] <= branch_ctl_if.dest;
end
stalled_warps[branch_ctl_if.wid] <= 0;
end
if (warp_scheduled) begin
// stall the warp until decode stage
stalled_warps[schedule_wid] <= 1;
// release wspawn
use_wspawn[schedule_wid] <= 0;
if (use_wspawn[schedule_wid]) begin
thread_masks[schedule_wid] <= 1;
end
end
if (ifetch_req_fire) begin
warp_pcs[ifetch_req_if.wid] <= ifetch_req_if.PC + 4;
end
if (wstall_if.valid) begin
stalled_warps[wstall_if.wid] <= wstall_if.stalled;
end
// join handling
if (join_if.valid) begin
if (join_else) begin
warp_pcs[join_if.wid] <= join_pc;
end
thread_masks[join_if.wid] <= join_tmask;
end
active_warps <= active_warps_n;
end
end
// export thread mask register
assign fetch_to_csr_if.thread_masks = thread_masks;
// calculate active barrier status
`IGNORE_UNUSED_BEGIN
wire [`NW_BITS:0] active_barrier_count;
`IGNORE_UNUSED_END
wire [`NUM_WARPS-1:0] barrier_mask = barrier_masks[warp_ctl_if.barrier.id];
`POP_COUNT(active_barrier_count, barrier_mask);
assign reached_barrier_limit = (active_barrier_count[`NW_BITS-1:0] == warp_ctl_if.barrier.size_m1);
reg [`NUM_WARPS-1:0] barrier_stalls;
always @(*) begin
barrier_stalls = barrier_masks[0];
for (integer i = 1; i < `NUM_BARRIERS; ++i) begin
barrier_stalls |= barrier_masks[i];
end
end
// split/join stack management
wire [(32+`NUM_THREADS)-1:0] ipdom_data [`NUM_WARPS-1:0];
wire ipdom_index [`NUM_WARPS-1:0];
for (genvar i = 0; i < `NUM_WARPS; i++) begin
wire push = warp_ctl_if.valid
&& warp_ctl_if.split.valid
&& (i == warp_ctl_if.wid);
wire pop = join_if.valid && (i == join_if.wid);
wire [`NUM_THREADS-1:0] else_tmask = warp_ctl_if.split.else_tmask;
wire [`NUM_THREADS-1:0] orig_tmask = thread_masks[warp_ctl_if.wid];
wire [(32+`NUM_THREADS)-1:0] q_else = {warp_ctl_if.split.pc, else_tmask};
wire [(32+`NUM_THREADS)-1:0] q_end = {32'b0, orig_tmask};
VX_ipdom_stack #(
.WIDTH (32+`NUM_THREADS),
.DEPTH (2 ** (`NT_BITS+1))
) ipdom_stack (
.clk (clk),
.reset (reset),
.push (push),
.pop (pop),
.pair (warp_ctl_if.split.diverged),
.q1 (q_end),
.q2 (q_else),
.d (ipdom_data[i]),
.index (ipdom_index[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (full)
);
end
assign {join_pc, join_tmask} = ipdom_data[join_if.wid];
assign join_else = ~ipdom_index[join_if.wid];
// schedule the next ready warp
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
VX_lzc #(
.N (`NUM_WARPS)
) wid_select (
.in_i (ready_warps),
.cnt_o (schedule_wid),
.valid_o (schedule_valid)
);
wire [`NUM_WARPS-1:0][(`NUM_THREADS + 32)-1:0] schedule_data;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign schedule_data[i] = {(use_wspawn[i] ? `NUM_THREADS'(1) : thread_masks[i]),
(use_wspawn[i] ? wspawn_pc : warp_pcs[i])};
end
assign {schedule_tmask, schedule_pc} = schedule_data[schedule_wid];
wire stall_out = ~ifetch_req_if.ready && ifetch_req_if.valid;
assign warp_scheduled = schedule_valid && ~stall_out;
VX_pipe_register #(
.DATAW (1 + `NUM_THREADS + 32 + `NW_BITS),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall_out),
.data_in ({schedule_valid, schedule_tmask, schedule_pc, schedule_wid}),
.data_out ({ifetch_req_if.valid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid})
);
assign busy = (active_warps != 0);
`SCOPE_ASSIGN (wsched_scheduled, warp_scheduled);
`SCOPE_ASSIGN (wsched_active_warps, active_warps);
`SCOPE_ASSIGN (wsched_stalled_warps, stalled_warps);
`SCOPE_ASSIGN (wsched_schedule_wid, schedule_wid);
`SCOPE_ASSIGN (wsched_schedule_tmask, schedule_tmask);
`SCOPE_ASSIGN (wsched_schedule_pc, schedule_pc);
endmodule

View file

@ -1,254 +0,0 @@
`include "VX_define.vh"
module VX_warp_sched #(
parameter CORE_ID = 0
) (
`SCOPE_IO_VX_warp_sched
input wire clk,
input wire reset,
VX_warp_ctl_if warp_ctl_if,
VX_wstall_if wstall_if,
VX_join_if join_if,
VX_branch_ctl_if branch_ctl_if,
VX_ifetch_rsp_if ifetch_rsp_if,
VX_ifetch_req_if ifetch_req_if,
output wire busy
);
`UNUSED_PARAM (CORE_ID)
wire join_else;
wire [31:0] join_pc;
wire [`NUM_THREADS-1:0] join_tm;
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // real active warps (updated when a warp is activated or disabled)
reg [`NUM_WARPS-1:0] schedule_table, schedule_table_n; // enforces round-robin, barrier, and non-speculating branches
reg [`NUM_WARPS-1:0] stalled_warps; // asserted when a branch/gpgpu instructions are issued
// Lock warp until instruction decode to resolve branches
reg [`NUM_WARPS-1:0] fetch_lock;
reg [`NUM_THREADS-1:0] thread_masks [`NUM_WARPS-1:0];
reg [31:0] warp_pcs [`NUM_WARPS-1:0];
// barriers
reg [`NUM_WARPS-1:0] barrier_stall_mask [`NUM_BARRIERS-1:0]; // warps waiting on barrier
wire reached_barrier_limit; // the expected number of warps reached the barrier
// wspawn
reg [31:0] use_wspawn_pc;
reg [`NUM_WARPS-1:0] use_wspawn;
reg [`NW_BITS-1:0] scheduled_warp;
wire warp_scheduled;
wire ifetch_rsp_fire = ifetch_rsp_if.valid && ifetch_rsp_if.ready;
always @(*) begin
active_warps_n = active_warps;
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
active_warps_n = warp_ctl_if.wspawn.wmask;
end
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
active_warps_n[warp_ctl_if.wid] = (warp_ctl_if.tmc.tmask != 0);
end
end
always @(*) begin
schedule_table_n = schedule_table;
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
schedule_table_n[warp_ctl_if.wid] = (warp_ctl_if.tmc.tmask != 0);
end
if (warp_scheduled) begin // remove scheduled warp (round-robin)
schedule_table_n[scheduled_warp] = 0;
end
end
always @(posedge clk) begin
if (reset) begin
for (integer i = 0; i < `NUM_BARRIERS; i++) begin
barrier_stall_mask[i] <= 0;
end
use_wspawn_pc <= 0;
use_wspawn <= 0;
warp_pcs[0] <= `STARTUP_ADDR;
active_warps[0] <= 1; // Activating first warp
schedule_table[0] <= 1; // set first warp as ready
thread_masks[0] <= 1; // Activating first thread in first warp
stalled_warps <= 0;
fetch_lock <= 0;
for (integer i = 1; i < `NUM_WARPS; i++) begin
warp_pcs[i] <= 0;
active_warps[i] <= 0;
schedule_table[i] <= 0;
thread_masks[i] <= 0;
end
end else begin
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1));
use_wspawn_pc <= warp_ctl_if.wspawn.pc;
end
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
stalled_warps[warp_ctl_if.wid] <= 0;
if (reached_barrier_limit) begin
barrier_stall_mask[warp_ctl_if.barrier.id] <= 0;
end else begin
barrier_stall_mask[warp_ctl_if.barrier.id][warp_ctl_if.wid] <= 1;
end
end else if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.tmc.tmask;
stalled_warps[warp_ctl_if.wid] <= 0;
end else if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin
stalled_warps[warp_ctl_if.wid] <= 0;
if (warp_ctl_if.split.diverged) begin
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.split.then_mask;
end
end
if (use_wspawn[scheduled_warp] && warp_scheduled) begin
use_wspawn[scheduled_warp] <= 0;
thread_masks[scheduled_warp] <= 1;
end
// Stalling the scheduling of warps
if (wstall_if.valid) begin
stalled_warps[wstall_if.wid] <= 1;
end
// Branch
if (branch_ctl_if.valid) begin
if (branch_ctl_if.taken) begin
warp_pcs[branch_ctl_if.wid] <= branch_ctl_if.dest;
end
stalled_warps[branch_ctl_if.wid] <= 0;
end
// Lock warp until instruction decode to resolve branches
if (warp_scheduled) begin
fetch_lock[scheduled_warp] <= 1;
end
if (ifetch_rsp_fire) begin
fetch_lock[ifetch_rsp_if.wid] <= 0;
warp_pcs[ifetch_rsp_if.wid] <= ifetch_rsp_if.PC + 4;
end
// join handling
if (join_if.valid) begin
if (join_else) begin
warp_pcs[join_if.wid] <= join_pc;
end
thread_masks[join_if.wid] <= join_tm;
end
active_warps <= active_warps_n;
// reset 'schedule_table' when it goes to zero
schedule_table <= (| schedule_table_n) ? schedule_table_n : active_warps_n;
end
end
// calculate active barrier status
`IGNORE_WARNINGS_BEGIN
wire [`NW_BITS:0] active_barrier_count;
`IGNORE_WARNINGS_END
assign active_barrier_count = $countones(barrier_stall_mask[warp_ctl_if.barrier.id]);
assign reached_barrier_limit = (active_barrier_count[`NW_BITS-1:0] == warp_ctl_if.barrier.size_m1);
reg [`NUM_WARPS-1:0] total_barrier_stall;
always @(*) begin
total_barrier_stall = barrier_stall_mask[0];
for (integer i = 1; i < `NUM_BARRIERS; ++i) begin
total_barrier_stall |= barrier_stall_mask[i];
end
end
// split/join stack management
wire [(1+32+`NUM_THREADS-1):0] ipdom [`NUM_WARPS-1:0];
for (genvar i = 0; i < `NUM_WARPS; i++) begin
wire push = warp_ctl_if.valid
&& warp_ctl_if.split.valid
&& (i == warp_ctl_if.wid);
wire pop = join_if.valid && (i == join_if.wid);
wire [`NUM_THREADS-1:0] else_mask = warp_ctl_if.split.diverged ? warp_ctl_if.split.else_mask : thread_masks[warp_ctl_if.wid];
wire [(1+32+`NUM_THREADS-1):0] q_end = {1'b0, 32'b0, thread_masks[warp_ctl_if.wid]};
wire [(1+32+`NUM_THREADS-1):0] q_else = {1'b1, warp_ctl_if.split.pc, else_mask};
VX_ipdom_stack #(
.WIDTH (1+32+`NUM_THREADS),
.DEPTH (2 ** (`NT_BITS+1))
) ipdom_stack (
.clk (clk),
.reset (reset),
.push (push),
.pop (pop),
.q1 (q_end),
.q2 (q_else),
.d (ipdom[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (full)
);
end
assign {join_else, join_pc, join_tm} = ipdom [join_if.wid];
// calculate next warp schedule
reg [`NUM_THREADS-1:0] thread_mask;
reg schedule_valid;
reg [31:0] warp_pc;
wire [`NUM_WARPS-1:0] schedule_ready = schedule_table & ~(stalled_warps | total_barrier_stall | fetch_lock);
always @(*) begin
schedule_valid = 0;
thread_mask = 'x;
warp_pc = 'x;
scheduled_warp = 'x;
for (integer i = 0; i < `NUM_WARPS; ++i) begin
if (schedule_ready[i]) begin
schedule_valid = 1;
thread_mask = use_wspawn[i] ? `NUM_THREADS'(1) : thread_masks[i];
warp_pc = use_wspawn[i] ? use_wspawn_pc : warp_pcs[i];
scheduled_warp = `NW_BITS'(i);
break;
end
end
end
wire stall_out = ~ifetch_req_if.ready && ifetch_req_if.valid;
assign warp_scheduled = schedule_valid && ~stall_out;
VX_pipe_register #(
.DATAW (1 + `NUM_THREADS + 32 + `NW_BITS),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall_out),
.data_in ({warp_scheduled, thread_mask, warp_pc, scheduled_warp}),
.data_out ({ifetch_req_if.valid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid})
);
assign busy = (active_warps != 0);
`SCOPE_ASSIGN (wsched_scheduled_warp, warp_scheduled);
`SCOPE_ASSIGN (wsched_active_warps, active_warps);
`SCOPE_ASSIGN (wsched_schedule_table, schedule_table);
`SCOPE_ASSIGN (wsched_schedule_ready, schedule_ready);
`SCOPE_ASSIGN (wsched_warp_to_schedule, scheduled_warp);
`SCOPE_ASSIGN (wsched_warp_pc, warp_pc);
endmodule

130
hw/rtl/VX_writeback.sv Normal file
View file

@ -0,0 +1,130 @@
`include "VX_define.vh"
module VX_writeback #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_commit_if.slave alu_commit_if,
VX_commit_if.slave ld_commit_if,
VX_commit_if.slave csr_commit_if,
VX_commit_if.slave csr_commit_if,
`ifdef EXT_F_ENABLE
VX_commit_if.slave fpu_commit_if,
`endif
VX_commit_if.slave gpu_commit_if,
// outputs
VX_writeback_if.master writeback_if
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1;
`ifdef EXT_F_ENABLE
`ifdef EXT_TEX_ENABLE
localparam NUM_RSPS = 5;
`else
localparam NUM_RSPS = 4;
`endif
`else
`ifdef EXT_TEX_ENABLE
localparam NUM_RSPS = 4;
`else
localparam NUM_RSPS = 3;
`endif
`endif
wire wb_valid;
wire [`NW_BITS-1:0] wb_wid;
wire [31:0] wb_PC;
wire [`NUM_THREADS-1:0] wb_tmask;
wire [`NR_BITS-1:0] wb_rd;
wire [`NUM_THREADS-1:0][31:0] wb_data;
wire wb_eop;
wire [NUM_RSPS-1:0] rsp_valid;
wire [NUM_RSPS-1:0][DATAW-1:0] rsp_data;
wire [NUM_RSPS-1:0] rsp_ready;
wire stall;
assign rsp_valid = {
csr_commit_if.valid && csr_commit_if.wb,
alu_commit_if.valid && alu_commit_if.wb,
`ifdef EXT_F_ENABLE
fpu_commit_if.valid && fpu_commit_if.wb,
`endif
ld_commit_if.valid && ld_commit_if.wb,
`ifdef EXT_TEX_ENABLE
gpu_commit_if.valid && gpu_commit_if.wb,
`ifend
};
assign rsp_data = {
{csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop},
{alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop},
`ifdef EXT_F_ENABLE
{fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.rd, fpu_commit_if.data, fpu_commit_if.eop},
`endif
{ ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.rd, ld_commit_if.data, ld_commit_if.eop},
`ifdef EXT_TEX_ENABLE
{gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop},
`endif
};
VX_stream_arbiter #(
.NUM_REQS (NUM_RSPS),
.DATAW (DATAW),
.TYPE ("P")
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (rsp_valid),
.data_in (rsp_data),
.ready_in (rsp_ready),
.valid_out (wb_valid),
.data_out ({wb_wid, wb_PC, wb_tmask, wb_rd, wb_data, wb_eop}),
.ready_out (~stall)
);
assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb;
`ifdef EXT_F_ENABLE
assign fpu_commit_if.ready = rsp_ready[1] || ~fpu_commit_if.wb;
assign alu_commit_if.ready = rsp_ready[2] || ~alu_commit_if.wb;
assign csr_commit_if.ready = rsp_ready[3] || ~csr_commit_if.wb;
`ifdef EXT_TEX_ENABLE
assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb;
`endif
`else
assign alu_commit_if.ready = rsp_ready[1] || ~alu_commit_if.wb;
assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb;
`ifdef EXT_TEX_ENABLE
assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb;
`endif
`endif
assign stall = ~writeback_if.ready && writeback_if.valid;
VX_pipe_register #(
.DATAW (1 + DATAW),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({wb_valid, wb_wid, wb_PC, wb_tmask, wb_rd, wb_data, wb_eop}),
.data_out ({writeback_if.valid, writeback_if.wid, writeback_if.PC, writeback_if.tmask, writeback_if.rd, writeback_if.data, writeback_if.eop})
);
// special workaround to get RISC-V tests Pass/Fail status
reg [31:0] last_wb_value [`NUM_REGS-1:0] /* verilator public */;
always @(posedge clk) begin
if (writeback_if.valid && writeback_if.ready) begin
last_wb_value[writeback_if.rd] <= writeback_if.data[0];
end
end
endmodule

View file

@ -1,90 +0,0 @@
`include "VX_define.vh"
module VX_writeback #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_commit_if alu_commit_if,
VX_commit_if ld_commit_if,
VX_commit_if csr_commit_if,
VX_commit_if fpu_commit_if,
VX_commit_if gpu_commit_if,
// outputs
VX_writeback_if writeback_if
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1;
wire wb_valid;
wire [`NW_BITS-1:0] wb_wid;
wire [31:0] wb_PC;
wire [`NUM_THREADS-1:0] wb_tmask;
wire [`NR_BITS-1:0] wb_rd;
wire [`NUM_THREADS-1:0][31:0] wb_data;
wire wb_eop;
wire [4:0][DATAW-1:0] rsp_data;
wire [4:0] rsp_ready;
wire stall;
wire ld_valid = ld_commit_if.valid && ld_commit_if.wb;
wire fpu_valid = fpu_commit_if.valid && fpu_commit_if.wb;
wire csr_valid = csr_commit_if.valid && csr_commit_if.wb;
wire alu_valid = alu_commit_if.valid && alu_commit_if.wb;
wire gpu_valid = gpu_commit_if.valid && gpu_commit_if.wb;
assign rsp_data[0] = { ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.rd, ld_commit_if.data, ld_commit_if.eop};
assign rsp_data[1] = {fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.rd, fpu_commit_if.data, fpu_commit_if.eop};
assign rsp_data[2] = {csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop};
assign rsp_data[3] = {alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop};
assign rsp_data[4] = {gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop};
VX_stream_arbiter #(
.NUM_REQS (5),
.DATAW (DATAW),
.TYPE ("X")
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in ({gpu_valid, alu_valid, csr_valid, fpu_valid, ld_valid}),
.data_in (rsp_data),
.ready_in (rsp_ready),
.valid_out (wb_valid),
.data_out ({wb_wid, wb_PC, wb_tmask, wb_rd, wb_data, wb_eop}),
.ready_out (~stall)
);
assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb;
assign fpu_commit_if.ready = rsp_ready[1] || ~fpu_commit_if.wb;
assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb;
assign alu_commit_if.ready = rsp_ready[3] || ~alu_commit_if.wb;
assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb;
assign stall = ~writeback_if.ready && writeback_if.valid;
VX_pipe_register #(
.DATAW (1 + DATAW),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({wb_valid, wb_wid, wb_PC, wb_tmask, wb_rd, wb_data, wb_eop}),
.data_out ({writeback_if.valid, writeback_if.wid, writeback_if.PC, writeback_if.tmask, writeback_if.rd, writeback_if.data, writeback_if.eop})
);
// special workaround to get RISC-V tests Pass/Fail status
reg [31:0] last_wb_value [`NUM_REGS-1:0] /* verilator public */;
always @(posedge clk) begin
if (writeback_if.valid && writeback_if.ready) begin
last_wb_value[writeback_if.rd] <= writeback_if.data[0];
end
end
endmodule

View file

@ -29,15 +29,15 @@ module Vortex (
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_valid;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_rw;
wire [`NUM_CLUSTERS-1:0][`L2MEM_BYTEEN_WIDTH-1:0] per_cluster_mem_req_byteen;
wire [`NUM_CLUSTERS-1:0][`L2MEM_ADDR_WIDTH-1:0] per_cluster_mem_req_addr;
wire [`NUM_CLUSTERS-1:0][`L2MEM_DATA_WIDTH-1:0] per_cluster_mem_req_data;
wire [`NUM_CLUSTERS-1:0][`L2MEM_TAG_WIDTH-1:0] per_cluster_mem_req_tag;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_BYTEEN_WIDTH-1:0] per_cluster_mem_req_byteen;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_ADDR_WIDTH-1:0] per_cluster_mem_req_addr;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_DATA_WIDTH-1:0] per_cluster_mem_req_data;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_TAG_WIDTH-1:0] per_cluster_mem_req_tag;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_ready;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_valid;
wire [`NUM_CLUSTERS-1:0][`L2MEM_DATA_WIDTH-1:0] per_cluster_mem_rsp_data;
wire [`NUM_CLUSTERS-1:0][`L2MEM_TAG_WIDTH-1:0] per_cluster_mem_rsp_tag;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_DATA_WIDTH-1:0] per_cluster_mem_rsp_data;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_TAG_WIDTH-1:0] per_cluster_mem_rsp_tag;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_ready;
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
@ -81,21 +81,22 @@ module Vortex (
`RESET_RELAY (l3_reset);
VX_cache #(
.CACHE_ID (`L3CACHE_ID),
.CACHE_SIZE (`L3CACHE_SIZE),
.CACHE_LINE_SIZE (`L3CACHE_LINE_SIZE),
.NUM_BANKS (`L3NUM_BANKS),
.WORD_SIZE (`L3WORD_SIZE),
.NUM_REQS (`L3NUM_REQS),
.CREQ_SIZE (`L3CREQ_SIZE),
.CRSQ_SIZE (`L3CRSQ_SIZE),
.MSHR_SIZE (`L3MSHR_SIZE),
.MRSQ_SIZE (`L3MRSQ_SIZE),
.MREQ_SIZE (`L3MREQ_SIZE),
.CACHE_ID (`L3_CACHE_ID),
.CACHE_SIZE (`L3_CACHE_SIZE),
.CACHE_LINE_SIZE (`L3_CACHE_LINE_SIZE),
.NUM_BANKS (`L3_NUM_BANKS),
.NUM_PORTS (`L3_NUM_PORTS),
.WORD_SIZE (`L3_WORD_SIZE),
.NUM_REQS (`L3_NUM_REQS),
.CREQ_SIZE (`L3_CREQ_SIZE),
.CRSQ_SIZE (`L3_CRSQ_SIZE),
.MSHR_SIZE (`L3_MSHR_SIZE),
.MRSQ_SIZE (`L3_MRSQ_SIZE),
.MREQ_SIZE (`L3_MREQ_SIZE),
.WRITE_ENABLE (1),
.CORE_TAG_WIDTH (`L2MEM_TAG_WIDTH),
.CORE_TAG_WIDTH (`L2_MEM_TAG_WIDTH),
.CORE_TAG_ID_BITS (0),
.MEM_TAG_WIDTH (`L3MEM_TAG_WIDTH),
.MEM_TAG_WIDTH (`L3_MEM_TAG_WIDTH),
.NC_ENABLE (1)
) l3cache (
`SCOPE_BIND_Vortex_l3cache
@ -141,16 +142,19 @@ module Vortex (
end else begin
`RESET_RELAY (mem_arb_reset);
VX_mem_arb #(
.NUM_REQS (`NUM_CLUSTERS),
.DATA_WIDTH (`L3MEM_DATA_WIDTH),
.ADDR_WIDTH (`L3MEM_ADDR_WIDTH),
.TAG_IN_WIDTH (`L2MEM_TAG_WIDTH),
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
.NUM_REQS (`NUM_CLUSTERS),
.DATA_WIDTH (`L3_MEM_DATA_WIDTH),
.ADDR_WIDTH (`L3_MEM_ADDR_WIDTH),
.TAG_IN_WIDTH (`L2_MEM_TAG_WIDTH),
.TYPE ("R"),
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
) mem_arb (
.clk (clk),
.reset (reset),
.reset (mem_arb_reset),
// Core request
.req_valid_in (per_cluster_mem_req_valid),
@ -201,12 +205,12 @@ module Vortex (
always @(posedge clk) begin
if (mem_req_valid && mem_req_ready) begin
if (mem_req_rw)
$display("%t: MEM Wr Req: addr=%0h, tag=%0h, byteen=%0h data=%0h", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data);
dpi_trace("%d: MEM Wr Req: addr=%0h, tag=%0h, byteen=%0h data=%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data);
else
$display("%t: MEM Rd Req: addr=%0h, tag=%0h, byteen=%0h", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen);
dpi_trace("%d: MEM Rd Req: addr=%0h, tag=%0h, byteen=%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen);
end
if (mem_rsp_valid && mem_rsp_ready) begin
$display("%t: MEM Rsp: tag=%0h, data=%0h", $time, mem_rsp_tag, mem_rsp_data);
dpi_trace("%d: MEM Rsp: tag=%0h, data=%0h\n", $time, mem_rsp_tag, mem_rsp_data);
end
end
`endif

165
hw/rtl/Vortex_axi.sv Normal file
View file

@ -0,0 +1,165 @@
`include "VX_define.vh"
module Vortex_axi #(
parameter AXI_DATA_WIDTH = `VX_MEM_DATA_WIDTH,
parameter AXI_ADDR_WIDTH = 32,
parameter AXI_TID_WIDTH = `VX_MEM_TAG_WIDTH,
parameter AXI_STROBE_WIDTH = (`VX_MEM_DATA_WIDTH / 8)
)(
// Clock
input wire clk,
input wire reset,
// AXI write request address channel
output wire [AXI_TID_WIDTH-1:0] m_axi_awid,
output wire [AXI_ADDR_WIDTH-1:0] m_axi_awaddr,
output wire [7:0] m_axi_awlen,
output wire [2:0] m_axi_awsize,
output wire [1:0] m_axi_awburst,
output wire m_axi_awlock,
output wire [3:0] m_axi_awcache,
output wire [2:0] m_axi_awprot,
output wire [3:0] m_axi_awqos,
output wire m_axi_awvalid,
input wire m_axi_awready,
// AXI write request data channel
output wire [AXI_DATA_WIDTH-1:0] m_axi_wdata,
output wire [AXI_STROBE_WIDTH-1:0] m_axi_wstrb,
output wire m_axi_wlast,
output wire m_axi_wvalid,
input wire m_axi_wready,
// AXI write response channel
input wire [AXI_TID_WIDTH-1:0] m_axi_bid,
input wire [1:0] m_axi_bresp,
input wire m_axi_bvalid,
output wire m_axi_bready,
// AXI read request channel
output wire [AXI_TID_WIDTH-1:0] m_axi_arid,
output wire [AXI_ADDR_WIDTH-1:0] m_axi_araddr,
output wire [7:0] m_axi_arlen,
output wire [2:0] m_axi_arsize,
output wire [1:0] m_axi_arburst,
output wire m_axi_arlock,
output wire [3:0] m_axi_arcache,
output wire [2:0] m_axi_arprot,
output wire [3:0] m_axi_arqos,
output wire m_axi_arvalid,
input wire m_axi_arready,
// AXI read response channel
input wire [AXI_TID_WIDTH-1:0] m_axi_rid,
input wire [AXI_DATA_WIDTH-1:0] m_axi_rdata,
input wire [1:0] m_axi_rresp,
input wire m_axi_rlast,
input wire m_axi_rvalid,
output wire m_axi_rready,
// Status
output wire busy
);
wire mem_req_valid;
wire mem_req_rw;
wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen;
wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr;
wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data;
wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
wire mem_rsp_valid;
wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data;
wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_ready;
VX_axi_adapter #(
.VX_DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.VX_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH),
.VX_TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.VX_BYTEEN_WIDTH (AXI_STROBE_WIDTH),
.AXI_DATA_WIDTH (AXI_DATA_WIDTH),
.AXI_ADDR_WIDTH (AXI_ADDR_WIDTH),
.AXI_TID_WIDTH (AXI_TID_WIDTH),
.AXI_STROBE_WIDTH (AXI_STROBE_WIDTH)
) axi_adapter (
.clk (clk),
.reset (reset),
.mem_req_valid (mem_req_valid),
.mem_req_rw (mem_req_rw),
.mem_req_byteen (mem_req_byteen),
.mem_req_addr (mem_req_addr),
.mem_req_data (mem_req_data),
.mem_req_tag (mem_req_tag),
.mem_req_ready (mem_req_ready),
.mem_rsp_valid (mem_rsp_valid),
.mem_rsp_data (mem_rsp_data),
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_ready (mem_rsp_ready),
.m_axi_awid (m_axi_awid),
.m_axi_awaddr (m_axi_awaddr),
.m_axi_awlen (m_axi_awlen),
.m_axi_awsize (m_axi_awsize),
.m_axi_awburst (m_axi_awburst),
.m_axi_awlock (m_axi_awlock),
.m_axi_awcache (m_axi_awcache),
.m_axi_awprot (m_axi_awprot),
.m_axi_awqos (m_axi_awqos),
.m_axi_awvalid (m_axi_awvalid),
.m_axi_awready (m_axi_awready),
.m_axi_wdata (m_axi_wdata),
.m_axi_wstrb (m_axi_wstrb),
.m_axi_wlast (m_axi_wlast),
.m_axi_wvalid (m_axi_wvalid),
.m_axi_wready (m_axi_wready),
.m_axi_bid (m_axi_bid),
.m_axi_bresp (m_axi_bresp),
.m_axi_bvalid (m_axi_bvalid),
.m_axi_bready (m_axi_bready),
.m_axi_arid (m_axi_arid),
.m_axi_araddr (m_axi_araddr),
.m_axi_arlen (m_axi_arlen),
.m_axi_arsize (m_axi_arsize),
.m_axi_arburst (m_axi_arburst),
.m_axi_arlock (m_axi_arlock),
.m_axi_arcache (m_axi_arcache),
.m_axi_arprot (m_axi_arprot),
.m_axi_arqos (m_axi_arqos),
.m_axi_arvalid (m_axi_arvalid),
.m_axi_arready (m_axi_arready),
.m_axi_rid (m_axi_rid),
.m_axi_rdata (m_axi_rdata),
.m_axi_rresp (m_axi_rresp),
.m_axi_rlast (m_axi_rlast),
.m_axi_rvalid (m_axi_rvalid),
.m_axi_rready (m_axi_rready)
);
Vortex vortex (
.clk (clk),
.reset (reset),
.mem_req_valid (mem_req_valid),
.mem_req_rw (mem_req_rw),
.mem_req_byteen (mem_req_byteen),
.mem_req_addr (mem_req_addr),
.mem_req_data (mem_req_data),
.mem_req_tag (mem_req_tag),
.mem_req_ready (mem_req_ready),
.mem_rsp_valid (mem_rsp_valid),
.mem_rsp_data (mem_rsp_data),
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_ready (mem_rsp_ready),
.busy (busy)
);
endmodule

View file

@ -1,17 +1,15 @@
`include "VX_define.vh"
module VX_avs_wrapper #(
parameter NUM_BANKS = 1,
module VX_avs_wrapper #(
parameter AVS_DATA_WIDTH = 1,
parameter AVS_ADDR_WIDTH = 1,
parameter AVS_BURST_WIDTH = 1,
parameter AVS_BANKS = 1,
parameter AVS_BANKS = 1,
parameter REQ_TAG_WIDTH = 1,
parameter RD_QUEUE_SIZE = 1,
parameter AVS_BYTEENW = (AVS_DATA_WIDTH / 8),
parameter RD_QUEUE_ADDR_WIDTH = $clog2(RD_QUEUE_SIZE+1),
parameter AVS_BANKS_BITS = $clog2(AVS_BANKS)
parameter RD_QUEUE_ADDR_WIDTH = $clog2(RD_QUEUE_SIZE+1)
) (
input wire clk,
input wire reset,
@ -32,47 +30,46 @@ module VX_avs_wrapper #(
input wire mem_rsp_ready,
// AVS bus
output wire [AVS_DATA_WIDTH-1:0] avs_writedata [NUM_BANKS],
input wire [AVS_DATA_WIDTH-1:0] avs_readdata [NUM_BANKS],
output wire [AVS_ADDR_WIDTH-1:0] avs_address [NUM_BANKS],
input wire avs_waitrequest [NUM_BANKS],
output wire avs_write [NUM_BANKS],
output wire avs_read [NUM_BANKS],
output wire [AVS_BYTEENW-1:0] avs_byteenable [NUM_BANKS],
output wire [AVS_BURST_WIDTH-1:0] avs_burstcount [NUM_BANKS],
input avs_readdatavalid [NUM_BANKS]
output wire [AVS_DATA_WIDTH-1:0] avs_writedata [AVS_BANKS],
input wire [AVS_DATA_WIDTH-1:0] avs_readdata [AVS_BANKS],
output wire [AVS_ADDR_WIDTH-1:0] avs_address [AVS_BANKS],
input wire avs_waitrequest [AVS_BANKS],
output wire avs_write [AVS_BANKS],
output wire avs_read [AVS_BANKS],
output wire [AVS_BYTEENW-1:0] avs_byteenable [AVS_BANKS],
output wire [AVS_BURST_WIDTH-1:0] avs_burstcount [AVS_BANKS],
input avs_readdatavalid [AVS_BANKS]
);
localparam BANK_ADDRW = `LOG2UP(NUM_BANKS);
localparam OUTPUT_REG = (NUM_BANKS > 2);
localparam BANK_ADDRW = `LOG2UP(AVS_BANKS);
// Requests handling
wire [NUM_BANKS-1:0] avs_reqq_push, avs_reqq_pop, avs_reqq_ready;
wire [NUM_BANKS-1:0][REQ_TAG_WIDTH-1:0] avs_reqq_tag_out;
wire [NUM_BANKS-1:0] req_queue_going_full;
wire [NUM_BANKS-1:0][RD_QUEUE_ADDR_WIDTH-1:0] req_queue_size;
wire [AVS_BANKS-1:0] avs_reqq_push, avs_reqq_pop, avs_reqq_ready;
wire [AVS_BANKS-1:0][REQ_TAG_WIDTH-1:0] avs_reqq_tag_out;
wire [AVS_BANKS-1:0] req_queue_going_full;
wire [AVS_BANKS-1:0][RD_QUEUE_ADDR_WIDTH-1:0] req_queue_size;
wire [BANK_ADDRW-1:0] req_bank_sel;
if (NUM_BANKS >= 2) begin
if (AVS_BANKS >= 2) begin
assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0];
end else begin
assign req_bank_sel = 0;
end
for (genvar i = 0; i < NUM_BANKS; i++) begin
for (genvar i = 0; i < AVS_BANKS; i++) begin
assign avs_reqq_ready[i] = !req_queue_going_full[i] && !avs_waitrequest[i];
assign avs_reqq_push[i] = mem_req_valid && !mem_req_rw && avs_reqq_ready[i] && (req_bank_sel == i);
end
for (genvar i = 0; i < NUM_BANKS; i++) begin
for (genvar i = 0; i < AVS_BANKS; i++) begin
VX_pending_size #(
.SIZE (RD_QUEUE_SIZE)
) pending_size (
.clk (clk),
.reset (reset),
.push (avs_reqq_push[i]),
.pop (avs_reqq_pop[i]),
.incr (avs_reqq_push[i]),
.decr (avs_reqq_pop[i]),
.full (req_queue_going_full[i]),
.size (req_queue_size[i]),
`UNUSED_PIN (empty)
@ -80,9 +77,8 @@ module VX_avs_wrapper #(
`UNUSED_VAR (req_queue_size)
VX_fifo_queue #(
.DATAW (REQ_TAG_WIDTH),
.SIZE (RD_QUEUE_SIZE),
.OUTPUT_REG (!OUTPUT_REG)
.DATAW (REQ_TAG_WIDTH),
.SIZE (RD_QUEUE_SIZE)
) rd_req_queue (
.clk (clk),
.reset (reset),
@ -98,7 +94,7 @@ module VX_avs_wrapper #(
);
end
for (genvar i = 0; i < NUM_BANKS; i++) begin
for (genvar i = 0; i < AVS_BANKS; i++) begin
assign avs_read[i] = mem_req_valid && !mem_req_rw && !req_queue_going_full[i] && (req_bank_sel == i);
assign avs_write[i] = mem_req_valid && mem_req_rw && !req_queue_going_full[i] && (req_bank_sel == i);
assign avs_address[i] = mem_req_addr;
@ -107,7 +103,7 @@ module VX_avs_wrapper #(
assign avs_burstcount[i] = AVS_BURST_WIDTH'(1);
end
if (NUM_BANKS >= 2) begin
if (AVS_BANKS >= 2) begin
assign mem_req_ready = avs_reqq_ready[req_bank_sel];
end else begin
assign mem_req_ready = avs_reqq_ready;
@ -115,18 +111,17 @@ module VX_avs_wrapper #(
// Responses handling
wire [NUM_BANKS-1:0] rsp_arb_valid_in;
wire [NUM_BANKS-1:0][AVS_DATA_WIDTH+REQ_TAG_WIDTH-1:0] rsp_arb_data_in;
wire [NUM_BANKS-1:0] rsp_arb_ready_in;
wire [AVS_BANKS-1:0] rsp_arb_valid_in;
wire [AVS_BANKS-1:0][AVS_DATA_WIDTH+REQ_TAG_WIDTH-1:0] rsp_arb_data_in;
wire [AVS_BANKS-1:0] rsp_arb_ready_in;
wire [NUM_BANKS-1:0][AVS_DATA_WIDTH-1:0] avs_rspq_data_out;
wire [NUM_BANKS-1:0] avs_rspq_empty;
wire [AVS_BANKS-1:0][AVS_DATA_WIDTH-1:0] avs_rspq_data_out;
wire [AVS_BANKS-1:0] avs_rspq_empty;
for (genvar i = 0; i < NUM_BANKS; i++) begin
for (genvar i = 0; i < AVS_BANKS; i++) begin
VX_fifo_queue #(
.DATAW (AVS_DATA_WIDTH),
.SIZE (RD_QUEUE_SIZE),
.OUTPUT_REG (!OUTPUT_REG)
.DATAW (AVS_DATA_WIDTH),
.SIZE (RD_QUEUE_SIZE)
) rd_rsp_queue (
.clk (clk),
.reset (reset),
@ -140,18 +135,18 @@ module VX_avs_wrapper #(
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
end
for (genvar i = 0; i < NUM_BANKS; i++) begin
for (genvar i = 0; i < AVS_BANKS; i++) begin
assign rsp_arb_valid_in[i] = !avs_rspq_empty[i];
assign rsp_arb_data_in[i] = {avs_rspq_data_out[i], avs_reqq_tag_out[i]};
assign avs_reqq_pop[i] = rsp_arb_valid_in[i] && rsp_arb_ready_in[i];
end
VX_stream_arbiter #(
.NUM_REQS (NUM_BANKS),
.NUM_REQS (AVS_BANKS),
.DATAW (AVS_DATA_WIDTH + REQ_TAG_WIDTH),
.BUFFERED (OUTPUT_REG ? 1 : 0)
.TYPE ("R")
) rsp_arb (
.clk (clk),
.reset (reset),
@ -166,13 +161,14 @@ module VX_avs_wrapper #(
`ifdef DBG_PRINT_AVS
always @(posedge clk) begin
if (mem_req_valid && mem_req_ready) begin
if (mem_req_rw)
$display("%t: AVS Wr Req: addr=%0h, byteen=%0h, tag=%0h, data=%0h", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, mem_req_data);
else
$display("%t: AVS Rd Req: addr=%0h, byteen=%0h, tag=%0h, pending=%0d", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, req_queue_size);
if (mem_req_rw) begin
dpi_trace("%d: AVS Wr Req: addr=%0h, byteen=%0h, tag=%0h, data=%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, mem_req_data);
end else begin
dpi_trace("%d: AVS Rd Req: addr=%0h, byteen=%0h, tag=%0h, pending=%0d\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, req_queue_size);
end
end
if (mem_rsp_valid && mem_rsp_ready) begin
$display("%t: AVS Rd Rsp: tag=%0h, data=%0h, pending=%0d", $time, mem_rsp_tag, mem_rsp_data, req_queue_size);
dpi_trace("%d: AVS Rd Rsp: tag=%0h, data=%0h, pending=%0d\n", $time, mem_rsp_tag, mem_rsp_data, req_queue_size);
end
end
`endif

View file

@ -47,9 +47,10 @@ localparam CCI_ADDR_WIDTH = 32 - $clog2(CCI_DATA_SIZE);
localparam AVS_RD_QUEUE_SIZE = 4;
localparam _AVS_REQ_TAGW_VX = `VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH);
localparam AVS_REQ_TAGW_VX = `MAX(`VX_MEM_TAG_WIDTH, _AVS_REQ_TAGW_VX);
localparam AVS_REQ_TAGW_CCI = `MAX(CCI_ADDR_WIDTH, CCI_ADDR_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(CCI_DATA_WIDTH));
localparam AVS_REQ_TAGW_VX_ = `VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH);
localparam AVS_REQ_TAGW_VX = `MAX(`VX_MEM_TAG_WIDTH, AVS_REQ_TAGW_VX_);
localparam AVS_REQ_TAGW_CCI_ = CCI_ADDR_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(CCI_DATA_WIDTH);
localparam AVS_REQ_TAGW_CCI = `MAX(CCI_ADDR_WIDTH, AVS_REQ_TAGW_CCI_);
localparam AVS_REQ_TAGW = `MAX(AVS_REQ_TAGW_VX, AVS_REQ_TAGW_CCI);
localparam CCI_RD_WINDOW_SIZE = 8;
@ -94,7 +95,7 @@ localparam STATE_WIDTH = $clog2(STATE_MAX_VALUE);
wire [127:0] afu_id = `AFU_ACCEL_UUID;
wire [63:0] dev_caps = {16'(`NUM_THREADS), 16'(`NUM_WARPS), 16'(`NUM_CORES), 16'(`IMPLEMENTATION_ID)};
wire [63:0] dev_caps = {16'(`NUM_THREADS), 16'(`NUM_WARPS), 16'(`NUM_CORES * `NUM_CLUSTERS), 16'(`IMPLEMENTATION_ID)};
reg [STATE_WIDTH-1:0] state;
@ -131,9 +132,9 @@ wire cmd_scope_write;
// MMIO controller ////////////////////////////////////////////////////////////
`IGNORE_WARNINGS_BEGIN
`IGNORE_UNUSED_BEGIN
t_ccip_c0_ReqMmioHdr mmio_hdr;
`IGNORE_WARNINGS_END
`IGNORE_UNUSED_END
assign mmio_hdr = t_ccip_c0_ReqMmioHdr'(cp2af_sRxPort.c0.hdr);
`STATIC_ASSERT(($bits(t_ccip_c0_ReqMmioHdr)-$bits(mmio_hdr.address)) == 12, ("Oops!"))
@ -150,21 +151,6 @@ assign cmd_scope_write = cp2af_sRxPort.c0.mmioWrValid && (MMIO_SCOPE_WRITE == mm
wire [COUT_QUEUE_DATAW-1:0] cout_q_dout;
wire cout_q_full, cout_q_empty;
/*
`DEBUG_BEGIN
wire cp2af_sRxPort_c0_mmioWrValid = cp2af_sRxPort.c0.mmioWrValid;
wire cp2af_sRxPort_c0_mmioRdValid = cp2af_sRxPort.c0.mmioRdValid;
wire cp2af_sRxPort_c0_rspValid = cp2af_sRxPort.c0.rspValid;
wire cp2af_sRxPort_c1_rspValid = cp2af_sRxPort.c1.rspValid;
wire cp2af_sRxPort_c0TxAlmFull = cp2af_sRxPort.c0TxAlmFull;
wire cp2af_sRxPort_c1TxAlmFull = cp2af_sRxPort.c1TxAlmFull;
wire[$bits(mmio_hdr.address)-1:0] mmio_hdr_address = mmio_hdr.address;
wire[$bits(mmio_hdr.length)-1:0] mmio_hdr_length = mmio_hdr.length;
wire[$bits(mmio_hdr.tid)-1:0] mmio_hdr_tid = mmio_hdr.tid;
wire[$bits(cp2af_sRxPort.c0.hdr.mdata)-1:0] cp2af_sRxPort_c0_hdr_mdata = cp2af_sRxPort.c0.hdr.mdata;
`DEBUG_END
*/
wire [2:0] cmd_type = (cp2af_sRxPort.c0.mmioWrValid
&& (MMIO_CMD_TYPE == mmio_hdr.address)) ? 3'(cp2af_sRxPort.c0.data) : 3'h0;
@ -201,36 +187,36 @@ always @(posedge clk) begin
MMIO_IO_ADDR: begin
cmd_io_addr <= t_ccip_clAddr'(cp2af_sRxPort.c0.data);
`ifdef DBG_PRINT_OPAE
$display("%t: MMIO_IO_ADDR: addr=%0h, data=0x%0h", $time, mmio_hdr.address, t_ccip_clAddr'(cp2af_sRxPort.c0.data));
dpi_trace("%d: MMIO_IO_ADDR: addr=%0h, data=0x%0h\n", $time, mmio_hdr.address, t_ccip_clAddr'(cp2af_sRxPort.c0.data));
`endif
end
MMIO_MEM_ADDR: begin
cmd_mem_addr <= $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data);
`ifdef DBG_PRINT_OPAE
$display("%t: MMIO_MEM_ADDR: addr=%0h, data=0x%0h", $time, mmio_hdr.address, $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data));
dpi_trace("%d: MMIO_MEM_ADDR: addr=%0h, data=0x%0h\n", $time, mmio_hdr.address, $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data));
`endif
end
MMIO_DATA_SIZE: begin
cmd_data_size <= $bits(cmd_data_size)'(cp2af_sRxPort.c0.data);
`ifdef DBG_PRINT_OPAE
$display("%t: MMIO_DATA_SIZE: addr=%0h, data=%0d", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data));
dpi_trace("%d: MMIO_DATA_SIZE: addr=%0h, data=%0d\n", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data));
`endif
end
MMIO_CMD_TYPE: begin
`ifdef DBG_PRINT_OPAE
$display("%t: MMIO_CMD_TYPE: addr=%0h, data=%0d", $time, mmio_hdr.address, $bits(cmd_type)'(cp2af_sRxPort.c0.data));
dpi_trace("%d: MMIO_CMD_TYPE: addr=%0h, data=%0d\n", $time, mmio_hdr.address, $bits(cmd_type)'(cp2af_sRxPort.c0.data));
`endif
end
`ifdef SCOPE
MMIO_SCOPE_WRITE: begin
`ifdef DBG_PRINT_OPAE
$display("%t: MMIO_SCOPE_WRITE: addr=%0h, data=%0h", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data));
dpi_trace("%d: MMIO_SCOPE_WRITE: addr=%0h, data=%0h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data));
`endif
end
`endif
default: begin
`ifdef DBG_PRINT_OPAE
$display("%t: Unknown MMIO Wr: addr=%0h, data=%0h", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data));
dpi_trace("%d: Unknown MMIO Wr: addr=%0h, data=%0h\n", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data));
`endif
end
endcase
@ -258,7 +244,7 @@ always @(posedge clk) begin
mmio_tx.data <= 64'({cout_q_dout, !cout_q_empty, 8'(state)});
`ifdef DBG_PRINT_OPAE
if (state != STATE_WIDTH'(mmio_tx.data)) begin
$display("%t: MMIO_STATUS: addr=%0h, state=%0d", $time, mmio_hdr.address, state);
dpi_trace("%d: MMIO_STATUS: addr=%0h, state=%0d\n", $time, mmio_hdr.address, state);
end
`endif
end
@ -266,20 +252,20 @@ always @(posedge clk) begin
MMIO_SCOPE_READ: begin
mmio_tx.data <= cmd_scope_rdata;
`ifdef DBG_PRINT_OPAE
$display("%t: MMIO_SCOPE_READ: addr=%0h, data=%0h", $time, mmio_hdr.address, cmd_scope_rdata);
dpi_trace("%d: MMIO_SCOPE_READ: addr=%0h, data=%0h\n", $time, mmio_hdr.address, cmd_scope_rdata);
`endif
end
`endif
MMIO_DEV_CAPS: begin
mmio_tx.data <= dev_caps;
`ifdef DBG_PRINT_OPAE
$display("%t: MMIO_DEV_CAPS: addr=%0h, data=%0h", $time, mmio_hdr.address, dev_caps);
dpi_trace("%d: MMIO_DEV_CAPS: addr=%0h, data=%0h\n", $time, mmio_hdr.address, dev_caps);
`endif
end
default: begin
mmio_tx.data <= 64'h0;
`ifdef DBG_PRINT_OPAE
$display("%t: Unknown MMIO Rd: addr=%0h", $time, mmio_hdr.address);
dpi_trace("%d: Unknown MMIO Rd: addr=%0h\n", $time, mmio_hdr.address);
`endif
end
endcase
@ -313,19 +299,19 @@ always @(posedge clk) begin
case (cmd_type)
CMD_MEM_READ: begin
`ifdef DBG_PRINT_OPAE
$display("%t: STATE READ: ia=%0h addr=%0h size=%0d", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size);
dpi_trace("%d: STATE READ: ia=%0h addr=%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size);
`endif
state <= STATE_READ;
end
CMD_MEM_WRITE: begin
`ifdef DBG_PRINT_OPAE
$display("%t: STATE WRITE: ia=%0h addr=%0h size=%0d", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size);
dpi_trace("%d: STATE WRITE: ia=%0h addr=%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size);
`endif
state <= STATE_WRITE;
end
CMD_RUN: begin
`ifdef DBG_PRINT_OPAE
$display("%t: STATE START", $time);
dpi_trace("%d: STATE START\n", $time);
`endif
vx_reset <= 1;
state <= STATE_START;
@ -340,7 +326,7 @@ always @(posedge clk) begin
if (cmd_read_done) begin
state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE
$display("%t: STATE IDLE", $time);
dpi_trace("%d: STATE IDLE\n", $time);
`endif
end
end
@ -349,7 +335,7 @@ always @(posedge clk) begin
if (cmd_write_done) begin
state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE
$display("%t: STATE IDLE", $time);
dpi_trace("%d: STATE IDLE\n", $time);
`endif
end
end
@ -361,7 +347,7 @@ always @(posedge clk) begin
vx_started <= 0;
state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE
$display("%t: STATE IDLE", $time);
dpi_trace("%d: STATE IDLE\n", $time);
`endif
end
end else begin
@ -527,17 +513,19 @@ t_local_mem_data mem_rsp_data;
wire [AVS_REQ_TAGW:0] mem_rsp_tag;
wire mem_rsp_ready;
`RESET_RELAY (mem_arb_reset);
VX_mem_arb #(
.NUM_REQS (2),
.DATA_WIDTH (LMEM_DATA_WIDTH),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.TAG_IN_WIDTH (AVS_REQ_TAGW),
.BUFFERED_REQ (0),
.BUFFERED_RSP (0),
.TYPE ("X")
.TYPE ("P"),
.BUFFERED_REQ (2),
.BUFFERED_RSP (2)
) mem_arb (
.clk (clk),
.reset (reset),
.reset (mem_arb_reset),
// Source request
.req_valid_in ({vx_mem_req_arb_valid, cci_mem_req_arb_valid}),
@ -572,8 +560,9 @@ VX_mem_arb #(
//--
`RESET_RELAY (avs_wrapper_reset);
VX_avs_wrapper #(
.NUM_BANKS (NUM_LOCAL_MEM_BANKS),
.AVS_DATA_WIDTH (LMEM_DATA_WIDTH),
.AVS_ADDR_WIDTH (LMEM_ADDR_WIDTH),
.AVS_BURST_WIDTH (LMEM_BURST_CTRW),
@ -582,7 +571,7 @@ VX_avs_wrapper #(
.RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE)
) avs_wrapper (
.clk (clk),
.reset (reset),
.reset (avs_wrapper_reset),
// Memory request
.mem_req_valid (mem_req_valid),
@ -657,8 +646,8 @@ VX_pending_size #(
) cci_rd_pending_size (
.clk (clk),
.reset (reset),
.push (cci_rd_req_fire),
.pop (cci_rdq_pop),
.incr (cci_rd_req_fire),
.decr (cci_rdq_pop),
.full (cci_pending_reads_full),
.size (cci_pending_reads),
`UNUSED_PIN (empty)
@ -712,7 +701,7 @@ always @(posedge clk) begin
cci_rd_req_addr <= cci_rd_req_addr + 1;
cci_rd_req_ctr <= cci_rd_req_ctr + 1;
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads);
dpi_trace("%d: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d\n", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads);
`endif
end
@ -722,13 +711,13 @@ always @(posedge clk) begin
cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE);
end
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data);
dpi_trace("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data);
`endif
end
if (cci_rdq_pop) begin
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Rd Queue Pop: pending=%0d", $time, cci_pending_reads);
dpi_trace("%d: CCI Rd Queue Pop: pending=%0d\n", $time, cci_pending_reads);
`endif
end
@ -740,13 +729,15 @@ always @(posedge clk) begin
end
end
`RESET_RELAY (cci_rdq_reset);
VX_fifo_queue #(
.DATAW (CCI_RD_QUEUE_DATAW),
.SIZE (CCI_RD_QUEUE_SIZE),
.OUTPUT_REG (1)
.DATAW (CCI_RD_QUEUE_DATAW),
.SIZE (CCI_RD_QUEUE_SIZE),
.OUT_REG (1)
) cci_rd_req_queue (
.clk (clk),
.reset (reset),
.reset (cci_rdq_reset),
.push (cci_rdq_push),
.pop (cci_rdq_pop),
.data_in (cci_rdq_din),
@ -814,8 +805,8 @@ VX_pending_size #(
) cci_wr_pending_size (
.clk (clk),
.reset (reset),
.push (cci_mem_rd_rsp_fire),
.pop (cci_wr_rsp_fire),
.incr (cci_mem_rd_rsp_fire),
.decr (cci_wr_rsp_fire),
.empty (cci_pending_writes_empty),
.full (cci_pending_writes_full),
.size (cci_pending_writes)
@ -861,19 +852,19 @@ begin
cci_wr_req_data <= t_ccip_clData'(cci_mem_rsp_data);
if (cci_wr_req_fire) begin
assert(cci_wr_req_ctr != 0);
`ASSERT(cci_wr_req_ctr != 0, ("runtime error"));
cci_wr_req_ctr <= cci_wr_req_ctr - CCI_ADDR_WIDTH'(1);
if (cci_wr_req_ctr == CCI_ADDR_WIDTH'(1)) begin
cci_wr_req_done <= 1;
end
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data);
dpi_trace("%d: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data);
`endif
end
if (cci_wr_rsp_fire) begin
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Wr Rsp: pending=%0d", $time, cci_pending_writes);
dpi_trace("%d: CCI Wr Rsp: pending=%0d\n", $time, cci_pending_writes);
`endif
end
end
@ -890,11 +881,11 @@ assign cci_mem_req_tag = cci_mem_req_rw ? cci_mem_wr_req_ctr : cci_mem_rd_req_
assign cmd_run_done = !vx_busy;
Vortex #() vortex (
Vortex vortex (
`SCOPE_BIND_afu_vortex
.clk (clk),
.reset (reset | vx_reset),
.reset (reset || vx_reset),
// Memory request
.mem_req_valid (vx_mem_req_valid),
@ -1013,6 +1004,8 @@ VX_fifo_queue #(
wire scope_changed = `SCOPE_TRIGGER;
`RESET_RELAY (scope_reset);
VX_scope #(
.DATAW ($bits({`SCOPE_DATA_LIST,`SCOPE_UPDATE_LIST})),
.BUSW (64),
@ -1020,7 +1013,7 @@ VX_scope #(
.UPDW ($bits({`SCOPE_UPDATE_LIST}))
) scope (
.clk (clk),
.reset (reset),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.changed (scope_changed),

548
hw/rtl/cache/VX_bank.sv vendored Normal file
View file

@ -0,0 +1,548 @@
`include "VX_cache_define.vh"
module VX_bank #(
parameter CACHE_ID = 0,
parameter BANK_ID = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 1,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 1,
// Number of bankS
parameter NUM_BANKS = 1,
// Number of ports per banks
parameter NUM_PORTS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Core Request Queue Size
parameter CREQ_SIZE = 1,
// Core Response Queue Size
parameter CRSQ_SIZE = 1,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 1,
// Memory Request Queue Size
parameter MREQ_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// core request tag size
parameter CORE_TAG_WIDTH = 1,
// size of tag id in core request tag
parameter CORE_TAG_ID_BITS = 0,
// bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = 0,
parameter MSHR_ADDR_WIDTH = $clog2(MSHR_SIZE),
parameter WORD_SELECT_BITS = `UP(`WORD_SELECT_BITS)
) (
`SCOPE_IO_VX_bank
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output wire perf_read_misses,
output wire perf_write_misses,
output wire perf_mshr_stalls,
output wire perf_pipe_stalls,
`endif
// Core Request
input wire core_req_valid,
input wire [NUM_PORTS-1:0] core_req_pmask,
input wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] core_req_wsel,
input wire [NUM_PORTS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] core_req_data,
input wire [NUM_PORTS-1:0][`REQS_BITS-1:0] core_req_tid,
input wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag,
input wire core_req_rw,
input wire [`LINE_ADDR_WIDTH-1:0] core_req_addr,
output wire core_req_ready,
// Core Response
output wire core_rsp_valid,
output wire [NUM_PORTS-1:0] core_rsp_pmask,
output wire [NUM_PORTS-1:0][`REQS_BITS-1:0] core_rsp_tid,
output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] core_rsp_data,
output wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag,
input wire core_rsp_ready,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [NUM_PORTS-1:0] mem_req_pmask,
output wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen,
output wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel,
output wire [`LINE_ADDR_WIDTH-1:0] mem_req_addr,
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id,
output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id,
input wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data,
output wire mem_rsp_ready,
// flush
input wire flush_enable,
input wire [`LINE_SELECT_BITS-1:0] flush_addr
);
`UNUSED_PARAM (CORE_TAG_ID_BITS)
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_UNUSED_BEGIN
wire [31:0] debug_pc_sel, debug_pc_st0, debug_pc_st1;
wire [`NW_BITS-1:0] debug_wid_sel, debug_wid_st0, debug_wid_st1;
`IGNORE_UNUSED_END
`endif
wire [NUM_PORTS-1:0] creq_pmask;
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] creq_wsel;
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] creq_byteen;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] creq_data;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] creq_tid;
wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] creq_tag;
wire creq_rw;
wire [`LINE_ADDR_WIDTH-1:0] creq_addr;
wire creq_valid, creq_ready;
VX_elastic_buffer #(
.DATAW (1 + `LINE_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SELECT_BITS + WORD_SIZE + `WORD_WIDTH + `REQS_BITS + CORE_TAG_WIDTH)),
.SIZE (CREQ_SIZE)
) core_req_queue (
.clk (clk),
.reset (reset),
.ready_in (core_req_ready),
.valid_in (core_req_valid),
.data_in ({core_req_rw, core_req_addr, core_req_pmask, core_req_wsel, core_req_byteen, core_req_data, core_req_tid, core_req_tag}),
.data_out ({creq_rw, creq_addr, creq_pmask, creq_wsel, creq_byteen, creq_data, creq_tid, creq_tag}),
.ready_out (creq_ready),
.valid_out (creq_valid)
);
wire mreq_alm_full;
wire [`LINE_ADDR_WIDTH-1:0] mem_rsp_addr;
wire crsq_valid, crsq_ready;
wire crsq_stall;
wire mshr_valid;
wire mshr_ready;
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id;
wire mshr_alm_full;
wire [MSHR_ADDR_WIDTH-1:0] mshr_dequeue_id;
wire [`LINE_ADDR_WIDTH-1:0] mshr_addr;
wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] mshr_tag;
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mshr_wsel;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] mshr_tid;
wire [NUM_PORTS-1:0] mshr_pmask;
wire [`LINE_ADDR_WIDTH-1:0] addr_st0, addr_st1;
wire is_read_st0, is_read_st1;
wire is_write_st0, is_write_st1;
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] wsel_st0, wsel_st1;
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] byteen_st0, byteen_st1;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] req_tid_st0, req_tid_st1;
wire [NUM_PORTS-1:0] pmask_st0, pmask_st1;
wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] tag_st0, tag_st1;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] rdata_st1;
wire [`CACHE_LINE_WIDTH-1:0] wdata_st0, wdata_st1;
wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st0, mshr_id_st1;
wire valid_st0, valid_st1;
wire is_fill_st0, is_fill_st1;
wire is_mshr_st0, is_mshr_st1;
wire miss_st0, miss_st1;
wire is_flush_st0;
wire mshr_pending_st0, mshr_pending_st1;
// prevent read-during-write hazard when accessing tags/data block RAMs
wire rdw_fill_hazard = valid_st0 && is_fill_st0;
wire rdw_write_hazard = valid_st0 && is_write_st0 && ~creq_rw;
// determine which queue to pop next in priority order
wire mshr_grant = !flush_enable;
wire mshr_enable = mshr_grant && mshr_valid;
wire mrsq_grant = !flush_enable && !mshr_enable;
wire mrsq_enable = mrsq_grant && mem_rsp_valid;
wire creq_grant = !flush_enable && !mshr_enable && !mrsq_enable;
wire creq_enable = creq_grant && creq_valid;
assign mshr_ready = mshr_grant
&& !rdw_fill_hazard // prevent read-during-write hazard
&& !crsq_stall; // ensure core_rsp_queue not full
assign mem_rsp_ready = mrsq_grant
&& !crsq_stall; // ensure core_rsp_queue not full
assign creq_ready = creq_grant
&& !rdw_write_hazard // prevent read-during-write hazard
&& !mreq_alm_full // ensure mem_req_queue not full
&& !mshr_alm_full // ensure mshr not full
&& !crsq_stall; // ensure core_rsp_queue not full
wire flush_fire = flush_enable;
wire mshr_fire = mshr_valid && mshr_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
wire creq_fire = creq_valid && creq_ready;
`ifdef DBG_CACHE_REQ_INFO
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
assign {debug_wid_sel, debug_pc_sel} = mshr_enable ? mshr_tag[0][`CACHE_REQ_INFO_RNG] : creq_tag[0][`CACHE_REQ_INFO_RNG];
end else begin
assign {debug_wid_sel, debug_pc_sel} = 0;
end
`endif
wire [`CACHE_LINE_WIDTH-1:0] wdata_sel;
assign wdata_sel[(NUM_PORTS * `WORD_WIDTH)-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[(NUM_PORTS * `WORD_WIDTH)-1:0] : creq_data;
for (genvar i = NUM_PORTS * `WORD_WIDTH; i < `CACHE_LINE_WIDTH; ++i) begin
assign wdata_sel[i] = mem_rsp_data[i];
end
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + NUM_PORTS * (WORD_SELECT_BITS + WORD_SIZE + `REQS_BITS + 1 + CORE_TAG_WIDTH) + MSHR_ADDR_WIDTH),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (!crsq_stall),
.data_in ({
flush_fire || mshr_fire || mem_rsp_fire || creq_fire,
flush_enable,
mshr_enable,
mrsq_enable,
creq_enable && ~creq_rw,
creq_enable && creq_rw,
flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : (mshr_valid ? mshr_addr : (mem_rsp_valid ? mem_rsp_addr : creq_addr)),
wdata_sel,
mshr_valid ? mshr_wsel : creq_wsel,
creq_byteen,
mshr_valid ? mshr_tid : creq_tid,
mshr_valid ? mshr_pmask : creq_pmask,
mshr_valid ? mshr_tag : creq_tag,
mshr_valid ? mshr_dequeue_id : mem_rsp_id
}),
.data_out ({valid_st0, is_flush_st0, is_mshr_st0, is_fill_st0, is_read_st0, is_write_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0})
);
`ifdef DBG_CACHE_REQ_INFO
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
assign {debug_wid_st0, debug_pc_st0} = tag_st0[0][`CACHE_REQ_INFO_RNG];
end else begin
assign {debug_wid_st0, debug_pc_st0} = 0;
end
`endif
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_flush_st0 = valid_st0 && is_flush_st0;
wire do_lookup_st0 = valid_st0 && ~(is_fill_st0 || is_flush_st0);
wire tag_match_st0;
VX_tag_access #(
.BANK_ID (BANK_ID),
.CACHE_ID (CACHE_ID),
.CACHE_SIZE (CACHE_SIZE),
.CACHE_LINE_SIZE (CACHE_LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE (WORD_SIZE),
.BANK_ADDR_OFFSET (BANK_ADDR_OFFSET)
) tag_access (
.clk (clk),
.reset (reset),
`ifdef DBG_CACHE_REQ_INFO
.debug_pc (debug_pc_st0),
.debug_wid (debug_wid_st0),
`endif
.stall (crsq_stall),
// read/Fill
.lookup (do_lookup_st0),
.addr (addr_st0),
.fill (do_fill_st0),
.flush (do_flush_st0),
.tag_match (tag_match_st0)
);
// we have a core request hit
assign miss_st0 = (is_read_st0 || is_write_st0) && ~tag_match_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_id_a_st0 = (is_read_st0 || is_write_st0) ? mshr_alloc_id : mshr_id_st0;
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + NUM_PORTS * (WORD_SELECT_BITS + WORD_SIZE + `REQS_BITS + 1 + CORE_TAG_WIDTH) + MSHR_ADDR_WIDTH + 1),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (!crsq_stall),
.data_in ({valid_st0, is_mshr_st0, is_fill_st0, is_read_st0, is_write_st0, miss_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_a_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_mshr_st1, is_fill_st1, is_read_st1, is_write_st1, miss_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1, mshr_id_st1, mshr_pending_st1})
);
`ifdef DBG_CACHE_REQ_INFO
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
assign {debug_wid_st1, debug_pc_st1} = tag_st1[0][`CACHE_REQ_INFO_RNG];
end else begin
assign {debug_wid_st1, debug_pc_st1} = 0;
end
`endif
wire do_read_st0 = valid_st0 && is_read_st0;
wire do_read_st1 = valid_st1 && is_read_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_write_st1 = valid_st1 && is_write_st1;
wire do_mshr_st1 = valid_st1 && is_mshr_st1;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] creq_data_st1 = wdata_st1[0 +: NUM_PORTS * `WORD_WIDTH];
`UNUSED_VAR (wdata_st1)
VX_data_access #(
.BANK_ID (BANK_ID),
.CACHE_ID (CACHE_ID),
.CACHE_SIZE (CACHE_SIZE),
.CACHE_LINE_SIZE(CACHE_LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_PORTS (NUM_PORTS),
.WORD_SIZE (WORD_SIZE),
.WRITE_ENABLE (WRITE_ENABLE)
) data_access (
.clk (clk),
.reset (reset),
`ifdef DBG_CACHE_REQ_INFO
.debug_pc (debug_pc_st1),
.debug_wid (debug_wid_st1),
`endif
.stall (crsq_stall),
.read (do_read_st1 || do_mshr_st1),
.fill (do_fill_st1),
.write (do_write_st1 && !miss_st1),
.addr (addr_st1),
.wsel (wsel_st1),
.pmask (pmask_st1),
.byteen (byteen_st1),
.fill_data (wdata_st1),
.write_data (creq_data_st1),
.read_data (rdata_st1)
);
wire mshr_allocate = do_read_st0 && !crsq_stall;
wire mshr_replay = do_fill_st0 && !crsq_stall;
wire mshr_lookup = mshr_allocate;
wire mshr_release = do_read_st1 && !miss_st1 && !crsq_stall;
VX_pending_size #(
.SIZE (MSHR_SIZE)
) mshr_pending_size (
.clk (clk),
.reset (reset),
.incr (creq_fire && ~creq_rw),
.decr (mshr_fire || mshr_release),
.full (mshr_alm_full),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
);
VX_miss_resrv #(
.BANK_ID (BANK_ID),
.CACHE_ID (CACHE_ID),
.CACHE_LINE_SIZE (CACHE_LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_PORTS (NUM_PORTS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.MSHR_SIZE (MSHR_SIZE),
.CORE_TAG_WIDTH (CORE_TAG_WIDTH)
) miss_resrv (
.clk (clk),
.reset (reset),
`ifdef DBG_CACHE_REQ_INFO
.deq_debug_pc (debug_pc_sel),
.deq_debug_wid (debug_wid_sel),
.lkp_debug_pc (debug_pc_st0),
.lkp_debug_wid (debug_wid_st0),
.rel_debug_pc (debug_pc_st1),
.rel_debug_wid (debug_wid_st1),
`endif
// allocate
.allocate_valid (mshr_allocate),
.allocate_addr (addr_st0),
.allocate_data ({wsel_st0, tag_st0, req_tid_st0, pmask_st0}),
.allocate_id (mshr_alloc_id),
`UNUSED_PIN (allocate_ready),
// lookup
.lookup_valid (mshr_lookup),
.lookup_replay (mshr_replay),
.lookup_id (mshr_alloc_id),
.lookup_addr (addr_st0),
.lookup_match (mshr_pending_st0),
// fill
.fill_valid (mem_rsp_fire),
.fill_id (mem_rsp_id),
.fill_addr (mem_rsp_addr),
// dequeue
.dequeue_valid (mshr_valid),
.dequeue_id (mshr_dequeue_id),
.dequeue_addr (mshr_addr),
.dequeue_data ({mshr_wsel, mshr_tag, mshr_tid, mshr_pmask}),
.dequeue_ready (mshr_ready),
// release
.release_valid (mshr_release),
.release_id (mshr_id_st1)
);
// Enqueue core response
wire [NUM_PORTS-1:0] crsq_pmask;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] crsq_data;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] crsq_tid;
wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] crsq_tag;
assign crsq_valid = (do_read_st1 && !miss_st1)
|| do_mshr_st1;
assign crsq_stall = crsq_valid && !crsq_ready;
assign crsq_pmask = pmask_st1;
assign crsq_tid = req_tid_st1;
assign crsq_data = rdata_st1;
assign crsq_tag = tag_st1;
VX_elastic_buffer #(
.DATAW (NUM_PORTS * (CORE_TAG_WIDTH + 1 + `WORD_WIDTH + `REQS_BITS)),
.SIZE (CRSQ_SIZE),
.OUT_REG (1)
) core_rsp_req (
.clk (clk),
.reset (reset),
.valid_in (crsq_valid),
.data_in ({crsq_tag, crsq_pmask, crsq_data, crsq_tid}),
.ready_in (crsq_ready),
.valid_out (core_rsp_valid),
.data_out ({core_rsp_tag, core_rsp_pmask, core_rsp_data, core_rsp_tid}),
.ready_out (core_rsp_ready)
);
// Enqueue memory request
wire mreq_push, mreq_pop, mreq_empty;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mreq_data;
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mreq_byteen;
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mreq_wsel;
wire [NUM_PORTS-1:0] mreq_pmask;
wire [`LINE_ADDR_WIDTH-1:0] mreq_addr;
wire [MSHR_ADDR_WIDTH-1:0] mreq_id;
wire mreq_rw;
assign mreq_push = (do_read_st1 && miss_st1 && !mshr_pending_st1)
|| do_write_st1;
assign mreq_pop = mem_req_valid && mem_req_ready;
assign mreq_rw = WRITE_ENABLE && is_write_st1;
assign mreq_addr = addr_st1;
assign mreq_id = mshr_id_st1;
assign mreq_pmask= pmask_st1;
assign mreq_wsel = wsel_st1;
assign mreq_byteen = byteen_st1;
assign mreq_data = creq_data_st1;
VX_fifo_queue #(
.DATAW (1 + `LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)),
.SIZE (MREQ_SIZE),
.ALM_FULL (MREQ_SIZE-2),
.OUT_REG (1 == NUM_BANKS)
) mem_req_queue (
.clk (clk),
.reset (reset),
.push (mreq_push),
.pop (mreq_pop),
.data_in ({mreq_rw, mreq_addr, mreq_id, mreq_pmask, mreq_byteen, mreq_wsel, mreq_data}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_pmask, mem_req_byteen, mem_req_wsel, mem_req_data}),
.empty (mreq_empty),
.alm_full (mreq_alm_full),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (size)
);
assign mem_req_valid = !mreq_empty;
///////////////////////////////////////////////////////////////////////////////
`SCOPE_ASSIGN (valid_st0, valid_st0);
`SCOPE_ASSIGN (valid_st1, valid_st1);
`SCOPE_ASSIGN (is_fill_st0, is_fill_st0);
`SCOPE_ASSIGN (is_mshr_st0, is_mshr_st0);
`SCOPE_ASSIGN (miss_st0, miss_st0);
`SCOPE_ASSIGN (crsq_stall, crsq_stall);
`SCOPE_ASSIGN (mreq_alm_full, mreq_alm_full);
`SCOPE_ASSIGN (mshr_alm_full, mshr_alm_full);
`SCOPE_ASSIGN (addr_st0, `LINE_TO_BYTE_ADDR(addr_st0, BANK_ID));
`SCOPE_ASSIGN (addr_st1, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID));
`ifdef PERF_ENABLE
assign perf_read_misses = do_read_st1 && miss_st1;
assign perf_write_misses = do_write_st1 && miss_st1;
assign perf_pipe_stalls = crsq_stall || mreq_alm_full || mshr_alm_full;
assign perf_mshr_stalls = mshr_alm_full;
`endif
`ifdef DBG_PRINT_CACHE_BANK
wire crsq_fire = crsq_valid && crsq_ready;
wire pipeline_stall = (mshr_valid || mem_rsp_valid || creq_valid)
&& ~(mshr_fire || mem_rsp_fire || creq_fire);
always @(posedge clk) begin
if (pipeline_stall) begin
dpi_trace("%d: *** cache%0d:%0d stall: crsq=%b, mreq=%b, mshr=%b\n", $time, CACHE_ID, BANK_ID, crsq_stall, mreq_alm_full, mshr_alm_full);
end
if (flush_enable) begin
dpi_trace("%d: cache%0d:%0d flush: addr=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(flush_addr, BANK_ID));
end
if (mem_rsp_fire) begin
dpi_trace("%d: cache%0d:%0d fill-rsp: addr=%0h, id=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data);
end
if (mshr_fire) begin
dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, debug_wid_sel, debug_pc_sel);
end
if (creq_fire) begin
if (creq_rw)
dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, debug_wid_sel, debug_pc_sel);
else
dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, debug_wid_sel, debug_pc_sel);
end
if (crsq_fire) begin
dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1);
end
if (mreq_push) begin
if (is_write_st1)
dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, debug_wid_st1, debug_pc_st1);
else
dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, debug_wid_st1, debug_pc_st1);
end
end
`endif
endmodule

585
hw/rtl/cache/VX_bank.v vendored
View file

@ -1,585 +0,0 @@
`include "VX_cache_define.vh"
module VX_bank #(
parameter CACHE_ID = 0,
parameter BANK_ID = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 1,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 1,
// Number of bankS
parameter NUM_BANKS = 1,
// Number of ports per banks
parameter NUM_PORTS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Core Request Queue Size
parameter CREQ_SIZE = 1,
// Core Response Queue Size
parameter CRSQ_SIZE = 1,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 1,
// Memory Request Queue Size
parameter MREQ_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// core request tag size
parameter CORE_TAG_WIDTH = 1,
// size of tag id in core request tag
parameter CORE_TAG_ID_BITS = 0,
// bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = 0
) (
`SCOPE_IO_VX_bank
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output wire perf_read_misses,
output wire perf_write_misses,
output wire perf_mshr_stalls,
output wire perf_pipe_stalls,
`endif
// Core Request
input wire core_req_valid,
input wire [NUM_PORTS-1:0] core_req_pmask,
input wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] core_req_wsel,
input wire [NUM_PORTS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] core_req_data,
input wire [NUM_PORTS-1:0][`REQS_BITS-1:0] core_req_tid,
input wire core_req_rw,
input wire [`LINE_ADDR_WIDTH-1:0] core_req_addr,
input wire [CORE_TAG_WIDTH-1:0] core_req_tag,
output wire core_req_ready,
// Core Response
output wire core_rsp_valid,
output wire [NUM_PORTS-1:0] core_rsp_pmask,
output wire [NUM_PORTS-1:0][`REQS_BITS-1:0] core_rsp_tid,
output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] core_rsp_data,
output wire [CORE_TAG_WIDTH-1:0] core_rsp_tag,
input wire core_rsp_ready,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [CACHE_LINE_SIZE-1:0] mem_req_byteen,
output wire [`LINE_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`CACHE_LINE_WIDTH-1:0] mem_req_data,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`LINE_ADDR_WIDTH-1:0] mem_rsp_addr,
input wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data,
output wire mem_rsp_ready,
// flush
input wire flush_enable,
input wire [`LINE_SELECT_BITS-1:0] flush_addr
);
`UNUSED_PARAM (CORE_TAG_ID_BITS)
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_WARNINGS_BEGIN
wire [31:0] debug_pc_sel, debug_pc_st0, debug_pc_st1;
wire [`NW_BITS-1:0] debug_wid_sel, debug_wid_st0, debug_wid_st1;
`IGNORE_WARNINGS_END
`endif
wire [NUM_PORTS-1:0] creq_pmask;
wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] creq_wsel;
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] creq_byteen;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] creq_data;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] creq_tid;
wire creq_rw;
wire [`LINE_ADDR_WIDTH-1:0] creq_addr;
wire [CORE_TAG_WIDTH-1:0] creq_tag;
wire creq_out_valid, creq_out_ready;
VX_elastic_buffer #(
.DATAW (CORE_TAG_WIDTH + 1 + `LINE_ADDR_WIDTH + (1 + `UP(`WORD_SELECT_BITS) + WORD_SIZE + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS),
.SIZE (CREQ_SIZE),
.OUTPUT_REG (CREQ_SIZE > 2)
) core_req_queue (
.clk (clk),
.reset (reset),
.ready_in (core_req_ready),
.valid_in (core_req_valid),
.data_in ({core_req_tag, core_req_rw, core_req_addr, core_req_pmask, core_req_wsel, core_req_byteen, core_req_data, core_req_tid}),
.data_out ({creq_tag, creq_rw, creq_addr, creq_pmask, creq_wsel, creq_byteen, creq_data, creq_tid}),
.ready_out (creq_out_ready),
.valid_out (creq_out_valid)
);
wire mshr_alm_full;
wire mshr_pop;
wire mshr_valid;
wire [`LINE_ADDR_WIDTH-1:0] mshr_addr;
wire [CORE_TAG_WIDTH-1:0] mshr_tag;
wire [NUM_PORTS-1:0] mshr_pmask;
wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] mshr_wsel;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] mshr_tid;
wire [`LINE_ADDR_WIDTH-1:0] addr_st0, addr_st1;
wire mem_rw_st0, mem_rw_st1;
wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] wsel_st0, wsel_st1;
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] byteen_st0, byteen_st1;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] req_tid_st0, req_tid_st1;
wire [NUM_PORTS-1:0] pmask_st0, pmask_st1;
wire [`CACHE_LINE_WIDTH-1:0] rdata_st1;
wire [`CACHE_LINE_WIDTH-1:0] wdata_st0, wdata_st1;
wire [CORE_TAG_WIDTH-1:0] tag_st0, tag_st1;
wire valid_st0, valid_st1;
wire is_fill_st0, is_fill_st1;
wire is_mshr_st0, is_mshr_st1;
wire miss_st0, miss_st1;
wire prev_miss_dep_st0;
wire force_miss_st0, force_miss_st1;
wire not_same_prev_mshr_st0, not_same_prev_mshr_st1;
wire writeen_unqual_st0, writeen_unqual_st1;
wire incoming_fill_unqual_st0, incoming_fill_unqual_st1;
wire mshr_pending_st0;
wire is_flush_st0;
wire crsq_in_valid, crsq_in_ready, crsq_in_stall;
wire mreq_alm_full;
wire creq_out_fire = creq_out_valid && creq_out_ready;
wire crsq_in_fire = crsq_in_valid && crsq_in_ready;
VX_pending_size #(
.SIZE (MSHR_SIZE)
) mshr_pending_size (
.clk (clk),
.reset (reset),
.push (creq_out_fire && !creq_rw),
.pop (crsq_in_fire),
.full (mshr_alm_full),
`UNUSED_PIN (empty),
`UNUSED_PIN (size)
);
// determine which queue to pop next in priority order
wire mshr_grant = !mreq_alm_full; // ensure memory request queue not full (deadlock prevention)
wire mshr_enable = mshr_grant && mshr_valid;
wire mrsq_grant = !mshr_enable;
wire mrsq_enable = mrsq_grant && mem_rsp_valid;
wire creq_grant = !mshr_enable && !mrsq_enable && !flush_enable;
wire is_miss_st1 = (miss_st1 || force_miss_st1);
assign mshr_pop = mshr_enable
&& !(valid_st1 && is_mshr_st1 && is_miss_st1) // do not schedule another mshr request if the previous one missed
&& !crsq_in_stall; // ensure core response ready
assign creq_out_ready = creq_grant
&& !mreq_alm_full // ensure memory request ready
&& !mshr_alm_full // ensure mshr enqueue ready
&& !crsq_in_stall; // ensure core response ready
assign mem_rsp_ready = mrsq_grant
&& !crsq_in_stall; // ensure core response ready
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`ifdef DBG_CACHE_REQ_INFO
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
assign {debug_wid_sel, debug_pc_sel} = mshr_enable ? mshr_tag[`CACHE_REQ_INFO_RNG] : creq_tag[`CACHE_REQ_INFO_RNG];
end else begin
assign {debug_wid_sel, debug_pc_sel} = 0;
end
`endif
wire [`CACHE_LINE_WIDTH-1:0] creq_line_data;
if (`WORDS_PER_LINE > 1) begin
if (NUM_PORTS > 1) begin
reg [`CACHE_LINE_WIDTH-1:0] creq_line_data_r;
always @(*) begin
creq_line_data_r = 'x;
for (integer p = 0; p < NUM_PORTS; p++) begin
if (creq_pmask[p]) begin
creq_line_data_r[creq_wsel[p] * `WORD_WIDTH +: `WORD_WIDTH] = creq_data[p];
end
end
end
assign creq_line_data = creq_line_data_r;
end else begin
assign creq_line_data = {`WORDS_PER_LINE{creq_data}};
end
end else begin
assign creq_line_data = creq_data;
end
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + (`UP(`WORD_SELECT_BITS) + WORD_SIZE + `REQS_BITS + 1) * NUM_PORTS + CORE_TAG_WIDTH + 1),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (!crsq_in_stall),
.data_in ({
flush_enable || mshr_pop || mem_rsp_fire || creq_out_fire,
flush_enable,
mshr_enable,
mrsq_enable || flush_enable,
mshr_enable ? 1'b0 : creq_rw,
mshr_enable ? mshr_addr : (mem_rsp_valid ? mem_rsp_addr : (flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : creq_addr)),
(mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data : creq_line_data,
mshr_enable ? mshr_wsel : creq_wsel,
creq_byteen,
mshr_enable ? mshr_tid : creq_tid,
mshr_enable ? mshr_pmask : creq_pmask,
mshr_enable ? mshr_tag : creq_tag
}),
.data_out ({valid_st0, is_flush_st0, is_mshr_st0, is_fill_st0, mem_rw_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0})
);
`ifdef DBG_CACHE_REQ_INFO
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
assign {debug_wid_st0, debug_pc_st0} = tag_st0[`CACHE_REQ_INFO_RNG];
end else begin
assign {debug_wid_st0, debug_pc_st0} = 0;
end
`endif
wire do_lookup_st0 = valid_st0 && ~is_fill_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0 && !crsq_in_stall;
wire tag_match_st0;
VX_tag_access #(
.BANK_ID (BANK_ID),
.CACHE_ID (CACHE_ID),
.CACHE_SIZE (CACHE_SIZE),
.CACHE_LINE_SIZE (CACHE_LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE (WORD_SIZE),
.BANK_ADDR_OFFSET (BANK_ADDR_OFFSET)
) tag_access (
.clk (clk),
.reset (reset),
`ifdef DBG_CACHE_REQ_INFO
.debug_pc (debug_pc_st0),
.debug_wid (debug_wid_st0),
`endif
// read/Fill
.lookup (do_lookup_st0),
.addr (addr_st0),
.fill (do_fill_st0),
.is_flush (is_flush_st0),
.tag_match (tag_match_st0)
);
// we had a miss with prior request for the current address
assign prev_miss_dep_st0 = valid_st1 && is_miss_st1 && (addr_st0 == addr_st1);
// we have a core request hit
assign miss_st0 = !is_fill_st0 && !tag_match_st0;
// force a miss to ensure commit order when a new request has pending previous requests to same block
// also force a miss for mshr requests when previous request was a missed
assign force_miss_st0 = (!is_fill_st0 && !is_mshr_st0 && (mshr_pending_st0 || prev_miss_dep_st0))
|| (is_mshr_st0 && valid_st1 && is_mshr_st1 && is_miss_st1);
// previous mshr request doesn't have same address
assign not_same_prev_mshr_st0 = valid_st1 && is_mshr_st1 && (addr_st1 != addr_st0);
// enable write when we have a fill request that is not redundant
assign writeen_unqual_st0 = is_fill_st0 && !tag_match_st0;
// check if incoming memory response match current address
assign incoming_fill_unqual_st0 = mem_rsp_valid && (addr_st0 == mem_rsp_addr);
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + (`UP(`WORD_SELECT_BITS) + WORD_SIZE + `REQS_BITS + 1) * NUM_PORTS + CORE_TAG_WIDTH),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (!crsq_in_stall),
.data_in ({valid_st0, is_mshr_st0, is_fill_st0, writeen_unqual_st0, incoming_fill_unqual_st0, miss_st0, force_miss_st0, mem_rw_st0, not_same_prev_mshr_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0}),
.data_out ({valid_st1, is_mshr_st1, is_fill_st1, writeen_unqual_st1, incoming_fill_unqual_st1, miss_st1, force_miss_st1, mem_rw_st1, not_same_prev_mshr_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1})
);
`ifdef DBG_CACHE_REQ_INFO
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
assign {debug_wid_st1, debug_pc_st1} = tag_st1[`CACHE_REQ_INFO_RNG];
end else begin
assign {debug_wid_st1, debug_pc_st1} = 0;
end
`endif
wire writeen_st1 = (WRITE_ENABLE && !is_fill_st1 && mem_rw_st1 && ~is_miss_st1)
|| writeen_unqual_st1;
wire readen_st1 = !is_fill_st1 && !mem_rw_st1;
wire crsq_push_st1 = readen_st1 && ~is_miss_st1;
wire mshr_push_st1 = readen_st1 && is_miss_st1;
wire incoming_fill_st1 = (mem_rsp_valid && (addr_st1 == mem_rsp_addr))
|| incoming_fill_unqual_st1;
wire do_writeback_st1 = !is_fill_st1 && mem_rw_st1;
wire mreq_push_st1 = (readen_st1 && miss_st1 && (~force_miss_st1 || not_same_prev_mshr_st1) && !incoming_fill_st1)
|| do_writeback_st1;
wire [`WORDS_PER_LINE-1:0][WORD_SIZE-1:0] line_byteen_st1;
if (`WORDS_PER_LINE > 1) begin
reg [CACHE_LINE_SIZE-1:0] line_byteen_r;
always @(*) begin
line_byteen_r = 0;
for (integer p = 0; p < NUM_PORTS; p++) begin
if ((NUM_PORTS == 1) || pmask_st1[p]) begin
line_byteen_r[wsel_st1[p] * WORD_SIZE +: WORD_SIZE] = byteen_st1[p];
end
end
end
assign line_byteen_st1 = line_byteen_r;
end else begin
assign line_byteen_st1 = byteen_st1;
end
VX_data_access #(
.BANK_ID (BANK_ID),
.CACHE_ID (CACHE_ID),
.CACHE_SIZE (CACHE_SIZE),
.CACHE_LINE_SIZE(CACHE_LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE (WORD_SIZE),
.WRITE_ENABLE (WRITE_ENABLE)
) data_access (
.clk (clk),
.reset (reset),
`ifdef DBG_CACHE_REQ_INFO
.debug_pc (debug_pc_st1),
.debug_wid (debug_wid_st1),
`endif
.addr (addr_st1),
// reading
.readen (valid_st1 && readen_st1),
.rdata (rdata_st1),
// writing
.writeen (valid_st1 && writeen_st1),
.is_fill (is_fill_st1),
.byteen (line_byteen_st1),
.wdata (wdata_st1)
);
wire mshr_push = valid_st1 && mshr_push_st1;
wire mshr_dequeue = valid_st1 && is_mshr_st1 && !mshr_push_st1 && crsq_in_ready;
wire mshr_restore = is_mshr_st1;
// push a missed request as 'ready' if it was a forced miss that actually had a hit
// or the fill request for this block is comming
wire mshr_init_ready_state = !miss_st1 || incoming_fill_unqual_st1;
VX_miss_resrv #(
.BANK_ID (BANK_ID),
.CACHE_ID (CACHE_ID),
.CACHE_LINE_SIZE (CACHE_LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_PORTS (NUM_PORTS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.MSHR_SIZE (MSHR_SIZE),
.ALM_FULL (MSHR_SIZE-2),
.CORE_TAG_WIDTH (CORE_TAG_WIDTH)
) miss_resrv (
.clk (clk),
.reset (reset),
`ifdef DBG_CACHE_REQ_INFO
.deq_debug_pc (debug_pc_sel),
.deq_debug_wid (debug_wid_sel),
.enq_debug_pc (debug_pc_st1),
.enq_debug_wid (debug_wid_st1),
`endif
// enqueue
.enqueue (mshr_push),
.enqueue_addr (addr_st1),
.enqueue_data ({wsel_st1, tag_st1, req_tid_st1, pmask_st1}),
.enqueue_is_mshr (mshr_restore),
.enqueue_as_ready (mshr_init_ready_state),
`UNUSED_PIN (enqueue_almfull),
`UNUSED_PIN (enqueue_full),
// fill
.fill_start (mem_rsp_fire),
.fill_addr (mem_rsp_addr),
// lookup
.lookup_addr (addr_st0),
.lookup_match (mshr_pending_st0),
.lookup_fill (do_fill_st0),
// schedule
.schedule (mshr_pop),
.schedule_valid (mshr_valid),
.schedule_addr (mshr_addr),
.schedule_data ({mshr_wsel, mshr_tag, mshr_tid, mshr_pmask}),
// dequeue
.dequeue (mshr_dequeue)
);
// Enqueue core response
wire [NUM_PORTS-1:0] crsq_pmask;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] crsq_data;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] crsq_tid;
wire [CORE_TAG_WIDTH-1:0] crsq_tag;
assign crsq_in_valid = valid_st1 && crsq_push_st1;
assign crsq_in_stall = crsq_in_valid && !crsq_in_ready;
assign crsq_pmask = pmask_st1;
assign crsq_tid = req_tid_st1;
assign crsq_tag = tag_st1;
if (`WORDS_PER_LINE > 1) begin
for (genvar p = 0; p < NUM_PORTS; ++p) begin
assign crsq_data[p] = rdata_st1[wsel_st1[p] * `WORD_WIDTH +: `WORD_WIDTH];
end
end else begin
assign crsq_data = rdata_st1;
end
VX_elastic_buffer #(
.DATAW (CORE_TAG_WIDTH + (1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS),
.SIZE (CRSQ_SIZE),
.OUTPUT_REG (1 == NUM_BANKS)
) core_rsp_req (
.clk (clk),
.reset (reset),
.valid_in (crsq_in_valid),
.data_in ({crsq_tag, crsq_pmask, crsq_data, crsq_tid}),
.ready_in (crsq_in_ready),
.valid_out (core_rsp_valid),
.data_out ({core_rsp_tag, core_rsp_pmask, core_rsp_data, core_rsp_tid}),
.ready_out (core_rsp_ready)
);
// Enqueue memory request
wire [CACHE_LINE_SIZE-1:0] mreq_byteen;
wire [`LINE_ADDR_WIDTH-1:0] mreq_addr;
wire [`CACHE_LINE_WIDTH-1:0] mreq_data;
wire mreq_push, mreq_pop, mreq_empty, mreq_rw;
assign mreq_push = valid_st1 && mreq_push_st1;
assign mreq_pop = mem_req_valid && mem_req_ready;
assign mreq_rw = WRITE_ENABLE && do_writeback_st1;
assign mreq_byteen = mreq_rw ? line_byteen_st1 : {CACHE_LINE_SIZE{1'b1}};
assign mreq_addr = addr_st1;
assign mreq_data = wdata_st1;
VX_fifo_queue #(
.DATAW (1 + CACHE_LINE_SIZE + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH),
.SIZE (MREQ_SIZE),
.ALM_FULL (MREQ_SIZE-2)
) mem_req_queue (
.clk (clk),
.reset (reset),
.push (mreq_push),
.pop (mreq_pop),
.data_in ({mreq_rw, mreq_byteen, mreq_addr, mreq_data}),
.data_out ({mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_data}),
.empty (mreq_empty),
.alm_full (mreq_alm_full),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (size)
);
assign mem_req_valid = !mreq_empty;
`SCOPE_ASSIGN (valid_st0, valid_st0);
`SCOPE_ASSIGN (valid_st1, valid_st1);
`SCOPE_ASSIGN (is_fill_st0, is_fill_st0);
`SCOPE_ASSIGN (is_mshr_st0, is_mshr_st0);
`SCOPE_ASSIGN (miss_st0, miss_st0);
`SCOPE_ASSIGN (force_miss_st0, force_miss_st0);
`SCOPE_ASSIGN (mshr_push, mshr_push);
`SCOPE_ASSIGN (crsq_in_stall, crsq_in_stall);
`SCOPE_ASSIGN (mreq_alm_full, mreq_alm_full);
`SCOPE_ASSIGN (mshr_alm_full, mshr_alm_full);
`SCOPE_ASSIGN (addr_st0, `LINE_TO_BYTE_ADDR(addr_st0, BANK_ID));
`SCOPE_ASSIGN (addr_st1, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID));
`ifdef PERF_ENABLE
assign perf_read_misses = valid_st1 && !is_fill_st1 && !is_mshr_st1 && miss_st1 && !mem_rw_st1;
assign perf_write_misses = valid_st1 && !is_fill_st1 && !is_mshr_st1 && miss_st1 && mem_rw_st1;
assign perf_pipe_stalls = crsq_in_stall || mreq_alm_full || mshr_alm_full;
assign perf_mshr_stalls = mshr_alm_full;
`endif
`ifdef DBG_PRINT_CACHE_BANK
always @(posedge clk) begin
/*if (crsq_in_fire && (NUM_PORTS > 1) && $countones(crsq_pmask) > 1) begin
$display("%t: *** cache%0d:%0d multi-port-out: pmask=%b, addr=%0h, tag=%0h", $time, CACHE_ID, BANK_ID, crsq_pmask, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag);
end*/
if (valid_st1 && !is_fill_st1 && miss_st1 && incoming_fill_st1) begin
$display("%t: *** cache%0d:%0d miss with incoming fill - addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID));
assert(!is_mshr_st1);
end
if (crsq_in_stall || mreq_alm_full || mshr_alm_full) begin
$display("%t: *** cache%0d:%0d pipeline-stall: cwbq=%b, dwbq=%b, mshr=%b", $time, CACHE_ID, BANK_ID, crsq_in_stall, mreq_alm_full, mshr_alm_full);
end
if (flush_enable) begin
$display("%t: cache%0d:%0d flush: addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(flush_addr, BANK_ID));
end
if (mem_rsp_fire) begin
$display("%t: cache%0d:%0d fill-rsp: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_data);
end
if (mshr_pop) begin
$display("%t: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, debug_wid_sel, debug_pc_sel);
end
if (creq_out_fire) begin
if (creq_rw)
$display("%t: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, debug_wid_sel, debug_pc_sel);
else
$display("%t: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, debug_wid_sel, debug_pc_sel);
end
if (crsq_in_fire) begin
$display("%t: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1);
end
if (mreq_push) begin
if (do_writeback_st1)
$display("%t: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, debug_wid_st1, debug_pc_st1);
else
$display("%t: cache%0d:%0d fill-req: addr=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), debug_wid_st1, debug_pc_st1);
end
end
`endif
endmodule

View file

@ -18,15 +18,15 @@ module VX_cache #(
parameter WORD_SIZE = 4,
// Core Request Queue Size
parameter CREQ_SIZE = 2,
parameter CREQ_SIZE = 0,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
// Memory Response Queue Size
parameter MRSQ_SIZE = 4,
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
parameter MREQ_SIZE = 2,
parameter MREQ_SIZE = 4,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
@ -44,13 +44,15 @@ module VX_cache #(
parameter BANK_ADDR_OFFSET = 0,
// enable bypass for non-cacheable addresses
parameter NC_ENABLE = 0
parameter NC_ENABLE = 0,
parameter WORD_SELECT_BITS = `UP(`WORD_SELECT_BITS)
) (
`SCOPE_IO_VX_cache
// PERF
`ifdef PERF_ENABLE
VX_perf_cache_if perf_cache_if,
VX_perf_cache_if.master perf_cache_if,
`endif
input wire clk,
@ -91,6 +93,8 @@ module VX_cache #(
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid value"))
`STATIC_ASSERT(NUM_PORTS <= NUM_BANKS, ("invalid value"))
localparam MSHR_ADDR_WIDTH = $clog2(MSHR_SIZE);
localparam MEM_TAG_IN_WIDTH = `BANK_SELECT_BITS + MSHR_ADDR_WIDTH;
localparam CORE_TAG_X_WIDTH = CORE_TAG_WIDTH - NC_ENABLE;
localparam CORE_TAG_ID_X_BITS = (CORE_TAG_ID_BITS != 0) ? (CORE_TAG_ID_BITS - NC_ENABLE) : CORE_TAG_ID_BITS;
@ -103,6 +107,117 @@ module VX_cache #(
///////////////////////////////////////////////////////////////////////////
wire mem_req_valid_sb;
wire mem_req_rw_sb;
wire [CACHE_LINE_SIZE-1:0] mem_req_byteen_sb;
wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_sb;
wire [`CACHE_LINE_WIDTH-1:0] mem_req_data_sb;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_sb;
wire mem_req_ready_sb;
VX_skid_buffer #(
.DATAW (1+CACHE_LINE_SIZE+`MEM_ADDR_WIDTH+`CACHE_LINE_WIDTH+MEM_TAG_WIDTH),
.PASSTHRU (1 == NUM_BANKS)
) mem_req_sbuf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_valid_sb),
.ready_in (mem_req_ready_sb),
.data_in ({mem_req_rw_sb, mem_req_byteen_sb, mem_req_addr_sb, mem_req_data_sb, mem_req_tag_sb}),
.data_out ({mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_data, mem_req_tag}),
.valid_out (mem_req_valid),
.ready_out (mem_req_ready)
);
///////////////////////////////////////////////////////////////////////////
wire [`CORE_RSP_TAGS-1:0] core_rsp_valid_sb;
wire [NUM_REQS-1:0] core_rsp_tmask_sb;
wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_sb;
wire [`CORE_RSP_TAGS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_sb;
wire [`CORE_RSP_TAGS-1:0] core_rsp_ready_sb;
if (CORE_TAG_ID_BITS != 0) begin
VX_skid_buffer #(
.DATAW (NUM_REQS + NUM_REQS*`WORD_WIDTH + CORE_TAG_WIDTH),
.PASSTHRU (1 == NUM_BANKS)
) core_rsp_sbuf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_sb),
.ready_in (core_rsp_ready_sb),
.data_in ({core_rsp_tmask_sb, core_rsp_data_sb, core_rsp_tag_sb}),
.data_out ({core_rsp_tmask, core_rsp_data, core_rsp_tag}),
.valid_out (core_rsp_valid),
.ready_out (core_rsp_ready)
);
end else begin
for (genvar i = 0; i < NUM_REQS; i++) begin
VX_skid_buffer #(
.DATAW (1 + `WORD_WIDTH + CORE_TAG_WIDTH),
.PASSTHRU (1 == NUM_BANKS)
) core_rsp_sbuf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_sb[i]),
.ready_in (core_rsp_ready_sb[i]),
.data_in ({core_rsp_tmask_sb[i], core_rsp_data_sb[i], core_rsp_tag_sb[i]}),
.data_out ({core_rsp_tmask[i], core_rsp_data[i], core_rsp_tag[i]}),
.valid_out (core_rsp_valid[i]),
.ready_out (core_rsp_ready[i])
);
end
end
///////////////////////////////////////////////////////////////////////////
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_p;
wire [NUM_PORTS-1:0] mem_req_pmask_p;
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_p;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_p;
wire mem_req_rw_p;
if (WRITE_ENABLE) begin
if (`WORDS_PER_LINE > 1) begin
reg [CACHE_LINE_SIZE-1:0] mem_req_byteen_r;
reg [`CACHE_LINE_WIDTH-1:0] mem_req_data_r;
always @(*) begin
mem_req_byteen_r = 0;
mem_req_data_r = 'x;
for (integer i = 0; i < NUM_PORTS; ++i) begin
if ((1 == NUM_PORTS) || mem_req_pmask_p[i]) begin
mem_req_byteen_r[mem_req_wsel_p[i] * WORD_SIZE +: WORD_SIZE] = mem_req_byteen_p[i];
mem_req_data_r[mem_req_wsel_p[i] * `WORD_WIDTH +: `WORD_WIDTH] = mem_req_data_p[i];
end
end
end
assign mem_req_rw_sb = mem_req_rw_p;
assign mem_req_byteen_sb = mem_req_byteen_r;
assign mem_req_data_sb = mem_req_data_r;
end else begin
`UNUSED_VAR (mem_req_pmask_p)
`UNUSED_VAR (mem_req_wsel_p)
assign mem_req_rw_sb = mem_req_rw_p;
assign mem_req_byteen_sb = mem_req_byteen_p;
assign mem_req_data_sb = mem_req_data_p;
end
end else begin
`UNUSED_VAR (mem_req_byteen_p)
`UNUSED_VAR (mem_req_pmask_p)
`UNUSED_VAR (mem_req_wsel_p)
`UNUSED_VAR (mem_req_data_p)
`UNUSED_VAR (mem_req_rw_p)
assign mem_req_rw_sb = 0;
assign mem_req_byteen_sb = 'x;
assign mem_req_data_sb = 'x;
end
///////////////////////////////////////////////////////////////////////////
// Core request
wire [NUM_REQS-1:0] core_req_valid_nc;
wire [NUM_REQS-1:0] core_req_rw_nc;
@ -122,20 +237,23 @@ module VX_cache #(
// Memory request
wire mem_req_valid_nc;
wire mem_req_rw_nc;
wire [CACHE_LINE_SIZE-1:0] mem_req_byteen_nc;
wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_nc;
wire [`CACHE_LINE_WIDTH-1:0] mem_req_data_nc;
wire [`MEM_ADDR_WIDTH-1:0] mem_req_tag_nc;
wire [NUM_PORTS-1:0] mem_req_pmask_nc;
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_nc;
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_nc;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_nc;
wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_nc;
wire mem_req_ready_nc;
// Memory response
wire mem_rsp_valid_nc;
wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_nc;
wire [`MEM_ADDR_WIDTH-1:0] mem_rsp_tag_nc;
wire mem_rsp_ready_nc;
wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_nc;
wire mem_rsp_ready_nc;
if (NC_ENABLE) begin
VX_nc_bypass #(
.NUM_PORTS (NUM_PORTS),
.NUM_REQS (NUM_REQS),
.NUM_RSP_TAGS (`CORE_RSP_TAGS),
.NC_TAG_BIT (0),
@ -145,12 +263,12 @@ module VX_cache #(
.CORE_TAG_IN_WIDTH (CORE_TAG_WIDTH),
.MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH),
.MEM_DATA_SIZE (CACHE_LINE_SIZE),
.MEM_TAG_IN_WIDTH (`MEM_ADDR_WIDTH),
.MEM_DATA_SIZE (CACHE_LINE_SIZE),
.MEM_TAG_IN_WIDTH (MEM_TAG_IN_WIDTH),
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH)
) nc_bypass (
.clk (clk),
.reset (reset),
.clk (clk),
.reset (reset),
// Core request in
.core_req_valid_in (core_req_valid),
@ -178,29 +296,33 @@ module VX_cache #(
.core_rsp_ready_in (core_rsp_ready_nc),
// Core response out
.core_rsp_valid_out (core_rsp_valid),
.core_rsp_tmask_out (core_rsp_tmask),
.core_rsp_data_out (core_rsp_data),
.core_rsp_tag_out (core_rsp_tag),
.core_rsp_ready_out (core_rsp_ready),
.core_rsp_valid_out (core_rsp_valid_sb),
.core_rsp_tmask_out (core_rsp_tmask_sb),
.core_rsp_data_out (core_rsp_data_sb),
.core_rsp_tag_out (core_rsp_tag_sb),
.core_rsp_ready_out (core_rsp_ready_sb),
// Memory request in
.mem_req_valid_in (mem_req_valid_nc),
.mem_req_rw_in (mem_req_rw_nc),
.mem_req_byteen_in (mem_req_byteen_nc),
.mem_req_rw_in (mem_req_rw_nc),
.mem_req_addr_in (mem_req_addr_nc),
.mem_req_pmask_in (mem_req_pmask_nc),
.mem_req_byteen_in (mem_req_byteen_nc),
.mem_req_wsel_in (mem_req_wsel_nc),
.mem_req_data_in (mem_req_data_nc),
.mem_req_tag_in (mem_req_tag_nc),
.mem_req_ready_in (mem_req_ready_nc),
// Memory request out
.mem_req_valid_out (mem_req_valid),
.mem_req_rw_out (mem_req_rw),
.mem_req_byteen_out (mem_req_byteen),
.mem_req_addr_out (mem_req_addr),
.mem_req_data_out (mem_req_data),
.mem_req_tag_out (mem_req_tag),
.mem_req_ready_out (mem_req_ready),
.mem_req_valid_out (mem_req_valid_sb),
.mem_req_addr_out (mem_req_addr_sb),
.mem_req_rw_out (mem_req_rw_p),
.mem_req_pmask_out (mem_req_pmask_p),
.mem_req_byteen_out (mem_req_byteen_p),
.mem_req_wsel_out (mem_req_wsel_p),
.mem_req_data_out (mem_req_data_p),
.mem_req_tag_out (mem_req_tag_sb),
.mem_req_ready_out (mem_req_ready_sb),
// Memory response in
.mem_rsp_valid_in (mem_rsp_valid),
@ -223,19 +345,21 @@ module VX_cache #(
assign core_req_tag_nc = core_req_tag;
assign core_req_ready = core_req_ready_nc;
assign core_rsp_valid = core_rsp_valid_nc;
assign core_rsp_tmask = core_rsp_tmask_nc;
assign core_rsp_data = core_rsp_data_nc;
assign core_rsp_tag = core_rsp_tag_nc;
assign core_rsp_ready_nc = core_rsp_ready;
assign core_rsp_valid_sb = core_rsp_valid_nc;
assign core_rsp_tmask_sb = core_rsp_tmask_nc;
assign core_rsp_data_sb = core_rsp_data_nc;
assign core_rsp_tag_sb = core_rsp_tag_nc;
assign core_rsp_ready_nc = core_rsp_ready_sb;
assign mem_req_valid = mem_req_valid_nc;
assign mem_req_rw = mem_req_rw_nc;
assign mem_req_addr = mem_req_addr_nc;
assign mem_req_byteen = mem_req_byteen_nc;
assign mem_req_data = mem_req_data_nc;
assign mem_req_tag = mem_req_tag_nc;
assign mem_req_ready_nc = mem_req_ready;
assign mem_req_valid_sb = mem_req_valid_nc;
assign mem_req_addr_sb = mem_req_addr_nc;
assign mem_req_rw_p = mem_req_rw_nc;
assign mem_req_pmask_p = mem_req_pmask_nc;
assign mem_req_byteen_p = mem_req_byteen_nc;
assign mem_req_wsel_p = mem_req_wsel_nc;
assign mem_req_data_p = mem_req_data_nc;
assign mem_req_tag_sb = mem_req_tag_nc;
assign mem_req_ready_nc = mem_req_ready_sb;
assign mem_rsp_valid_nc = mem_rsp_valid;
assign mem_rsp_data_nc = mem_rsp_data;
@ -246,17 +370,19 @@ module VX_cache #(
///////////////////////////////////////////////////////////////////////////
wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_qual;
wire [`MEM_ADDR_WIDTH-1:0] mem_rsp_tag_qual;
wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_qual;
wire mrsq_out_valid, mrsq_out_ready;
`RESET_RELAY (mrsq_reset);
VX_elastic_buffer #(
.DATAW (`MEM_ADDR_WIDTH + `CACHE_LINE_WIDTH),
.SIZE (MRSQ_SIZE),
.OUTPUT_REG (MRSQ_SIZE > 2)
.DATAW (MEM_TAG_IN_WIDTH + `CACHE_LINE_WIDTH),
.SIZE (MRSQ_SIZE),
.OUT_REG (MRSQ_SIZE > 2)
) mem_rsp_queue (
.clk (clk),
.reset (reset),
.reset (mrsq_reset),
.ready_in (mem_rsp_ready_nc),
.valid_in (mem_rsp_valid_nc),
.data_in ({mem_rsp_tag_nc, mem_rsp_data_nc}),
@ -272,13 +398,15 @@ module VX_cache #(
wire [`LINE_SELECT_BITS-1:0] flush_addr;
wire flush_enable;
`RESET_RELAY (flush_reset);
VX_flush_ctrl #(
.CACHE_SIZE (CACHE_SIZE),
.CACHE_LINE_SIZE (CACHE_LINE_SIZE),
.NUM_BANKS (NUM_BANKS)
) flush_ctrl (
.clk (clk),
.reset (reset),
.reset (flush_reset),
.addr_out (flush_addr),
.valid_out (flush_enable)
);
@ -287,36 +415,38 @@ module VX_cache #(
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_req_pmask;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] per_bank_core_req_wsel;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] per_bank_core_req_wsel;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_X_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0] per_bank_core_req_rw;
wire [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
wire [NUM_BANKS-1:0][CORE_TAG_X_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
wire [NUM_BANKS-1:0] per_bank_core_req_ready;
wire [NUM_BANKS-1:0] per_bank_core_rsp_valid;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_pmask;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_rsp_tid;
wire [NUM_BANKS-1:0][CORE_TAG_X_WIDTH-1:0] per_bank_core_rsp_tag;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_X_WIDTH-1:0] per_bank_core_rsp_tag;
wire [NUM_BANKS-1:0] per_bank_core_rsp_ready;
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
wire [NUM_BANKS-1:0][CACHE_LINE_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_mem_req_pmask;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] per_bank_mem_req_wsel;
wire [NUM_BANKS-1:0][`MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0][`CACHE_LINE_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
if (NUM_BANKS == 1) begin
`UNUSED_VAR (mem_rsp_tag_qual)
assign mrsq_out_ready = per_bank_mem_rsp_ready;
end else begin
assign mrsq_out_ready = per_bank_mem_rsp_ready[`MEM_ADDR_BANK(mem_rsp_tag_qual)];
assign mrsq_out_ready = per_bank_mem_rsp_ready[`MEM_TAG_TO_BANK_ID(mem_rsp_tag_qual)];
end
VX_core_req_bank_sel #(
@ -358,31 +488,34 @@ module VX_cache #(
for (genvar i = 0; i < NUM_BANKS; i++) begin
wire curr_bank_core_req_valid;
wire [NUM_PORTS-1:0] curr_bank_core_req_pmask;
wire [NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] curr_bank_core_req_wsel;
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] curr_bank_core_req_wsel;
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] curr_bank_core_req_byteen;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] curr_bank_core_req_data;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] curr_bank_core_req_tid;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] curr_bank_core_req_tid;
wire [NUM_PORTS-1:0][CORE_TAG_X_WIDTH-1:0] curr_bank_core_req_tag;
wire curr_bank_core_req_rw;
wire [`LINE_ADDR_WIDTH-1:0] curr_bank_core_req_addr;
wire [CORE_TAG_X_WIDTH-1:0] curr_bank_core_req_tag;
wire [`LINE_ADDR_WIDTH-1:0] curr_bank_core_req_addr;
wire curr_bank_core_req_ready;
wire curr_bank_core_rsp_valid;
wire [NUM_PORTS-1:0] curr_bank_core_rsp_pmask;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] curr_bank_core_rsp_data;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] curr_bank_core_rsp_tid;
wire [CORE_TAG_X_WIDTH-1:0] curr_bank_core_rsp_tag;
wire [NUM_PORTS-1:0][CORE_TAG_X_WIDTH-1:0] curr_bank_core_rsp_tag;
wire curr_bank_core_rsp_ready;
wire curr_bank_mem_req_valid;
wire curr_bank_mem_req_rw;
wire [CACHE_LINE_SIZE-1:0] curr_bank_mem_req_byteen;
wire [NUM_PORTS-1:0] curr_bank_mem_req_pmask;
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] curr_bank_mem_req_byteen;
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] curr_bank_mem_req_wsel;
wire [`LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
wire[`CACHE_LINE_WIDTH-1:0] curr_bank_mem_req_data;
wire [MSHR_ADDR_WIDTH-1:0] curr_bank_mem_req_id;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] curr_bank_mem_req_data;
wire curr_bank_mem_req_ready;
wire curr_bank_mem_rsp_valid;
wire [`LINE_ADDR_WIDTH-1:0] curr_bank_mem_rsp_addr;
wire curr_bank_mem_rsp_valid;
wire [MSHR_ADDR_WIDTH-1:0] curr_bank_mem_rsp_id;
wire [`CACHE_LINE_WIDTH-1:0] curr_bank_mem_rsp_data;
wire curr_bank_mem_rsp_ready;
@ -407,27 +540,31 @@ module VX_cache #(
assign per_bank_core_rsp_data [i] = curr_bank_core_rsp_data;
// Memory request
assign per_bank_mem_req_valid[i] = curr_bank_mem_req_valid;
assign per_bank_mem_req_rw[i] = curr_bank_mem_req_rw;
assign per_bank_mem_req_valid[i] = curr_bank_mem_req_valid;
assign per_bank_mem_req_rw[i] = curr_bank_mem_req_rw;
assign per_bank_mem_req_pmask[i] = curr_bank_mem_req_pmask;
assign per_bank_mem_req_byteen[i] = curr_bank_mem_req_byteen;
assign per_bank_mem_req_wsel[i] = curr_bank_mem_req_wsel;
if (NUM_BANKS == 1) begin
assign per_bank_mem_req_addr[i] = curr_bank_mem_req_addr;
end else begin
assign per_bank_mem_req_addr[i] = `LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, i);
end
assign per_bank_mem_req_id[i] = curr_bank_mem_req_id;
assign per_bank_mem_req_data[i] = curr_bank_mem_req_data;
assign curr_bank_mem_req_ready = per_bank_mem_req_ready[i];
assign curr_bank_mem_req_ready = per_bank_mem_req_ready[i];
// Memory response
if (NUM_BANKS == 1) begin
assign curr_bank_mem_rsp_valid = mrsq_out_valid;
assign curr_bank_mem_rsp_addr = mem_rsp_tag_qual;
assign curr_bank_mem_rsp_valid = mrsq_out_valid;
end else begin
assign curr_bank_mem_rsp_valid = mrsq_out_valid && (`MEM_ADDR_BANK(mem_rsp_tag_qual) == i);
assign curr_bank_mem_rsp_addr = `MEM_TO_LINE_ADDR(mem_rsp_tag_qual);
assign curr_bank_mem_rsp_valid = mrsq_out_valid && (`MEM_TAG_TO_BANK_ID(mem_rsp_tag_qual) == i);
end
assign curr_bank_mem_rsp_id = `MEM_TAG_TO_REQ_ID(mem_rsp_tag_qual);
assign curr_bank_mem_rsp_data = mem_rsp_data_qual;
assign per_bank_mem_rsp_ready[i] = curr_bank_mem_rsp_ready;
`RESET_RELAY (bank_reset);
VX_bank #(
.BANK_ID (i),
@ -450,7 +587,7 @@ module VX_cache #(
`SCOPE_BIND_VX_cache_bank(i)
.clk (clk),
.reset (reset),
.reset (bank_reset),
`ifdef PERF_ENABLE
.perf_read_misses (perf_read_miss_per_bank[i]),
@ -482,14 +619,17 @@ module VX_cache #(
// Memory request
.mem_req_valid (curr_bank_mem_req_valid),
.mem_req_rw (curr_bank_mem_req_rw),
.mem_req_pmask (curr_bank_mem_req_pmask),
.mem_req_byteen (curr_bank_mem_req_byteen),
.mem_req_wsel (curr_bank_mem_req_wsel),
.mem_req_addr (curr_bank_mem_req_addr),
.mem_req_id (curr_bank_mem_req_id),
.mem_req_data (curr_bank_mem_req_data),
.mem_req_ready (curr_bank_mem_req_ready),
// Memory response
.mem_rsp_valid (curr_bank_mem_rsp_valid),
.mem_rsp_addr (curr_bank_mem_rsp_addr),
.mem_rsp_id (curr_bank_mem_rsp_id),
.mem_rsp_data (curr_bank_mem_rsp_data),
.mem_rsp_ready (curr_bank_mem_rsp_ready),
@ -523,53 +663,66 @@ module VX_cache #(
.core_rsp_ready (core_rsp_ready_nc)
);
wire [NUM_BANKS-1:0][(`MEM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH)-1:0] data_in;
for (genvar i = 0; i < NUM_BANKS; i++) begin
assign data_in[i] = {per_bank_mem_req_addr[i], per_bank_mem_req_rw[i], per_bank_mem_req_byteen[i], per_bank_mem_req_data[i]};
wire [NUM_BANKS-1:0][(`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH))-1:0] data_in;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign data_in[i] = {per_bank_mem_req_addr[i], per_bank_mem_req_id[i], per_bank_mem_req_rw[i], per_bank_mem_req_pmask[i], per_bank_mem_req_byteen[i], per_bank_mem_req_wsel[i], per_bank_mem_req_data[i]};
end
wire [MSHR_ADDR_WIDTH-1:0] mem_req_id;
`RESET_RELAY (mreq_reset);
VX_stream_arbiter #(
.NUM_REQS (NUM_BANKS),
.DATAW (`MEM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH),
.BUFFERED (1)
.DATAW (`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)),
.TYPE ("R")
) mem_req_arb (
.clk (clk),
.reset (reset),
.reset (mreq_reset),
.valid_in (per_bank_mem_req_valid),
.data_in (data_in),
.ready_in (per_bank_mem_req_ready),
.valid_out (mem_req_valid_nc),
.data_out ({mem_req_addr_nc, mem_req_rw_nc, mem_req_byteen_nc, mem_req_data_nc}),
.data_out ({mem_req_addr_nc, mem_req_id, mem_req_rw_nc, mem_req_pmask_nc, mem_req_byteen_nc, mem_req_wsel_nc, mem_req_data_nc}),
.ready_out (mem_req_ready_nc)
);
assign mem_req_tag_nc = mem_req_addr_nc;
if (NUM_BANKS == 1) begin
assign mem_req_tag_nc = MEM_TAG_IN_WIDTH'(mem_req_id);
end else begin
assign mem_req_tag_nc = MEM_TAG_IN_WIDTH'({`MEM_ADDR_TO_BANK_ID(mem_req_addr_nc), mem_req_id});
end
`ifdef PERF_ENABLE
// per cycle: core_reads, core_writes
reg [($clog2(NUM_REQS+1)-1):0] perf_core_reads_per_cycle;
reg [($clog2(NUM_REQS+1)-1):0] perf_core_writes_per_cycle;
reg [($clog2(NUM_REQS+1)-1):0] perf_crsp_stall_per_cycle;
wire [$clog2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [$clog2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [$clog2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
assign perf_core_reads_per_cycle = $countones(core_req_valid & core_req_ready & ~core_req_rw);
assign perf_core_writes_per_cycle = $countones(core_req_valid & core_req_ready & core_req_rw);
wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid & core_req_ready & ~core_req_rw;
wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid & core_req_ready & core_req_rw;
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_mask);
if (CORE_TAG_ID_BITS != 0) begin
assign perf_crsp_stall_per_cycle = $countones(core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}});
wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}};
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask);
end else begin
assign perf_crsp_stall_per_cycle = $countones(core_rsp_valid & ~core_rsp_ready);
wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_valid & ~core_rsp_ready;
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask);
end
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
reg [($clog2(NUM_BANKS+1)-1):0] perf_read_miss_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_write_miss_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_mshr_stall_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_pipe_stall_per_cycle;
wire [$clog2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
wire [$clog2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
wire [$clog2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
wire [$clog2(NUM_BANKS+1)-1:0] perf_pipe_stall_per_cycle;
assign perf_read_miss_per_cycle = $countones(perf_read_miss_per_bank);
assign perf_write_miss_per_cycle = $countones(perf_write_miss_per_bank);
assign perf_mshr_stall_per_cycle = $countones(perf_mshr_stall_per_bank);
assign perf_pipe_stall_per_cycle = $countones(perf_pipe_stall_per_bank);
`POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
`POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank);
`POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank);
`POP_COUNT(perf_pipe_stall_per_cycle, perf_pipe_stall_per_bank);
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
reg [`PERF_CTR_BITS-1:0] perf_core_writes;

View file

@ -9,8 +9,10 @@
`define REQS_BITS `LOG2UP(NUM_REQS)
// tag valid tid word_sel
`define MSHR_DATA_WIDTH (CORE_TAG_WIDTH + (1 + `REQS_BITS + `UP(`WORD_SELECT_BITS)) * NUM_PORTS)
`define PORTS_BITS `LOG2UP(NUM_PORTS)
// tag valid tid word_sel
`define MSHR_DATA_WIDTH ((CORE_TAG_WIDTH + 1 + `REQS_BITS + `UP(`WORD_SELECT_BITS)) * NUM_PORTS)
`define WORD_WIDTH (8 * WORD_SIZE)
@ -57,14 +59,14 @@
`define CORE_RSP_TAGS ((CORE_TAG_ID_BITS != 0) ? 1 : NUM_REQS)
`define BANK_READY_COUNT ((SHARED_BANK_READY != 0) ? 1 : NUM_BANKS)
`define MEM_ADDR_BANK(x) x[`BANK_SELECT_BITS+BANK_ADDR_OFFSET-1 : BANK_ADDR_OFFSET]
`define MEM_TO_LINE_ADDR(x) x[`MEM_ADDR_WIDTH-1 : `BANK_SELECT_BITS]
`define LINE_TO_MEM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)}
`define MEM_ADDR_TO_BANK_ID(x) x[0 +: `BANK_SELECT_BITS]
`define MEM_TAG_TO_REQ_ID(x) x[MSHR_ADDR_WIDTH-1:0]
`define MEM_TAG_TO_BANK_ID(x) x[MSHR_ADDR_WIDTH +: `BANK_SELECT_BITS]
`define LINE_TO_BYTE_ADDR(x, i) {x, (32-$bits(x))'(i << (32-$bits(x)-`BANK_SELECT_BITS))}
`define TO_FULL_ADDR(x) {x, (32-$bits(x))'(0)}

View file

@ -16,9 +16,7 @@ module VX_core_req_bank_sel #(
// core request tag size
parameter CORE_TAG_WIDTH = 3,
// bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = 0,
// shared bank ready signal
parameter SHARED_BANK_READY = 0
parameter BANK_ADDR_OFFSET = 0
) (
input wire clk,
input wire reset,
@ -43,8 +41,8 @@ module VX_core_req_bank_sel #(
output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen,
output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data,
output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid,
output wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag,
input wire [`BANK_READY_COUNT-1:0] per_bank_core_req_ready
output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag,
input wire [NUM_BANKS-1:0] per_bank_core_req_ready
);
`UNUSED_PARAM (CACHE_ID)
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid value"))
@ -80,9 +78,9 @@ module VX_core_req_bank_sel #(
reg [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen_r;
reg [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data_r;
reg [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid_r;
reg [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag_r;
reg [NUM_BANKS-1:0] per_bank_core_req_rw_r;
reg [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr_r;
reg [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag_r;
reg [NUM_REQS-1:0] core_req_ready_r;
if (NUM_REQS > 1) begin
@ -101,7 +99,7 @@ module VX_core_req_bank_sel #(
end
end
for (genvar i = NUM_REQS-1; i >= 0; --i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_line_match[i] = (core_req_line_addr[i] == per_bank_line_addr_r[core_req_bid[i]]);
end
@ -129,30 +127,19 @@ module VX_core_req_bank_sel #(
per_bank_core_req_byteen_r[core_req_bid[i]][i % NUM_PORTS] = core_req_byteen[i];
per_bank_core_req_data_r[core_req_bid[i]][i % NUM_PORTS] = core_req_data[i];
per_bank_core_req_tid_r[core_req_bid[i]][i % NUM_PORTS] = `REQS_BITS'(i);
per_bank_core_req_tag_r[core_req_bid[i]][i % NUM_PORTS] = core_req_tag[i];
per_bank_core_req_rw_r[core_req_bid[i]] = core_req_rw[i];
per_bank_core_req_addr_r[core_req_bid[i]] = core_req_line_addr[i];
per_bank_core_req_tag_r[core_req_bid[i]] = core_req_tag[i];
req_select_table_r[core_req_bid[i]][i % NUM_PORTS] = (1 << i);
end
end
end
if (SHARED_BANK_READY == 0) begin
always @(*) begin
for (integer i = 0; i < NUM_REQS; ++i) begin
core_req_ready_r[i] = per_bank_core_req_ready[core_req_bid[i]]
&& core_req_line_match[i]
&& req_select_table_r[core_req_bid[i]][i % NUM_PORTS][i];
end
end
end else begin
always @(*) begin
for (integer i = 0; i < NUM_REQS; ++i) begin
core_req_ready_r[i] = per_bank_core_req_ready
&& core_req_line_match[i]
&& req_select_table_r[core_req_bid[i]][i % NUM_PORTS][i];
end
always @(*) begin
for (integer i = 0; i < NUM_REQS; ++i) begin
core_req_ready_r[i] = per_bank_core_req_ready[core_req_bid[i]]
&& core_req_line_match[i]
&& req_select_table_r[core_req_bid[i]][i % NUM_PORTS][i];
end
end
@ -177,32 +164,17 @@ module VX_core_req_bank_sel #(
per_bank_core_req_byteen_r[core_req_bid[i]][i % NUM_PORTS] = core_req_byteen[i];
per_bank_core_req_data_r[core_req_bid[i]][i % NUM_PORTS] = core_req_data[i];
per_bank_core_req_tid_r[core_req_bid[i]][i % NUM_PORTS] = `REQS_BITS'(i);
per_bank_core_req_tag_r[core_req_bid[i]][i % NUM_PORTS] = core_req_tag[i];
per_bank_core_req_rw_r[core_req_bid[i]] = core_req_rw[i];
per_bank_core_req_addr_r[core_req_bid[i]] = core_req_line_addr[i];
per_bank_core_req_tag_r[core_req_bid[i]] = core_req_tag[i];
per_bank_core_req_addr_r[core_req_bid[i]] = core_req_line_addr[i];
end
end
end
if (SHARED_BANK_READY == 0) begin
always @(*) begin
core_req_ready_r = 'x;
for (integer i = NUM_REQS-1; i >= 0; --i) begin
if (core_req_valid[i]) begin
core_req_ready_r[i] = per_bank_core_req_ready[core_req_bid[i]]
&& core_req_line_match[i];
end
end
end
end else begin
always @(*) begin
core_req_ready_r = 'x;
for (integer i = NUM_REQS-1; i >= 0; --i) begin
if (core_req_valid[i]) begin
core_req_ready_r[i] = per_bank_core_req_ready
&& core_req_line_match[i];
end
end
always @(*) begin
for (integer i = 0; i < NUM_REQS; ++i) begin
core_req_ready_r[i] = per_bank_core_req_ready[core_req_bid[i]]
&& core_req_line_match[i];
end
end
end
@ -236,22 +208,11 @@ module VX_core_req_bank_sel #(
end
if (NUM_BANKS > 1) begin
if (SHARED_BANK_READY == 0) begin
always @(*) begin
core_req_ready_r = 0;
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid_r[i]) begin
core_req_ready_r[per_bank_core_req_tid_r[i]] = per_bank_core_req_ready[i];
end
end
end
end else begin
always @(*) begin
core_req_ready_r = 0;
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid_r[i]) begin
core_req_ready_r[per_bank_core_req_tid_r[i]] = per_bank_core_req_ready;
end
always @(*) begin
core_req_ready_r = 0;
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid_r[i]) begin
core_req_ready_r[per_bank_core_req_tid_r[i]] = per_bank_core_req_ready[i];
end
end
end
@ -320,33 +281,26 @@ module VX_core_req_bank_sel #(
`ifdef PERF_ENABLE
reg [NUM_REQS-1:0] core_req_sel_r;
if (SHARED_BANK_READY == 0) begin
always @(*) begin
core_req_sel_r = 0;
for (integer i = 0; i < NUM_REQS; ++i) begin
if (core_req_valid[i]) begin
core_req_sel_r[i] = per_bank_core_req_ready[core_req_bid[i]];
end
end
end
end else begin
always @(*) begin
core_req_sel_r = 0;
for (integer i = 0; i < NUM_REQS; ++i) begin
if (core_req_valid[i]) begin
core_req_sel_r[i] = per_bank_core_req_ready;
end
always @(*) begin
core_req_sel_r = 0;
for (integer i = 0; i < NUM_REQS; ++i) begin
if (core_req_valid[i]) begin
core_req_sel_r[i] = per_bank_core_req_ready[core_req_bid[i]];
end
end
end
reg [`PERF_CTR_BITS-1:0] bank_stalls_r;
wire [$clog2(NUM_REQS+1)-1:0] bank_stall_cnt;
wire [NUM_REQS-1:0] bank_stall_mask = core_req_sel_r & ~core_req_ready;
`POP_COUNT(bank_stall_cnt, bank_stall_mask);
always @(posedge clk) begin
if (reset) begin
bank_stalls_r <= 0;
end else begin
bank_stalls_r <= bank_stalls_r + `PERF_CTR_BITS'($countones(core_req_sel_r & ~core_req_ready));
bank_stalls_r <= bank_stalls_r + `PERF_CTR_BITS'(bank_stall_cnt);
end
end

350
hw/rtl/cache/VX_core_rsp_merge.sv vendored Normal file
View file

@ -0,0 +1,350 @@
`include "VX_cache_define.vh"
module VX_core_rsp_merge #(
parameter CACHE_ID = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 1,
// Number of banks
parameter NUM_BANKS = 1,
// Number of ports per banks
parameter NUM_PORTS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// core request tag size
parameter CORE_TAG_WIDTH = 1,
// size of tag id in core request tag
parameter CORE_TAG_ID_BITS = 0,
// output register
parameter OUT_REG = 0
) (
input wire clk,
input wire reset,
// Per Bank WB
input wire [NUM_BANKS-1:0] per_bank_core_rsp_valid,
input wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_pmask,
input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data,
input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_rsp_tid,
input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_rsp_tag,
output wire [NUM_BANKS-1:0] per_bank_core_rsp_ready,
// Core Response
output wire [`CORE_RSP_TAGS-1:0] core_rsp_valid,
output wire [NUM_REQS-1:0] core_rsp_tmask,
output wire [`CORE_RSP_TAGS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag,
output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data,
input wire [`CORE_RSP_TAGS-1:0] core_rsp_ready
);
`UNUSED_PARAM (CACHE_ID)
if (NUM_BANKS > 1) begin
reg [NUM_REQS-1:0] core_rsp_valid_unqual;
reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_unqual;
reg [NUM_BANKS-1:0] per_bank_core_rsp_ready_r;
if (CORE_TAG_ID_BITS != 0) begin
// The core response bus handles a single tag at the time
// We first need to select the current tag to process,
// then send all bank responses for that tag as a batch
wire [CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual;
wire core_rsp_ready_unqual;
if (NUM_PORTS > 1) begin
reg [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_r, per_bank_core_rsp_sent;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_n;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign per_bank_core_rsp_sent_n[i] = per_bank_core_rsp_sent_r[i] | per_bank_core_rsp_sent[i];
end
always @(posedge clk) begin
if (reset) begin
per_bank_core_rsp_sent_r <= '0;
end else begin
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]) begin
per_bank_core_rsp_sent_r[i] <= '0;
end else begin
per_bank_core_rsp_sent_r[i] <= per_bank_core_rsp_sent_n[i];
end
end
end
end
wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_valid_p;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
for (genvar p = 0; p < NUM_PORTS; ++p) begin
assign per_bank_core_rsp_valid_p[i][p] = per_bank_core_rsp_valid[i]
&& per_bank_core_rsp_pmask[i][p]
&& !per_bank_core_rsp_sent_r[i][p];
end
end
VX_find_first #(
.N (NUM_BANKS * NUM_PORTS),
.DATAW (CORE_TAG_WIDTH)
) find_first (
.valid_i (per_bank_core_rsp_valid_p),
.data_i (per_bank_core_rsp_tag),
.data_o (core_rsp_tag_unqual),
`UNUSED_PIN (valid_o)
);
always @(*) begin
core_rsp_valid_unqual = 0;
core_rsp_data_unqual = 'x;
per_bank_core_rsp_sent = 0;
for (integer i = 0; i < NUM_BANKS; ++i) begin
for (integer p = 0; p < NUM_PORTS; ++p) begin
if (per_bank_core_rsp_valid[i]
&& per_bank_core_rsp_pmask[i][p]
&& !per_bank_core_rsp_sent_r[i][p]
&& (per_bank_core_rsp_tag[i][p][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin
core_rsp_valid_unqual[per_bank_core_rsp_tid[i][p]] = 1;
core_rsp_data_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_data[i][p];
per_bank_core_rsp_sent[i][p] = core_rsp_ready_unqual;
end
end
end
end
always @(*) begin
for (integer i = 0; i < NUM_BANKS; ++i) begin
per_bank_core_rsp_ready_r[i] = (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]);
end
end
end else begin
`UNUSED_VAR (per_bank_core_rsp_pmask)
VX_find_first #(
.N (NUM_BANKS),
.DATAW (CORE_TAG_WIDTH)
) find_first (
.valid_i (per_bank_core_rsp_valid),
.data_i (per_bank_core_rsp_tag),
.data_o (core_rsp_tag_unqual),
`UNUSED_PIN (valid_o)
);
always @(*) begin
core_rsp_valid_unqual = 0;
core_rsp_data_unqual = 'x;
per_bank_core_rsp_ready_r = 0;
for (integer i = 0; i < NUM_BANKS; i++) begin
if (per_bank_core_rsp_valid[i]
&& (per_bank_core_rsp_tag[i][0][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin
core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1;
core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i];
per_bank_core_rsp_ready_r[i] = core_rsp_ready_unqual;
end
end
end
end
wire core_rsp_valid_any = (| per_bank_core_rsp_valid);
VX_skid_buffer #(
.DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH)),
.PASSTHRU (0 == OUT_REG)
) out_sbuf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_any),
.data_in ({core_rsp_valid_unqual, core_rsp_tag_unqual, core_rsp_data_unqual}),
.ready_in (core_rsp_ready_unqual),
.valid_out (core_rsp_valid),
.data_out ({core_rsp_tmask, core_rsp_tag, core_rsp_data}),
.ready_out (core_rsp_ready)
);
end else begin
reg [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual;
wire [NUM_REQS-1:0] core_rsp_ready_unqual;
if (NUM_PORTS > 1) begin
reg [NUM_REQS-1:0][(`PORTS_BITS + `BANK_SELECT_BITS)-1:0] bank_select_table;
reg [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_r, per_bank_core_rsp_sent;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_n;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign per_bank_core_rsp_sent_n[i] = per_bank_core_rsp_sent_r[i] | per_bank_core_rsp_sent[i];
end
always @(posedge clk) begin
if (reset) begin
per_bank_core_rsp_sent_r <= '0;
end else begin
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]) begin
per_bank_core_rsp_sent_r[i] <= '0;
end else begin
per_bank_core_rsp_sent_r[i] <= per_bank_core_rsp_sent_n[i];
end
end
end
end
always @(*) begin
core_rsp_valid_unqual = '0;
core_rsp_tag_unqual = 'x;
core_rsp_data_unqual = 'x;
bank_select_table = 'x;
for (integer i = NUM_BANKS-1; i >= 0; --i) begin
for (integer p = 0; p < NUM_PORTS; ++p) begin
if (per_bank_core_rsp_valid[i]
&& per_bank_core_rsp_pmask[i][p]
&& !per_bank_core_rsp_sent_r[i][p]) begin
core_rsp_valid_unqual[per_bank_core_rsp_tid[i][p]] = 1;
core_rsp_tag_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_tag[i][p];
core_rsp_data_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_data[i][p];
bank_select_table[per_bank_core_rsp_tid[i][p]] = {`PORTS_BITS'(p), `BANK_SELECT_BITS'(i)};
end
end
end
end
always @(*) begin
per_bank_core_rsp_sent = '0;
for (integer i = 0; i < NUM_REQS; i++) begin
if (core_rsp_valid_unqual[i]) begin
per_bank_core_rsp_sent[bank_select_table[i][0 +: `BANK_SELECT_BITS]][bank_select_table[i][`BANK_SELECT_BITS +: `PORTS_BITS]] = core_rsp_ready_unqual[i];
end
end
end
always @(*) begin
for (integer i = 0; i < NUM_BANKS; i++) begin
per_bank_core_rsp_ready_r[i] = (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]);
end
end
end else begin
`UNUSED_VAR (per_bank_core_rsp_pmask)
reg [NUM_REQS-1:0][NUM_BANKS-1:0] bank_select_table;
always @(*) begin
core_rsp_valid_unqual = 0;
core_rsp_tag_unqual = 'x;
core_rsp_data_unqual = 'x;
bank_select_table = 'x;
for (integer i = NUM_BANKS-1; i >= 0; --i) begin
if (per_bank_core_rsp_valid[i]) begin
core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1;
core_rsp_tag_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_tag[i];
core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i];
bank_select_table[per_bank_core_rsp_tid[i]] = (1 << i);
end
end
end
always @(*) begin
for (integer i = 0; i < NUM_BANKS; ++i) begin
per_bank_core_rsp_ready_r[i] = core_rsp_ready_unqual[per_bank_core_rsp_tid[i]]
&& bank_select_table[per_bank_core_rsp_tid[i]][i];
end
end
end
for (genvar i = 0; i < NUM_REQS; i++) begin
VX_skid_buffer #(
.DATAW (CORE_TAG_WIDTH + `WORD_WIDTH),
.PASSTHRU (0 == OUT_REG)
) out_sbuf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_unqual[i]),
.data_in ({core_rsp_tag_unqual[i], core_rsp_data_unqual[i]}),
.ready_in (core_rsp_ready_unqual[i]),
.valid_out (core_rsp_valid[i]),
.data_out ({core_rsp_tag[i],core_rsp_data[i]}),
.ready_out (core_rsp_ready[i])
);
end
assign core_rsp_tmask = core_rsp_valid;
end
assign per_bank_core_rsp_ready = per_bank_core_rsp_ready_r;
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (per_bank_core_rsp_pmask)
if (NUM_REQS > 1) begin
reg [`CORE_RSP_TAGS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual;
reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_unqual;
if (CORE_TAG_ID_BITS != 0) begin
reg [NUM_REQS-1:0] core_rsp_tmask_unqual;
always @(*) begin
core_rsp_tmask_unqual = 0;
core_rsp_tmask_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_valid;
core_rsp_tag_unqual = per_bank_core_rsp_tag;
core_rsp_data_unqual = 'x;
core_rsp_data_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_data;
end
assign core_rsp_valid = per_bank_core_rsp_valid;
assign core_rsp_tmask = core_rsp_tmask_unqual;
assign per_bank_core_rsp_ready = core_rsp_ready;
end else begin
reg [`CORE_RSP_TAGS-1:0] core_rsp_valid_unqual;
always @(*) begin
core_rsp_valid_unqual = 0;
core_rsp_valid_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_valid;
core_rsp_tag_unqual = 'x;
core_rsp_tag_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_tag;
core_rsp_data_unqual = 'x;
core_rsp_data_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_data;
end
assign core_rsp_valid = core_rsp_valid_unqual;
assign core_rsp_tmask = core_rsp_valid_unqual;
assign per_bank_core_rsp_ready = core_rsp_ready[per_bank_core_rsp_tid];
end
assign core_rsp_tag = core_rsp_tag_unqual;
assign core_rsp_data = core_rsp_data_unqual;
end else begin
`UNUSED_VAR(per_bank_core_rsp_tid)
assign core_rsp_valid = per_bank_core_rsp_valid;
assign core_rsp_tmask = per_bank_core_rsp_valid;
assign core_rsp_tag = per_bank_core_rsp_tag;
assign core_rsp_data = per_bank_core_rsp_data;
assign per_bank_core_rsp_ready = core_rsp_ready;
end
end
endmodule

View file

@ -1,239 +0,0 @@
`include "VX_cache_define.vh"
module VX_core_rsp_merge #(
parameter CACHE_ID = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 1,
// Number of banks
parameter NUM_BANKS = 1,
// Number of ports per banks
parameter NUM_PORTS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// core request tag size
parameter CORE_TAG_WIDTH = 1,
// size of tag id in core request tag
parameter CORE_TAG_ID_BITS = 0
) (
input wire clk,
input wire reset,
// Per Bank WB
input wire [NUM_BANKS-1:0] per_bank_core_rsp_valid,
input wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_pmask,
input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data,
input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_rsp_tid,
input wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_rsp_tag,
output wire [NUM_BANKS-1:0] per_bank_core_rsp_ready,
// Core Response
output wire [`CORE_RSP_TAGS-1:0] core_rsp_valid,
output wire [NUM_REQS-1:0] core_rsp_tmask,
output wire [`CORE_RSP_TAGS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag,
output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data,
input wire [`CORE_RSP_TAGS-1:0] core_rsp_ready
);
`UNUSED_PARAM (CACHE_ID)
if (NUM_BANKS > 1) begin
reg [NUM_REQS-1:0] core_rsp_valid_unqual;
reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_unqual;
reg [NUM_BANKS-1:0] core_rsp_bank_select;
if (CORE_TAG_ID_BITS != 0) begin
// The core response bus handles a single tag at the time
// We first need to select the current tag to process,
// then send all bank responses for that tag as a batch
reg [CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual;
wire core_rsp_ready_unqual;
always @(*) begin
core_rsp_tag_unqual = 'x;
for (integer i = NUM_BANKS-1; i >= 0; --i) begin
if (per_bank_core_rsp_valid[i]) begin
core_rsp_tag_unqual = per_bank_core_rsp_tag[i];
end
end
end
if (NUM_PORTS > 1) begin
always @(*) begin
core_rsp_valid_unqual = 0;
core_rsp_data_unqual = 'x;
core_rsp_bank_select = 0;
for (integer i = 0; i < NUM_BANKS; i++) begin
for (integer p = 0; p < NUM_PORTS; p++) begin
if (per_bank_core_rsp_valid[i]
&& per_bank_core_rsp_pmask[i][p]
&& (per_bank_core_rsp_tag[i][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin
core_rsp_valid_unqual[per_bank_core_rsp_tid[i][p]] = 1;
core_rsp_data_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_data[i][p];
core_rsp_bank_select[i] = core_rsp_ready_unqual;
end
end
end
end
end else begin
`UNUSED_VAR (per_bank_core_rsp_pmask)
always @(*) begin
core_rsp_valid_unqual = 0;
core_rsp_data_unqual = 'x;
core_rsp_bank_select = 0;
for (integer i = 0; i < NUM_BANKS; i++) begin
if (per_bank_core_rsp_valid[i]
&& (per_bank_core_rsp_tag[i][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin
core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1;
core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i];
core_rsp_bank_select[i] = core_rsp_ready_unqual;
end
end
end
end
wire core_rsp_valid_any = (| per_bank_core_rsp_valid);
VX_skid_buffer #(
.DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH))
) pipe_reg (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_any),
.data_in ({core_rsp_valid_unqual, core_rsp_tag_unqual, core_rsp_data_unqual}),
.ready_in (core_rsp_ready_unqual),
.valid_out (core_rsp_valid),
.data_out ({core_rsp_tmask, core_rsp_tag, core_rsp_data}),
.ready_out (core_rsp_ready)
);
end else begin
`UNUSED_VAR (per_bank_core_rsp_pmask)
reg [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual;
reg [NUM_REQS-1:0][NUM_BANKS-1:0] bank_select_table;
wire [NUM_REQS-1:0] core_rsp_ready_unqual;
always @(*) begin
core_rsp_valid_unqual = 0;
core_rsp_tag_unqual = 'x;
core_rsp_data_unqual = 'x;
bank_select_table = 'x;
for (integer i = NUM_BANKS-1; i >= 0; --i) begin
if (per_bank_core_rsp_valid[i]) begin
core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1;
core_rsp_tag_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_tag[i];
core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i];
bank_select_table[per_bank_core_rsp_tid[i]] = (1 << i);
end
end
end
always @(*) begin
for (integer i = 0; i < NUM_BANKS; i++) begin
core_rsp_bank_select[i] = core_rsp_ready_unqual[per_bank_core_rsp_tid[i]]
&& bank_select_table[per_bank_core_rsp_tid[i]][i];
end
end
for (genvar i = 0; i < NUM_REQS; i++) begin
VX_skid_buffer #(
.DATAW (CORE_TAG_WIDTH + `WORD_WIDTH)
) pipe_reg (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_unqual[i]),
.data_in ({core_rsp_tag_unqual[i], core_rsp_data_unqual[i]}),
.ready_in (core_rsp_ready_unqual[i]),
.valid_out (core_rsp_valid[i]),
.data_out ({core_rsp_tag[i],core_rsp_data[i]}),
.ready_out (core_rsp_ready[i])
);
end
assign core_rsp_tmask = core_rsp_valid;
end
for (genvar i = 0; i < NUM_BANKS; i++) begin
assign per_bank_core_rsp_ready[i] = core_rsp_bank_select[i];
end
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (per_bank_core_rsp_pmask)
if (NUM_REQS > 1) begin
reg [`CORE_RSP_TAGS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual;
reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_unqual;
if (CORE_TAG_ID_BITS != 0) begin
reg [NUM_REQS-1:0] core_rsp_tmask_unqual;
always @(*) begin
core_rsp_tmask_unqual = 0;
core_rsp_tmask_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_valid;
core_rsp_tag_unqual = per_bank_core_rsp_tag;
core_rsp_data_unqual = 'x;
core_rsp_data_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_data;
end
assign core_rsp_valid = per_bank_core_rsp_valid;
assign core_rsp_tmask = core_rsp_tmask_unqual;
assign per_bank_core_rsp_ready = core_rsp_ready;
end else begin
reg [`CORE_RSP_TAGS-1:0] core_rsp_valid_unqual;
always @(*) begin
core_rsp_valid_unqual = 0;
core_rsp_valid_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_valid;
core_rsp_tag_unqual = 'x;
core_rsp_tag_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_tag;
core_rsp_data_unqual = 'x;
core_rsp_data_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_data;
end
assign core_rsp_valid = core_rsp_valid_unqual;
assign core_rsp_tmask = core_rsp_valid_unqual;
assign per_bank_core_rsp_ready = core_rsp_ready[per_bank_core_rsp_tid];
end
assign core_rsp_tag = core_rsp_tag_unqual;
assign core_rsp_data = core_rsp_data_unqual;
end else begin
`UNUSED_VAR(per_bank_core_rsp_tid)
assign core_rsp_valid = per_bank_core_rsp_valid;
assign core_rsp_tmask = per_bank_core_rsp_valid;
assign core_rsp_tag = per_bank_core_rsp_tag;
assign core_rsp_data = per_bank_core_rsp_data;
assign per_bank_core_rsp_ready = core_rsp_ready;
end
end
endmodule

136
hw/rtl/cache/VX_data_access.sv vendored Normal file
View file

@ -0,0 +1,136 @@
`include "VX_cache_define.vh"
module VX_data_access #(
parameter CACHE_ID = 0,
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 1,
// Number of banks
parameter NUM_BANKS = 1,
// Number of ports per banks
parameter NUM_PORTS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
parameter WORD_SELECT_BITS = `UP(`WORD_SELECT_BITS)
) (
input wire clk,
input wire reset,
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_UNUSED_BEGIN
input wire[31:0] debug_pc,
input wire[`NW_BITS-1:0] debug_wid,
`IGNORE_UNUSED_END
`endif
input wire stall,
input wire read,
input wire fill,
input wire write,
input wire[`LINE_ADDR_WIDTH-1:0] addr,
input wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] wsel,
input wire [NUM_PORTS-1:0] pmask,
input wire [NUM_PORTS-1:0][WORD_SIZE-1:0] byteen,
input wire [`WORDS_PER_LINE-1:0][`WORD_WIDTH-1:0] fill_data,
input wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] write_data,
output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] read_data
);
`UNUSED_PARAM (CACHE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (reset)
`UNUSED_VAR (addr)
`UNUSED_VAR (read)
localparam BYTEENW = WRITE_ENABLE ? CACHE_LINE_SIZE : 1;
wire [`WORDS_PER_LINE-1:0][`WORD_WIDTH-1:0] rdata;
wire [`WORDS_PER_LINE-1:0][`WORD_WIDTH-1:0] wdata;
wire [BYTEENW-1:0] wren;
wire [`LINE_SELECT_BITS-1:0] line_addr = addr[`LINE_SELECT_BITS-1:0];
if (WRITE_ENABLE) begin
if (`WORDS_PER_LINE > 1) begin
reg [`WORDS_PER_LINE-1:0][`WORD_WIDTH-1:0] wdata_r;
reg [`WORDS_PER_LINE-1:0][WORD_SIZE-1:0] wren_r;
if (NUM_PORTS > 1) begin
always @(*) begin
wdata_r = 'x;
wren_r = 0;
for (integer i = 0; i < NUM_PORTS; ++i) begin
if (pmask[i]) begin
wdata_r[wsel[i]] = write_data[i];
wren_r[wsel[i]] = byteen[i];
end
end
end
end else begin
`UNUSED_VAR (pmask)
always @(*) begin
wdata_r = {`WORDS_PER_LINE{write_data}};
wren_r = 0;
wren_r[wsel] = byteen;
end
end
assign wdata = write ? wdata_r : fill_data;
assign wren = write ? wren_r : {BYTEENW{fill}};
end else begin
`UNUSED_VAR (wsel)
`UNUSED_VAR (pmask)
assign wdata = write ? write_data : fill_data;
assign wren = write ? byteen : {BYTEENW{fill}};
end
end else begin
`UNUSED_VAR (write)
`UNUSED_VAR (byteen)
`UNUSED_VAR (pmask)
`UNUSED_VAR (write_data)
assign wdata = fill_data;
assign wren = fill;
end
VX_sp_ram #(
.DATAW (`CACHE_LINE_WIDTH),
.SIZE (`LINES_PER_BANK),
.BYTEENW (BYTEENW),
.NO_RWCHECK (1)
) data_store (
.clk (clk),
.addr (line_addr),
.wren (wren),
.wdata (wdata),
.rdata (rdata)
);
if (`WORDS_PER_LINE > 1) begin
for (genvar i = 0; i < NUM_PORTS; ++i) begin
assign read_data[i] = rdata[wsel[i]];
end
end else begin
assign read_data = rdata;
end
`UNUSED_VAR (stall)
`ifdef DBG_PRINT_CACHE_DATA
always @(posedge clk) begin
if (fill && ~stall) begin
dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data);
end
if (read && ~stall) begin
dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, read_data);
end
if (write && ~stall) begin
dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, wid=%0d, PC=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, byteen, line_addr, write_data);
end
end
`endif
endmodule

View file

@ -1,93 +0,0 @@
`include "VX_cache_define.vh"
module VX_data_access #(
parameter CACHE_ID = 0,
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 1,
// Number of banks
parameter NUM_BANKS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1
) (
input wire clk,
input wire reset,
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_WARNINGS_BEGIN
input wire[31:0] debug_pc,
input wire[`NW_BITS-1:0] debug_wid,
`IGNORE_WARNINGS_END
`endif
`IGNORE_WARNINGS_BEGIN
input wire[`LINE_ADDR_WIDTH-1:0] addr,
`IGNORE_WARNINGS_END
// reading
input wire readen,
output wire [`CACHE_LINE_WIDTH-1:0] rdata,
// writing
input wire writeen,
input wire is_fill,
input wire [CACHE_LINE_SIZE-1:0] byteen,
input wire [`CACHE_LINE_WIDTH-1:0] wdata
);
`UNUSED_PARAM (CACHE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (reset)
`UNUSED_VAR (readen)
localparam BYTEENW = WRITE_ENABLE ? CACHE_LINE_SIZE : 1;
wire [`LINE_SELECT_BITS-1:0] line_addr;
wire [BYTEENW-1:0] byte_enable;
assign line_addr = addr[`LINE_SELECT_BITS-1:0];
if (WRITE_ENABLE) begin
assign byte_enable = is_fill ? {BYTEENW{1'b1}} : byteen;
end else begin
`UNUSED_VAR (byteen)
`UNUSED_VAR (is_fill)
assign byte_enable = 1'b1;
end
VX_sp_ram #(
.DATAW (CACHE_LINE_SIZE * 8),
.SIZE (`LINES_PER_BANK),
.BYTEENW (BYTEENW),
.RWCHECK (1)
) data_store (
.clk(clk),
.addr(line_addr),
.wren(writeen),
.byteen(byte_enable),
.rden(1'b1),
.din(wdata),
.dout(rdata)
);
`ifdef DBG_PRINT_CACHE_DATA
always @(posedge clk) begin
if (writeen) begin
if (is_fill) begin
$display("%t: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, wdata);
end else begin
$display("%t: cache%0d:%0d data-write: addr=%0h, wid=%0d, PC=%0h, byteen=%b, blk_addr=%0d, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, byte_enable, line_addr, wdata);
end
end
if (readen) begin
$display("%t: cache%0d:%0d data-read: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, rdata);
end
end
`endif
endmodule

240
hw/rtl/cache/VX_miss_resrv.sv vendored Normal file
View file

@ -0,0 +1,240 @@
`include "VX_cache_define.vh"
module VX_miss_resrv #(
parameter CACHE_ID = 0,
parameter BANK_ID = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 1,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 1,
// Number of banks
parameter NUM_BANKS = 1,
// Number of ports per banks
parameter NUM_PORTS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 1,
// core request tag size
parameter CORE_TAG_WIDTH = 1,
parameter MSHR_ADDR_WIDTH = $clog2(MSHR_SIZE)
) (
input wire clk,
input wire reset,
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_UNUSED_BEGIN
input wire[31:0] deq_debug_pc,
input wire[`NW_BITS-1:0] deq_debug_wid,
input wire[31:0] lkp_debug_pc,
input wire[`NW_BITS-1:0] lkp_debug_wid,
input wire[31:0] rel_debug_pc,
input wire[`NW_BITS-1:0] rel_debug_wid,
`IGNORE_UNUSED_END
`endif
// allocate
input wire allocate_valid,
input wire [`LINE_ADDR_WIDTH-1:0] allocate_addr,
input wire [`MSHR_DATA_WIDTH-1:0] allocate_data,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_id,
output wire allocate_ready,
// fill
input wire fill_valid,
input wire [MSHR_ADDR_WIDTH-1:0] fill_id,
output wire [`LINE_ADDR_WIDTH-1:0] fill_addr,
// lookup
input wire lookup_valid,
input wire lookup_replay,
input wire [MSHR_ADDR_WIDTH-1:0] lookup_id,
input wire [`LINE_ADDR_WIDTH-1:0] lookup_addr,
output wire lookup_match,
// dequeue
output wire dequeue_valid,
output wire [MSHR_ADDR_WIDTH-1:0] dequeue_id,
output wire [`LINE_ADDR_WIDTH-1:0] dequeue_addr,
output wire [`MSHR_DATA_WIDTH-1:0] dequeue_data,
input wire dequeue_ready,
// release
input wire release_valid,
input wire [MSHR_ADDR_WIDTH-1:0] release_id
);
`UNUSED_PARAM (CACHE_ID)
`UNUSED_PARAM (BANK_ID)
reg [MSHR_SIZE-1:0][`LINE_ADDR_WIDTH-1:0] addr_table, addr_table_n;
reg [MSHR_SIZE-1:0] valid_table, valid_table_n;
reg [MSHR_SIZE-1:0] ready_table, ready_table_n;
reg allocate_rdy_r, allocate_rdy_n;
reg [MSHR_ADDR_WIDTH-1:0] allocate_id_r, allocate_id_n;
reg dequeue_val_r, dequeue_val_n, dequeue_val_x;
reg [MSHR_ADDR_WIDTH-1:0] dequeue_id_r, dequeue_id_n, dequeue_id_x;
reg [MSHR_SIZE-1:0] valid_table_x;
reg [MSHR_SIZE-1:0] ready_table_x;
wire [MSHR_SIZE-1:0] addr_matches;
wire allocate_fire = allocate_valid && allocate_ready;
wire dequeue_fire = dequeue_valid && dequeue_ready;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign addr_matches[i] = (addr_table[i] == lookup_addr);
end
always @(*) begin
valid_table_x = valid_table;
ready_table_x = ready_table;
if (dequeue_fire) begin
valid_table_x[dequeue_id] = 0;
end
if (lookup_replay) begin
ready_table_x |= addr_matches;
end
end
VX_lzc #(
.N (MSHR_SIZE)
) dequeue_sel (
.in_i (valid_table_x & ready_table_x),
.cnt_o (dequeue_id_x),
.valid_o (dequeue_val_x)
);
VX_lzc #(
.N (MSHR_SIZE)
) allocate_sel (
.in_i (~valid_table_n),
.cnt_o (allocate_id_n),
.valid_o (allocate_rdy_n)
);
always @(*) begin
valid_table_n = valid_table_x;
ready_table_n = ready_table_x;
addr_table_n = addr_table;
dequeue_val_n = dequeue_val_r;
dequeue_id_n = dequeue_id_r;
if (dequeue_fire) begin
dequeue_val_n = dequeue_val_x;
dequeue_id_n = dequeue_id_x;
end
if (allocate_fire) begin
valid_table_n[allocate_id] = 1;
ready_table_n[allocate_id] = 0;
addr_table_n[allocate_id] = allocate_addr;
end
if (fill_valid) begin
dequeue_val_n = 1;
dequeue_id_n = fill_id;
end
if (release_valid) begin
valid_table_n[release_id] = 0;
end
end
always @(posedge clk) begin
if (reset) begin
valid_table <= 0;
allocate_rdy_r <= 0;
dequeue_val_r <= 0;
end else begin
valid_table <= valid_table_n;
allocate_rdy_r <= allocate_rdy_n;
dequeue_val_r <= dequeue_val_n;
end
ready_table <= ready_table_n;
addr_table <= addr_table_n;
dequeue_id_r <= dequeue_id_n;
allocate_id_r <= allocate_id_n;
`ASSERT(!allocate_fire || !valid_table[allocate_id_r], ("runtime error"));
`ASSERT(!release_valid || valid_table[release_id], ("runtime error"));
end
`RUNTIME_ASSERT((!allocate_fire || ~valid_table[allocate_id]), ("%t: *** cache%0d:%0d in-use allocation: addr=%0h, id=%0d", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id))
`RUNTIME_ASSERT((!fill_valid || valid_table[fill_id]), ("%t: *** cache%0d:%0d invalid fill: addr=%0h, id=%0d", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(addr_table[fill_id], BANK_ID), fill_id))
VX_dp_ram #(
.DATAW (`MSHR_DATA_WIDTH),
.SIZE (MSHR_SIZE),
.LUTRAM (1)
) entries (
.clk (clk),
.waddr (allocate_id_r),
.raddr (dequeue_id_r),
.wren (allocate_valid),
.wdata (allocate_data),
.rdata (dequeue_data)
);
assign fill_addr = addr_table[fill_id];
assign allocate_ready = allocate_rdy_r;
assign allocate_id = allocate_id_r;
assign dequeue_valid = dequeue_val_r;
assign dequeue_id = dequeue_id_r;
assign dequeue_addr = addr_table[dequeue_id_r];
wire [MSHR_SIZE-1:0] lookup_entries;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign lookup_entries[i] = (i != lookup_id);
end
assign lookup_match = |(lookup_entries & valid_table & addr_matches);
`UNUSED_VAR (lookup_valid)
`ifdef DBG_PRINT_CACHE_MSHR
always @(posedge clk) begin
if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin
if (allocate_fire)
dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id, deq_debug_wid, deq_debug_pc);
if (fill_valid)
dpi_trace("%d: cache%0d:%0d mshr-fill: addr=%0h, id=%0d, addr=%0h\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(addr_table[fill_id], BANK_ID), fill_id, `LINE_TO_BYTE_ADDR(fill_addr, BANK_ID));
if (dequeue_fire)
dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_debug_wid, deq_debug_pc);
if (lookup_replay)
dpi_trace("%d: cache%0d:%0d mshr-replay: addr=%0h, id=%0d\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id);
if (lookup_valid)
dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lookup_match, lkp_debug_wid, lkp_debug_pc);
if (release_valid)
dpi_trace("%d: cache%0d:%0d mshr-release id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID,
release_id, rel_debug_wid, rel_debug_pc);
dpi_trace("%d: cache%0d:%0d mshr-table", $time, CACHE_ID, BANK_ID);
for (integer i = 0; i < MSHR_SIZE; ++i) begin
if (valid_table[i]) begin
dpi_trace(" ");
if (ready_table[i])
dpi_trace("*");
dpi_trace("%0d=%0h", i, `LINE_TO_BYTE_ADDR(addr_table[i], BANK_ID));
end
end
dpi_trace("\n");
end
end
`endif
endmodule

View file

@ -1,233 +0,0 @@
`include "VX_cache_define.vh"
module VX_miss_resrv #(
parameter CACHE_ID = 0,
parameter BANK_ID = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 1,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 1,
// Number of banks
parameter NUM_BANKS = 1,
// Number of ports per banks
parameter NUM_PORTS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 1,
parameter ALM_FULL = (MSHR_SIZE-1),
// core request tag size
parameter CORE_TAG_WIDTH = 1
) (
input wire clk,
input wire reset,
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_WARNINGS_BEGIN
input wire[31:0] deq_debug_pc,
input wire[`NW_BITS-1:0] deq_debug_wid,
input wire[31:0] enq_debug_pc,
input wire[`NW_BITS-1:0] enq_debug_wid,
`IGNORE_WARNINGS_END
`endif
// enqueue
input wire enqueue,
input wire [`LINE_ADDR_WIDTH-1:0] enqueue_addr,
input wire [`MSHR_DATA_WIDTH-1:0] enqueue_data,
input wire enqueue_is_mshr,
input wire enqueue_as_ready,
output wire enqueue_full,
output wire enqueue_almfull,
// fill
input wire fill_start,
input wire [`LINE_ADDR_WIDTH-1:0] fill_addr,
// lookup
input wire [`LINE_ADDR_WIDTH-1:0] lookup_addr,
output wire lookup_match,
input wire lookup_fill,
// schedule
input wire schedule,
output wire schedule_valid,
output wire [`LINE_ADDR_WIDTH-1:0] schedule_addr,
output wire [`MSHR_DATA_WIDTH-1:0] schedule_data,
// dequeue
input wire dequeue
);
`UNUSED_PARAM (CACHE_ID)
`UNUSED_PARAM (BANK_ID)
localparam ADDRW = $clog2(MSHR_SIZE);
reg [MSHR_SIZE-1:0][`LINE_ADDR_WIDTH-1:0] addr_table;
reg [MSHR_SIZE-1:0] valid_table, valid_table_n;
reg [MSHR_SIZE-1:0] ready_table, ready_table_n;
reg [ADDRW-1:0] head_ptr, head_ptr_n;
reg [ADDRW-1:0] tail_ptr, tail_ptr_n;
reg [ADDRW-1:0] restore_ptr, restore_ptr_n;
reg [ADDRW-1:0] schedule_ptr, schedule_ptr_n;
reg [ADDRW-1:0] used_r;
reg alm_full_r, full_r;
reg valid_out_r;
wire [MSHR_SIZE-1:0] valid_address_match;
for (genvar i = 0; i < MSHR_SIZE; i++) begin
assign valid_address_match[i] = valid_table[i] && (addr_table[i] == lookup_addr);
end
wire push_new = enqueue && !enqueue_is_mshr;
wire restore = enqueue && enqueue_is_mshr;
always @(*) begin
valid_table_n = valid_table;
ready_table_n = ready_table;
head_ptr_n = head_ptr;
tail_ptr_n = tail_ptr;
schedule_ptr_n = schedule_ptr;
restore_ptr_n = restore_ptr;
if (lookup_fill) begin
// unlock pending requests for scheduling
ready_table_n |= valid_address_match;
end
if (schedule) begin
// schedule next entry
schedule_ptr_n = schedule_ptr + 1;
valid_table_n[schedule_ptr] = 0;
ready_table_n[schedule_ptr] = 0;
end
if (fill_start && (fill_addr == addr_table[schedule_ptr])) begin
ready_table_n[schedule_ptr] = valid_table[schedule_ptr];
end
if (push_new) begin
// push new entry
valid_table_n[tail_ptr] = 1;
ready_table_n[tail_ptr] = enqueue_as_ready;
tail_ptr_n = tail_ptr + 1;
end else if (restore) begin
// restore schedule, returning missed mshr entry
valid_table_n[restore_ptr] = 1;
ready_table_n[restore_ptr] = enqueue_as_ready;
restore_ptr_n = restore_ptr + 1;
schedule_ptr_n = head_ptr;
end else if (dequeue) begin
// clear scheduled entry
head_ptr_n = head_ptr + 1;
restore_ptr_n = head_ptr_n;
end
end
always @(posedge clk) begin
if (reset) begin
valid_table <= 0;
ready_table <= 0;
head_ptr <= 0;
tail_ptr <= 0;
schedule_ptr <= 0;
restore_ptr <= 0;
used_r <= 0;
alm_full_r <= 0;
full_r <= 0;
valid_out_r <= 0;
end else begin
if (schedule) begin
assert(schedule_valid);
assert(!fill_start);
assert(!restore);
end
if (push_new) begin
assert(!full_r);
end else if (restore) begin
assert(!schedule);
end
if (push_new) begin
if (!dequeue) begin
if (used_r == ADDRW'(ALM_FULL-1))
alm_full_r <= 1;
if (used_r == ADDRW'(MSHR_SIZE-1))
full_r <= 1;
end
end else if (dequeue) begin
if (used_r == ADDRW'(ALM_FULL))
alm_full_r <= 0;
full_r <= 0;
end
used_r <= used_r + ADDRW'($signed(2'(push_new) - 2'(dequeue)));
valid_table <= valid_table_n;
ready_table <= ready_table_n;
head_ptr <= head_ptr_n;
tail_ptr <= tail_ptr_n;
schedule_ptr <= schedule_ptr_n;
restore_ptr <= restore_ptr_n;
valid_out_r <= ready_table_n[schedule_ptr_n];
end
if (push_new) begin
addr_table[tail_ptr] <= enqueue_addr;
end
end
VX_dp_ram #(
.DATAW (`MSHR_DATA_WIDTH),
.SIZE (MSHR_SIZE),
.RWCHECK (1),
.FASTRAM (1)
) entries (
.clk (clk),
.waddr (tail_ptr),
.raddr (schedule_ptr),
.wren (push_new),
.byteen (1'b1),
.rden (1'b1),
.din (enqueue_data),
.dout (schedule_data)
);
assign lookup_match = (| valid_address_match);
assign schedule_valid = valid_out_r;
assign schedule_addr = addr_table[schedule_ptr];
assign enqueue_almfull = alm_full_r;
assign enqueue_full = full_r;
`ifdef DBG_PRINT_CACHE_MSHR
always @(posedge clk) begin
if (lookup_fill || schedule || enqueue || dequeue) begin
if (schedule)
$display("%t: cache%0d:%0d mshr-schedule: addr%0d=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, schedule_ptr, `LINE_TO_BYTE_ADDR(schedule_addr, BANK_ID), deq_debug_wid, deq_debug_pc);
if (enqueue) begin
if (enqueue_is_mshr)
$display("%t: cache%0d:%0d mshr-restore: addr%0d=%0h, ready=%b", $time, CACHE_ID, BANK_ID, restore_ptr, `LINE_TO_BYTE_ADDR(enqueue_addr, BANK_ID), enqueue_as_ready);
else
$display("%t: cache%0d:%0d mshr-enqueue: addr%0d=%0h, ready=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, tail_ptr, `LINE_TO_BYTE_ADDR(enqueue_addr, BANK_ID), enqueue_as_ready, enq_debug_wid, enq_debug_pc);
end
if (dequeue)
$display("%t: cache%0d:%0d mshr-dequeue addr%0d, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, head_ptr, enq_debug_wid, enq_debug_pc);
$write("%t: cache%0d:%0d mshr-table", $time, CACHE_ID, BANK_ID);
for (integer j = 0; j < MSHR_SIZE; j++) begin
if (valid_table[j]) begin
$write(" ");
if (schedule_ptr == $bits(schedule_ptr)'(j)) $write("*");
if (~ready_table[j]) $write("!");
$write("addr%0d=%0h", j, `LINE_TO_BYTE_ADDR(addr_table[j], BANK_ID));
end
end
$write("\n");
end
end
`endif
endmodule

View file

@ -1,6 +1,7 @@
`include "VX_cache_define.vh"
module VX_nc_bypass #(
parameter NUM_PORTS = 1,
parameter NUM_REQS = 1,
parameter NUM_RSP_TAGS = 0,
parameter NC_TAG_BIT = 0,
@ -10,13 +11,14 @@ module VX_nc_bypass #(
parameter CORE_TAG_IN_WIDTH = 1,
parameter MEM_ADDR_WIDTH = 1,
parameter MEM_DATA_SIZE = 1,
parameter MEM_DATA_SIZE = 1,
parameter MEM_TAG_IN_WIDTH = 1,
parameter MEM_TAG_OUT_WIDTH = 1,
localparam CORE_DATA_WIDTH = CORE_DATA_SIZE * 8,
localparam MEM_DATA_WIDTH = MEM_DATA_SIZE * 8,
localparam CORE_TAG_OUT_WIDTH = CORE_TAG_IN_WIDTH - 1
parameter CORE_DATA_WIDTH = CORE_DATA_SIZE * 8,
parameter MEM_DATA_WIDTH = MEM_DATA_SIZE * 8,
parameter CORE_TAG_OUT_WIDTH = CORE_TAG_IN_WIDTH - 1,
parameter MEM_SELECT_BITS = `UP(`CLOG2(MEM_DATA_SIZE / CORE_DATA_SIZE))
) (
input wire clk,
input wire reset,
@ -57,8 +59,10 @@ module VX_nc_bypass #(
input wire mem_req_valid_in,
input wire mem_req_rw_in,
input wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_in,
input wire [MEM_DATA_SIZE-1:0] mem_req_byteen_in,
input wire [MEM_DATA_WIDTH-1:0] mem_req_data_in,
input wire [NUM_PORTS-1:0] mem_req_pmask_in,
input wire [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in,
input wire [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_in,
input wire [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in,
input wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_in,
output wire mem_req_ready_in,
@ -66,8 +70,10 @@ module VX_nc_bypass #(
output wire mem_req_valid_out,
output wire mem_req_rw_out,
output wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_out,
output wire [MEM_DATA_SIZE-1:0] mem_req_byteen_out,
output wire [MEM_DATA_WIDTH-1:0] mem_req_data_out,
output wire [NUM_PORTS-1:0] mem_req_pmask_out,
output wire [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_out,
output wire [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_out,
output wire [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_out,
output wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_out,
input wire mem_req_ready_out,
@ -99,9 +105,9 @@ module VX_nc_bypass #(
// core request handling
wire [NUM_REQS-1:0] core_req_valid_in_nc;
wire [NUM_REQS-1:0] core_req_nc_sel;
wire [NUM_REQS-1:0] core_req_nc_tids;
wire [`UP(CORE_REQ_TIDW)-1:0] core_req_nc_tid;
wire [NUM_REQS-1:0] core_req_nc_sel;
wire core_req_nc_valid;
for (genvar i = 0; i < NUM_REQS; ++i) begin
@ -142,7 +148,6 @@ module VX_nc_bypass #(
(~mem_req_valid_in && mem_req_ready_out && core_req_nc_sel[i]) : core_req_ready_out[i];
end
end else begin
`UNUSED_VAR (core_req_nc_sel)
assign core_req_ready_in = core_req_valid_in_nc ? (~mem_req_valid_in && mem_req_ready_out) : core_req_ready_out;
end
@ -151,7 +156,7 @@ module VX_nc_bypass #(
assign mem_req_valid_out = mem_req_valid_in || core_req_nc_valid;
assign mem_req_ready_in = mem_req_ready_out;
wire [(MEM_TAG_IN_WIDTH+1)-1:0] mem_req_tag_in_nc;
wire [(MEM_TAG_IN_WIDTH+1)-1:0] mem_req_tag_in_c;
VX_bits_insert #(
.N (MEM_TAG_IN_WIDTH),
@ -160,81 +165,69 @@ module VX_nc_bypass #(
) mem_req_tag_insert (
.data_in (mem_req_tag_in),
.sel_in ('0),
.data_out (mem_req_tag_in_nc)
.data_out (mem_req_tag_in_c)
);
wire [CORE_TAG_IN_WIDTH-1:0] core_req_tag_in_sel;
wire [CORE_DATA_WIDTH-1:0] core_req_data_in_sel;
wire [CORE_DATA_SIZE-1:0] core_req_byteen_in_sel;
wire [CORE_ADDR_WIDTH-1:0] core_req_addr_in_sel;
wire core_req_rw_in_sel;
if (NUM_REQS > 1) begin
wire [CORE_TAG_IN_WIDTH-1:0] core_req_tag_in_sel;
wire [CORE_DATA_WIDTH-1:0] core_req_data_in_sel;
wire [CORE_DATA_SIZE-1:0] core_req_byteen_in_sel;
wire [CORE_ADDR_WIDTH-1:0] core_req_addr_in_sel;
wire core_req_rw_in_sel;
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_nc_mux_in[i] = {core_req_tag_in[i], core_req_data_in[i], core_req_byteen_in[i], core_req_addr_in[i], core_req_rw_in[i]};
end
VX_onehot_mux #(
.DATAW (MUX_DATAW),
.N (NUM_REQS)
) core_req_nc_mux (
.data_in (core_req_nc_mux_in),
.sel_in (core_req_nc_sel),
.data_out ({core_req_tag_in_sel, core_req_data_in_sel, core_req_byteen_in_sel, core_req_addr_in_sel, core_req_rw_in_sel})
);
assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in_sel;
assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in_sel[D +: MEM_ADDR_WIDTH];
for (genvar i = 0; i < P; ++i) begin
assign mem_req_data_out[i * CORE_DATA_WIDTH +: CORE_DATA_WIDTH] = mem_req_valid_in ?
mem_req_data_in[i * CORE_DATA_WIDTH +: CORE_DATA_WIDTH] : core_req_data_in_sel;
end
if (D != 0) begin
wire [D-1:0] req_addr_idx = core_req_addr_in_sel[D-1:0];
reg [MEM_DATA_SIZE-1:0] mem_req_byteen_in_r;
always @(*) begin
mem_req_byteen_in_r = 0;
mem_req_byteen_in_r[req_addr_idx * CORE_DATA_SIZE +: CORE_DATA_SIZE] = core_req_byteen_in_sel;
end
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r;
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, req_addr_idx, core_req_tag_in_sel});
end else begin
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in_sel;
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, core_req_tag_in_sel});
end
assign {core_req_tag_in_sel, core_req_data_in_sel, core_req_byteen_in_sel, core_req_addr_in_sel, core_req_rw_in_sel} = core_req_nc_mux_in[core_req_nc_tid];
end else begin
`UNUSED_VAR (core_req_nc_tid)
assign core_req_tag_in_sel = core_req_tag_in;
assign core_req_data_in_sel = core_req_data_in;
assign core_req_byteen_in_sel = core_req_byteen_in;
assign core_req_addr_in_sel = core_req_addr_in;
assign core_req_rw_in_sel = core_req_rw_in;
end
assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in_sel;
assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in_sel[D +: MEM_ADDR_WIDTH];
if (D != 0) begin
reg [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in_r;
reg [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_in_r;
reg [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r;
wire [D-1:0] req_addr_idx = core_req_addr_in_sel[D-1:0];
assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in;
assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in[0][D +: MEM_ADDR_WIDTH];
always @(*) begin
mem_req_byteen_in_r = 0;
mem_req_byteen_in_r[0] = core_req_byteen_in_sel;
for (genvar i = 0; i < P; ++i) begin
assign mem_req_data_out[i * CORE_DATA_WIDTH +: CORE_DATA_WIDTH] = mem_req_valid_in ?
mem_req_data_in[i * CORE_DATA_WIDTH +: CORE_DATA_WIDTH] : core_req_data_in;
mem_req_wsel_in_r = 'x;
mem_req_wsel_in_r[0] = req_addr_idx;
mem_req_data_in_r = 'x;
mem_req_data_in_r[0] = core_req_data_in_sel;
end
if (D != 0) begin
wire [D-1:0] req_addr_idx = core_req_addr_in[0][D-1:0];
reg [MEM_DATA_SIZE-1:0] mem_req_byteen_in_r;
always @(*) begin
mem_req_byteen_in_r = 0;
mem_req_byteen_in_r[req_addr_idx * CORE_DATA_SIZE +: CORE_DATA_SIZE] = core_req_byteen_in;
end
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r;
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'({req_addr_idx, core_req_tag_in});
end else begin
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in;
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : MEM_TAG_OUT_WIDTH'(core_req_tag_in);
end
assign mem_req_pmask_out = mem_req_valid_in ? mem_req_pmask_in : NUM_PORTS'(1'b1);
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r;
assign mem_req_wsel_out = mem_req_valid_in ? mem_req_wsel_in : mem_req_wsel_in_r;
assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : mem_req_data_in_r;
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_c) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, req_addr_idx, core_req_tag_in_sel});
end else begin
`UNUSED_VAR (mem_req_wsel_in)
`UNUSED_VAR (mem_req_pmask_in)
assign mem_req_pmask_out = 0;
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in_sel;
assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : core_req_data_in_sel;
assign mem_req_wsel_out = 0;
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_c) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, core_req_tag_in_sel});
end
// core response handling
wire [NUM_RSP_TAGS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_out_unqual;
wire [NUM_RSP_TAGS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_out_c;
wire is_mem_rsp_nc = mem_rsp_valid_in && mem_rsp_tag_in[NC_TAG_BIT];
@ -246,7 +239,7 @@ module VX_nc_bypass #(
) core_rsp_tag_insert (
.data_in (core_rsp_tag_in[i]),
.sel_in ('0),
.data_out (core_rsp_tag_out_unqual[i])
.data_out (core_rsp_tag_out_c[i])
);
end
@ -272,14 +265,14 @@ module VX_nc_bypass #(
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_data_out[i] = core_rsp_valid_in[i] ? core_rsp_data_in[i] : mem_rsp_data_in;
end
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_out_unqual[i] : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0];
assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_out_c[i] : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0];
end
end else begin
assign core_rsp_valid_out = core_rsp_valid_in || is_mem_rsp_nc;
assign core_rsp_tag_out = core_rsp_valid_in ? core_rsp_tag_out_unqual : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0];
assign core_rsp_tag_out = core_rsp_valid_in ? core_rsp_tag_out_c : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0];
assign core_rsp_ready_in = core_rsp_ready_out;
if (NUM_REQS > 1) begin

View file

@ -24,14 +24,14 @@ module VX_shared_mem #(
parameter CORE_TAG_WIDTH = (2 + CORE_TAG_ID_BITS),
// bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = `CLOG2(256)
parameter BANK_ADDR_OFFSET = `CLOG2(256)
) (
input wire clk,
input wire reset,
// PERF
`ifdef PERF_ENABLE
VX_perf_cache_if perf_cache_if,
VX_perf_cache_if.master perf_cache_if,
`endif
// Core request
@ -64,7 +64,7 @@ module VX_shared_mem #(
wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data_unqual;
wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag_unqual;
wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid_unqual;
wire per_bank_core_req_ready_unqual;
wire [NUM_BANKS-1:0] per_bank_core_req_ready_unqual;
VX_core_req_bank_sel #(
.CACHE_ID (CACHE_ID),
@ -74,8 +74,7 @@ module VX_shared_mem #(
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.CORE_TAG_WIDTH (CORE_TAG_WIDTH),
.BANK_ADDR_OFFSET(BANK_ADDR_OFFSET),
.SHARED_BANK_READY(1)
.BANK_ADDR_OFFSET(BANK_ADDR_OFFSET)
) core_req_bank_sel (
.clk (clk),
.reset (reset),
@ -103,41 +102,34 @@ module VX_shared_mem #(
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0] per_bank_core_req_rw;
wire [NUM_BANKS-1:0][`LINE_SELECT_BITS-1:0] per_bank_core_req_addr;
wire [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen;
wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data;
wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid;
wire creq_in_ready;
wire creq_out_valid;
wire crsq_in_fire_last;
wire [NUM_BANKS-1:0] per_bank_req_reads = per_bank_core_req_valid & ~per_bank_core_req_rw;
wire per_bank_req_has_reads = (| per_bank_req_reads);
wire creq_in_valid = (| core_req_valid);
wire creq_out_ready = ~per_bank_req_has_reads // is write only
|| crsq_in_fire_last; // is sending last read response
assign per_bank_core_req_ready_unqual = creq_in_ready;
wire creq_out_valid, creq_out_ready;
wire creq_in_valid, creq_in_ready;
wire creq_in_fire = creq_in_valid && creq_in_ready;
`UNUSED_VAR (creq_in_fire)
wire creq_out_fire = creq_out_valid && creq_out_ready;
`UNUSED_VAR (creq_out_fire)
wire [NUM_BANKS-1:0][`LINE_SELECT_BITS-1:0] per_bank_core_req_addr_qual;
`UNUSED_VAR (per_bank_core_req_addr_unqual)
for (genvar i = 0; i < NUM_BANKS; i++) begin
assign per_bank_core_req_addr_qual[i] = per_bank_core_req_addr_unqual[i][`LINE_SELECT_BITS-1:0];
end
assign creq_in_valid = (| core_req_valid);
assign per_bank_core_req_ready_unqual = {NUM_BANKS{creq_in_ready}};
wire [NUM_BANKS-1:0] core_req_read_mask, core_req_read_mask_unqual;
wire core_req_writeonly, core_req_writeonly_unqual;
assign core_req_read_mask_unqual = per_bank_core_req_valid_unqual & ~per_bank_core_req_rw_unqual;
assign core_req_writeonly_unqual = ~(| core_req_read_mask_unqual);
VX_elastic_buffer #(
.DATAW (NUM_BANKS * (1 + 1 + `LINE_SELECT_BITS + WORD_SIZE + `WORD_WIDTH + CORE_TAG_WIDTH + `REQS_BITS)),
.SIZE (CREQ_SIZE),
.OUTPUT_REG (1) // output should be registered for the data_store addr port
.DATAW (NUM_BANKS * (1 + 1 + `LINE_ADDR_WIDTH + WORD_SIZE + `WORD_WIDTH + CORE_TAG_WIDTH + `REQS_BITS) + NUM_BANKS + 1),
.SIZE (CREQ_SIZE),
.OUT_REG (1) // output should be registered for the data_store addr port
) core_req_queue (
.clk (clk),
.reset (reset),
@ -145,44 +137,53 @@ module VX_shared_mem #(
.valid_in (creq_in_valid),
.data_in ({per_bank_core_req_valid_unqual,
per_bank_core_req_rw_unqual,
per_bank_core_req_addr_qual,
per_bank_core_req_addr_unqual,
per_bank_core_req_byteen_unqual,
per_bank_core_req_data_unqual,
per_bank_core_req_tag_unqual,
per_bank_core_req_tid_unqual}),
per_bank_core_req_tid_unqual,
core_req_read_mask_unqual,
core_req_writeonly_unqual}),
.data_out ({per_bank_core_req_valid,
per_bank_core_req_rw,
per_bank_core_req_addr,
per_bank_core_req_byteen,
per_bank_core_req_data,
per_bank_core_req_tag,
per_bank_core_req_tid}),
per_bank_core_req_tid,
core_req_read_mask,
core_req_writeonly}),
.ready_out (creq_out_ready),
.valid_out (creq_out_valid)
);
`UNUSED_VAR (creq_in_fire)
wire crsq_in_valid, crsq_in_ready;
wire crsq_last_read;
assign creq_out_ready = core_req_writeonly
|| (crsq_in_ready && crsq_last_read);
wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data;
for (genvar i = 0; i < NUM_BANKS; i++) begin
wire wren = per_bank_core_req_rw[i]
&& per_bank_core_req_valid[i]
&& creq_out_fire;
wire [WORD_SIZE-1:0] wren = per_bank_core_req_byteen[i]
& {WORD_SIZE{per_bank_core_req_valid[i]
&& per_bank_core_req_rw[i]}};
wire [`LINE_SELECT_BITS-1:0] addr = per_bank_core_req_addr[i][`LINE_SELECT_BITS-1:0];
VX_sp_ram #(
.DATAW (`WORD_WIDTH),
.SIZE (`LINES_PER_BANK),
.BYTEENW (WORD_SIZE),
.RWCHECK (1)
.DATAW (`WORD_WIDTH),
.SIZE (`LINES_PER_BANK),
.BYTEENW (WORD_SIZE),
.NO_RWCHECK (1)
) data_store (
.clk (clk),
.addr (per_bank_core_req_addr[i]),
.wren (wren),
.byteen (per_bank_core_req_byteen[i]),
.rden (1'b1),
.din (per_bank_core_req_data[i]),
.dout (per_bank_core_rsp_data[i])
.clk (clk),
.addr (addr),
.wren (wren),
.wdata (per_bank_core_req_data[i]),
.rdata (per_bank_core_rsp_data[i])
);
end
@ -190,57 +191,54 @@ module VX_shared_mem #(
// We first need to select the current tag to process,
// then send all bank responses for that tag as a batch
wire crsq_in_valid, crsq_in_ready;
reg [NUM_BANKS-1:0] bank_rsp_sel_prv, bank_rsp_sel_cur;
wire [NUM_BANKS-1:0] bank_rsp_sel_n = bank_rsp_sel_prv | bank_rsp_sel_cur;
reg [NUM_REQS-1:0] core_rsp_valids_in;
reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_in;
wire [CORE_TAG_WIDTH-1:0] core_rsp_tag_in;
reg [NUM_BANKS-1:0] bank_rsp_sel_r, bank_rsp_sel_n;
wire crsq_in_fire = crsq_in_valid && crsq_in_ready;
assign crsq_in_fire_last = crsq_in_fire && (bank_rsp_sel_n == per_bank_req_reads);
assign crsq_last_read = (bank_rsp_sel_n == core_req_read_mask);
always @(posedge clk) begin
if (reset) begin
bank_rsp_sel_prv <= 0;
bank_rsp_sel_r <= 0;
end else begin
if (crsq_in_fire) begin
if (bank_rsp_sel_n == per_bank_req_reads) begin
bank_rsp_sel_prv <= 0;
if (crsq_last_read) begin
bank_rsp_sel_r <= 0;
end else begin
bank_rsp_sel_prv <= bank_rsp_sel_n;
bank_rsp_sel_r <= bank_rsp_sel_n;
end
end
end
end
reg [NUM_REQS-1:0] core_rsp_valids_in;
reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_in;
reg [CORE_TAG_WIDTH-1:0] core_rsp_tag_in;
always @(*) begin
end
VX_find_first #(
.N (NUM_BANKS),
.DATAW (CORE_TAG_WIDTH)
) find_first (
.valid_i (core_req_read_mask & ~bank_rsp_sel_r),
.data_i (per_bank_core_req_tag),
.data_o (core_rsp_tag_in),
`UNUSED_PIN (valid_o)
);
always @(*) begin
core_rsp_valids_in = 0;
core_rsp_data_in = 'x;
core_rsp_tag_in = 'x;
bank_rsp_sel_cur = 0;
for (integer i = NUM_BANKS-1; i >= 0; --i) begin
if (per_bank_req_reads[i] && ~bank_rsp_sel_prv[i]) begin
core_rsp_tag_in = per_bank_core_req_tag[i];
end
end
bank_rsp_sel_n = bank_rsp_sel_r;
for (integer i = 0; i < NUM_BANKS; i++) begin
if (per_bank_core_req_valid[i]
&& (core_rsp_tag_in[CORE_TAG_ID_BITS-1:0] == per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin
core_rsp_valids_in[per_bank_core_req_tid[i]] = 1;
core_rsp_data_in[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i];
bank_rsp_sel_cur[i] = 1;
bank_rsp_sel_n[i] = 1;
end
end
end
assign crsq_in_valid = creq_out_valid && per_bank_req_has_reads;
assign crsq_in_valid = creq_out_valid && ~core_req_writeonly;
VX_elastic_buffer #(
.DATAW (NUM_BANKS * (1 + `WORD_WIDTH) + CORE_TAG_WIDTH),
@ -257,10 +255,10 @@ module VX_shared_mem #(
);
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_WARNINGS_BEGIN
`IGNORE_UNUSED_BEGIN
wire [NUM_BANKS-1:0][31:0] debug_pc_st0, debug_pc_st1;
wire [NUM_BANKS-1:0][`NW_BITS-1:0] debug_wid_st0, debug_wid_st1;
`IGNORE_WARNINGS_END
`IGNORE_UNUSED_END
for (genvar i = 0; i < NUM_BANKS; ++i) begin
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
@ -276,17 +274,21 @@ module VX_shared_mem #(
`ifdef DBG_PRINT_CACHE_BANK
reg is_multi_tag_req;
`IGNORE_WARNINGS_BEGIN
`IGNORE_UNUSED_BEGIN
reg [CORE_TAG_WIDTH-1:0] core_req_tag_sel;
`IGNORE_WARNINGS_END
`IGNORE_UNUSED_END
always @(*) begin
core_req_tag_sel ='x;
for (integer i = NUM_BANKS-1; i >= 0; --i) begin
if (per_bank_core_req_valid[i]) begin
core_req_tag_sel = per_bank_core_req_tag[i];
end
end
VX_find_first #(
.N (NUM_BANKS),
.DATAW (CORE_TAG_WIDTH)
) find_first_d (
.valid_i (per_bank_core_req_valid),
.data_i (per_bank_core_req_tag),
.data_o (core_req_tag_sel),
`UNUSED_PIN (valid_o)
);
always @(*) begin
is_multi_tag_req = 0;
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid[i]
@ -298,22 +300,20 @@ module VX_shared_mem #(
always @(posedge clk) begin
if (!crsq_in_ready) begin
$display("%t: *** cache%0d pipeline-stall", $time, CACHE_ID);
dpi_trace("%d: *** cache%0d pipeline-stall\n", $time, CACHE_ID);
end
if (is_multi_tag_req) begin
$display("%t: *** cache%0d multi-tag request!", $time, CACHE_ID);
dpi_trace("%d: *** cache%0d multi-tag request!\n", $time, CACHE_ID);
end
if (creq_in_fire) begin
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid_unqual[i]) begin
if (per_bank_core_req_rw_unqual[i]) begin
$display("%t: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h",
$time, CACHE_ID, i, per_bank_core_req_addr_unqual[i], per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i],
debug_wid_st0[i], debug_pc_st0[i]);
dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n",
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], debug_wid_st0[i], debug_pc_st0[i]);
end else begin
$display("%t: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, byteen=%b, wid=%0d, PC=%0h",
$time, CACHE_ID, i, per_bank_core_req_addr_unqual[i], per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i],
debug_wid_st0[i], debug_pc_st0[i]);
dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, byteen=%b, wid=%0d, PC=%0h\n",
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], debug_wid_st0[i], debug_pc_st0[i]);
end
end
end
@ -322,13 +322,11 @@ module VX_shared_mem #(
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid[i]) begin
if (per_bank_core_req_rw[i]) begin
$display("%t: cache%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h",
$time, CACHE_ID, i, per_bank_core_req_addr[i], per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_req_data[i],
debug_wid_st1[i], debug_pc_st1[i]);
dpi_trace("%d: cache%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n",
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_req_data[i], debug_wid_st1[i], debug_pc_st1[i]);
end else begin
$display("%t: cache%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h",
$time, CACHE_ID, i, per_bank_core_req_addr[i], per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_rsp_data[i],
debug_wid_st1[i], debug_pc_st1[i]);
dpi_trace("%d: cache%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n",
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_rsp_data[i], debug_wid_st1[i], debug_pc_st1[i]);
end
end
end
@ -338,16 +336,22 @@ module VX_shared_mem #(
`ifdef PERF_ENABLE
// per cycle: core_reads, core_writes
reg [($clog2(NUM_REQS+1)-1):0] perf_core_reads_per_cycle, perf_core_writes_per_cycle;
reg [($clog2(NUM_REQS+1)-1):0] perf_crsp_stall_per_cycle;
wire [$clog2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [$clog2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [$clog2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
assign perf_core_reads_per_cycle = $countones(core_req_valid & core_req_ready & ~core_req_rw);
assign perf_core_writes_per_cycle = $countones(core_req_valid & core_req_ready & core_req_rw);
wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid & core_req_ready & ~core_req_rw;
wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid & core_req_ready & core_req_rw;
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_mask);
if (CORE_TAG_ID_BITS != 0) begin
assign perf_crsp_stall_per_cycle = $countones(core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}});
wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}};
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask);
end else begin
assign perf_crsp_stall_per_cycle = $countones(core_rsp_valid & ~core_rsp_ready);
wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_valid & ~core_rsp_ready;
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask);
end
reg [`PERF_CTR_BITS-1:0] perf_core_reads;

82
hw/rtl/cache/VX_tag_access.sv vendored Normal file
View file

@ -0,0 +1,82 @@
`include "VX_cache_define.vh"
module VX_tag_access #(
parameter CACHE_ID = 0,
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 1,
// Number of banks
parameter NUM_BANKS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = 0
) (
input wire clk,
input wire reset,
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_UNUSED_BEGIN
input wire[31:0] debug_pc,
input wire[`NW_BITS-1:0] debug_wid,
`IGNORE_UNUSED_END
`endif
input wire stall,
// read/fill
input wire lookup,
input wire[`LINE_ADDR_WIDTH-1:0] addr,
input wire fill,
input wire flush,
output wire tag_match
);
`UNUSED_PARAM (CACHE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_VAR (reset)
`UNUSED_VAR (lookup)
wire [`TAG_SELECT_BITS-1:0] read_tag;
wire read_valid;
wire [`LINE_SELECT_BITS-1:0] line_addr = addr[`LINE_SELECT_BITS-1:0];
wire [`TAG_SELECT_BITS-1:0] line_tag = `LINE_TAG_ADDR(addr);
VX_sp_ram #(
.DATAW (`TAG_SELECT_BITS + 1),
.SIZE (`LINES_PER_BANK),
.NO_RWCHECK (1)
) tag_store (
.clk( clk),
.addr (line_addr),
.wren (fill || flush),
.wdata ({!flush, line_tag}),
.rdata ({read_valid, read_tag})
);
assign tag_match = read_valid && (line_tag == read_tag);
`UNUSED_VAR (stall)
`ifdef DBG_PRINT_CACHE_TAG
always @(posedge clk) begin
if (fill && ~stall) begin
dpi_trace("%d: cache%0d:%0d tag-fill: addr=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag);
end
if (flush) begin
dpi_trace("%d: cache%0d:%0d tag-flush: addr=%0h, blk_addr=%0d\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr);
end
if (lookup && ~stall) begin
if (tag_match) begin
dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag);
end else begin
dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag, read_tag);
end
end
end
`endif
endmodule

View file

@ -1,84 +0,0 @@
`include "VX_cache_define.vh"
module VX_tag_access #(
parameter CACHE_ID = 0,
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 1,
// Number of banks
parameter NUM_BANKS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = 0
) (
input wire clk,
input wire reset,
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_WARNINGS_BEGIN
input wire[31:0] debug_pc,
input wire[`NW_BITS-1:0] debug_wid,
`IGNORE_WARNINGS_END
`endif
// read/fill
input wire lookup,
input wire[`LINE_ADDR_WIDTH-1:0] addr,
input wire fill,
input wire is_flush,
output wire tag_match
);
`UNUSED_PARAM (CACHE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_VAR (reset)
`UNUSED_VAR (lookup)
wire read_valid;
wire [`TAG_SELECT_BITS-1:0] read_tag;
wire [`TAG_SELECT_BITS-1:0] line_tag = `LINE_TAG_ADDR(addr);
wire [`LINE_SELECT_BITS-1:0] line_addr = addr [`LINE_SELECT_BITS-1:0];
VX_sp_ram #(
.DATAW(`TAG_SELECT_BITS + 1),
.SIZE(`LINES_PER_BANK),
.INITZERO(1),
.RWCHECK(1)
) tag_store (
.clk(clk),
.addr(line_addr),
.wren(fill),
.byteen(1'b1),
.rden(1'b1),
.din({!is_flush, line_tag}),
.dout({read_valid, read_tag})
);
assign tag_match = read_valid && (line_tag == read_tag);
`ifdef DBG_PRINT_CACHE_TAG
always @(posedge clk) begin
if (fill) begin
if (is_flush) begin
$display("%t: cache%0d:%0d tag-flush: addr=%0h, blk_addr=%0d", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr);
end else begin
$display("%t: cache%0d:%0d tag-fill: addr=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag, read_tag);
if (tag_match) begin
$display("%t: warning: redundant fill - addr=%0h", $time, `LINE_TO_BYTE_ADDR(addr, BANK_ID));
end
end
end else if (lookup) begin
if (tag_match) begin
$display("%t: cache%0d:%0d tag-hit: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag);
end else begin
$display("%t: cache%0d:%0d tag-miss: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag, read_tag);
end
end
end
`endif
endmodule

View file

@ -0,0 +1,28 @@
`include "VX_fpu_define.vh"
module VX_fp_class # (
parameter MAN_BITS = 23,
parameter EXP_BITS = 8
) (
input [EXP_BITS-1:0] exp_i,
input [MAN_BITS-1:0] man_i,
output fp_class_t clss_o
);
wire is_normal = (exp_i != '0) && (exp_i != '1);
wire is_zero = (exp_i == '0) && (man_i == '0);
wire is_subnormal = (exp_i == '0) && (man_i != '0);
wire is_inf = (exp_i == '1) && (man_i == '0);
wire is_nan = (exp_i == '1) && (man_i != '0);
wire is_signaling = is_nan && ~man_i[MAN_BITS-1];
wire is_quiet = is_nan && ~is_signaling;
assign clss_o.is_normal = is_normal;
assign clss_o.is_zero = is_zero;
assign clss_o.is_subnormal = is_subnormal;
assign clss_o.is_inf = is_inf;
assign clss_o.is_nan = is_nan;
assign clss_o.is_quiet = is_quiet;
assign clss_o.is_signaling = is_signaling;
endmodule

View file

@ -1,4 +1,4 @@
`include "VX_define.vh"
`include "VX_fpu_define.vh"
/// Modified port of cast module from fpnew Libray
/// reference: https://github.com/pulp-platform/fpnew
@ -15,7 +15,7 @@ module VX_fp_cvt #(
input wire [TAGW-1:0] tag_in,
input wire [`FRM_BITS-1:0] frm,
input wire [`INST_FRM_BITS-1:0] frm,
input wire is_itof,
input wire is_signed,
@ -59,13 +59,16 @@ module VX_fp_cvt #(
// Input processing
fp_type_t [LANES-1:0] in_a_type;
fp_class_t [LANES-1:0] fp_clss;
for (genvar i = 0; i < LANES; ++i) begin
VX_fp_type fp_type (
VX_fp_class #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_class (
.exp_i (dataa[i][30:23]),
.man_i (dataa[i][22:0]),
.type_o (in_a_type[i])
.clss_o (fp_clss[i])
);
end
@ -74,16 +77,18 @@ module VX_fp_cvt #(
wire [LANES-1:0] input_sign;
for (genvar i = 0; i < LANES; ++i) begin
`IGNORE_WARNINGS_BEGIN
wire [INT_MAN_WIDTH-1:0] int_mantissa;
wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
wire fmt_sign = dataa[i][31];
wire int_sign = dataa[i][31] & is_signed;
assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i];
assign fmt_mantissa = INT_MAN_WIDTH'({in_a_type[i].is_normal, dataa[i][MAN_BITS-1:0]});
assign fmt_exponent[i] = {1'b0, dataa[i][MAN_BITS+EXP_BITS-1:MAN_BITS]};
assign fmt_mantissa = INT_MAN_WIDTH'({fp_clss[i].is_normal, dataa[i][MAN_BITS-1:0]});
assign fmt_exponent[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} +
{1'b0, fp_clss[i].is_subnormal};
assign encoded_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
assign input_sign[i] = is_itof ? int_sign : fmt_sign;
`IGNORE_WARNINGS_END
end
// Pipeline stage0
@ -93,7 +98,7 @@ module VX_fp_cvt #(
wire is_itof_s0;
wire unsigned_s0;
wire [2:0] rnd_mode_s0;
fp_type_t [LANES-1:0] in_a_type_s0;
fp_class_t [LANES-1:0] fp_clss_s0;
wire [LANES-1:0] input_sign_s0;
wire [LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
wire [LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
@ -101,14 +106,14 @@ module VX_fp_cvt #(
wire stall;
VX_pipe_register #(
.DATAW (1 + TAGW + 1 + `FRM_BITS + 1 + LANES * ($bits(fp_type_t) + 1 + INT_EXP_WIDTH + INT_MAN_WIDTH)),
.DATAW (1 + TAGW + 1 + `INST_FRM_BITS + 1 + LANES * ($bits(fp_class_t) + 1 + INT_EXP_WIDTH + INT_MAN_WIDTH)),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in, tag_in, is_itof, !is_signed, frm, in_a_type, input_sign, fmt_exponent, encoded_mant}),
.data_out ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, in_a_type_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
.data_in ({valid_in, tag_in, is_itof, !is_signed, frm, fp_clss, input_sign, fmt_exponent, encoded_mant}),
.data_out ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fp_clss_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
);
// Normalization
@ -119,8 +124,8 @@ module VX_fp_cvt #(
for (genvar i = 0; i < LANES; ++i) begin
wire mant_is_nonzero;
VX_lzc #(
.WIDTH (INT_MAN_WIDTH),
.MODE (1)
.N (INT_MAN_WIDTH),
.MODE (1)
) lzc (
.in_i (encoded_mant_s0[i]),
.cnt_o (renorm_shamt_s0[i]),
@ -134,20 +139,12 @@ module VX_fp_cvt #(
for (genvar i = 0; i < LANES; ++i) begin
`IGNORE_WARNINGS_BEGIN
// Input mantissa needs to be normalized
wire [INT_EXP_WIDTH-1:0] fp_input_exp;
wire [INT_EXP_WIDTH-1:0] int_input_exp;
// Realign input mantissa, append zeroes if destination is wider
// Realign input mantissa, append zeroes if destination is wider
assign input_mant_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
// Unbias exponent and compensate for shift
assign fp_input_exp = fmt_exponent_s0[i] +
{1'b0, in_a_type_s0[i].is_subnormal} +
(FMT_SHIFT_COMPENSATION - EXP_BIAS) -
{1'b0, renorm_shamt_s0[i]};
assign int_input_exp = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]};
wire [INT_EXP_WIDTH-1:0] fp_input_exp = fmt_exponent_s0[i] + (FMT_SHIFT_COMPENSATION - EXP_BIAS) - {1'b0, renorm_shamt_s0[i]};
wire [INT_EXP_WIDTH-1:0] int_input_exp = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]};
assign input_exp_s0[i] = is_itof_s0 ? int_input_exp : fp_input_exp;
`IGNORE_WARNINGS_END
@ -160,21 +157,21 @@ module VX_fp_cvt #(
wire is_itof_s1;
wire unsigned_s1;
wire [2:0] rnd_mode_s1;
fp_type_t [LANES-1:0] in_a_type_s1;
fp_class_t [LANES-1:0] fp_clss_s1;
wire [LANES-1:0] input_sign_s1;
wire [LANES-1:0] mant_is_zero_s1;
wire [LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
wire [LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
VX_pipe_register #(
.DATAW (1 + TAGW + 1 + `FRM_BITS + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + INT_MAN_WIDTH + INT_EXP_WIDTH)),
.DATAW (1 + TAGW + 1 + `INST_FRM_BITS + 1 + LANES * ($bits(fp_class_t) + 1 + 1 + INT_MAN_WIDTH + INT_EXP_WIDTH)),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, in_a_type_s0, input_sign_s0, mant_is_zero_s0, input_mant_s0, input_exp_s0}),
.data_out ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, in_a_type_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
.data_in ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fp_clss_s0, input_sign_s0, mant_is_zero_s0, input_mant_s0, input_exp_s0}),
.data_out ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fp_clss_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
);
// Perform adjustments to mantissa and exponent
@ -183,39 +180,35 @@ module VX_fp_cvt #(
wire [LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
wire [LANES-1:0] of_before_round_s1;
for (genvar i = 0; i < LANES; ++i) begin
wire [INT_EXP_WIDTH-1:0] destination_exp; // re-biased exponent for destination
for (genvar i = 0; i < LANES; ++i) begin
reg [2*INT_MAN_WIDTH:0] preshift_mant; // mantissa before final shift
reg [SHAMT_BITS-1:0] denorm_shamt; // shift amount for denormalization
reg [INT_EXP_WIDTH-1:0] final_exp; // after eventual adjustments
reg of_before_round;
// Rebias the exponent
assign destination_exp = input_exp_s1[i] + EXP_BIAS;
always @(*) begin
`IGNORE_WARNINGS_BEGIN
// Default assignment
final_exp = destination_exp; // take exponent as is, only look at lower bits
preshift_mant = {input_mant_s1[i], 33'b0}; // Place mantissa to the left of the shifter
final_exp = input_exp_s1[i] + EXP_BIAS; // take exponent as is, only look at lower bits
preshift_mant = {input_mant_s1[i], 33'b0}; // Place mantissa to the left of the shifter
denorm_shamt = 0; // right of mantissa
of_before_round = 1'b0;
// Handle INT casts
if (is_itof_s1) begin
if ($signed(destination_exp) >= $signed(2**EXP_BITS-1)) begin
if ($signed(input_exp_s1[i]) >= $signed(2**EXP_BITS-1-EXP_BIAS)) begin
// Overflow or infinities (for proper rounding)
final_exp = (2**EXP_BITS-2); // largest normal value
preshift_mant = ~0; // largest normal value and RS bits set
of_before_round = 1'b1;
end else if ($signed(destination_exp) < $signed(-MAN_BITS)) begin
end else if ($signed(input_exp_s1[i]) < $signed(-MAN_BITS-EXP_BIAS)) begin
// Limit the shift to retain sticky bits
final_exp = 0; // denormal result
denorm_shamt = denorm_shamt + (2 + MAN_BITS); // to sticky
end else if ($signed(destination_exp) < $signed(1)) begin
denorm_shamt = (2 + MAN_BITS); // to sticky
end else if ($signed(input_exp_s1[i]) < $signed(1-EXP_BIAS)) begin
// Denormalize underflowing values
final_exp = 0; // denormal result
denorm_shamt = denorm_shamt + 1 - destination_exp; // adjust right shifting
denorm_shamt = (1-EXP_BIAS) - input_exp_s1[i]; // adjust right shifting
end
end else begin
if ($signed(input_exp_s1[i]) >= $signed((MAX_INT_WIDTH-1) + unsigned_s1)) begin
@ -224,7 +217,7 @@ module VX_fp_cvt #(
of_before_round = 1'b1;
end else if ($signed(input_exp_s1[i]) < $signed(-1)) begin
// underflow
denorm_shamt = MAX_INT_WIDTH + 1; // all bits go to the sticky
denorm_shamt = MAX_INT_WIDTH+1; // all bits go to the sticky
end else begin
// By default right shift mantissa to be an integer
denorm_shamt = (MAX_INT_WIDTH-1) - input_exp_s1[i];
@ -245,7 +238,7 @@ module VX_fp_cvt #(
wire is_itof_s2;
wire unsigned_s2;
wire [2:0] rnd_mode_s2;
fp_type_t [LANES-1:0] in_a_type_s2;
fp_class_t [LANES-1:0] fp_clss_s2;
wire [LANES-1:0] mant_is_zero_s2;
wire [LANES-1:0] input_sign_s2;
wire [LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
@ -253,14 +246,14 @@ module VX_fp_cvt #(
wire [LANES-1:0] of_before_round_s2;
VX_pipe_register #(
.DATAW (1 + TAGW + 1 + 1 + `FRM_BITS + LANES * ($bits(fp_type_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
.DATAW (1 + TAGW + 1 + 1 + `INST_FRM_BITS + LANES * ($bits(fp_class_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
.RESETW (1)
) pipe_reg2 (
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, in_a_type_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
.data_out ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, in_a_type_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
.data_in ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fp_clss_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
.data_out ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fp_clss_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
);
wire [LANES-1:0] rounded_sign;
@ -314,7 +307,7 @@ module VX_fp_cvt #(
wire [TAGW-1:0] tag_in_s3;
wire is_itof_s3;
wire unsigned_s3;
fp_type_t [LANES-1:0] in_a_type_s3;
fp_class_t [LANES-1:0] fp_clss_s3;
wire [LANES-1:0] mant_is_zero_s3;
wire [LANES-1:0] input_sign_s3;
wire [LANES-1:0] rounded_sign_s3;
@ -322,14 +315,14 @@ module VX_fp_cvt #(
wire [LANES-1:0] of_before_round_s3;
VX_pipe_register #(
.DATAW (1 + TAGW + 1 + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + 32 + 1 + 1)),
.DATAW (1 + TAGW + 1 + 1 + LANES * ($bits(fp_class_t) + 1 + 1 + 32 + 1 + 1)),
.RESETW (1)
) pipe_reg3 (
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, in_a_type_s2, mant_is_zero_s2, input_sign_s2, rounded_abs, rounded_sign, of_before_round_s2}),
.data_out ({valid_in_s3, tag_in_s3, is_itof_s3, unsigned_s3, in_a_type_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3})
.data_in ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, fp_clss_s2, mant_is_zero_s2, input_sign_s2, rounded_abs, rounded_sign, of_before_round_s2}),
.data_out ({valid_in_s3, tag_in_s3, is_itof_s3, unsigned_s3, fp_clss_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3})
);
wire [LANES-1:0] of_after_round;
@ -362,14 +355,14 @@ module VX_fp_cvt #(
for (genvar i = 0; i < LANES; ++i) begin
// Detect special case from source format, I2F casts don't produce a special result
assign fp_result_is_special[i] = ~is_itof_s3 & (in_a_type_s3[i].is_zero | in_a_type_s3[i].is_nan);
assign fp_result_is_special[i] = ~is_itof_s3 & (fp_clss_s3[i].is_zero | fp_clss_s3[i].is_nan);
// Signalling input NaNs raise invalid flag, otherwise no flags set
assign fp_special_status[i] = in_a_type_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
assign fp_special_status[i] = fp_clss_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
// Assemble result according to destination format
assign fp_special_result[i] = in_a_type_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
: {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
assign fp_special_result[i] = fp_clss_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
: {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
end
// INT Special case handling
@ -381,7 +374,7 @@ module VX_fp_cvt #(
for (genvar i = 0; i < LANES; ++i) begin
// Assemble result according to destination format
always @(*) begin
if (input_sign_s3[i] && !in_a_type_s3[i].is_nan) begin
if (input_sign_s3[i] && !fp_clss_s3[i].is_nan) begin
int_special_result[i][30:0] = 0; // alone yields 2**(31)-1
int_special_result[i][31] = ~unsigned_s3; // for unsigned casts yields 2**31
end else begin
@ -391,8 +384,8 @@ module VX_fp_cvt #(
end
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
assign int_result_is_special[i] = in_a_type_s3[i].is_nan
| in_a_type_s3[i].is_inf
assign int_result_is_special[i] = fp_clss_s3[i].is_nan
| fp_clss_s3[i].is_inf
| of_before_round_s3[i]
| (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero[i]);
@ -411,11 +404,11 @@ module VX_fp_cvt #(
wire [31:0] fp_result, int_result;
wire inexact = is_itof_s3 ? (| fp_round_sticky_bits[i]) // overflow is invalid in i2f;
: (| fp_round_sticky_bits[i]) | (~in_a_type_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i]));
: (| fp_round_sticky_bits[i]) | (~fp_clss_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i]));
assign fp_regular_status.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round[i]); // overflow is invalid for I2F casts
assign fp_regular_status.DZ = 1'b0; // no divisions
assign fp_regular_status.OF = ~is_itof_s3 & (~in_a_type_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i])); // inf casts no OF
assign fp_regular_status.OF = ~is_itof_s3 & (~fp_clss_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i])); // inf casts no OF
assign fp_regular_status.UF = uf_after_round[i] & inexact;
assign fp_regular_status.NX = inexact;
@ -435,7 +428,7 @@ module VX_fp_cvt #(
assign stall = ~ready_out && valid_out;
VX_pipe_register #(
.DATAW (1 + TAGW + (LANES * 32) + (LANES * `FFG_BITS)),
.DATAW (1 + TAGW + (LANES * 32) + (LANES * `FFLAGS_BITS)),
.RESETW (1)
) pipe_reg4 (
.clk (clk),

View file

@ -1,8 +1,4 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
`include "VX_fpu_define.vh"
module VX_fp_div #(
parameter TAGW = 1,
@ -16,7 +12,7 @@ module VX_fp_div #(
input wire [TAGW-1:0] tag_in,
input wire [`FRM_BITS-1:0] frm,
input wire [`INST_FRM_BITS-1:0] frm,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
@ -39,7 +35,7 @@ module VX_fp_div #(
fflags_t f;
always @(*) begin
dpi_fdiv (dataa[i], datab[i], frm, r, f);
dpi_fdiv (enable && valid_in, dataa[i], datab[i], frm, r, f);
end
`UNUSED_VAR (f)

View file

@ -1,8 +1,4 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
`include "VX_fpu_define.vh"
module VX_fp_fma #(
parameter TAGW = 1,
@ -16,7 +12,7 @@ module VX_fp_fma #(
input wire [TAGW-1:0] tag_in,
input wire [`FRM_BITS-1:0] frm,
input wire [`INST_FRM_BITS-1:0] frm,
input wire do_madd,
input wire do_sub,
@ -68,7 +64,7 @@ module VX_fp_fma #(
fflags_t f;
always @(*) begin
dpi_fmadd (a, b, c, frm, r, f);
dpi_fmadd (enable && valid_in, a, b, c, frm, r, f);
end
`UNUSED_VAR (f)

View file

@ -1,4 +1,4 @@
`include "VX_define.vh"
`include "VX_fpu_define.vh"
/// Modified port of noncomp module from fpnew Libray
/// reference: https://github.com/pulp-platform/fpnew
@ -15,8 +15,8 @@ module VX_fp_ncomp #(
input wire [TAGW-1:0] tag_in,
input wire [`FPU_BITS-1:0] op_type,
input wire [`FRM_BITS-1:0] frm,
input wire [`INST_FPU_BITS-1:0] op_type,
input wire [`INST_FRM_BITS-1:0] frm,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
@ -30,6 +30,9 @@ module VX_fp_ncomp #(
input wire ready_out,
output wire valid_out
);
localparam EXP_BITS = 8;
localparam MAN_BITS = 23;
localparam NEG_INF = 32'h00000001,
NEG_NORM = 32'h00000002,
NEG_SUBNORM = 32'h00000004,
@ -38,86 +41,92 @@ module VX_fp_ncomp #(
POS_SUBNORM = 32'h00000020,
POS_NORM = 32'h00000040,
POS_INF = 32'h00000080,
SIG_NAN = 32'h00000100,
//SIG_NAN = 32'h00000100,
QUT_NAN = 32'h00000200;
wire [LANES-1:0] tmp_a_sign, tmp_b_sign;
wire [LANES-1:0][7:0] tmp_a_exponent, tmp_b_exponent;
wire [LANES-1:0][22:0] tmp_a_mantissa, tmp_b_mantissa;
fp_type_t [LANES-1:0] tmp_a_type, tmp_b_type;
wire [LANES-1:0] tmp_a_smaller, tmp_ab_equal;
wire [LANES-1:0] a_sign, b_sign;
wire [LANES-1:0][7:0] a_exponent, b_exponent;
wire [LANES-1:0][22:0] a_mantissa, b_mantissa;
fp_class_t [LANES-1:0] a_clss, b_clss;
wire [LANES-1:0] a_smaller, ab_equal;
// Setup
for (genvar i = 0; i < LANES; i++) begin
assign tmp_a_sign[i] = dataa[i][31];
assign tmp_a_exponent[i] = dataa[i][30:23];
assign tmp_a_mantissa[i] = dataa[i][22:0];
assign a_sign[i] = dataa[i][31];
assign a_exponent[i] = dataa[i][30:23];
assign a_mantissa[i] = dataa[i][22:0];
assign tmp_b_sign[i] = datab[i][31];
assign tmp_b_exponent[i] = datab[i][30:23];
assign tmp_b_mantissa[i] = datab[i][22:0];
assign b_sign[i] = datab[i][31];
assign b_exponent[i] = datab[i][30:23];
assign b_mantissa[i] = datab[i][22:0];
VX_fp_type fp_type_a (
.exp_i (tmp_a_exponent[i]),
.man_i (tmp_a_mantissa[i]),
.type_o (tmp_a_type[i])
VX_fp_class #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_class_a (
.exp_i (a_exponent[i]),
.man_i (a_mantissa[i]),
.clss_o (a_clss[i])
);
VX_fp_type fp_type_b (
.exp_i (tmp_b_exponent[i]),
.man_i (tmp_b_mantissa[i]),
.type_o (tmp_b_type[i])
VX_fp_class #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_class_b (
.exp_i (b_exponent[i]),
.man_i (b_mantissa[i]),
.clss_o (b_clss[i])
);
assign tmp_a_smaller[i] = $signed(dataa[i]) < $signed(datab[i]);
assign tmp_ab_equal[i] = (dataa[i] == datab[i]) | (tmp_a_type[i].is_zero & tmp_b_type[i].is_zero);
assign a_smaller[i] = $signed(dataa[i]) < $signed(datab[i]);
assign ab_equal[i] = (dataa[i] == datab[i]) | (a_clss[i].is_zero & b_clss[i].is_zero);
end
// Pipeline stage0
wire valid_in_s0;
wire [TAGW-1:0] tag_in_s0;
wire [`FPU_BITS-1:0] op_type_s0;
wire [`FRM_BITS-1:0] frm_s0;
wire [`INST_FPU_BITS-1:0] op_type_s0;
wire [`INST_FRM_BITS-1:0] frm_s0;
wire [LANES-1:0][31:0] dataa_s0, datab_s0;
wire [LANES-1:0] a_sign_s0, b_sign_s0;
wire [LANES-1:0][7:0] a_exponent_s0;
wire [LANES-1:0][22:0] a_mantissa_s0;
fp_type_t [LANES-1:0] a_type_s0, b_type_s0;
fp_class_t [LANES-1:0] a_clss_s0, b_clss_s0;
wire [LANES-1:0] a_smaller_s0, ab_equal_s0;
wire stall;
VX_pipe_register #(
.DATAW (1 + TAGW + `FPU_BITS + `FRM_BITS + LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fp_type_t) + 1 + 1)),
.DATAW (1 + TAGW + `INST_FPU_BITS + `INST_FRM_BITS + LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fp_class_t) + 1 + 1)),
.RESETW (1),
.DEPTH (0)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (!stall),
.data_in ({valid_in, tag_in, op_type, frm, dataa, datab, tmp_a_sign, tmp_b_sign, tmp_a_exponent, tmp_a_mantissa, tmp_a_type, tmp_b_type, tmp_a_smaller, tmp_ab_equal}),
.data_out ({valid_in_s0, tag_in_s0, op_type_s0, frm_s0, dataa_s0, datab_s0, a_sign_s0, b_sign_s0, a_exponent_s0, a_mantissa_s0, a_type_s0, b_type_s0, a_smaller_s0, ab_equal_s0})
.data_in ({valid_in, tag_in, op_type, frm, dataa, datab, a_sign, b_sign, a_exponent, a_mantissa, a_clss, b_clss, a_smaller, ab_equal}),
.data_out ({valid_in_s0, tag_in_s0, op_type_s0, frm_s0, dataa_s0, datab_s0, a_sign_s0, b_sign_s0, a_exponent_s0, a_mantissa_s0, a_clss_s0, b_clss_s0, a_smaller_s0, ab_equal_s0})
);
// FCLASS
reg [LANES-1:0][31:0] fclass_mask; // generate a 10-bit mask for integer reg
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
if (a_type_s0[i].is_normal) begin
if (a_clss_s0[i].is_normal) begin
fclass_mask[i] = a_sign_s0[i] ? NEG_NORM : POS_NORM;
end
else if (a_type_s0[i].is_inf) begin
else if (a_clss_s0[i].is_inf) begin
fclass_mask[i] = a_sign_s0[i] ? NEG_INF : POS_INF;
end
else if (a_type_s0[i].is_zero) begin
else if (a_clss_s0[i].is_zero) begin
fclass_mask[i] = a_sign_s0[i] ? NEG_ZERO : POS_ZERO;
end
else if (a_type_s0[i].is_subnormal) begin
else if (a_clss_s0[i].is_subnormal) begin
fclass_mask[i] = a_sign_s0[i] ? NEG_SUBNORM : POS_SUBNORM;
end
else if (a_type_s0[i].is_nan) begin
fclass_mask[i] = {22'h0, a_type_s0[i].is_quiet, a_type_s0[i].is_signaling, 8'h0};
else if (a_clss_s0[i].is_nan) begin
fclass_mask[i] = {22'h0, a_clss_s0[i].is_quiet, a_clss_s0[i].is_signaling, 8'h0};
end
else begin
fclass_mask[i] = QUT_NAN;
@ -129,11 +138,11 @@ module VX_fp_ncomp #(
reg [LANES-1:0][31:0] fminmax_res; // result of fmin/fmax
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
if (a_type_s0[i].is_nan && b_type_s0[i].is_nan)
if (a_clss_s0[i].is_nan && b_clss_s0[i].is_nan)
fminmax_res[i] = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
else if (a_type_s0[i].is_nan)
else if (a_clss_s0[i].is_nan)
fminmax_res[i] = datab_s0[i];
else if (b_type_s0[i].is_nan)
else if (b_clss_s0[i].is_nan)
fminmax_res[i] = dataa_s0[i];
else begin
case (frm_s0) // use LSB to distinguish MIN and MAX
@ -160,33 +169,33 @@ module VX_fp_ncomp #(
// Comparison
reg [LANES-1:0][31:0] fcmp_res; // result of comparison
fflags_t [LANES-1:0] fcmp_fflags; // comparison fflags
fflags_t [LANES-1:0] fcmp_fflags; // comparison fflags
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
case (frm_s0)
`FRM_RNE: begin // LE
`INST_FRM_RNE: begin // LE
fcmp_fflags[i] = 5'h0;
if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
if (a_clss_s0[i].is_nan || b_clss_s0[i].is_nan) begin
fcmp_res[i] = 32'h0;
fcmp_fflags[i].NV = 1'b1;
end else begin
fcmp_res[i] = {31'h0, (a_smaller_s0[i] | ab_equal_s0[i])};
end
end
`FRM_RTZ: begin // LS
`INST_FRM_RTZ: begin // LS
fcmp_fflags[i] = 5'h0;
if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
if (a_clss_s0[i].is_nan || b_clss_s0[i].is_nan) begin
fcmp_res[i] = 32'h0;
fcmp_fflags[i].NV = 1'b1;
end else begin
fcmp_res[i] = {31'h0, (a_smaller_s0[i] & ~ab_equal_s0[i])};
end
end
`FRM_RDN: begin // EQ
`INST_FRM_RDN: begin // EQ
fcmp_fflags[i] = 5'h0;
if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
if (a_clss_s0[i].is_nan || b_clss_s0[i].is_nan) begin
fcmp_res[i] = 32'h0;
fcmp_fflags[i].NV = a_type_s0[i].is_signaling | b_type_s0[i].is_signaling;
fcmp_fflags[i].NV = a_clss_s0[i].is_signaling | b_clss_s0[i].is_signaling;
end else begin
fcmp_res[i] = {31'h0, ab_equal_s0[i]};
end
@ -207,11 +216,11 @@ module VX_fp_ncomp #(
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
case (op_type_s0)
`FPU_CLASS: begin
`INST_FPU_CLASS: begin
tmp_result[i] = fclass_mask[i];
tmp_fflags[i] = 'x;
end
`FPU_CMP: begin
`INST_FPU_CMP: begin
tmp_result[i] = fcmp_res[i];
tmp_fflags[i] = fcmp_fflags[i];
end
@ -225,11 +234,11 @@ module VX_fp_ncomp #(
3,4: begin
tmp_result[i] = fminmax_res[i];
tmp_fflags[i] = 0;
tmp_fflags[i].NV = a_type_s0[i].is_signaling | b_type_s0[i].is_signaling;
tmp_fflags[i].NV = a_clss_s0[i].is_signaling | b_clss_s0[i].is_signaling;
end
//5,6,7: MOVE
default: begin
tmp_result[i] = dataa[i];
tmp_result[i] = dataa_s0[i];
tmp_fflags[i] = 'x;
end
endcase
@ -238,15 +247,15 @@ module VX_fp_ncomp #(
end
end
wire has_fflags_s0 = ((op_type_s0 == `FPU_MISC)
&& (frm_s0 == 3 // MIN
|| frm_s0 == 4)) // MAX
|| (op_type_s0 == `FPU_CMP); // CMP
wire has_fflags_s0 = ((op_type_s0 == `INST_FPU_MISC)
&& (frm_s0 == 3 // MIN
|| frm_s0 == 4)) // MAX
|| (op_type_s0 == `INST_FPU_CMP); // CMP
assign stall = ~ready_out && valid_out;
VX_pipe_register #(
.DATAW (1 + TAGW + (LANES * 32) + 1 + (LANES * `FFG_BITS)),
.DATAW (1 + TAGW + (LANES * 32) + 1 + (LANES * `FFLAGS_BITS)),
.RESETW (1)
) pipe_reg1 (
.clk (clk),

View file

@ -1,5 +1,4 @@
`include "VX_define.vh"
`include "VX_fpu_define.vh"
/// Modified port of rouding module from fpnew Libray
/// reference: https://github.com/pulp-platform/fpnew
@ -34,7 +33,7 @@ module VX_fp_rounding #(
always @(*) begin
case (rnd_mode_i)
`FRM_RNE: // Decide accoring to round/sticky bits
`INST_FRM_RNE: // Decide accoring to round/sticky bits
case (round_sticky_bits_i)
2'b00,
2'b01: round_up = 1'b0; // < ulp/2 away, round down
@ -42,10 +41,10 @@ module VX_fp_rounding #(
2'b11: round_up = 1'b1; // > ulp/2 away, round up
default: round_up = 1'bx;
endcase
`FRM_RTZ: round_up = 1'b0; // always round down
`FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if -
`FRM_RUP: round_up = (| round_sticky_bits_i) & ~sign_i; // to 0 if -, away if +
`FRM_RMM: round_up = round_sticky_bits_i[1]; // round down if < ulp/2 away, else up
`INST_FRM_RTZ: round_up = 1'b0; // always round down
`INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if -
`INST_FRM_RUP: round_up = (| round_sticky_bits_i) & ~sign_i; // to 0 if -, away if +
`INST_FRM_RMM: round_up = round_sticky_bits_i[1]; // round down if < ulp/2 away, else up
default: round_up = 1'bx; // propagate x
endcase
end
@ -58,7 +57,7 @@ module VX_fp_rounding #(
// In case of effective subtraction (thus signs of addition operands must have differed) and a
// true zero result, the result sign is '-' in case of RDN and '+' for other modes.
assign sign_o = (exact_zero_o && effective_subtraction_i) ? (rnd_mode_i == `FRM_RDN)
assign sign_o = (exact_zero_o && effective_subtraction_i) ? (rnd_mode_i == `INST_FRM_RDN)
: sign_i;
endmodule

View file

@ -1,8 +1,4 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
`include "VX_fpu_define.vh"
module VX_fp_sqrt #(
parameter TAGW = 1,
@ -16,7 +12,7 @@ module VX_fp_sqrt #(
input wire [TAGW-1:0] tag_in,
input wire [`FRM_BITS-1:0] frm,
input wire [`INST_FRM_BITS-1:0] frm,
input wire [LANES-1:0][31:0] dataa,
output wire [LANES-1:0][31:0] result,
@ -38,7 +34,7 @@ module VX_fp_sqrt #(
fflags_t f;
always @(*) begin
dpi_fsqrt (dataa[i], frm, r, f);
dpi_fsqrt (enable && valid_in, dataa[i], frm, r, f);
end
`UNUSED_VAR (f)

View file

@ -1,27 +0,0 @@
`include "VX_define.vh"
module VX_fp_type (
// inputs
input [7:0] exp_i,
input [22:0] man_i,
// outputs
output fp_type_t type_o
);
wire is_normal = (exp_i != 8'd0) && (exp_i != 8'hff);
wire is_zero = (exp_i == 8'd0) && (man_i == 23'd0);
wire is_subnormal = (exp_i == 8'd0) && (man_i != 23'd0);
wire is_inf = (exp_i == 8'hff) && (man_i == 23'd0);
wire is_nan = (exp_i == 8'hff) && (man_i != 23'd0);
wire is_signaling = is_nan && (man_i[22] == 1'b0);
wire is_quiet = is_nan && !is_signaling;
assign type_o.is_normal = is_normal;
assign type_o.is_zero = is_zero;
assign type_o.is_subnormal = is_subnormal;
assign type_o.is_inf = is_inf;
assign type_o.is_nan = is_nan;
assign type_o.is_quiet = is_quiet;
assign type_o.is_signaling = is_signaling;
endmodule

Some files were not shown because too many files have changed in this diff Show more