This commit is contained in:
Santosh Raghav Srivatsan 2021-11-08 16:35:32 -05:00
commit 8a550b625c
147 changed files with 6615 additions and 547 deletions

View file

@ -30,25 +30,28 @@ jobs:
include:
- stage: test
name: coverage
script: cp -r $PWD ../build1 && cd ../build1 && ./ci/travis_run.py ./ci/regression.sh -coverage
script: cp -r $PWD ../build_coverage && cd ../build_coverage && ./ci/travis_run.py ./ci/regression.sh -coverage
- stage: test
name: tex
script: cp -r $PWD ../build_tex && cd ../build_tex && ./ci/travis_run.py ./ci/regression.sh -tex
- stage: test
name: cluster
script: cp -r $PWD ../build2 && cd ../build2 && ./ci/travis_run.py ./ci/regression.sh -cluster
script: cp -r $PWD ../build_cluster && cd ../build_cluster && ./ci/travis_run.py ./ci/regression.sh -cluster
- stage: test
name: debug
script: cp -r $PWD ../build3 && cd ../build3 && ./ci/travis_run.py ./ci/regression.sh -debug
script: cp -r $PWD ../build_debug && cd ../build_debug && ./ci/travis_run.py ./ci/regression.sh -debug
- stage: test
name: config
script: cp -r $PWD ../build4 && cd ../build4 && ./ci/travis_run.py ./ci/regression.sh -config
script: cp -r $PWD ../build_config && cd ../build_config && ./ci/travis_run.py ./ci/regression.sh -config
- stage: test
name: stress0
script: cp -r $PWD ../build5 && cd ../build5 && ./ci/travis_run.py ./ci/regression.sh -stress0
script: cp -r $PWD ../build_stress0 && cd ../build_stress0 && ./ci/travis_run.py ./ci/regression.sh -stress0
- stage: test
name: stress1
script: cp -r $PWD ../build6 && cd ../build6 && ./ci/travis_run.py ./ci/regression.sh -stress1
script: cp -r $PWD ../build_stress1 && cd ../build_stress1 && ./ci/travis_run.py ./ci/regression.sh -stress1
- stage: test
name: compiler
script: cp -r $PWD ../build7 && cd ../build7 && ./ci/travis_run.py ./ci/test_compiler.sh
script: cp -r $PWD ../build_compiler && cd ../build_compiler && ./ci/travis_run.py ./ci/test_compiler.sh
after_success:
# Gather code coverage

View file

@ -21,59 +21,32 @@ Vortex is a full-system RISCV-based GPGPU processor.
## Directory structure
- `doc`: [Documentation](doc/Vortex.md).
- `hw`: Hardware sources.
- `driver`: Host drivers repository.
- `runtime`: Kernel Runtime software.
- `sim`: Simulators repository.
- `tests`: Tests repository.
- `ci`: Continuous integration scripts.
- `miscs`: Miscellaneous resources.
## Basic Installation
## Build Instructions
### Supported OS Platforms
- Ubuntu 18.04
- Centos 7
### Toolchain Dependencies
- [POCL](http://portablecl.org/)
- [LLVM](https://llvm.org/)
- [RISCV-GNU-TOOLCHAIN](https://github.com/riscv-collab/riscv-gnu-toolchain)
- [Verilator](https://www.veripool.org/verilator)
### Install development tools
$ sudo apt-get install build-essential
$ sudo apt-get install git
### Install gnu-riscv-tools
$ export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain
$ sudo apt-get -y install \
binutils build-essential libtool texinfo \
gzip zip unzip patchutils curl git \
make cmake ninja-build automake bison flex gperf \
grep sed gawk python bc \
zlib1g-dev libexpat1-dev libmpc-dev \
libglib2.0-dev libfdt-dev libpixman-1-dev
$ git clone https://github.com/riscv/riscv-gnu-toolchain
$ cd riscv-gnu-toolchain
$ git submodule update --init --recursive
$ mkdir build
$ cd build
$ ../configure --prefix=$RISCV_TOOLCHAIN_PATH --with-arch=rv32im --with-abi=ilp32
$ make -j`nproc`
$ make -j`nproc` build-qemu
### Install Verilator
You need into build the latest version using the instructions on their website
$ https://www.veripool.org/projects/verilator/wiki/Installing
### Install Vortex
### Install Vortex codebase
$ git clone --recursive https://github.com/vortexgpgpu/vortex.git
$ cd Vortex
$ make
### Quick Test running OpenCL vecadd sample on 2 cores
$ ./ci/blackbox.sh --cores=2 --app=vecadd
### Install prebuilt toolchain
$ ./ci/toolchain_install.sh -all
### Build Vortex sources
$ make -s
### Quick demo running vecadd OpenCL kernel on 2 cores
$ ./ci/blackbox.sh --driver=rtlsim --cores=2 --app=vecadd

View file

@ -12,7 +12,7 @@ VORTEX_HOME=$SCRIPT_DIR/..
DRIVER=vlsim
APP=sgemm
CLUSTERS=1
CORES=2
CORES=1
WARPS=4
THREADS=4
L2=0
@ -132,9 +132,9 @@ if [ $DEBUG -eq 1 ]
then
if [ $SCOPE -eq 1 ]
then
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH
else
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH
fi
if [ $HAS_ARGS -eq 1 ]
@ -153,9 +153,9 @@ then
else
if [ $SCOPE -eq 1 ]
then
SCOPE=1 CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH
SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH
else
CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH
CONFIGS="$CONFIGS" make -C $DRIVER_PATH
fi
if [ $HAS_ARGS -eq 1 ]

View file

@ -22,6 +22,17 @@ make -C tests/opencl run-simx
echo "coverage tests done!"
}
tex()
{
echo "begin texture tests..."
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0"
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1"
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-irainbow.png -orainbow_result.png -g1"
echo "coverage texture done!"
}
cluster()
{
echo "begin clustering tests..."
@ -134,13 +145,15 @@ echo "stress1 tests done!"
usage()
{
echo "usage: regression [-coverage] [-cluster] [-debug] [-config] [-stress[#n]] [-all] [-h|--help]"
echo "usage: regression [-coverage] [-tex] [-cluster] [-debug] [-config] [-stress[#n]] [-all] [-h|--help]"
}
while [ "$1" != "" ]; do
case $1 in
-coverage ) coverage
;;
-tex ) tex
;;
-cluster ) cluster
;;
-debug ) debug
@ -155,6 +168,7 @@ while [ "$1" != "" ]; do
stress1
;;
-all ) coverage
tex
cluster
debug
config

View file

Before

Width:  |  Height:  |  Size: 60 KiB

After

Width:  |  Height:  |  Size: 60 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 77 KiB

After

Width:  |  Height:  |  Size: 77 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 67 KiB

After

Width:  |  Height:  |  Size: 67 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 517 KiB

After

Width:  |  Height:  |  Size: 517 KiB

Before After
Before After

View file

@ -8,7 +8,7 @@ The Vortex Cache Sub-system has the following main properties:
### Cache Hierarchy
![Image of Cache Hierarchy](./images/cache_hierarchy.png)
![Image of Cache Hierarchy](./assets/img/cache_hierarchy.png)
- Cache can be configured to be any level in the hierarchy
- Caches communicate via snooping
@ -18,7 +18,7 @@ The Vortex Cache Sub-system has the following main properties:
VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/cache` directory.
![Image of Vortex Cache](./images/vortex_cache_top_module.png)
![Image of Vortex Cache](./assets/img/vortex_cache_top_module.png)
- Configurable (Cache size, number of banks, bank line size, etc.)
- I/O signals
@ -44,7 +44,7 @@ VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/c
VX_bank.v is the verilog code that handles cache bank functionality and is located in the `/hw/rtl/cache` directory.
![Image of Vortex Cache Bank](./images/vortex_bank.png)
![Image of Vortex Cache Bank](./assets/img/vortex_bank.png)
- Allows for high throughput
- Each bank contains queues to hold requests to the cache

View file

@ -6,7 +6,7 @@
- [riscv-toolchain](https://github.com/riscv-collab/riscv-gnu-toolchain)
- [llvm-riscv](https://github.com/llvm-mirror/llvm)
For installation, please see [Basic Installation](https://github.com/vortexgpgpu/vortex#basic-installation) for more details.
For installation, please see [Build Instructions](../README.md) for more details.
**For Ubuntu18.04 users, you can directly download pre-build toolchains with [toolchain_install.sh](https://github.com/vortexgpgpu/vortex/blob/master/ci/toolchain_install.sh) script.**
```bash

View file

@ -13,17 +13,6 @@ OPAE Environment Setup
$ export PATH=:/opt/verilator/bin:$PATH
$ export VERILATOR_ROOT=/opt/verilator
OPAE Build Configuration
------------------------
Within the `/hw/syn/opae` directory, there are source text files for each core-option for the fpga build (the 32 and 64 core options are not currently implemented) which have the following parameters that can be configured:
- NUM_CORES: the number of cores per cluster
- NUM_CLUSTERS: the number of clusters alotted to the processor
- L3_ENABLE: enable the use of the L3 cache
- PERF_ENABLE: enable the use of all profile counters
To enable L3 cache and profile counters for a build, simply uncomment the definition within the respective source file.
OPAE Build
------------------
@ -33,41 +22,58 @@ The FPGA has to following configuration options:
- 4 cores fpga (fpga-4c)
- 8 cores fpga (fpga-8c)
- 16 cores fpga (fpga-16c)
- 32 cores fpga (fpga-32c)
- 64 cores fpga (fpga-64c)
Command line:
$ cd hw/syn/opae
$ make fpga- *# of cores* c
$ make fpga-<num-of-cores>c
Example: `make fpga-4c`
A new folder (ex: `build_fpga_4c`) will be created and the build will start and take ~30-45 min to complete.
A new folder (ex: `build_fpga_4c`) will be created and the build will start and take ~30-480 min to complete.
OPAE Build Configuration
------------------------
The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured:
- `NUM_WARPS`: Number of warps per cores
- `NUM_THREADS`: Number of threads per warps
- `PERF_ENABLE`: enable the use of all profile counters
You configure the syntesis build from the command line:
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make fpga-4c
OPAE Build Progress
-------------------
You could check the last 10 lines in the build log for possible errors until build completion.
$ tail -n 10 ./build_fpga_4c/build.log
$ tail -n 10 ./build_fpga_<num-of-cores>c/build.log
Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs.
$ ps -u *username*
$ ps -u <username>
If the build fails and you need to restart it, clean up the build folder using the following command:
$ make clean-fpga- *# of cores* c
$ make clean-fpga-<num-of-cores>c
Example: `make clean-fpga-4c`
The file `vortex_afu.gbs` should exist when the build is done:
$ ls -lsa ./build_fpga_ *# of cores* c/vortex_afu.gbs
$ ls -lsa ./build_fpga_<num-of-cores>c/vortex_afu.gbs
Signing the bitstream and Programming the FPGA
----------------------------------------------
$ cd ./build_fpga_`# of cores`c/
$ cd ./build_fpga_<num-of-cores>c
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
$ fpgasupdate vortex_afu_unsigned_ssl.gbs

View file

@ -14,17 +14,17 @@
## Installation
- Refer to the install instructions in [README](../README.md).
- Refer to the build instructions in [README](../README.md).
## Quick Start Scenarios
Running Vortex simulators with different configurations:
- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads
$ ./ci/blackbox.sh --clusters=2 --cores=2 --warps=2 --threads=4 --driver=rtlsim --app=basic
$ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic
- Run demo driver test with vlsim driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads
$ ./ci/blackbox.sh --clusters=1 --cores=4 --warps=4 --threads=2 --driver=vlsim --app=demo
$ ./ci/blackbox.sh --driver=vlsim --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
$ ./ci/blackbox.sh --clusters=4 --cores=4 --warps=8 --threads=6 --driver=simx --app=dogfood
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood

View file

@ -32,7 +32,7 @@ Vortex uses the SIMT (Single Instruction, Multiple Threads) execution model with
### Vortex Pipeline/Datapath
![Image of Vortex Microarchitecture](./images/vortex_microarchitecture_v2.png)
![Image of Vortex Microarchitecture](./assets/img/vortex_microarchitecture_v2.png)
Vortex has a 5-stage pipeline: FI | ID | Issue | EX | WB.

View file

@ -63,12 +63,5 @@ scope: scope-defs.h
$(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H)
$(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) -o $(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
clean:
rm -rf $(PROJECT) *.o .depend scope-defs.h
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
rm -rf $(PROJECT) *.o scope-defs.h

View file

@ -65,12 +65,5 @@ scope: scope-defs.h
$(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H)
$(CXX) $(CXXFLAGS) -DUSE_FPGA $^ $(LDFLAGS) -o $(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
clean:
rm -rf $(PROJECT) *.o .depend scope-defs.h
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
rm -rf $(PROJECT) *.o scope-defs.h

View file

@ -35,4 +35,4 @@ $(PROJECT): $(SRCS)
clean:
$(MAKE) -C $(RTLSIM_DIR) clean-static
rm -rf $(PROJECT) *.o .depend
rm -rf $(PROJECT) *.o

View file

@ -21,9 +21,6 @@ $(PROJECT): $(SRCS)
$(MAKE) -C $(SIMX_DIR) static
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
$(MAKE) -C $(SIMX_DIR) clean-static
rm -rf $(PROJECT) *.o .depend
rm -rf $(PROJECT) *.o

View file

@ -50,13 +50,6 @@ $(PROJECT): $(SRCS) $(SCOPE_H)
$(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C $(VLSIM_DIR) static
$(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -o $(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
clean:
$(MAKE) -C $(VLSIM_DIR) clean-static
rm -rf $(PROJECT) *.o .depend scope-defs.h
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
rm -rf $(PROJECT) *.o scope-defs.h

View file

@ -217,7 +217,7 @@ module VX_alu_unit #(
// can accept new request?
assign alu_req_if.ready = ready_in;
`ifdef DBG_PRINT_PIPELINE
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (branch_ctl_if.valid) begin
dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h\n",

159
hw/rtl/VX_cache_arb.sv Normal file
View file

@ -0,0 +1,159 @@
`include "VX_define.vh"
module VX_cache_arb #(
parameter NUM_REQS = 1,
parameter LANES = 1,
parameter DATA_SIZE = 1,
parameter TAG_IN_WIDTH = 1,
parameter TAG_SEL_IDX = 0,
parameter BUFFERED_REQ = 0,
parameter BUFFERED_RSP = 0,
parameter TYPE = "R",
localparam ADDR_WIDTH = (32-`CLOG2(DATA_SIZE)),
localparam DATA_WIDTH = (8 * DATA_SIZE),
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS),
localparam TAG_OUT_WIDTH = TAG_IN_WIDTH + LOG_NUM_REQS
) (
input wire clk,
input wire reset,
// input requests
input wire [NUM_REQS-1:0][LANES-1:0] req_valid_in,
input wire [NUM_REQS-1:0][LANES-1:0] req_rw_in,
input wire [NUM_REQS-1:0][LANES-1:0][DATA_SIZE-1:0] req_byteen_in,
input wire [NUM_REQS-1:0][LANES-1:0][ADDR_WIDTH-1:0] req_addr_in,
input wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] req_data_in,
input wire [NUM_REQS-1:0][LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in,
output wire [NUM_REQS-1:0][LANES-1:0] req_ready_in,
// output request
output wire [LANES-1:0] req_valid_out,
output wire [LANES-1:0] req_rw_out,
output wire [LANES-1:0][DATA_SIZE-1:0] req_byteen_out,
output wire [LANES-1:0][ADDR_WIDTH-1:0] req_addr_out,
output wire [LANES-1:0][DATA_WIDTH-1:0] req_data_out,
output wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out,
input wire [LANES-1:0] req_ready_out,
// input response
input wire rsp_valid_in,
input wire [LANES-1:0] rsp_tmask_in,
input wire [LANES-1:0][DATA_WIDTH-1:0] rsp_data_in,
input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in,
output wire rsp_ready_in,
// output responses
output wire [NUM_REQS-1:0] rsp_valid_out,
output wire [NUM_REQS-1:0][LANES-1:0] rsp_tmask_out,
output wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] rsp_data_out,
output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out,
input wire [NUM_REQS-1:0] rsp_ready_out
);
localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
localparam RSP_DATAW = LANES * (1 + DATA_WIDTH) + TAG_IN_WIDTH;
if (NUM_REQS > 1) begin
wire [NUM_REQS-1:0][LANES-1:0][REQ_DATAW-1:0] req_data_in_merged;
wire [LANES-1:0][REQ_DATAW-1:0] req_data_out_merged;
for (genvar i = 0; i < NUM_REQS; i++) begin
for (genvar j = 0; j < LANES; ++j) begin
wire [TAG_OUT_WIDTH-1:0] req_tag_in_w;
VX_bits_insert #(
.N (TAG_IN_WIDTH),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_insert (
.data_in (req_tag_in[i][j]),
.sel_in (LOG_NUM_REQS'(i)),
.data_out (req_tag_in_w)
);
assign req_data_in_merged[i][j] = {req_tag_in_w, req_addr_in[i][j], req_rw_in[i][j], req_byteen_in[i][j], req_data_in[i][j]};
end
end
VX_stream_arbiter #(
.NUM_REQS (NUM_REQS),
.LANES (LANES),
.DATAW (REQ_DATAW),
.BUFFERED (BUFFERED_REQ),
.TYPE (TYPE)
) req_arb (
.clk (clk),
.reset (reset),
.valid_in (req_valid_in),
.data_in (req_data_in_merged),
.ready_in (req_ready_in),
.valid_out (req_valid_out),
.data_out (req_data_out_merged),
.ready_out (req_ready_out)
);
for (genvar i = 0; i < LANES; ++i) begin
assign {req_tag_out[i], req_addr_out[i], req_rw_out[i], req_byteen_out[i], req_data_out[i]} = req_data_out_merged[i];
end
///////////////////////////////////////////////////////////////////////
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out_merged;
wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[TAG_SEL_IDX +: LOG_NUM_REQS];
wire [TAG_IN_WIDTH-1:0] rsp_tag_in_w;
VX_bits_remove #(
.N (TAG_OUT_WIDTH),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_remove (
.data_in (rsp_tag_in),
.data_out (rsp_tag_in_w)
);
VX_stream_demux #(
.NUM_REQS (NUM_REQS),
.LANES (1),
.DATAW (RSP_DATAW),
.BUFFERED (BUFFERED_RSP)
) rsp_demux (
.clk (clk),
.reset (reset),
.sel_in (rsp_sel),
.valid_in (rsp_valid_in),
.data_in ({rsp_tmask_in, rsp_tag_in_w, rsp_data_in}),
.ready_in (rsp_ready_in),
.valid_out (rsp_valid_out),
.data_out (rsp_data_out_merged),
.ready_out (rsp_ready_out)
);
for (genvar i = 0; i < NUM_REQS; i++) begin
assign {rsp_tmask_out[i], rsp_tag_out[i], rsp_data_out[i]} = rsp_data_out_merged[i];
end
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign req_valid_out = req_valid_in;
assign req_tag_out = req_tag_in;
assign req_addr_out = req_addr_in;
assign req_rw_out = req_rw_in;
assign req_byteen_out = req_byteen_in;
assign req_data_out = req_data_in;
assign req_ready_in = req_ready_out;
assign rsp_valid_out = rsp_valid_in;
assign rsp_tmask_out = rsp_tmask_in;
assign rsp_tag_out = rsp_tag_in;
assign rsp_data_out = rsp_data_in;
assign rsp_ready_in = rsp_ready_out;
end
endmodule

View file

@ -78,14 +78,14 @@ module VX_commit #(
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if),
`endif
.gpu_commit_if (gpu_commit_if),
.writeback_if (writeback_if)
);
// store and gpu commits don't writeback
assign st_commit_if.ready = 1'b1;
assign gpu_commit_if.ready = 1'b1;
`ifdef DBG_PRINT_PIPELINE
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (alu_commit_if.valid && alu_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd);

View file

@ -230,6 +230,21 @@
`define CSR_NW 12'hFC1
`define CSR_NC 12'hFC2
////////// Texture Units //////////////////////////////////////////////////////
`define NUM_TEX_UNITS 2
`define CSR_TEX_STATES 7
`define CSR_TEX_BEGIN(x) (12'hFD0 + (x) * `CSR_TEX_STATES)
`define CSR_TEX_ADDR(x) (`CSR_TEX_BEGIN(x) + 12'h00)
`define CSR_TEX_FORMAT(x) (`CSR_TEX_BEGIN(x) + 12'h01)
`define CSR_TEX_WRAP(x) (`CSR_TEX_BEGIN(x) + 12'h02)
`define CSR_TEX_FILTER(x) (`CSR_TEX_BEGIN(x) + 12'h03)
`define CSR_TEX_MIPOFF(x) (`CSR_TEX_BEGIN(x) + 12'h04)
`define CSR_TEX_WIDTH(x) (`CSR_TEX_BEGIN(x) + 12'h05)
`define CSR_TEX_HEIGHT(x) (`CSR_TEX_BEGIN(x) + 12'h06)
// Pipeline Queues ////////////////////////////////////////////////////////////
// Size of Instruction Buffer

View file

@ -17,6 +17,9 @@ module VX_csr_data #(
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if,
`endif
`ifdef EXT_TEX_ENABLE
VX_tex_csr_if.master tex_csr_if,
`endif
input wire read_enable,
input wire[`CSR_ADDR_BITS-1:0] read_addr,
@ -26,7 +29,7 @@ module VX_csr_data #(
input wire write_enable,
input wire[`CSR_ADDR_BITS-1:0] write_addr,
input wire[`NW_BITS-1:0] write_wid,
input wire[`CSR_WIDTH-1:0] write_data,
input wire[31:0] write_data,
input wire busy
);
@ -46,13 +49,13 @@ module VX_csr_data #(
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr;
always @(posedge clk) begin
always @(posedge clk) begin
`ifdef EXT_F_ENABLE
if (reset) begin
fcsr <= '0;
end
end
if (fpu_to_csr_if.write_enable) begin
fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0]
fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0]
| fpu_to_csr_if.write_fflags;
end
`endif
@ -61,27 +64,33 @@ module VX_csr_data #(
`CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0];
`CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0];
`CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0];
`CSR_SATP: csr_satp <= write_data;
`CSR_MSTATUS: csr_mstatus <= write_data;
`CSR_MEDELEG: csr_medeleg <= write_data;
`CSR_MIDELEG: csr_mideleg <= write_data;
`CSR_MIE: csr_mie <= write_data;
`CSR_MTVEC: csr_mtvec <= write_data;
`CSR_MEPC: csr_mepc <= write_data;
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data;
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data;
default: begin
`ASSERT(~write_enable, ("%t: invalid CSR write address: %0h", $time, write_addr));
`CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0];
`CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0];
`CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0];
`CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0];
`CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0];
`CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0];
`CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0];
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0];
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0];
default: begin
`ASSERT(write_addr >= `CSR_TEX_BEGIN(0)
&& write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES),
("%t: invalid CSR write address: %0h", $time, write_addr));
end
endcase
endcase
end
end
`UNUSED_VAR (write_data)
// TEX CSRs
`ifdef EXT_TEX_ENABLE
assign tex_csr_if.write_enable = write_enable;
assign tex_csr_if.write_addr = write_addr;
assign tex_csr_if.write_data = write_data;
`endif
always @(posedge clk) begin
if (reset) begin
csr_cycle <= 0;
@ -209,7 +218,8 @@ module VX_csr_data #(
default: begin
if (!((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32))
| (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32)))) begin
|| (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32)
|| (read_addr >= `CSR_TEX_BEGIN(0) && read_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))))) begin
read_addr_valid_r = 0;
end
end

View file

@ -20,6 +20,9 @@ module VX_csr_unit #(
VX_fpu_to_csr_if.slave fpu_to_csr_if,
input wire[`NUM_WARPS-1:0] fpu_pending,
`endif
`ifdef EXT_TEX_ENABLE
VX_tex_csr_if.master tex_csr_if,
`endif
output wire[`NUM_WARPS-1:0] pending,
input wire busy
@ -46,6 +49,9 @@ module VX_csr_unit #(
.fetch_to_csr_if(fetch_to_csr_if),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
`endif
`ifdef EXT_TEX_ENABLE
.tex_csr_if (tex_csr_if),
`endif
.read_enable (csr_req_if.valid),
.read_addr (csr_req_if.addr),
@ -54,7 +60,7 @@ module VX_csr_unit #(
.write_enable (write_enable),
.write_addr (csr_addr_s1),
.write_wid (csr_commit_if.wid),
.write_data (csr_updated_data_s1[`CSR_WIDTH-1:0]),
.write_data (csr_updated_data_s1),
.busy (busy)
);

View file

@ -1,6 +1,6 @@
`include "VX_define.vh"
`ifdef DBG_PRINT_PIPELINE
`include "VX_print_instr.vh"
`ifdef DBG_TRACE_PIPELINE
`include "VX_trace_instr.vh"
`endif
`ifdef EXT_F_ENABLE
@ -42,6 +42,7 @@ module VX_decode #(
wire [31:0] instr = ifetch_rsp_if.data;
wire [6:0] opcode = instr[6:0];
wire [1:0] func2 = instr[26:25];
wire [2:0] func3 = instr[14:12];
wire [6:0] func7 = instr[31:25];
wire [11:0] u_12 = instr[31:20];
@ -193,7 +194,6 @@ module VX_decode #(
end
`INST_F: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'(func3[0]);
op_mod = `INST_MOD_BITS'(1);
end
`INST_SYS : begin
@ -375,11 +375,21 @@ module VX_decode #(
`USED_IREG (rs1);
`USED_IREG (rs2);
end
`ifdef EXT_TEX_ENABLE
3'h5: begin
op_type = `INST_OP_BITS'(`INST_GPU_TEX);
op_mod = `INST_MOD_BITS'(func2);
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
`USED_IREG (rs3);
end
`endif
3'h6: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'(`INST_GPU_PRED);
imm = {{20{u_12[11]}}, u_12};
use_rd = 0;
op_type = `INST_OP_BITS'(`INST_LSU_LW);
op_mod = `INST_MOD_BITS'(2);
`USED_IREG (rs1);
end
default:;
@ -389,6 +399,8 @@ module VX_decode #(
endcase
end
`UNUSED_VAR (func2)
// disable write to integer register r0
wire wb = use_rd && (| rd_r);
@ -421,13 +433,13 @@ module VX_decode #(
assign ifetch_rsp_if.ready = decode_if.ready;
`ifdef DBG_PRINT_PIPELINE
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (decode_if.valid && decode_if.ready) begin
dpi_trace("%d: core%0d-decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_if.wid, decode_if.PC);
print_ex_type(decode_if.ex_type);
trace_ex_type(decode_if.ex_type);
dpi_trace(", op=");
print_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
trace_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm);
end
end

View file

@ -18,6 +18,8 @@
`define NRI_BITS `LOG2UP(`NUM_IREGS)
`define NTEX_BITS `LOG2UP(`NUM_TEX_UNITS)
`ifdef EXT_F_ENABLE
`define NUM_REGS (2 * `NUM_IREGS)
`else
@ -66,6 +68,8 @@
`define INST_GPU 7'b1101011
`define INST_TEX 7'b0101011
///////////////////////////////////////////////////////////////////////////////
`define INST_FRM_RNE 3'b000 // round to nearest even
@ -150,8 +154,8 @@
`define INST_LSU_BITS 4
`define INST_LSU_FMT(x) x[2:0]
`define INST_LSU_WSIZE(x) x[1:0]
`define INST_LSU_IS_FENCE(x) x[0]
`define INST_LSU_IS_PREF(x) (x==3'b111)
`define INST_LSU_IS_FENCE(x) (3'h1 == x)
`define INST_LSU_IS_PREFETCH(x) (3'h2 == x)
`define INST_FENCE_BITS 1
`define INST_FENCE_D 1'h0
@ -187,6 +191,7 @@
`define INST_GPU_JOIN 3'h3
`define INST_GPU_BAR 3'h4
`define INST_GPU_PRED 3'h5
`define INST_GPU_TEX 3'h6
`define INST_GPU_BITS 3
///////////////////////////////////////////////////////////////////////////////
@ -238,8 +243,11 @@
`define DBG_CACHE_REQ_MDATAW 0
`endif
// non-cacheable address bit
`define NC_FLAG_BITS 1
// non-cacheable tag bits
`define NC_TAG_BIT 1
// texture tag bits
`define TEX_TAG_BIT 1
////////////////////////// Icache Configurable Knobs //////////////////////////
@ -278,12 +286,20 @@
// Block size in bytes
`define DCACHE_LINE_SIZE `L1_BLOCK_SIZE
// TAG sharing enable
`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE)
`define DCACHE_CORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_FLAG_BITS + `SM_ENABLE)
// Input request tag bits
`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS)
// Core request tag bits
`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE)
`ifdef EXT_TEX_ENABLE
`define LSU_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE)
`define TEX_TAG_ID_BITS (2)
`define LSU_TEX_TAG_ID_BITS `MAX(`LSU_TAG_ID_BITS, `TEX_TAG_ID_BITS)
`define DCACHE_CORE_TAG_ID_BITS (`LSU_TEX_TAG_ID_BITS + `TEX_TAG_BIT)
`define LSU_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TAG_ID_BITS)
`define TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `TEX_TAG_ID_BITS)
`define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TEX_TAG_ID_BITS)
`else
`define DCACHE_CORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE)
`endif
`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS)
// Memory request data bits
`define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8)
@ -300,7 +316,7 @@
// Memory request tag bits
`define _DMEM_ADDR_RATIO_W $clog2(`DCACHE_LINE_SIZE / `DCACHE_WORD_SIZE)
`define _DNC_MEM_TAG_WIDTH ($clog2(`DCACHE_NUM_REQS) + `_DMEM_ADDR_RATIO_W + `DCACHE_CORE_TAG_WIDTH)
`define DCACHE_MEM_TAG_WIDTH `MAX((`CLOG2(`DCACHE_NUM_BANKS) + `CLOG2(`DCACHE_MSHR_SIZE) + `NC_FLAG_BITS), `_DNC_MEM_TAG_WIDTH)
`define DCACHE_MEM_TAG_WIDTH `MAX((`CLOG2(`DCACHE_NUM_BANKS) + `CLOG2(`DCACHE_MSHR_SIZE) + `NC_TAG_BIT), `_DNC_MEM_TAG_WIDTH)
// Merged D-cache/I-cache memory tag
`define L1_MEM_TAG_WIDTH (`MAX(`ICACHE_MEM_TAG_WIDTH, `DCACHE_MEM_TAG_WIDTH) + `CLOG2(2))
@ -348,7 +364,7 @@
// Memory request tag bits
`define _L2_MEM_ADDR_RATIO_W $clog2(`L2_CACHE_LINE_SIZE / `L2_WORD_SIZE)
`define _L2_NC_MEM_TAG_WIDTH ($clog2(`L2_NUM_REQS) + `_L2_MEM_ADDR_RATIO_W + `L1_MEM_TAG_WIDTH)
`define _L2_MEM_TAG_WIDTH `MAX((`CLOG2(`L2_NUM_BANKS) + `CLOG2(`L2_MSHR_SIZE) + `NC_FLAG_BITS), `_L2_NC_MEM_TAG_WIDTH)
`define _L2_MEM_TAG_WIDTH `MAX((`CLOG2(`L2_NUM_BANKS) + `CLOG2(`L2_MSHR_SIZE) + `NC_TAG_BIT), `_L2_NC_MEM_TAG_WIDTH)
`define L2_MEM_TAG_WIDTH ((`L2_ENABLE) ? `_L2_MEM_TAG_WIDTH : (`L1_MEM_TAG_WIDTH + `CLOG2(`L2_NUM_REQS)))
////////////////////////// L3cache Configurable Knobs /////////////////////////
@ -380,7 +396,7 @@
// Memory request tag bits
`define _L3_MEM_ADDR_RATIO_W $clog2(`L3_CACHE_LINE_SIZE / `L3_WORD_SIZE)
`define _L3_NC_MEM_TAG_WIDTH ($clog2(`L3_NUM_REQS) + `_L3_MEM_ADDR_RATIO_W + `L2_MEM_TAG_WIDTH)
`define _L3_MEM_TAG_WIDTH `MAX((`CLOG2(`L3_NUM_BANKS) + `CLOG2(`L3_MSHR_SIZE) + `NC_FLAG_BITS), `_L3_NC_MEM_TAG_WIDTH)
`define _L3_MEM_TAG_WIDTH `MAX((`CLOG2(`L3_NUM_BANKS) + `CLOG2(`L3_MSHR_SIZE) + `NC_TAG_BIT), `_L3_NC_MEM_TAG_WIDTH)
`define L3_MEM_TAG_WIDTH ((`L3_ENABLE) ? `_L3_MEM_TAG_WIDTH : (`L2_MEM_TAG_WIDTH + `CLOG2(`L3_NUM_REQS)))
///////////////////////////////////////////////////////////////////////////////

View file

@ -1,6 +1,6 @@
`include "VX_define.vh"
module VX_instr_demux (
module VX_dispatch (
input wire clk,
input wire reset,
@ -60,7 +60,7 @@ module VX_instr_demux (
wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU);
wire [`INST_LSU_BITS-1:0] lsu_op_type = `INST_LSU_BITS'(ibuffer_if.op_type);
wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod);
wire lsu_is_prefetch = (~ibuffer_if.wb) && ~(ibuffer_if.op_type[`INST_OP_BITS-1]);
wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1),
@ -125,18 +125,17 @@ module VX_instr_demux (
wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU);
wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type);
wire [31:0] gpu_rs2_data = gpr_rsp_if.rs2_data[tid];
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)),
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)),
.OUT_REG (1)
) gpu_buffer (
.clk (clk),
.reset (reset),
.valid_in (gpu_req_valid),
.ready_in (gpu_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpu_rs2_data}),
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data}),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}),
.valid_out (gpu_req_if.valid),
.ready_out (gpu_req_if.ready)
);

View file

@ -45,12 +45,108 @@ module VX_execute #(
VX_commit_if.master gpu_commit_if,
input wire busy
);
);
`ifdef EXT_TEX_ENABLE
VX_dcache_req_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`LSU_DCACHE_TAG_BITS)
) lsu_dcache_req_if();
VX_dcache_rsp_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`LSU_DCACHE_TAG_BITS)
) lsu_dcache_rsp_if();
VX_dcache_req_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`TEX_DCACHE_TAG_BITS)
) tex_dcache_req_if();
VX_dcache_rsp_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`TEX_DCACHE_TAG_BITS)
) tex_dcache_rsp_if();
VX_tex_csr_if tex_csr_if();
wire [`NUM_THREADS-1:0][`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_in, lsu_tag_in;
wire [`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_out, lsu_tag_out;
`UNUSED_VAR (tex_tag_out)
`UNUSED_VAR (lsu_tag_out)
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign tex_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(tex_dcache_req_if.tag[i][`TEX_TAG_ID_BITS-1:0]);
assign lsu_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(lsu_dcache_req_if.tag[i][`LSU_TAG_ID_BITS-1:0]);
`ifdef DBG_CACHE_REQ_INFO
assign tex_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = tex_dcache_req_if.tag[i][`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS];
assign lsu_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = lsu_dcache_req_if.tag[i][`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS];
`endif
end
assign tex_dcache_rsp_if.tag[`TEX_TAG_ID_BITS-1:0] = tex_tag_out[`TEX_TAG_ID_BITS-1:0];
assign lsu_dcache_rsp_if.tag[`LSU_TAG_ID_BITS-1:0] = lsu_tag_out[`LSU_TAG_ID_BITS-1:0];
`ifdef DBG_CACHE_REQ_INFO
assign tex_dcache_rsp_if.tag[`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS] = tex_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS];
assign lsu_dcache_rsp_if.tag[`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS] = lsu_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS];
`endif
VX_cache_arb #(
.NUM_REQS (2),
.LANES (`NUM_THREADS),
.DATA_SIZE (4),
.TAG_IN_WIDTH (`LSU_TEX_DCACHE_TAG_BITS),
.TAG_SEL_IDX (`NC_TAG_BIT + `SM_ENABLE)
) tex_lsu_arb (
.clk (clk),
.reset (reset),
// Tex/LSU request
.req_valid_in ({tex_dcache_req_if.valid, lsu_dcache_req_if.valid}),
.req_rw_in ({tex_dcache_req_if.rw, lsu_dcache_req_if.rw}),
.req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}),
.req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}),
.req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}),
.req_tag_in ({tex_tag_in, lsu_tag_in}),
.req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}),
// Dcache request
.req_valid_out (dcache_req_if.valid),
.req_rw_out (dcache_req_if.rw),
.req_byteen_out (dcache_req_if.byteen),
.req_addr_out (dcache_req_if.addr),
.req_data_out (dcache_req_if.data),
.req_tag_out (dcache_req_if.tag),
.req_ready_out (dcache_req_if.ready),
// Dcache response
.rsp_valid_in (dcache_rsp_if.valid),
.rsp_tmask_in (dcache_rsp_if.tmask),
.rsp_tag_in (dcache_rsp_if.tag),
.rsp_data_in (dcache_rsp_if.data),
.rsp_ready_in (dcache_rsp_if.ready),
// Tex/LSU response
.rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}),
.rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}),
.rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}),
.rsp_tag_out ({tex_tag_out, lsu_tag_out}),
.rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready})
);
`endif
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if fpu_to_csr_if();
wire[`NUM_WARPS-1:0] fpu_pending;
wire[`NUM_WARPS-1:0] csr_pending;
`endif
wire [`NUM_WARPS-1:0] csr_pending;
wire [`NUM_WARPS-1:0] fpu_pending;
VX_fpu_to_csr_if fpu_to_csr_if();
`endif
`RESET_RELAY (alu_reset);
`RESET_RELAY (lsu_reset);
@ -58,7 +154,7 @@ module VX_execute #(
`RESET_RELAY (gpu_reset);
VX_alu_unit #(
.CORE_ID (CORE_ID)
.CORE_ID(CORE_ID)
) alu_unit (
.clk (clk),
.reset (alu_reset),
@ -68,20 +164,25 @@ module VX_execute #(
);
VX_lsu_unit #(
.CORE_ID (CORE_ID)
.CORE_ID(CORE_ID)
) lsu_unit (
`SCOPE_BIND_VX_execute_lsu_unit
.clk (clk),
.reset (lsu_reset),
`ifdef EXT_TEX_ENABLE
.dcache_req_if (lsu_dcache_req_if),
.dcache_rsp_if (lsu_dcache_rsp_if),
`else
.dcache_req_if (dcache_req_if),
.dcache_rsp_if (dcache_rsp_if),
`endif
.lsu_req_if (lsu_req_if),
.ld_commit_if (ld_commit_if),
.st_commit_if (st_commit_if)
);
VX_csr_unit #(
.CORE_ID (CORE_ID)
.CORE_ID(CORE_ID)
) csr_unit (
.clk (clk),
.reset (csr_reset),
@ -89,7 +190,7 @@ module VX_execute #(
.perf_memsys_if (perf_memsys_if),
.perf_pipeline_if(perf_pipeline_if),
`endif
.cmt_to_csr_if (cmt_to_csr_if),
.cmt_to_csr_if (cmt_to_csr_if),
.fetch_to_csr_if(fetch_to_csr_if),
.csr_req_if (csr_req_if),
.csr_commit_if (csr_commit_if),
@ -100,6 +201,9 @@ module VX_execute #(
`else
`UNUSED_PIN (pending),
`endif
`ifdef EXT_TEX_ENABLE
.tex_csr_if (tex_csr_if),
`endif
.busy (busy)
);
@ -107,7 +211,7 @@ module VX_execute #(
`RESET_RELAY (fpu_reset);
VX_fpu_unit #(
.CORE_ID (CORE_ID)
.CORE_ID(CORE_ID)
) fpu_unit (
.clk (clk),
.reset (fpu_reset),
@ -120,12 +224,17 @@ module VX_execute #(
`endif
VX_gpu_unit #(
.CORE_ID (CORE_ID)
.CORE_ID(CORE_ID)
) gpu_unit (
`SCOPE_BIND_VX_execute_gpu_unit
.clk (clk),
.reset (gpu_reset),
.gpu_req_if (gpu_req_if),
`ifdef EXT_TEX_ENABLE
.tex_csr_if (tex_csr_if),
.dcache_req_if (tex_dcache_req_if),
.dcache_rsp_if (tex_dcache_rsp_if),
`endif
.warp_ctl_if (warp_ctl_if),
.gpu_commit_if (gpu_commit_if)
);
@ -137,4 +246,4 @@ module VX_execute #(
&& (`INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_EBREAK
|| `INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_ECALL);
endmodule
endmodule

View file

@ -11,6 +11,12 @@ module VX_gpu_unit #(
// Inputs
VX_gpu_req_if.slave gpu_req_if,
`ifdef EXT_TEX_ENABLE
VX_dcache_req_if.master dcache_req_if,
VX_dcache_rsp_if.slave dcache_rsp_if,
VX_tex_csr_if.slave tex_csr_if,
`endif
// Outputs
VX_warp_ctl_if.master warp_ctl_if,
VX_commit_if.master gpu_commit_if
@ -18,14 +24,29 @@ module VX_gpu_unit #(
import gpu_types::*;
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
localparam WCTL_DATAW = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS;
localparam RSP_DATAW = `MAX(`NUM_THREADS * 32, WCTL_DATAW);
wire rsp_valid;
wire [`NW_BITS-1:0] rsp_wid;
wire [`NUM_THREADS-1:0] rsp_tmask;
wire [31:0] rsp_PC;
wire [`NR_BITS-1:0] rsp_rd;
wire rsp_wb;
wire [RSP_DATAW-1:0] rsp_data, rsp_data_r;
gpu_tmc_t tmc;
gpu_wspawn_t wspawn;
gpu_barrier_t barrier;
gpu_split_t split;
wire [WCTL_DATAW-1:0] warp_ctl_data;
wire is_warp_ctl;
wire stall_in, stall_out;
wire is_wspawn = (gpu_req_if.op_type == `INST_GPU_WSPAWN);
wire is_tmc = (gpu_req_if.op_type == `INST_GPU_TMC);
wire is_split = (gpu_req_if.op_type == `INST_GPU_SPLIT);
@ -33,7 +54,8 @@ module VX_gpu_unit #(
wire is_pred = (gpu_req_if.op_type == `INST_GPU_PRED);
wire [31:0] rs1_data = gpu_req_if.rs1_data[gpu_req_if.tid];
wire [31:0] rs2_data = gpu_req_if.rs2_data[gpu_req_if.tid];
wire [`NUM_THREADS-1:0] taken_tmask;
wire [`NUM_THREADS-1:0] not_taken_tmask;
@ -52,7 +74,7 @@ module VX_gpu_unit #(
// wspawn
wire [31:0] wspawn_pc = gpu_req_if.rs2_data;
wire [31:0] wspawn_pc = rs2_data;
wire [`NUM_WARPS-1:0] wspawn_wmask;
for (genvar i = 0; i < `NUM_WARPS; i++) begin
assign wspawn_wmask[i] = (i < rs1_data);
@ -73,30 +95,109 @@ module VX_gpu_unit #(
assign barrier.valid = is_bar;
assign barrier.id = rs1_data[`NB_BITS-1:0];
assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data - 1);
assign barrier.size_m1 = (`NW_BITS)'(rs2_data - 1);
// pack warp ctl result
assign warp_ctl_data = {tmc, wspawn, split, barrier};
// texture
`ifdef EXT_TEX_ENABLE
`UNUSED_VAR (gpu_req_if.op_mod)
VX_tex_req_if tex_req_if();
VX_tex_rsp_if tex_rsp_if();
wire is_tex = (gpu_req_if.op_type == `INST_GPU_TEX);
assign tex_req_if.valid = gpu_req_if.valid && is_tex;
assign tex_req_if.wid = gpu_req_if.wid;
assign tex_req_if.tmask = gpu_req_if.tmask;
assign tex_req_if.PC = gpu_req_if.PC;
assign tex_req_if.rd = gpu_req_if.rd;
assign tex_req_if.wb = gpu_req_if.wb;
assign tex_req_if.unit = gpu_req_if.op_mod[`NTEX_BITS-1:0];
assign tex_req_if.coords[0] = gpu_req_if.rs1_data;
assign tex_req_if.coords[1] = gpu_req_if.rs2_data;
assign tex_req_if.lod = gpu_req_if.rs3_data;
VX_tex_unit #(
.CORE_ID(CORE_ID)
) tex_unit (
.clk (clk),
.reset (reset),
.tex_req_if (tex_req_if),
.tex_csr_if (tex_csr_if),
.tex_rsp_if (tex_rsp_if),
.dcache_req_if (dcache_req_if),
.dcache_rsp_if (dcache_rsp_if)
);
assign tex_rsp_if.ready = !stall_out;
assign stall_in = (is_tex && ~tex_req_if.ready)
|| (~is_tex && (tex_rsp_if.valid || stall_out));
assign is_warp_ctl = !(is_tex || tex_rsp_if.valid);
assign rsp_valid = tex_rsp_if.valid || (gpu_req_if.valid && ~is_tex);
assign rsp_wid = tex_rsp_if.valid ? tex_rsp_if.wid : gpu_req_if.wid;
assign rsp_tmask = tex_rsp_if.valid ? tex_rsp_if.tmask : gpu_req_if.tmask;
assign rsp_PC = tex_rsp_if.valid ? tex_rsp_if.PC : gpu_req_if.PC;
assign rsp_rd = tex_rsp_if.rd;
assign rsp_wb = tex_rsp_if.valid && tex_rsp_if.wb;
assign rsp_data = tex_rsp_if.valid ? RSP_DATAW'(tex_rsp_if.data) : RSP_DATAW'(warp_ctl_data);
`else
`UNUSED_VAR (gpu_req_if.op_mod)
`UNUSED_VAR (gpu_req_if.rs3_data)
`UNUSED_VAR (gpu_req_if.wb)
`UNUSED_VAR (gpu_req_if.rd)
assign stall_in = stall_out;
assign is_warp_ctl = 1;
assign rsp_valid = gpu_req_if.valid;
assign rsp_wid = gpu_req_if.wid;
assign rsp_tmask = gpu_req_if.tmask;
assign rsp_PC = gpu_req_if.PC;
assign rsp_rd = 0;
assign rsp_wb = 0;
assign rsp_data = RSP_DATAW'(warp_ctl_data);
`endif
wire is_warp_ctl_r;
// output
wire stall = ~gpu_commit_if.ready && gpu_commit_if.valid;
assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid;
VX_pipe_register #(
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS),
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall),
.data_in ({gpu_req_if.valid, gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.rd, gpu_req_if.wb, tmc, wspawn, split, barrier}),
.data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier})
);
.enable (!stall_out),
.data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}),
.data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r})
);
assign gpu_commit_if.eop = 1'b1;
assign gpu_commit_if.data = rsp_data_r[(`NUM_THREADS * 32)-1:0];
assign gpu_commit_if.eop = 1'b1;
assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready;
assign warp_ctl_if.wid = gpu_commit_if.wid;
// warp control reponse
assign {warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier} = rsp_data_r[WCTL_DATAW-1:0];
assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready && is_warp_ctl_r;
assign warp_ctl_if.wid = gpu_commit_if.wid;
// can accept new request?
assign gpu_req_if.ready = ~stall;
assign gpu_req_if.ready = ~stall_in;
`SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid);
`SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid);

View file

@ -88,7 +88,7 @@ module VX_icache_stage #(
`SCOPE_ASSIGN (icache_rsp_data, icache_rsp_if.data);
`SCOPE_ASSIGN (icache_rsp_tag, rsp_tag);
`ifdef DBG_PRINT_CORE_ICACHE
`ifdef DBG_TRACE_CORE_ICACHE
always @(posedge clk) begin
if (icache_req_if.valid && icache_req_if.ready) begin
dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC);

View file

@ -23,56 +23,60 @@ module VX_issue #(
`endif
VX_gpu_req_if.master gpu_req_if
);
VX_ibuffer_if ibuffer_if();
VX_gpr_rsp_if gpr_rsp_if();
VX_gpr_req_if gpr_req_if();
assign gpr_req_if.wid = ibuffer_if.wid;
assign gpr_req_if.rs1 = ibuffer_if.rs1;
assign gpr_req_if.rs2 = ibuffer_if.rs2;
assign gpr_req_if.rs3 = ibuffer_if.rs3;
VX_ibuffer_if ibuffer_if();
VX_gpr_req_if gpr_req_if();
VX_gpr_rsp_if gpr_rsp_if();
VX_writeback_if sboard_wb_if();
assign sboard_wb_if.valid = writeback_if.valid;
assign sboard_wb_if.wid = writeback_if.wid;
assign sboard_wb_if.PC = writeback_if.PC;
assign sboard_wb_if.rd = writeback_if.rd;
assign sboard_wb_if.eop = writeback_if.eop;
assign sboard_wb_if.ready = writeback_if.ready;
VX_ibuffer_if sboard_ib_if();
assign sboard_ib_if.valid = ibuffer_if.valid && idmux_ib_if.ready;
assign sboard_ib_if.wid = ibuffer_if.wid;
assign sboard_ib_if.PC = ibuffer_if.PC;
assign sboard_ib_if.wb = ibuffer_if.wb;
assign sboard_ib_if.rd = ibuffer_if.rd;
assign sboard_ib_if.rd_n = ibuffer_if.rd_n;
assign sboard_ib_if.rs1_n = ibuffer_if.rs1_n;
assign sboard_ib_if.rs2_n = ibuffer_if.rs2_n;
assign sboard_ib_if.rs3_n = ibuffer_if.rs3_n;
assign sboard_ib_if.wid_n = ibuffer_if.wid_n;
VX_ibuffer_if scoreboard_if();
VX_ibuffer_if dispatch_if();
VX_ibuffer_if idmux_ib_if();
assign idmux_ib_if.valid = ibuffer_if.valid && sboard_ib_if.ready;
assign idmux_ib_if.wid = ibuffer_if.wid;
assign idmux_ib_if.tmask = ibuffer_if.tmask;
assign idmux_ib_if.PC = ibuffer_if.PC;
assign idmux_ib_if.ex_type = ibuffer_if.ex_type;
assign idmux_ib_if.op_type = ibuffer_if.op_type;
assign idmux_ib_if.op_mod = ibuffer_if.op_mod;
assign idmux_ib_if.wb = ibuffer_if.wb;
assign idmux_ib_if.rd = ibuffer_if.rd;
assign idmux_ib_if.rs1 = ibuffer_if.rs1;
assign idmux_ib_if.imm = ibuffer_if.imm;
assign idmux_ib_if.use_PC = ibuffer_if.use_PC;
assign idmux_ib_if.use_imm = ibuffer_if.use_imm;
// GPR request interface
assign gpr_req_if.wid = ibuffer_if.wid;
assign gpr_req_if.rs1 = ibuffer_if.rs1;
assign gpr_req_if.rs2 = ibuffer_if.rs2;
assign gpr_req_if.rs3 = ibuffer_if.rs3;
// scoreboard writeback interface
assign sboard_wb_if.valid = writeback_if.valid;
assign sboard_wb_if.wid = writeback_if.wid;
assign sboard_wb_if.PC = writeback_if.PC;
assign sboard_wb_if.rd = writeback_if.rd;
assign sboard_wb_if.eop = writeback_if.eop;
// scoreboard interface
assign scoreboard_if.valid = ibuffer_if.valid && dispatch_if.ready;
assign scoreboard_if.wid = ibuffer_if.wid;
assign scoreboard_if.PC = ibuffer_if.PC;
assign scoreboard_if.wb = ibuffer_if.wb;
assign scoreboard_if.rd = ibuffer_if.rd;
assign scoreboard_if.rd_n = ibuffer_if.rd_n;
assign scoreboard_if.rs1_n = ibuffer_if.rs1_n;
assign scoreboard_if.rs2_n = ibuffer_if.rs2_n;
assign scoreboard_if.rs3_n = ibuffer_if.rs3_n;
assign scoreboard_if.wid_n = ibuffer_if.wid_n;
// dispatch interface
assign dispatch_if.valid = ibuffer_if.valid && scoreboard_if.ready;
assign dispatch_if.wid = ibuffer_if.wid;
assign dispatch_if.tmask = ibuffer_if.tmask;
assign dispatch_if.PC = ibuffer_if.PC;
assign dispatch_if.ex_type = ibuffer_if.ex_type;
assign dispatch_if.op_type = ibuffer_if.op_type;
assign dispatch_if.op_mod = ibuffer_if.op_mod;
assign dispatch_if.wb = ibuffer_if.wb;
assign dispatch_if.rd = ibuffer_if.rd;
assign dispatch_if.rs1 = ibuffer_if.rs1;
assign dispatch_if.imm = ibuffer_if.imm;
assign dispatch_if.use_PC = ibuffer_if.use_PC;
assign dispatch_if.use_imm = ibuffer_if.use_imm;
// issue the instruction
assign ibuffer_if.ready = sboard_ib_if.ready && idmux_ib_if.ready;
assign ibuffer_if.ready = scoreboard_if.ready && dispatch_if.ready;
`RESET_RELAY (ibuf_reset);
`RESET_RELAY (scoreboard_reset);
`RESET_RELAY (gpr_reset);
`RESET_RELAY (demux_reset);
`RESET_RELAY (dispatch_reset);
VX_ibuffer #(
.CORE_ID(CORE_ID)
@ -87,9 +91,9 @@ module VX_issue #(
.CORE_ID(CORE_ID)
) scoreboard (
.clk (clk),
.reset (reset),
.ibuffer_if (sboard_ib_if),
.writeback_if(sboard_wb_if)
.reset (scoreboard_reset),
.writeback_if(sboard_wb_if),
.ibuffer_if (scoreboard_if)
);
VX_gpr_stage #(
@ -102,10 +106,10 @@ module VX_issue #(
.gpr_rsp_if (gpr_rsp_if)
);
VX_instr_demux instr_demux (
VX_dispatch dispatch (
.clk (clk),
.reset (demux_reset),
.ibuffer_if (idmux_ib_if),
.reset (dispatch_reset),
.ibuffer_if (dispatch_if),
.gpr_rsp_if (gpr_rsp_if),
.alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if),
@ -131,11 +135,11 @@ module VX_issue #(
`SCOPE_ASSIGN (issue_imm, ibuffer_if.imm);
`SCOPE_ASSIGN (issue_use_pc, ibuffer_if.use_PC);
`SCOPE_ASSIGN (issue_use_imm, ibuffer_if.use_imm);
`SCOPE_ASSIGN (scoreboard_delay, !sboard_wb_if.ready);
`SCOPE_ASSIGN (execute_delay, !idmux_ib_if.ready);
`SCOPE_ASSIGN (gpr_rsp_a, gpr_rsp_if.rs1_data);
`SCOPE_ASSIGN (gpr_rsp_b, gpr_rsp_if.rs2_data);
`SCOPE_ASSIGN (gpr_rsp_c, gpr_rsp_if.rs3_data);
`SCOPE_ASSIGN (scoreboard_delay, !scoreboard_if.ready);
`SCOPE_ASSIGN (dispatch_delay, !dispatch_if.ready);
`SCOPE_ASSIGN (gpr_rs1, gpr_rsp_if.rs1_data);
`SCOPE_ASSIGN (gpr_rs2, gpr_rsp_if.rs2_data);
`SCOPE_ASSIGN (gpr_rs3, gpr_rsp_if.rs3_data);
`SCOPE_ASSIGN (writeback_valid, writeback_if.valid);
`SCOPE_ASSIGN (writeback_tmask, writeback_if.tmask);
`SCOPE_ASSIGN (writeback_wid, writeback_if.wid);
@ -170,7 +174,7 @@ module VX_issue #(
if (decode_if.valid & !decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1;
end
if (ibuffer_if.valid & !sboard_wb_if.ready) begin
if (scoreboard_if.valid & !scoreboard_if.ready) begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1;
end
if (alu_req_if.valid & !alu_req_if.ready) begin
@ -204,7 +208,7 @@ module VX_issue #(
`endif
`endif
`ifdef DBG_PRINT_PIPELINE
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (alu_req_if.valid && alu_req_if.ready) begin
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rd=%0d, rs1_data=",
@ -246,6 +250,8 @@ module VX_issue #(
`TRACE_ARRAY1D(gpu_req_if.rs1_data, `NUM_THREADS);
dpi_trace(", rs2_data=");
`TRACE_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS);
dpi_trace(", rs3_data=");
`TRACE_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS);
dpi_trace("\n");
end
end

View file

@ -24,7 +24,7 @@ module VX_lsu_unit #(
localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE);
localparam ADDR_TYPEW = `NC_FLAG_BITS + `SM_ENABLE;
localparam ADDR_TYPEW = `NC_TAG_BIT + `SM_ENABLE;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
@ -80,6 +80,8 @@ module VX_lsu_unit #(
wire lsu_valid = lsu_req_if.valid && ~fence_wait;
wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch;
VX_pipe_register #(
.DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
.RESETW (1)
@ -87,8 +89,8 @@ module VX_lsu_unit #(
.clk (clk),
.reset (reset),
.enable (!stall_in),
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_req_if.wb | lsu_req_if.is_prefetch, lsu_req_if.store_data}),
.data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data})
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}),
.data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data})
);
// Can accept new request?
@ -103,6 +105,7 @@ module VX_lsu_unit #(
wire rsp_is_prefetch;
`UNUSED_VAR (rsp_type)
`UNUSED_VAR (rsp_is_prefetch)
reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask;
wire [`NUM_THREADS-1:0] rsp_rem_mask_n;
@ -132,7 +135,11 @@ module VX_lsu_unit #(
wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n);
assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS];
assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS];
`UNUSED_VAR (dcache_rsp_if.tag)
// do not writeback from software prefetch
wire req_wb2 = req_wb && ~req_is_prefetch;
VX_index_buffer #(
.DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1),
@ -143,8 +150,8 @@ module VX_lsu_unit #(
.write_addr (mbuf_waddr),
.acquire_slot (mbuf_push),
.read_addr (mbuf_raddr),
.write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb, req_type, req_offset, req_is_dup, req_is_prefetch}),
.read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}),
.write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}),
.read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}),
.release_addr (mbuf_raddr),
.release_slot (mbuf_pop),
.full (mbuf_full),
@ -276,8 +283,6 @@ module VX_lsu_unit #(
// send load commit
// ignore responce from software prefetch
wire rsp_valid = (rsp_is_prefetch)? 0:(| dcache_rsp_if.valid);
wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid;
VX_pipe_register #(
@ -287,12 +292,12 @@ module VX_lsu_unit #(
.clk (clk),
.reset (reset),
.enable (!load_rsp_stall),
.data_in ({rsp_valid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}),
.data_in ({dcache_rsp_if.valid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}),
.data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop})
);
// Can accept new cache response?
assign dcache_rsp_if.ready = rsp_is_prefetch ? 1 : ~load_rsp_stall;
assign dcache_rsp_if.ready = ~load_rsp_stall;
// scope registration
`SCOPE_ASSIGN (dcache_req_fire, dcache_req_fire);
@ -333,7 +338,7 @@ module VX_lsu_unit #(
end
`endif
`ifdef DBG_PRINT_CORE_DCACHE
`ifdef DBG_TRACE_CORE_DCACHE
wire dcache_req_fire_any = (| dcache_req_fire);
always @(posedge clk) begin
if (lsu_req_if.valid && fence_wait) begin
@ -349,7 +354,7 @@ module VX_lsu_unit #(
`TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
dpi_trace("\n");
end else begin
dpi_trace("%d: D$%0d Rd Req: req_is_prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire);
dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire);
`TRACE_ARRAY1D(req_addr, `NUM_THREADS);
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
@ -357,7 +362,7 @@ module VX_lsu_unit #(
end
end
if (dcache_rsp_fire) begin
dpi_trace("%d: D$%0d Rsp: rsp_is_prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=",
dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=",
$time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd);
`TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS);
dpi_trace(", is_dup=%b\n", rsp_is_dup);

View file

@ -206,6 +206,7 @@ module VX_mem_unit # (
.LANES (`NUM_THREADS),
.DATA_SIZE (4),
.TAG_IN_WIDTH (`DCACHE_CORE_TAG_WIDTH),
.TAG_SEL_IDX (0), // SM flag
.TYPE ("P"),
.BUFFERED_REQ (2),
.BUFFERED_RSP (1)

View file

@ -119,9 +119,9 @@
`define UP(x) (((x) > 0) ? (x) : 1)
`define RTRIM(x,s) x[$bits(x)-1:($bits(x)-s)]
`define RTRIM(x, s) x[$bits(x)-1:($bits(x)-s)]
`define LTRIM(x,s) x[s-1:0]
`define LTRIM(x, s) x[s-1:0]
`define TRACE_ARRAY1D(a, m) \
dpi_trace("{"); \

View file

@ -6,8 +6,8 @@ module VX_scoreboard #(
input wire clk,
input wire reset,
VX_ibuffer_if.scoreboard ibuffer_if,
VX_writeback_if.scoreboard writeback_if
VX_ibuffer_if.slave ibuffer_if,
VX_writeback_if.slave writeback_if
);
reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
@ -53,11 +53,12 @@ module VX_scoreboard #(
reg [31:0] deadlock_ctr;
wire [31:0] deadlock_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
always @(posedge clk) begin
if (reset) begin
deadlock_ctr <= 0;
end else begin
`ifdef DBG_PRINT_PIPELINE
`ifdef DBG_TRACE_PIPELINE
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,

View file

@ -1,9 +1,9 @@
`ifndef VX_PRINT_INSTR
`define VX_PRINT_INSTR
`ifndef VX_TRACE_INSTR
`define VX_TRACE_INSTR
`include "VX_define.vh"
task print_ex_type (
task trace_ex_type (
input [`EX_BITS-1:0] ex_type
);
case (ex_type)
@ -16,7 +16,7 @@ task print_ex_type (
endcase
endtask
task print_ex_op (
task trace_ex_op (
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input [`INST_MOD_BITS-1:0] op_mod
@ -137,6 +137,7 @@ task print_ex_op (
`INST_GPU_JOIN: dpi_trace("JOIN");
`INST_GPU_BAR: dpi_trace("BAR");
`INST_GPU_PRED: dpi_trace("PRED");
`INST_GPU_TEX: dpi_trace("TEX");
default: dpi_trace("?");
endcase
end

View file

@ -71,8 +71,8 @@ module VX_warp_sched #(
// activate first warp
warp_pcs[0] <= `STARTUP_ADDR;
active_warps[0] <= '1;
thread_masks[0] <= '1;
active_warps[0] <= 1;
thread_masks[0] <= 1;
end else begin
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1));

View file

@ -12,7 +12,8 @@ module VX_writeback #(
VX_commit_if.slave csr_commit_if,
`ifdef EXT_F_ENABLE
VX_commit_if.slave fpu_commit_if,
`endif
`endif
VX_commit_if.slave gpu_commit_if,
// outputs
VX_writeback_if.master writeback_if
@ -22,9 +23,17 @@ module VX_writeback #(
localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1;
`ifdef EXT_F_ENABLE
`ifdef EXT_TEX_ENABLE
localparam NUM_RSPS = 5;
`else
localparam NUM_RSPS = 4;
`endif
`else
`ifdef EXT_TEX_ENABLE
localparam NUM_RSPS = 4;
`else
localparam NUM_RSPS = 3;
`endif
`endif
wire wb_valid;
@ -40,22 +49,27 @@ module VX_writeback #(
wire [NUM_RSPS-1:0] rsp_ready;
wire stall;
assign rsp_valid = {
assign rsp_valid = {
`ifdef EXT_TEX_ENABLE
gpu_commit_if.valid && gpu_commit_if.wb,
`endif
csr_commit_if.valid && csr_commit_if.wb,
alu_commit_if.valid && alu_commit_if.wb,
alu_commit_if.valid && alu_commit_if.wb,
`ifdef EXT_F_ENABLE
fpu_commit_if.valid && fpu_commit_if.wb,
`endif
ld_commit_if.valid && ld_commit_if.wb
};
assign rsp_data = {
assign rsp_data = {
`ifdef EXT_TEX_ENABLE
{gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop},
`endif
{csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop},
{alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop},
`ifdef EXT_F_ENABLE
{fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.rd, fpu_commit_if.data, fpu_commit_if.eop},
`endif
`endif
{ ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.rd, ld_commit_if.data, ld_commit_if.eop}
};
@ -82,8 +96,20 @@ module VX_writeback #(
`else
assign alu_commit_if.ready = rsp_ready[1] || ~alu_commit_if.wb;
assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb;
`ifdef EXT_TEX_ENABLE
assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb;
`endif
`endif
`ifdef EXT_TEX_ENABLE
`ifdef EXT_F_ENABLE
assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb;
`else
assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb;
`endif
`else
assign gpu_commit_if.ready = 1;
`endif
assign stall = ~writeback_if.ready && writeback_if.valid;

View file

@ -201,7 +201,7 @@ module Vortex (
`SCOPE_ASSIGN (mem_rsp_tag, mem_rsp_tag);
`SCOPE_ASSIGN (busy, busy);
`ifdef DBG_PRINT_MEM
`ifdef DBG_TRACE_MEM
always @(posedge clk) begin
if (mem_req_valid && mem_req_ready) begin
if (mem_req_rw)

View file

@ -158,7 +158,7 @@ module VX_avs_wrapper #(
.ready_out (mem_rsp_ready)
);
`ifdef DBG_PRINT_AVS
`ifdef DBG_TRACE_AVS
always @(posedge clk) begin
if (mem_req_valid && mem_req_ready) begin
if (mem_req_rw) begin

View file

@ -45,12 +45,14 @@ localparam CCI_DATA_WIDTH = $bits(t_ccip_clData);
localparam CCI_DATA_SIZE = CCI_DATA_WIDTH / 8;
localparam CCI_ADDR_WIDTH = 32 - $clog2(CCI_DATA_SIZE);
localparam AVS_RD_QUEUE_SIZE = 4;
localparam AVS_REQ_TAGW_VX_ = `VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH);
localparam AVS_REQ_TAGW_VX = `MAX(`VX_MEM_TAG_WIDTH, AVS_REQ_TAGW_VX_);
localparam AVS_REQ_TAGW_CCI_ = CCI_ADDR_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(CCI_DATA_WIDTH);
localparam AVS_REQ_TAGW_CCI = `MAX(CCI_ADDR_WIDTH, AVS_REQ_TAGW_CCI_);
localparam AVS_REQ_TAGW = `MAX(AVS_REQ_TAGW_VX, AVS_REQ_TAGW_CCI);
localparam _VX_MEM_TAG_WIDTH = `VX_MEM_TAG_WIDTH;
localparam _AVS_REQ_TAGW_VX = _VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH);
localparam _AVS_REQ_TAGW_VX2 = `MAX(_VX_MEM_TAG_WIDTH, _AVS_REQ_TAGW_VX);
localparam _AVS_REQ_TAGW_CCI = CCI_ADDR_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(CCI_DATA_WIDTH);
localparam _AVS_REQ_TAGW_CCI2 = `MAX(CCI_ADDR_WIDTH, _AVS_REQ_TAGW_CCI);
localparam AVS_REQ_TAGW = `MAX(_AVS_REQ_TAGW_VX2, _AVS_REQ_TAGW_CCI2);
localparam CCI_RD_WINDOW_SIZE = 8;
localparam CCI_RW_PENDING_SIZE= 256;
@ -185,36 +187,36 @@ always @(posedge clk) begin
case (mmio_hdr.address)
MMIO_IO_ADDR: begin
cmd_io_addr <= t_ccip_clAddr'(cp2af_sRxPort.c0.data);
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_IO_ADDR: addr=%0h, data=0x%0h\n", $time, mmio_hdr.address, t_ccip_clAddr'(cp2af_sRxPort.c0.data));
`endif
end
MMIO_MEM_ADDR: begin
cmd_mem_addr <= $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data);
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_MEM_ADDR: addr=%0h, data=0x%0h\n", $time, mmio_hdr.address, $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data));
`endif
end
MMIO_DATA_SIZE: begin
cmd_data_size <= $bits(cmd_data_size)'(cp2af_sRxPort.c0.data);
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_DATA_SIZE: addr=%0h, data=%0d\n", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data));
`endif
end
MMIO_CMD_TYPE: begin
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_CMD_TYPE: addr=%0h, data=%0d\n", $time, mmio_hdr.address, $bits(cmd_type)'(cp2af_sRxPort.c0.data));
`endif
end
`ifdef SCOPE
MMIO_SCOPE_WRITE: begin
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_SCOPE_WRITE: addr=%0h, data=%0h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data));
`endif
end
`endif
default: begin
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: Unknown MMIO Wr: addr=%0h, data=%0h\n", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data));
`endif
end
@ -241,7 +243,7 @@ always @(posedge clk) begin
16'h0008: mmio_tx.data <= 64'h0; // reserved
MMIO_STATUS: begin
mmio_tx.data <= 64'({cout_q_dout, !cout_q_empty, 8'(state)});
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
if (state != STATE_WIDTH'(mmio_tx.data)) begin
dpi_trace("%d: MMIO_STATUS: addr=%0h, state=%0d\n", $time, mmio_hdr.address, state);
end
@ -250,20 +252,20 @@ always @(posedge clk) begin
`ifdef SCOPE
MMIO_SCOPE_READ: begin
mmio_tx.data <= cmd_scope_rdata;
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_SCOPE_READ: addr=%0h, data=%0h\n", $time, mmio_hdr.address, cmd_scope_rdata);
`endif
end
`endif
MMIO_DEV_CAPS: begin
mmio_tx.data <= dev_caps;
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_DEV_CAPS: addr=%0h, data=%0h\n", $time, mmio_hdr.address, dev_caps);
`endif
end
default: begin
mmio_tx.data <= 64'h0;
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: Unknown MMIO Rd: addr=%0h\n", $time, mmio_hdr.address);
`endif
end
@ -297,19 +299,19 @@ always @(posedge clk) begin
STATE_IDLE: begin
case (cmd_type)
CMD_MEM_READ: begin
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: STATE READ: ia=%0h addr=%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size);
`endif
state <= STATE_READ;
end
CMD_MEM_WRITE: begin
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: STATE WRITE: ia=%0h addr=%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size);
`endif
state <= STATE_WRITE;
end
CMD_RUN: begin
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: STATE START\n", $time);
`endif
vx_reset <= 1;
@ -324,7 +326,7 @@ always @(posedge clk) begin
STATE_READ: begin
if (cmd_read_done) begin
state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: STATE IDLE\n", $time);
`endif
end
@ -333,7 +335,7 @@ always @(posedge clk) begin
STATE_WRITE: begin
if (cmd_write_done) begin
state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: STATE IDLE\n", $time);
`endif
end
@ -345,7 +347,7 @@ always @(posedge clk) begin
if (cmd_run_done) begin
vx_started <= 0;
state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: STATE IDLE\n", $time);
`endif
end
@ -699,7 +701,7 @@ always @(posedge clk) begin
if (cci_rd_req_fire) begin
cci_rd_req_addr <= cci_rd_req_addr + 1;
cci_rd_req_ctr <= cci_rd_req_ctr + 1;
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d\n", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads);
`endif
end
@ -709,13 +711,13 @@ always @(posedge clk) begin
if (CCI_RD_QUEUE_TAGW'(cci_rd_rsp_ctr) == CCI_RD_QUEUE_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin
cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE);
end
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data);
`endif
end
if (cci_rdq_pop) begin
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: CCI Rd Queue Pop: pending=%0d\n", $time, cci_pending_reads);
`endif
end
@ -856,13 +858,13 @@ begin
if (cci_wr_req_ctr == CCI_ADDR_WIDTH'(1)) begin
cci_wr_req_done <= 1;
end
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data);
`endif
end
if (cci_wr_rsp_fire) begin
`ifdef DBG_PRINT_OPAE
`ifdef DBG_TRACE_OPAE
dpi_trace("%d: CCI Wr Rsp: pending=%0d\n", $time, cci_pending_writes);
`endif
end

View file

@ -509,7 +509,7 @@ module VX_bank #(
assign perf_mshr_stalls = mshr_alm_full;
`endif
`ifdef DBG_PRINT_CACHE_BANK
`ifdef DBG_TRACE_CACHE_BANK
wire crsq_fire = crsq_valid && crsq_ready;
wire pipeline_stall = (mshr_valid || mem_rsp_valid || creq_valid)
&& ~(mshr_fire || mem_rsp_fire || creq_fire);

View file

@ -53,7 +53,7 @@
`define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS]
`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1:(CORE_TAG_WIDTH-`NW_BITS-32)
`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_MDATAW)
///////////////////////////////////////////////////////////////////////////////

View file

@ -119,7 +119,7 @@ module VX_data_access #(
`UNUSED_VAR (stall)
`ifdef DBG_PRINT_CACHE_DATA
`ifdef DBG_TRACE_CACHE_DATA
always @(posedge clk) begin
if (fill && ~stall) begin
dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data);

View file

@ -202,7 +202,7 @@ module VX_miss_resrv #(
`UNUSED_VAR (lookup_valid)
`ifdef DBG_PRINT_CACHE_MSHR
`ifdef DBG_TRACE_CACHE_MSHR
always @(posedge clk) begin
if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin
if (allocate_fire)

View file

@ -229,7 +229,7 @@ module VX_shared_mem #(
core_rsp_data_in = 'x;
bank_rsp_sel_n = bank_rsp_sel_r;
for (integer i = 0; i < NUM_BANKS; i++) begin
if (per_bank_core_req_valid[i]
if (core_req_read_mask[i]
&& (core_rsp_tag_in[CORE_TAG_ID_BITS-1:0] == per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin
core_rsp_valids_in[per_bank_core_req_tid[i]] = 1;
core_rsp_data_in[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i];
@ -271,7 +271,7 @@ module VX_shared_mem #(
end
`endif
`ifdef DBG_PRINT_CACHE_BANK
`ifdef DBG_TRACE_CACHE_BANK
reg is_multi_tag_req;
`IGNORE_UNUSED_BEGIN

View file

@ -61,7 +61,7 @@ module VX_tag_access #(
`UNUSED_VAR (stall)
`ifdef DBG_PRINT_CACHE_TAG
`ifdef DBG_TRACE_CACHE_TAG
always @(posedge clk) begin
if (fill && ~stall) begin
dpi_trace("%d: cache%0d:%0d tag-fill: addr=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag);

View file

@ -3,8 +3,7 @@
`include "defs_div_sqrt_mvp.sv"
`TRACING_OFF
module VX_fpu_fpnew
#(
module VX_fpu_fpnew #(
parameter TAGW = 1,
parameter FMULADD = 1,
parameter FDIVSQRT = 1,

View file

@ -12,9 +12,11 @@ interface VX_gpu_req_if();
wire [31:0] PC;
wire [31:0] next_PC;
wire [`INST_GPU_BITS-1:0] op_type;
wire [`INST_MOD_BITS-1:0] op_mod;
wire [`NT_BITS-1:0] tid;
wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [31:0] rs2_data;
wire [`NUM_THREADS-1:0][31:0] rs2_data;
wire [`NUM_THREADS-1:0][31:0] rs3_data;
wire [`NR_BITS-1:0] rd;
wire wb;
@ -27,9 +29,11 @@ interface VX_gpu_req_if();
output PC,
output next_PC,
output op_type,
output op_mod,
output tid,
output rs1_data,
output rs2_data,
output rs3_data,
output rd,
output wb,
input ready
@ -42,9 +46,11 @@ interface VX_gpu_req_if();
input PC,
input next_PC,
input op_type,
input op_mod,
input tid,
input rs1_data,
input rs2_data,
input rs3_data,
input rd,
input wb,
output ready

View file

@ -76,20 +76,6 @@ interface VX_ibuffer_if ();
input wid_n,
output ready
);
modport scoreboard (
input valid,
input wid,
input PC,
input wb,
input rd,
input rd_n,
input rs1_n,
input rs2_n,
input rs3_n,
input wid_n,
output ready
);
endinterface

View file

@ -0,0 +1,26 @@
`ifndef VX_TEX_CSR_IF
`define VX_TEX_CSR_IF
`include "VX_define.vh"
interface VX_tex_csr_if ();
wire write_enable;
wire [`CSR_ADDR_BITS-1:0] write_addr;
wire [31:0] write_data;
modport master (
output write_enable,
output write_addr,
output write_data
);
modport slave (
input write_enable,
input write_addr,
input write_data
);
endinterface
`endif

View file

@ -0,0 +1,51 @@
`ifndef VX_TEX_REQ_IF
`define VX_TEX_REQ_IF
`include "VX_define.vh"
interface VX_tex_req_if ();
wire valid;
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] tmask;
wire [31:0] PC;
wire [`NR_BITS-1:0] rd;
wire wb;
wire [`NTEX_BITS-1:0] unit;
wire [1:0][`NUM_THREADS-1:0][31:0] coords;
wire [`NUM_THREADS-1:0][31:0] lod;
wire ready;
modport master (
output valid,
output wid,
output tmask,
output PC,
output rd,
output wb,
output unit,
output coords,
output lod,
input ready
);
modport slave (
input valid,
input wid,
input tmask,
input PC,
input rd,
input wb,
input unit,
input coords,
input lod,
output ready
);
endinterface
`endif

View file

@ -0,0 +1,43 @@
`ifndef VX_TEX_RSP_IF
`define VX_TEX_RSP_IF
`include "VX_define.vh"
interface VX_tex_rsp_if ();
wire valid;
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] tmask;
wire [31:0] PC;
wire [`NR_BITS-1:0] rd;
wire wb;
wire [`NUM_THREADS-1:0][31:0] data;
wire ready;
modport master (
output valid,
output wid,
output tmask,
output PC,
output rd,
output wb,
output data,
input ready
);
modport slave (
input valid,
input wid,
input tmask,
input PC,
input rd,
input wb,
input data,
output ready
);
endinterface
`endif

View file

@ -36,15 +36,6 @@ interface VX_writeback_if ();
output ready
);
modport scoreboard (
input valid,
input wid,
input PC,
input rd,
input eop,
output ready
);
endinterface
`endif

View file

@ -93,13 +93,13 @@ module VX_scope #(
CMD_SET_START: begin
delay_val <= $bits(delay_val)'(cmd_data);
cmd_start <= 1;
`ifdef DBG_PRINT_SCOPE
`ifdef DBG_TRACE_SCOPE
dpi_trace("%d: *** scope: CMD_SET_START: delay_val=%0d\n", $time, $bits(delay_val)'(cmd_data));
`endif
end
CMD_SET_STOP: begin
waddr_end <= $bits(waddr)'(cmd_data);
`ifdef DBG_PRINT_SCOPE
`ifdef DBG_TRACE_SCOPE
dpi_trace("%d: *** scope: CMD_SET_STOP: waddr_end=%0d\n", $time, $bits(waddr)'(cmd_data));
`endif
end
@ -116,7 +116,7 @@ module VX_scope #(
delta <= 0;
delay_cntr <= 0;
start_time <= timestamp;
`ifdef DBG_PRINT_SCOPE
`ifdef DBG_TRACE_SCOPE
dpi_trace("%d: *** scope: recording start - start_time=%0d\n", $time, timestamp);
`endif
end else begin
@ -132,7 +132,7 @@ module VX_scope #(
recording <= 1;
delta <= 0;
start_time <= timestamp;
`ifdef DBG_PRINT_SCOPE
`ifdef DBG_TRACE_SCOPE
dpi_trace("%d: *** scope: recording start - start_time=%0d\n", $time, timestamp);
`endif
end
@ -161,7 +161,7 @@ module VX_scope #(
if (stop
|| (waddr >= waddr_end)) begin
`ifdef DBG_PRINT_SCOPE
`ifdef DBG_TRACE_SCOPE
dpi_trace("%d: *** scope: recording stop - waddr=(%0d, %0d)\n", $time, waddr, waddr_end);
`endif
waddr <= waddr; // keep last address
@ -229,7 +229,7 @@ module VX_scope #(
assign bus_out = bus_out_r;
`ifdef DBG_PRINT_SCOPE
`ifdef DBG_TRACE_SCOPE
always @(posedge clk) begin
if (bus_read) begin
dpi_trace("%d: scope-read: cmd=%0d, addr=%0d, value=%0h\n", $time, get_cmd, raddr, bus_out);

View file

@ -0,0 +1,178 @@
`include "VX_tex_define.vh"
module VX_tex_addr #(
parameter CORE_ID = 0,
parameter REQ_INFOW = 1,
parameter NUM_REQS = 1
) (
input wire clk,
input wire reset,
// inputs
input wire req_valid,
input wire [NUM_REQS-1:0] req_tmask,
input wire [1:0][NUM_REQS-1:0][31:0] req_coords,
input wire [`TEX_FORMAT_BITS-1:0] req_format,
input wire [`TEX_FILTER_BITS-1:0] req_filter,
input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps,
input wire [`TEX_ADDR_BITS-1:0] req_baseaddr,
input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoff,
input wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] req_logdims,
input wire [REQ_INFOW-1:0] req_info,
output wire req_ready,
// outputs
output wire rsp_valid,
output wire [NUM_REQS-1:0] rsp_tmask,
output wire [`TEX_FILTER_BITS-1:0] rsp_filter,
output wire [`TEX_STRIDE_BITS-1:0] rsp_stride,
output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr,
output wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends,
output wire [REQ_INFOW-1:0] rsp_info,
input wire rsp_ready
);
`UNUSED_PARAM (CORE_ID)
localparam PITCH_BITS = `MAX(`TEX_DIM_BITS, `TEX_STRIDE_BITS) + 1;
localparam SCALED_U_W = `FIXED_INT + `TEX_STRIDE_BITS;
localparam SCALED_X_W = (2 * `FIXED_INT);
localparam SCALED_V_W = SCALED_X_W + `TEX_STRIDE_BITS;
wire valid_s0;
wire [NUM_REQS-1:0] tmask_s0;
wire [`TEX_FILTER_BITS-1:0] filter_s0;
wire [REQ_INFOW-1:0] req_info_s0;
wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_lo, clamped_lo_s0;
wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_hi, clamped_hi_s0;
wire [`TEX_STRIDE_BITS-1:0] log_stride, log_stride_s0;
wire [NUM_REQS-1:0][31:0] mip_addr, mip_addr_s0;
wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] log_dims_s0;
wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0;
wire stall_out;
// stride
VX_tex_stride #(
.CORE_ID (CORE_ID)
) tex_stride (
.format (req_format),
.log_stride (log_stride)
);
// addressing mode
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 2; ++j) begin
wire [`FIXED_FRAC-1:0] delta = (`FIXED_HALF >> req_logdims[i][j]);
wire [31:0] coord_lo = req_filter ? (req_coords[j][i] - 32'(delta)) : req_coords[j][i];
wire [31:0] coord_hi = req_filter ? (req_coords[j][i] + 32'(delta)) : req_coords[j][i];
VX_tex_wrap #(
.CORE_ID (CORE_ID)
) tex_wrap_lo (
.wrap_i (req_wraps[j]),
.coord_i (coord_lo),
.coord_o (clamped_lo[i][j])
);
VX_tex_wrap #(
.CORE_ID (CORE_ID)
) tex_wrap_hi (
.wrap_i (req_wraps[j]),
.coord_i (coord_hi),
.coord_o (clamped_hi[i][j])
);
end
assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0]) + PITCH_BITS'(log_stride);
assign mip_addr[i] = req_baseaddr + 32'(req_mipoff[i]);
end
VX_pipe_register #(
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * `TEX_DIM_BITS + 32 + 2 * 2 * `FIXED_FRAC)),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, req_logdims, mip_addr, clamped_lo, clamped_hi}),
.data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, log_dims_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0})
);
// addresses generation
wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_lo;
wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_hi;
wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] blends;
wire [NUM_REQS-1:0][3:0][31:0] addr;
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 2; ++j) begin
assign scaled_lo[i][j] = scale_to_dim(clamped_lo_s0[i][j], log_dims_s0[i][j]);
assign scaled_hi[i][j] = scale_to_dim(clamped_hi_s0[i][j], log_dims_s0[i][j]);
assign blends[i][j] = filter_s0 ? clamped_lo_s0[i][j][`BLEND_FRAC-1:0] : `BLEND_FRAC'(0);
end
end
`UNUSED_VAR (log_pitch_s0)
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire [SCALED_U_W-1:0] offset_u_lo = SCALED_U_W'(scaled_lo[i][0]) << log_stride_s0;
wire [SCALED_U_W-1:0] offset_u_hi = SCALED_U_W'(scaled_hi[i][0]) << log_stride_s0;
wire [SCALED_V_W-1:0] offset_v_lo = SCALED_V_W'(scaled_lo[i][1]) << log_pitch_s0[i];
wire [SCALED_V_W-1:0] offset_v_hi = SCALED_V_W'(scaled_hi[i][1]) << log_pitch_s0[i];
wire [31:0] base_addr_lo = mip_addr_s0[i] + 32'(offset_v_lo);
wire [31:0] base_addr_hi = mip_addr_s0[i] + 32'(offset_v_hi);
assign addr[i][0] = base_addr_lo + 32'(offset_u_lo);
assign addr[i][1] = base_addr_lo + 32'(offset_u_hi);
assign addr[i][2] = base_addr_hi + 32'(offset_u_lo);
assign addr[i][3] = base_addr_hi + 32'(offset_u_hi);
end
assign stall_out = rsp_valid && ~rsp_ready;
VX_pipe_register #(
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `BLEND_FRAC) + REQ_INFOW),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}),
.data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_stride, rsp_addr, rsp_blends, rsp_info})
);
assign req_ready = ~stall_out;
`ifdef DBG_TRACE_TEX
wire [`NW_BITS-1:0] rsp_wid;
wire [31:0] rsp_PC;
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
always @(posedge clk) begin
if (rsp_valid && rsp_ready) begin
dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, tride=%0d, addr=",
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_stride);
`TRACE_ARRAY2D(rsp_addr, 4, NUM_REQS);
dpi_trace("\n");
end
end
`endif
function logic [`FIXED_INT-1:0] scale_to_dim (input logic [`FIXED_FRAC-1:0] src,
input logic [`TEX_DIM_BITS-1:0] dim);
`IGNORE_WARNINGS_BEGIN
logic [`FIXED_BITS-1:0] out;
`IGNORE_WARNINGS_END
out = `FIXED_BITS'(src) << dim;
return out[`FIXED_FRAC +: `FIXED_INT];
endfunction
endmodule

View file

@ -0,0 +1,39 @@
`ifndef VX_TEX_DEFINE
`define VX_TEX_DEFINE
`include "VX_define.vh"
`define FIXED_BITS 32
`define FIXED_FRAC 20
`define FIXED_INT (`FIXED_BITS - `FIXED_FRAC)
`define FIXED_ONE (2 ** `FIXED_FRAC)
`define FIXED_HALF (`FIXED_ONE >> 1)
`define FIXED_MASK (`FIXED_ONE - 1)
`define TEX_ADDR_BITS 32
`define TEX_FORMAT_BITS 3
`define TEX_WRAP_BITS 2
`define TEX_DIM_BITS 4
`define TEX_FILTER_BITS 1
`define TEX_MIPOFF_BITS (2*12+1)
`define TEX_STRIDE_BITS 2
`define TEX_LOD_BITS 4
`define TEX_MIP_BITS (`NTEX_BITS + `TEX_LOD_BITS)
`define TEX_WRAP_CLAMP 0
`define TEX_WRAP_REPEAT 1
`define TEX_WRAP_MIRROR 2
`define BLEND_FRAC 8
`define BLEND_ONE (2 ** `BLEND_FRAC)
`define TEX_FORMAT_R8G8B8A8 `TEX_FORMAT_BITS'(0)
`define TEX_FORMAT_R5G6B5 `TEX_FORMAT_BITS'(1)
`define TEX_FORMAT_R4G4B4A4 `TEX_FORMAT_BITS'(2)
`define TEX_FORMAT_L8A8 `TEX_FORMAT_BITS'(3)
`define TEX_FORMAT_L8 `TEX_FORMAT_BITS'(4)
`define TEX_FORMAT_A8 `TEX_FORMAT_BITS'(5)
`endif

View file

@ -0,0 +1,58 @@
`include "VX_tex_define.vh"
module VX_tex_format #(
parameter CORE_ID = 0
) (
input wire [`TEX_FORMAT_BITS-1:0] format,
input wire [31:0] texel_in,
output wire [31:0] texel_out
);
`UNUSED_PARAM (CORE_ID)
reg [31:0] texel_out_r;
always @(*) begin
case (format)
`TEX_FORMAT_R8G8B8A8: begin
texel_out_r[07:00] = texel_in[7:0];
texel_out_r[15:08] = texel_in[15:8];
texel_out_r[23:16] = texel_in[23:16];
texel_out_r[31:24] = texel_in[31:24];
end
`TEX_FORMAT_R5G6B5: begin
texel_out_r[07:00] = {texel_in[15:11], texel_in[15:13]};
texel_out_r[15:08] = {texel_in[10:5], texel_in[10:9]};
texel_out_r[23:16] = {texel_in[4:0], texel_in[4:2]};
texel_out_r[31:24] = 8'hff;
end
`TEX_FORMAT_R4G4B4A4: begin
texel_out_r[07:00] = {texel_in[11:8], texel_in[15:12]};
texel_out_r[15:08] = {2{texel_in[7:4]}};
texel_out_r[23:16] = {2{texel_in[3:0]}};
texel_out_r[31:24] = {2{texel_in[15:12]}};
end
`TEX_FORMAT_L8A8: begin
texel_out_r[07:00] = texel_in[7:0];
texel_out_r[15:08] = texel_in[7:0];
texel_out_r[23:16] = texel_in[7:0];
texel_out_r[31:24] = texel_in[15:8];
end
`TEX_FORMAT_L8: begin
texel_out_r[07:00] = texel_in[7:0];
texel_out_r[15:08] = texel_in[7:0];
texel_out_r[23:16] = texel_in[7:0];
texel_out_r[31:24] = 8'hff;
end
//`TEX_FORMAT_A8
default: begin
texel_out_r[07:00] = 0;
texel_out_r[15:08] = 0;
texel_out_r[23:16] = 0;
texel_out_r[31:24] = texel_in[7:0];
end
endcase
end
assign texel_out = texel_out_r;
endmodule

View file

@ -0,0 +1,16 @@
`include "VX_tex_define.vh"
module VX_tex_lerp (
input wire [3:0][7:0] in1,
input wire [3:0][7:0] in2,
input wire [8:0] alpha,
input wire [7:0] beta,
output wire [3:0][7:0] out
);
for (genvar i = 0; i < 4; ++i) begin
wire [16:0] sum = in1[i] * alpha + in2[i] * beta;
`UNUSED_VAR (sum)
assign out[i] = sum[15:8];
end
endmodule

View file

@ -0,0 +1,295 @@
`include "VX_tex_define.vh"
module VX_tex_mem #(
parameter CORE_ID = 0,
parameter REQ_INFOW = 1,
parameter NUM_REQS = 1
) (
input wire clk,
input wire reset,
// memory interface
VX_dcache_req_if.master dcache_req_if,
VX_dcache_rsp_if.slave dcache_rsp_if,
// inputs
input wire req_valid,
input wire [NUM_REQS-1:0] req_tmask,
input wire [`TEX_FILTER_BITS-1:0] req_filter,
input wire [`TEX_STRIDE_BITS-1:0] req_stride,
input wire [NUM_REQS-1:0][3:0][31:0] req_addr,
input wire [REQ_INFOW-1:0] req_info,
output wire req_ready,
// outputs
output wire rsp_valid,
output wire [NUM_REQS-1:0] rsp_tmask,
output wire [NUM_REQS-1:0][3:0][31:0] rsp_data,
output wire [REQ_INFOW-1:0] rsp_info,
input wire rsp_ready
);
`UNUSED_PARAM (CORE_ID)
localparam RSP_CTR_W = $clog2(NUM_REQS * 4 + 1);
wire [3:0] dup_reqs;
wire [3:0][NUM_REQS-1:0][29:0] req_addr_w;
wire [3:0][NUM_REQS-1:0][1:0] align_offs;
// reorder address into quads
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 4; ++j) begin
assign req_addr_w[j][i] = req_addr[i][j][31:2];
assign align_offs[j][i] = req_addr[i][j][1:0];
end
end
// find duplicate addresses
for (genvar i = 0; i < 4; ++i) begin
wire [NUM_REQS-1:0] addr_matches;
for (genvar j = 0; j < NUM_REQS; j++) begin
assign addr_matches[j] = (req_addr_w[i][0] == req_addr_w[i][j]) || ~req_tmask[j];
end
assign dup_reqs[i] = req_tmask[0] && (& addr_matches);
end
// save request addresses into fifo
wire reqq_push, reqq_pop, reqq_empty, reqq_full;
wire [3:0][NUM_REQS-1:0][29:0] q_req_addr;
wire [NUM_REQS-1:0] q_req_tmask;
wire [`TEX_FILTER_BITS-1:0] q_req_filter;
wire [REQ_INFOW-1:0] q_req_info;
wire [`TEX_STRIDE_BITS-1:0] q_req_stride;
wire [3:0][NUM_REQS-1:0][1:0] q_align_offs;
wire [3:0] q_dup_reqs;
assign reqq_push = req_valid && req_ready;
VX_fifo_queue #(
.DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (4 * NUM_REQS * 2) + 4),
.SIZE (`LSUQ_SIZE),
.OUT_REG (1)
) req_queue (
.clk (clk),
.reset (reset),
.push (reqq_push),
.pop (reqq_pop),
.data_in ({req_addr_w, req_tmask, req_info, req_filter, req_stride, align_offs, dup_reqs}),
.data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_stride, q_align_offs, q_dup_reqs}),
.empty (reqq_empty),
.full (reqq_full),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (size)
);
// can take more requests?
assign req_ready = ~reqq_full;
///////////////////////////////////////////////////////////////////////////
wire req_texel_valid;
wire sent_all_ready, last_texel_sent;
wire req_texel_dup;
wire [NUM_REQS-1:0][29:0] req_texel_addr;
reg [1:0] req_texel_idx;
reg req_texels_done;
always @(posedge clk) begin
if (reset || last_texel_sent) begin
req_texel_idx <= 0;
end else if (req_texel_valid && sent_all_ready) begin
req_texel_idx <= req_texel_idx + 1;
end
end
always @(posedge clk) begin
if (reset || reqq_pop) begin
req_texels_done <= 0;
end else if (last_texel_sent) begin
req_texels_done <= 1;
end
end
assign req_texel_valid = ~reqq_empty && ~req_texels_done;
assign req_texel_addr = q_req_addr[req_texel_idx];
assign req_texel_dup = q_dup_reqs[req_texel_idx];
wire is_last_texel = (req_texel_idx == (q_req_filter ? 3 : 0));
assign last_texel_sent = req_texel_valid && sent_all_ready && is_last_texel;
// DCache Request
reg [NUM_REQS-1:0] texel_sent_mask;
wire [NUM_REQS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready;
wire dcache_req_fire_any = (| dcache_req_fire);
assign sent_all_ready = (&(dcache_req_if.ready | texel_sent_mask | ~q_req_tmask))
|| (req_texel_dup & dcache_req_if.ready[0]);
always @(posedge clk) begin
if (reset || sent_all_ready) begin
texel_sent_mask <= 0;
end else begin
texel_sent_mask <= texel_sent_mask | dcache_req_fire;
end
end
wire [NUM_REQS-1:0] req_dup_mask = {{(NUM_REQS-1){~req_texel_dup}}, 1'b1};
assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask;
assign dcache_req_if.rw = {NUM_REQS{1'b0}};
assign dcache_req_if.addr = req_texel_addr;
assign dcache_req_if.byteen = {NUM_REQS{4'b1111}};
assign dcache_req_if.data = 'x;
`ifdef DBG_CACHE_REQ_INFO
assign dcache_req_if.tag = {NUM_REQS{q_req_info[`DBG_CACHE_REQ_MDATAW-1:0], req_texel_idx}};
`else
assign dcache_req_if.tag = {NUM_REQS{req_texel_idx}};
`endif
// Dcache Response
reg [3:0][NUM_REQS-1:0][31:0] rsp_texels, rsp_texels_n;
wire [NUM_REQS-1:0][3:0][31:0] rsp_texels_qual;
reg [NUM_REQS-1:0][31:0] rsp_data_qual;
reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init;
wire [RSP_CTR_W-1:0] rsp_rem_ctr_n;
wire dcache_rsp_fire;
wire [1:0] rsp_texel_idx;
wire rsp_texel_dup;
assign rsp_texel_idx = dcache_rsp_if.tag[1:0];
`UNUSED_VAR (dcache_rsp_if.tag)
assign rsp_texel_dup = q_dup_reqs[rsp_texel_idx];
assign dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready;
for (genvar i = 0; i < NUM_REQS; i++) begin
wire [31:0] src_mask = {32{dcache_rsp_if.tmask[i]}};
wire [31:0] src_data = ((i == 0 || rsp_texel_dup) ? dcache_rsp_if.data[0] : dcache_rsp_if.data[i]) & src_mask;
reg [31:0] rsp_data_shifted;
always @(*) begin
rsp_data_shifted[31:16] = src_data[31:16];
rsp_data_shifted[15:0] = q_align_offs[rsp_texel_idx][i][1] ? src_data[31:16] : src_data[15:0];
rsp_data_shifted[7:0] = q_align_offs[rsp_texel_idx][i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0];
end
always @(*) begin
case (q_req_stride)
0: rsp_data_qual[i] = 32'(rsp_data_shifted[7:0]);
1: rsp_data_qual[i] = 32'(rsp_data_shifted[15:0]);
default: rsp_data_qual[i] = rsp_data_shifted;
endcase
end
end
always @(*) begin
rsp_texels_n = rsp_texels;
rsp_texels_n[rsp_texel_idx] |= rsp_data_qual;
end
always @(posedge clk) begin
if (reset || reqq_pop) begin
rsp_texels <= '0;
end else if (dcache_rsp_fire) begin
rsp_texels <= rsp_texels_n;
end
end
always @(*) begin
rsp_rem_ctr_init = RSP_CTR_W'($countones(q_dup_reqs[0] ? NUM_REQS'(1) : q_req_tmask));
if (q_req_filter) begin
for (integer i = 1; i < 4; ++i) begin
rsp_rem_ctr_init += RSP_CTR_W'($countones(q_dup_reqs[i] ? NUM_REQS'(1) : q_req_tmask));
end
end
end
assign rsp_rem_ctr_n = rsp_rem_ctr - RSP_CTR_W'($countones(dcache_rsp_if.tmask));
always @(posedge clk) begin
if (reset) begin
rsp_rem_ctr <= 0;
end else begin
if (dcache_req_fire_any && 0 == rsp_rem_ctr) begin
rsp_rem_ctr <= rsp_rem_ctr_init;
end else if (dcache_rsp_fire) begin
rsp_rem_ctr <= rsp_rem_ctr_n;
end
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 4; ++j) begin
assign rsp_texels_qual[i][j] = rsp_texels_n[j][i];
end
end
wire stall_out = rsp_valid && ~rsp_ready;
wire is_last_rsp = (0 == rsp_rem_ctr_n);
wire rsp_texels_done = dcache_rsp_fire && is_last_rsp;
assign reqq_pop = rsp_texels_done && ~stall_out;
VX_pipe_register #(
.DATAW (1 + NUM_REQS + REQ_INFOW + (4 * NUM_REQS * 32)),
.RESETW (1)
) rsp_pipe_reg (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({rsp_texels_done, q_req_tmask, q_req_info, rsp_texels_qual}),
.data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data})
);
// Can accept new cache response?
assign dcache_rsp_if.ready = ~(is_last_rsp && stall_out);
`ifdef DBG_TRACE_TEX
wire [`NW_BITS-1:0] q_req_wid, req_wid, rsp_wid;
wire [31:0] q_req_PC, req_PC, rsp_PC;
assign {q_req_wid, q_req_PC} = q_req_info[`NW_BITS+32-1:0];
assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0];
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
always @(posedge clk) begin
if (dcache_req_fire_any) begin
dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, tag=%0h, addr=",
$time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_texel_idx);
`TRACE_ARRAY1D(req_texel_addr, NUM_REQS);
dpi_trace(", is_dup=%b\n", req_texel_dup);
end
if (dcache_rsp_fire) begin
dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, data=",
$time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_texel_idx);
`TRACE_ARRAY1D(dcache_rsp_if.data, NUM_REQS);
dpi_trace("\n");
end
if (req_valid && req_ready) begin
dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, stride=%0d, addr=",
$time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_stride);
`TRACE_ARRAY2D(req_addr, 4, NUM_REQS);
dpi_trace("\n");
end
if (rsp_valid && rsp_ready) begin
dpi_trace("%d: core%0d-tex-mem-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask);
`TRACE_ARRAY2D(rsp_data, 4, NUM_REQS);
dpi_trace("\n");
end
end
`endif
endmodule

View file

@ -0,0 +1,146 @@
`include "VX_tex_define.vh"
module VX_tex_sampler #(
parameter CORE_ID = 0,
parameter REQ_INFOW = 1,
parameter NUM_REQS = 1
) (
input wire clk,
input wire reset,
// inputs
input wire req_valid,
input wire [NUM_REQS-1:0] req_tmask,
input wire [`TEX_FORMAT_BITS-1:0] req_format,
input wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] req_blends,
input wire [NUM_REQS-1:0][3:0][31:0] req_data,
input wire [REQ_INFOW-1:0] req_info,
output wire req_ready,
// ouputs
output wire rsp_valid,
output wire [NUM_REQS-1:0] rsp_tmask,
output wire [NUM_REQS-1:0][31:0] rsp_data,
output wire [REQ_INFOW-1:0] rsp_info,
input wire rsp_ready
);
`UNUSED_PARAM (CORE_ID)
wire valid_s0;
wire [NUM_REQS-1:0] tmask_s0;
wire [REQ_INFOW-1:0] req_info_s0;
wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh;
wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0;
wire [NUM_REQS-1:0][`BLEND_FRAC-1:0] blend_v, blend_v_s0;
wire [NUM_REQS-1:0][31:0] texel_v;
wire stall_out;
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire [3:0][31:0] fmt_texels;
for (genvar j = 0; j < 4; ++j) begin
VX_tex_format #(
.CORE_ID (CORE_ID)
) tex_format (
.format (req_format),
.texel_in (req_data[i][j]),
.texel_out (fmt_texels[j])
);
end
wire [7:0] beta = req_blends[i][0];
wire [8:0] alpha = `BLEND_ONE - beta;
VX_tex_lerp #(
) tex_lerp_ul (
.in1 (fmt_texels[0]),
.in2 (fmt_texels[1]),
.alpha (alpha),
.beta (beta),
.out (texel_ul[i])
);
VX_tex_lerp #(
) tex_lerp_uh (
.in1 (fmt_texels[2]),
.in2 (fmt_texels[3]),
.alpha (alpha),
.beta (beta),
.out (texel_uh[i])
);
assign blend_v[i] = req_blends[i][1];
end
VX_pipe_register #(
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `BLEND_FRAC) + (2 * NUM_REQS * 32)),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({req_valid, req_tmask, req_info, blend_v, texel_ul, texel_uh}),
.data_out ({valid_s0, tmask_s0, req_info_s0, blend_v_s0, texel_ul_s0, texel_uh_s0})
);
for (genvar i = 0; i < NUM_REQS; i++) begin
wire [7:0] beta = blend_v_s0[i];
wire [8:0] alpha = `BLEND_ONE - beta;
VX_tex_lerp #(
) tex_lerp_v (
.in1 (texel_ul_s0[i]),
.in2 (texel_uh_s0[i]),
.alpha (alpha),
.beta (beta),
.out (texel_v[i])
);
end
assign stall_out = rsp_valid && ~rsp_ready;
VX_pipe_register #(
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * 32)),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({valid_s0, tmask_s0, req_info_s0, texel_v}),
.data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data})
);
// can accept new request?
assign req_ready = ~stall_out;
`ifdef DBG_TRACE_TEX
wire [`NW_BITS-1:0] req_wid, rsp_wid;
wire [31:0] req_PC, rsp_PC;
assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0];
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
always @(posedge clk) begin
if (req_valid && req_ready) begin
dpi_trace("%d: core%0d-tex-sampler-req: wid=%0d, PC=%0h, tmask=%b, format=%0d, data=",
$time, CORE_ID, req_wid, req_PC, req_tmask, req_format);
`TRACE_ARRAY2D(req_data, 4, NUM_REQS);
dpi_trace(", u0=");
`TRACE_ARRAY1D(req_blends[0], NUM_REQS);
dpi_trace(", v0=");
`TRACE_ARRAY1D(req_blends[1], NUM_REQS);
dpi_trace("\n");
end
if (rsp_valid && rsp_ready) begin
dpi_trace("%d: core%0d-tex-sampler-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask);
`TRACE_ARRAY1D(rsp_data, NUM_REQS);
dpi_trace("\n");
end
end
`endif
endmodule

View file

@ -0,0 +1,21 @@
`include "VX_platform.vh"
module VX_tex_sat #(
parameter IN_W = 1,
parameter OUT_W = 1,
parameter MODEL = 1
) (
input wire [IN_W-1:0] data_in,
output wire [OUT_W-1:0] data_out
);
`STATIC_ASSERT(((OUT_W+1) < IN_W), ("invalid parameter"))
if (MODEL == 1) begin
wire [OUT_W-1:0] underflow_mask = {OUT_W{~data_in[IN_W-1]}};
wire [OUT_W-1:0] overflow_mask = {OUT_W{(| data_in[IN_W-2:OUT_W])}};
assign data_out = (data_in[OUT_W-1:0] | overflow_mask) & underflow_mask;
end else begin
assign data_out = data_in[IN_W-1] ? OUT_W'(0) : ((data_in > {OUT_W{1'b1}}) ? {OUT_W{1'b1}} : OUT_W'(data_in));
end
endmodule

View file

@ -0,0 +1,27 @@
`include "VX_tex_define.vh"
module VX_tex_stride #(
parameter CORE_ID = 0
) (
input wire [`TEX_FORMAT_BITS-1:0] format,
output wire [`TEX_STRIDE_BITS-1:0] log_stride
);
`UNUSED_PARAM (CORE_ID)
reg [`TEX_STRIDE_BITS-1:0] log_stride_r;
always @(*) begin
case (format)
`TEX_FORMAT_A8: log_stride_r = 0;
`TEX_FORMAT_L8: log_stride_r = 0;
`TEX_FORMAT_L8A8: log_stride_r = 1;
`TEX_FORMAT_R5G6B5: log_stride_r = 1;
`TEX_FORMAT_R4G4B4A4: log_stride_r = 1;
//`TEX_FORMAT_R8G8B8A8
default: log_stride_r = 2;
endcase
end
assign log_stride = log_stride_r;
endmodule

View file

@ -0,0 +1,234 @@
`include "VX_tex_define.vh"
module VX_tex_unit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// Texture unit <-> Memory Unit
VX_dcache_req_if.master dcache_req_if,
VX_dcache_rsp_if.slave dcache_rsp_if,
// Inputs
VX_tex_req_if.slave tex_req_if,
VX_tex_csr_if.slave tex_csr_if,
// Outputs
VX_tex_rsp_if.master tex_rsp_if
);
localparam REQ_INFOW_S = `NR_BITS + 1 + `NW_BITS + 32;
localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S;
localparam REQ_INFOW_M = (2 * `NUM_THREADS * `BLEND_FRAC) + REQ_INFOW_A;
reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0];
reg [1:0][`TEX_DIM_BITS-1:0] tex_dims [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0];
reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0];
reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0];
reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0];
reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0];
// CSRs programming
reg [`NUM_TEX_UNITS-1:0] csrs_dirty;
`UNUSED_VAR (csrs_dirty)
for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin
wire [`TEX_LOD_BITS-1:0] mip_level = tex_csr_if.write_data[28 +: `TEX_LOD_BITS];
always @(posedge clk) begin
if (tex_csr_if.write_enable) begin
case (tex_csr_if.write_addr)
`CSR_TEX_ADDR(i) : begin
tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0];
csrs_dirty[i] <= 1;
end
`CSR_TEX_FORMAT(i) : begin
tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0];
csrs_dirty[i] <= 1;
end
`CSR_TEX_WRAP(i) : begin
tex_wraps[i][0] <= tex_csr_if.write_data[0 +: `TEX_WRAP_BITS];
tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS +: `TEX_WRAP_BITS];
csrs_dirty[i] <= 1;
end
`CSR_TEX_FILTER(i) : begin
tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0];
csrs_dirty[i] <= 1;
end
`CSR_TEX_MIPOFF(i) : begin
tex_mipoff[i][mip_level] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0];
csrs_dirty[i] <= 1;
end
`CSR_TEX_WIDTH(i) : begin
tex_dims[i][mip_level][0] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0];
csrs_dirty[i] <= 1;
end
`CSR_TEX_HEIGHT(i) : begin
tex_dims[i][mip_level][1] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0];
csrs_dirty[i] <= 1;
end
endcase
end
if (reset || (tex_req_if.valid && tex_req_if.ready)) begin
csrs_dirty[i] <= '0;
end
end
end
// mipmap attributes
wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff;
wire [`NUM_THREADS-1:0][1:0][`TEX_DIM_BITS-1:0] sel_dims;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0];
wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][20+:`TEX_LOD_BITS];
assign sel_mipoff[i] = tex_mipoff[unit][mip_level];
assign sel_dims[i] = tex_dims[unit][mip_level];
end
// address generation
wire mem_req_valid;
wire [`NUM_THREADS-1:0] mem_req_tmask;
wire [`TEX_FILTER_BITS-1:0] mem_req_filter;
wire [`TEX_STRIDE_BITS-1:0] mem_req_stride;
wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] mem_req_blends;
wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr;
wire [REQ_INFOW_A-1:0] mem_req_info;
wire mem_req_ready;
VX_tex_addr #(
.CORE_ID (CORE_ID),
.REQ_INFOW (REQ_INFOW_A),
.NUM_REQS (`NUM_THREADS)
) tex_addr (
.clk (clk),
.reset (reset),
.req_valid (tex_req_if.valid),
.req_tmask (tex_req_if.tmask),
.req_coords (tex_req_if.coords),
.req_format (tex_format[tex_req_if.unit]),
.req_filter (tex_filter[tex_req_if.unit]),
.req_wraps (tex_wraps[tex_req_if.unit]),
.req_baseaddr (tex_baddr[tex_req_if.unit]),
.req_mipoff (sel_mipoff),
.req_logdims (sel_dims),
.req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}),
.req_ready (tex_req_if.ready),
.rsp_valid (mem_req_valid),
.rsp_tmask (mem_req_tmask),
.rsp_filter (mem_req_filter),
.rsp_stride (mem_req_stride),
.rsp_addr (mem_req_addr),
.rsp_blends (mem_req_blends),
.rsp_info (mem_req_info),
.rsp_ready (mem_req_ready)
);
// retrieve texel values from memory
wire mem_rsp_valid;
wire [`NUM_THREADS-1:0] mem_rsp_tmask;
wire [`NUM_THREADS-1:0][3:0][31:0] mem_rsp_data;
wire [REQ_INFOW_M-1:0] mem_rsp_info;
wire mem_rsp_ready;
VX_tex_mem #(
.CORE_ID (CORE_ID),
.REQ_INFOW (REQ_INFOW_M),
.NUM_REQS (`NUM_THREADS)
) tex_mem (
.clk (clk),
.reset (reset),
// memory interface
.dcache_req_if (dcache_req_if),
.dcache_rsp_if (dcache_rsp_if),
// inputs
.req_valid (mem_req_valid),
.req_tmask (mem_req_tmask),
.req_filter(mem_req_filter),
.req_stride(mem_req_stride),
.req_addr (mem_req_addr),
.req_info ({mem_req_blends, mem_req_info}),
.req_ready (mem_req_ready),
// outputs
.rsp_valid (mem_rsp_valid),
.rsp_tmask (mem_rsp_tmask),
.rsp_data (mem_rsp_data),
.rsp_info (mem_rsp_info),
.rsp_ready (mem_rsp_ready)
);
// apply sampler
wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends;
wire [`TEX_FORMAT_BITS-1:0] rsp_format;
wire [REQ_INFOW_S-1:0] rsp_info;
assign {rsp_blends, rsp_format, rsp_info} = mem_rsp_info;
VX_tex_sampler #(
.CORE_ID (CORE_ID),
.REQ_INFOW (REQ_INFOW_S),
.NUM_REQS (`NUM_THREADS)
) tex_sampler (
.clk (clk),
.reset (reset),
// inputs
.req_valid (mem_rsp_valid),
.req_tmask (mem_rsp_tmask),
.req_data (mem_rsp_data),
.req_format (rsp_format),
.req_blends (rsp_blends),
.req_info (rsp_info),
.req_ready (mem_rsp_ready),
// outputs
.rsp_valid (tex_rsp_if.valid),
.rsp_tmask (tex_rsp_if.tmask),
.rsp_data (tex_rsp_if.data),
.rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}),
.rsp_ready (tex_rsp_if.ready)
);
`ifdef DBG_TRACE_TEX
always @(posedge clk) begin
if (tex_req_if.valid && tex_req_if.ready) begin
for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin
if (csrs_dirty[i]) begin
dpi_trace("%d: core%0d-tex-csr: tex%0d_addr=%0h\n", $time, CORE_ID, i, tex_baddr[i]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_format=%0h\n", $time, CORE_ID, i, tex_format[i]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_u=%0h\n", $time, CORE_ID, i, tex_wraps[i][0]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_v=%0h\n", $time, CORE_ID, i, tex_wraps[i][1]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_filter=%0h\n", $time, CORE_ID, i, tex_filter[i]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_mipoff[0]=%0h\n", $time, CORE_ID, i, tex_mipoff[i][0]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_width[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][0]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_height[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][1]);
end
end
dpi_trace("%d: core%0d-tex-req: wid=%0d, PC=%0h, tmask=%b, unit=%0d, lod=%0h, u=",
$time, CORE_ID, tex_req_if.wid, tex_req_if.PC, tex_req_if.tmask, tex_req_if.unit, tex_req_if.lod);
`TRACE_ARRAY1D(tex_req_if.coords[0], `NUM_THREADS);
dpi_trace(", v=");
`TRACE_ARRAY1D(tex_req_if.coords[1], `NUM_THREADS);
dpi_trace("\n");
end
if (tex_rsp_if.valid && tex_rsp_if.ready) begin
dpi_trace("%d: core%0d-tex-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
$time, CORE_ID, tex_rsp_if.wid, tex_rsp_if.PC, tex_rsp_if.tmask);
`TRACE_ARRAY1D(tex_rsp_if.data, `NUM_THREADS);
dpi_trace("\n");
end
end
`endif
endmodule

View file

@ -0,0 +1,38 @@
`include "VX_tex_define.vh"
module VX_tex_wrap #(
parameter CORE_ID = 0
) (
input wire [`TEX_WRAP_BITS-1:0] wrap_i,
input wire [31:0] coord_i,
output wire [`FIXED_FRAC-1:0] coord_o
);
`UNUSED_PARAM (CORE_ID)
reg [`FIXED_FRAC-1:0] coord_r;
wire [`FIXED_FRAC-1:0] clamp;
VX_tex_sat #(
.IN_W (32),
.OUT_W (`FIXED_FRAC)
) sat_fx (
.data_in (coord_i),
.data_out (clamp)
);
always @(*) begin
case (wrap_i)
`TEX_WRAP_CLAMP:
coord_r = clamp;
`TEX_WRAP_MIRROR:
coord_r = coord_i[`FIXED_FRAC-1:0] ^ {`FIXED_FRAC{coord_i[`FIXED_FRAC]}};
default: //`TEX_WRAP_REPEAT
coord_r = coord_i[`FIXED_FRAC-1:0];
endcase
end
assign coord_o = coord_r;
endmodule

View file

@ -194,9 +194,9 @@
"issue_imm": 32,
"issue_use_pc": 1,
"issue_use_imm": 1,
"gpr_rsp_a":"`NUM_THREADS * 32",
"gpr_rsp_b":"`NUM_THREADS * 32",
"gpr_rsp_c":"`NUM_THREADS * 32",
"gpr_rs1":"`NUM_THREADS * 32",
"gpr_rs2":"`NUM_THREADS * 32",
"gpr_rs3":"`NUM_THREADS * 32",
"?writeback_valid": 1,
"writeback_wid":"`NW_BITS",
"writeback_pc": 32,
@ -205,7 +205,7 @@
"writeback_data":"`NUM_THREADS * 32",
"writeback_eop": 1,
"!scoreboard_delay": 1,
"!execute_delay": 1
"!dispatch_delay": 1
},
"afu/vortex/l3cache/bank, afu/vortex/cluster/l2cache/bank, afu/vortex/cluster/core/mem_unit/dcache/bank, afu/vortex/cluster/core/mem_unit/icache/bank": {
"?valid_st0": 1,

View file

@ -262,7 +262,7 @@ def expand_text(text, params):
has_func = do_repl.has_func
if not (params_updated or do_repl.expanded):
break
text = new_text
text = new_text
changed = True
if not has_func:
break

View file

@ -8,20 +8,21 @@ else
RUN_SYNTH=qsub-synth
endif
# control RTL debug print states
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
# control RTL debug tracing states
DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
DBG_TRACE_FLAGS += -DDBG_TRACE_MEM
DBG_TRACE_FLAGS += -DDBG_TRACE_OPAE
DBG_TRACE_FLAGS += -DDBG_TRACE_AVS
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += $(DBG_TRACE_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
@ -33,7 +34,8 @@ CONFIG32 := -DNUM_CLUSTERS=4 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=1 -DICACHE_
CONFIG64 := -DNUM_CLUSTERS=8 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=1 -DICACHE_SIZE=8192 -DDCACHE_SIZE=8192 -DL3_CACHE_SIZE=524288 $(CONFIGS)
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)
RTL_INCLUDE = -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) -I$(RTL_DIR) -I$(RTL_DIR)/afu
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/afu $(FPU_INCLUDE) $(TEX_INCLUDE)
CFLAGS += $(RTL_INCLUDE)

View file

@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE)
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -12,12 +12,12 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
# Part, Family
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
# Executable Configuration
SYN_ARGS = --parallel --read_settings_files=on

View file

@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -12,7 +12,8 @@ FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -12,7 +12,8 @@ DEVICE = 1SX280HN2F43E2VG
FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -12,7 +12,9 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
# Executable Configuration

View file

@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE)
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -1,46 +1,42 @@
PARAM += -DCACHE_SIZE=4096 -DCACHE_WORD_SIZE=4 -DCACHE_LINE_SIZE=16 -DCACHE_NUM_BANKS=4 -DCACHE_CREQ_SIZE=4 -DMRVQ_SIZE=16 -DDFPQ_SIZE=16 -DSNRQ_SIZE=16 -DCWBQ_SIZE=4 -DDWBQ_SIZE=4 -DFQQ_SIZE=4
PARAMS += -DCACHE_SIZE=4096 -DCACHE_WORD_SIZE=4 -DCACHE_LINE_SIZE=16 -DCACHE_NUM_BANKS=4 -DCACHE_CREQ_SIZE=4 -DMRVQ_SIZE=16 -DDFPQ_SIZE=16 -DSNRQ_SIZE=16 -DCWBQ_SIZE=4 -DDWBQ_SIZE=4 -DFQQ_SIZE=4
# control RTL debug tracing states
DBG_TRACE_FLAGS = -DDBG_TRACE_CORE_ICACHE \
-DDBG_TRACE_CORE_DCACHE \
-DDBG_TRACE_CACHE_BANK \
-DDBG_TRACE_CACHE_SNP \
-DDBG_TRACE_CACHE_MSHR \
-DDBG_TRACE_CACHE_TAG \
-DDBG_TRACE_CACHE_DATA \
-DDBG_TRACE_MEM \
-DDBG_TRACE_OPAE \
-DDBG_TRACE_AVS
# control RTL debug print states
DBG_PRINT_FLAGS = -DDBG_PRINT_CORE_ICACHE \
-DDBG_PRINT_CORE_DCACHE \
-DDBG_PRINT_CACHE_BANK \
-DDBG_PRINT_CACHE_SNP \
-DDBG_PRINT_CACHE_MSHR \
-DDBG_PRINT_CACHE_TAG \
-DDBG_PRINT_CACHE_DATA \
-DDBG_PRINT_MEM \
-DDBG_PRINT_OPAE \
-DDBG_PRINT_AVS
#DBG_PRINT=$(DBG_PRINT_FLAGS)
#DBG_PRINT=$(DBG_TRACE_FLAGS)
INCLUDE = -I../../rtl/ -I../../rtl/cache -I../../rtl/libs
SRCS = cachesim.cpp testbench.cpp
all: build
CF += -std=c++11 -fms-extensions -I../..
CF += $(PARAMS)
VF += --language 1800-2009 --assert -Wall --trace #-Wpedantic
VF += -Wno-DECLFILENAME
VF += --x-initial unique
VF += -exe $(SRCS) $(INCLUDE)
DBG += -DVCD_OUTPUT $(DBG_PRINT)
VF += $(PARAMS)
gen:
verilator $(VF) -DNDEBUG -cc VX_cache.v $(PARAM) -CFLAGS '$(CF) -DNDEBUG $(PARAM)' --exe $(SRCS)
verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS)
build: gen
(cd obj_dir && make -j -f VVX_cache.mk)
(cd obj_dir && make -j -f V$(TOP).mk)
run: build
(cd obj_dir && ./VVX_cache)
(cd obj_dir && ./V$(TOP))
clean:
rm -rf obj_dir

View file

@ -173,10 +173,10 @@ void CacheSim::stall_mem(){
}
void CacheSim::send_snoop_req(){
cache_->snp_req_valid = 1;
/*cache_->snp_req_valid = 1;
cache_->snp_req_addr = 0x12222222;
cache_->snp_req_invalidate = 1;
cache_->snp_req_tag = 0xff;
cache_->snp_req_tag = 0xff; */
}
void CacheSim::eval_mem_bus() {
@ -274,9 +274,9 @@ bool CacheSim::assert_equal(unsigned int* data, unsigned int tag){
//DEBUG
void CacheSim::display_miss(){
int i = (unsigned int)cache_->miss_vec;
std::bitset<8> x(i);
if (i) std::cout << "Miss Vec " << x << std::endl;
//int i = (unsigned int)cache_->miss_vec;
//std::bitset<8> x(i);
//if (i) std::cout << "Miss Vec " << x << std::endl;
//std::cout << "Miss Vec 0" << cache_->miss_vec[0] << std::endl;
}

View file

@ -1,11 +1,30 @@
all: testbench.iv
TOP = VX_fifo_queue
testbench.iv: testbench.v
iverilog testbench.v -o testbench.iv -I ../../rtl/
PARAMS ?=
run: testbench.iv
! vvp testbench.iv | grep 'ERROR' || false
INCLUDE = -I../../rtl/ -I../../rtl/libs
SRCS = main.cpp
all: build
CF += -std=c++11 -fms-extensions -I../..
VF += $(PARAMS)
VF += --language 1800-2009 --assert -Wall --trace
VF += -Wno-DECLFILENAME
VF += --x-initial unique
VF += -exe $(SRCS) $(INCLUDE)
VF += $(PARAMS)
gen:
verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS)
build: gen
(cd obj_dir && make -j -f V$(TOP).mk)
run: build
(cd obj_dir && ./V$(TOP))
clean:
rm testbench.iv
rm -rf obj_dir

View file

@ -0,0 +1,93 @@
#include "vl_simulator.h"
#include "VVX_fifo_queue.h"
#include <iostream>
#define MAX_TICKS 20
#define CHECK(x) \
do { \
if (x) \
break; \
std::cout << "FAILED: " << #x << std::endl; \
std::abort(); \
} while (false)
uint64_t ticks = 0;
double sc_time_stamp() {
return ticks;
}
using Device = VVX_fifo_queue;
int main(int argc, char **argv) {
// Initialize Verilators variables
Verilated::commandArgs(argc, argv);
vl_simulator<Device> sim;
// run test
ticks = sim.reset(0);
while (ticks < MAX_TICKS) {
switch (ticks) {
case 0:
// initial values
sim->pop = 0;
sim->push = 0;
ticks = sim.step(ticks, 2);
break;
case 2:
// Verify outputs
CHECK(sim->full == 0x0);
CHECK(sim->empty == 0x1);
// push 0xa
sim->pop = 0;
sim->push = 1;
sim->data_in = 0xa;
break;
case 4:
// verify outputs
CHECK(sim->data_out == 0xa);
CHECK(sim->full == 0x0);
CHECK(sim->empty == 0x0);
// push 0xb
sim->pop = 0;
sim->push = 1;
sim->data_in = 0xb;
break;
case 6:
// verify outputs
CHECK(sim->data_out == 0xa);
CHECK(sim->full == 0x1);
CHECK(sim->empty == 0x0);
// pop
sim->pop = 1;
sim->push = 0;
break;
case 8:
// verify outputs
CHECK(sim->data_out == 0xb);
CHECK(sim->full == 0x0);
CHECK(sim->empty == 0x0);
// pop
sim->pop = 1;
sim->push = 0;
break;
case 10:
// verify outputs
CHECK(sim->full == 0x0);
CHECK(sim->empty == 0x1);
sim->pop = 0;
sim->push = 0;
break;
}
// advance clock
ticks = sim.step(ticks, 2);
}
std::cout << "PASSED!" << std::endl;
std::cout << "Simulation time: " << std::dec << ticks/2 << " cycles" << std::endl;
return 0;
}

View file

@ -0,0 +1,81 @@
#pragma once
#include <array>
#include <cstdint>
#include "verilated.h"
#ifdef VM_TRACE
#include <verilated_vcd_c.h> // Trace file format header
#endif
template <typename T>
class vl_simulator {
private:
T top_;
#ifdef VM_TRACE
VerilatedVcdC tfp_;
#endif
public:
vl_simulator() {
top_.clk = 0;
top_.reset = 0;
#ifdef VM_TRACE
Verilated::traceEverOn(true);
top_.trace(&tfp_, 99);
tfp_.open("trace.vcd");
#endif
}
~vl_simulator() {
#ifdef VM_TRACE
tfp_.close();
#endif
top_.final();
}
uint64_t reset(uint64_t ticks) {
top_.reset = 1;
ticks = this->step(ticks, 2);
top_.reset = 0;
return ticks;
}
uint64_t step(uint64_t ticks, uint32_t count = 1) {
while (count--) {
top_.eval();
#ifdef VM_TRACE
tfp_.dump(ticks);
#endif
top_.clk = !top_.clk;
++ticks;
}
return ticks;
}
T* operator->() {
return &top_;
}
};
template <typename... Args>
void vl_setw(uint32_t* sig, Args&&... args) {
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
for (size_t i = 0; i < sizeof... (Args); ++i) {
sig[i] = arr[i];
}
}
template <typename... Args>
int vl_cmpw(const uint32_t* sig, Args&&... args) {
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
for (size_t i = 0; i < sizeof... (Args); ++i) {
if (sig[i] < arr[i])
return -1;
if (sig[i] > arr[i])
return 1;
}
return 0;
}

View file

@ -0,0 +1,30 @@
TOP = VX_tex_sampler
PARAMS ?=
INCLUDE = -I../../../rtl/ -I../../../rtl/libs -I../../../rtl/tex_unit
SRCS = main.cpp
all: build
CF += -std=c++11 -fms-extensions -I../..
VF += $(PARAMS)
VF += --language 1800-2009 --assert -Wall --trace
VF += -Wno-DECLFILENAME
VF += --x-initial unique
VF += -exe $(SRCS) $(INCLUDE)
VF += $(PARAMS)
gen:
verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS)
build: gen
(cd obj_dir && make -j -f V$(TOP).mk)
run: build
(cd obj_dir && ./V$(TOP))
clean:
rm -rf obj_dir

View file

@ -0,0 +1,215 @@
#include "vl_simulator.h"
#include "VVX_tex_sampler.h"
#include <iostream>
#include <map>
#define MAX_TICKS 20
#define MAX_UNIT_CYCLES 5
#define NUM_THREADS
#define CHECK(x) \
do { \
if (x) \
break; \
std::cout << "FAILED: " << #x << std::endl; \
std::abort(); \
} while (false)
uint64_t ticks = 0;
// using Device = VVX_tex_sampler;
template <typename T>
class testbench
{
private:
vl_simulator<T> sim;
std::map<int, struct Input> input_map;
std::map<int, struct Output> output_map;
public:
struct UnitTest {
bool use_reset;
unsigned int num_cycles;
bool use_cmodel;
struct Output outputs[MAX_UNIT_CYCLES];
struct Input inputs[MAX_UNIT_CYCLES];
unsigned int num_output_check;
unsigned int check_output_cycle[MAX_UNIT_CYCLES];
}
struct Input {
bool req_valid;
unsigned int req_wid;
unsigned int req_tmask;
unsigned int req_PC;
unsigned int req_rd;
unsigned int req_wb;
unsigned int req_filter;
unsigned int req_format;
unsigned int req_u[NUM_THREADS];
unsigned int req_v[NUM_THREADS];
unsigned int req_texels[NUM_THREADS][4];
bool rsp_ready;
}
struct Output {
int output_cycle;
// outputs
bool req_ready;
bool rsp_valid;
unsigned int rsp_wid;
unsigned int rsp_tmask;
unsigned int rsp_PC;
unsigned int rsp_rd;
bool rsp_wb;
unsigned int rsp_data[NUM_THREADS];
}
testbench(/* args */){
}
~testbench(){
}
void unittest_Cmodel(struct UnitTest * test){
int cycles = test->num_cycles;
int num_outputs = test->num_output_check;
// struct Input* inputs = new (struct Input)[cycles];
struct Output* outputs = new (struct Output)[num_outputs];
// implement c model and assign outputs to struct
if (test->inputs[0]->req_filter == 0){
for (int i = 0; i < NUM_THREADS; i++)
outputs[0]->rsp_data[0] = test->inputs->req_texels[i][0];
} else {
// for (int i = 0; i < NUM_THREADS; i++){
// uint32_t low[4], high[4];
// for (int j = 0; j < 4; j++){
// low[j] = test->inputs->req_texels[i][j] & 0x00ff00ff;
// high[j] = (test->inputs->req_texels[i][j] >> 8) & 0x00ff00ff;
// }
// }
}
outputs[0]->output_cycle = 1;
test->num_cycles = 1;
test->outputs = &outputs;
}
void generate_test_vectors(struct UnitTest * tests, int num_tests, bool is_pipe){
// for all unit tests create output test vectors (w w/o c-model)
int prev_test_cycle = 0;
for (int i = 0; i < num_tests; i++)
{
int op_counter = 0;
int ip_counter = 0;
int test_cycle = 0;
int last_ip_cycle = 0;
struct UnitTest curr_test = tests[i];
if (curr_test->use_cmodel){
unittest_Cmodel(&curr_test);
}
for (int j = 0; j < curr_test->num_cycles; j++)
{
if (curr_test->inputs[ip_counter]->input_cycle == test_cycle){
input_map.insert(std::make_pair(prev_test_cycle + test_cycle, curr_test->inputs[j]));
last_ip_cycle = prev_test_cycle + test_cycle;
ip_counter++;
}
if (curr_test->outputs[op_counter]->output_cycle == test_cycle){
output_map.insert(std::make_pair(prev_test_cycle + test_cycle, curr_test->outputs[op_counter]));
op_counter++;
}
test_cycle++;
}
if(!is_pipe){
prev_test_cycle += (test_cycle - 1);
}
else{
prev_test_cycle = last_ip_cycle + 1;
}
}
}
void run(){
ticks = sim.reset(0);
int cycle = 0;
while (ticks < MAX_TICKS) {
auto input = input_map.find(cycle);
auto output = output_map.find(cycle);
if (input != input_map.end()){
sim->req_valid = input->req_valid;
sim->req_wid = input->req_wid;
sim->req_tmask = input->req_tmask;
sim->req_PC = input->req_PC;
sim->req_rd = input->req_rd;
sim->req_wb = input->req_wb;
sim->req_filter = input->req_filter;
sim->req_format = input->req_format;
// sim->req_u = input->req_u[NUM_THREADS];
// sim->req_v = input->req_v[NUM_THREADS];
vl_setw(sim->req_texels, input->req_texels)
// sim->req_texels = input->req_texels[NUM_THREADS][4];
sim->rsp_ready = input->rsp_ready;
} else{
std::cout << "Warning! No Input on Cycle " << cycle << std::endl;
}
if(output != output_map.end()){
CHECK(sim->req_ready == output->req_ready);
CHECK(sim->rsp_valid == output->rsp_valid);
CHECK(sim->rsp_wid == output->rsp_wid);
CHECK(sim->rsp_tmask == output->rsp_tmask);
CHECK(sim->rsp_PC == output->rsp_PC);
CHECK(sim->rsp_rd == output->rsp_rd);
CHECK(sim->rsp_wb == output->rsp_wb);
CHECK(vl_cmpw(sim->rsp_data, output->rsp_data));
}
cycle++;
ticks = sim.step(ticks,2);
}
}
std::cout << "PASSED!" << std::endl;
std::cout << "Simulation time: " << std::dec << ticks/2 << " cycles" << std::endl;
};
double sc_time_stamp() {
return ticks;
}
int main(int argc, char **argv) {
// Initialize Verilators variables
Verilated::commandArgs(argc, argv);
testbench<VVX_tex_sampler> sampler_testbench;
sampler_testbench.generate_test_vectors(tests, 1, 0);
sampler_test_bench.run();
return 0;
}

View file

@ -0,0 +1,81 @@
#pragma once
#include <array>
#include <cstdint>
#include "verilated.h"
#ifdef VM_TRACE
#include <verilated_vcd_c.h> // Trace file format header
#endif
template <typename T>
class vl_simulator {
private:
T top_;
#ifdef VM_TRACE
VerilatedVcdC tfp_;
#endif
public:
vl_simulator() {
top_.clk = 0;
top_.reset = 0;
#ifdef VM_TRACE
Verilated::traceEverOn(true);
top_.trace(&tfp_, 99);
tfp_.open("trace.vcd");
#endif
}
~vl_simulator() {
#ifdef VM_TRACE
tfp_.close();
#endif
top_.final();
}
uint64_t reset(uint64_t ticks) {
top_.reset = 1;
ticks = this->step(ticks, 2);
top_.reset = 0;
return ticks;
}
uint64_t step(uint64_t ticks, uint32_t count = 1) {
while (count--) {
top_.eval();
#ifdef VM_TRACE
tfp_.dump(ticks);
#endif
top_.clk = !top_.clk;
++ticks;
}
return ticks;
}
T* operator->() {
return &top_;
}
};
template <typename... Args>
void vl_setw(uint32_t* sig, Args&&... args) {
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
for (size_t i = 0; i < sizeof... (Args); ++i) {
sig[i] = arr[i];
}
}
template <typename... Args>
int vl_cmpw(const uint32_t* sig, Args&&... args) {
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
for (size_t i = 0; i < sizeof... (Args); ++i) {
if (sig[i] < arr[i])
return -1;
if (sig[i] > arr[i])
return 1;
}
return 0;
}

View file

@ -5,7 +5,62 @@
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __ASSEMBLY__
#define __ASM_STR(x) x
#else
#define __ASM_STR(x) #x
#endif
#define vx_csr_swap(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_read(csr) ({ \
register unsigned __v; \
__asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \
__v; \
})
#define vx_csr_write(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
#define vx_csr_read_set(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_set(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
#define vx_csr_read_clear(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_clear(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
// Texture load
#define vx_tex(unit, u, v, l) ({ \
unsigned __r; \
unsigned __u = u; \
unsigned __v = v; \
unsigned __l = l; \
__asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \
__r; \
})
#ifdef __ASSEMBLY__
#define __ASM_STR(x) x
@ -52,6 +107,16 @@ extern "C" {
__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
// Texture load
#define vx_tex(unit, u, v, l) ({ \
unsigned __r; \
unsigned __u = u; \
unsigned __v = v; \
unsigned __l = l; \
__asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \
__r; \
})
// Set thread mask
inline void vx_tmc(unsigned thread_mask) {
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask));
@ -86,7 +151,7 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
// Prefetch
inline void vx_prefetch(unsigned addr) {
asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
asm volatile (".insn s 0x6b, 6, x0, 0(%0)" :: "r"(addr) );
}
// Return active warp's thread id
@ -170,6 +235,8 @@ inline void vx_fence() {
#define __endif vx_join();
#define __DIVERGENT__ __attribute__((annotate("divergent")))
#ifdef __cplusplus
}
#endif

View file

@ -34,7 +34,7 @@ int vx_vprintf(const char* format, va_list va) {
printf_arg_t arg;
arg.format = format;
arg.va = &va;
vx_serial(__printf_cb, &arg);
vx_serial((vx_serial_cb)__printf_cb, &arg);
return arg.ret;
}
@ -63,7 +63,7 @@ void vx_putint(int value, int base) {
putint_arg_t arg;
arg.value = value;
arg.base = base;
vx_serial(__putint_cb, &arg);
vx_serial((vx_serial_cb)__putint_cb, &arg);
}
static void __putfloat_cb(const putfloat_arg_t* arg) {
@ -83,7 +83,7 @@ void vx_putfloat(float value, int precision) {
putfloat_arg_t arg;
arg.value = value;
arg.precision = precision;
vx_serial(__putfloat_cb, &arg);
vx_serial((vx_serial_cb)__putfloat_cb, &arg);
}
#ifdef __cplusplus

View file

@ -1,32 +1,34 @@
RTL_DIR=../../hw/rtl
DPI_DIR=../../hw/dpi
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I../../../hw -I../../common
CXXFLAGS += -I../../common/softfloat/source/include
LDFLAGS += ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
# control RTL debug print states
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
# control RTL debug tracing states
DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
DBG_TRACE_FLAGS += -DDBG_TRACE_MEM
DBG_TRACE_FLAGS += -DDBG_TRACE_OPAE
DBG_TRACE_FLAGS += -DDBG_TRACE_AVS
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += $(DBG_TRACE_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
DBG_FLAGS += -DVCD_OUTPUT
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE)
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) $(TEX_INCLUDE)
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp

View file

@ -182,7 +182,7 @@ static const char* op_string(const Instr &instr) {
case 2: return "SPLIT";
case 3: return "JOIN";
case 4: return "BAR";
case 5: return "PREFETCH";
case 6: return "PREFETCH";
default:
std::abort();
}

View file

@ -712,7 +712,7 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) {
pipeline->stall_warp = true;
runOnce = true;
} break;
case 5: {
case 6: {
// PREFETCH
int addr = rsdata[0];
printf("*** PREFETCHED %d ***\n", addr);

View file

@ -2,27 +2,28 @@ RTL_DIR = ../../hw/rtl
DPI_DIR = ../../hw/dpi
SCRIPT_DIR=../../hw/scripts
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I.. -I../../../hw -I../../common
CXXFLAGS += -I../../common/softfloat/source/include
LDFLAGS += -shared ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
# control RTL debug print states
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
# control RTL debug tracing states
DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
DBG_TRACE_FLAGS += -DDBG_TRACE_MEM
DBG_TRACE_FLAGS += -DDBG_TRACE_OPAE
DBG_TRACE_FLAGS += -DDBG_TRACE_AVS
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += $(DBG_TRACE_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
@ -30,7 +31,8 @@ SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += fpga.cpp opae_sim.cpp
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip
TOP = vortex_afu_shim
@ -84,12 +86,12 @@ VL_FLAGS += -D$(FPU_CORE)
PROJECT = libopae-c-vlsim
all: shared
all: $(PROJECT).so
vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h
shared: $(SRCS) vortex_afu.h
$(PROJECT).so: $(SRCS) vortex_afu.h
verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT).so
static: $(SRCS) vortex_afu.h

View file

@ -421,7 +421,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
#else
commandQueue = clCreateCommandQueue(context,
devices[device_touse], NULL, &status);
devices[device_touse], 0, &status);
#endif // PROFILING
@ -451,8 +451,8 @@ void cl_cleanup()
printf("clReleaseContext()\n");
}
for (int p = 0; p < numPlatforms; ++p) {
for (int d = 0; d < numDevices[p]; ++d) {
for (cl_uint p = 0; p < numPlatforms; ++p) {
for (cl_uint d = 0; d < numDevices[p]; ++d) {
status = clReleaseDevice(devices[d]);
cl_errChk(status, "Oops!", true);
printf("clReleaseDevice()\n");

View file

@ -421,7 +421,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
#else
commandQueue = clCreateCommandQueue(context,
devices[device_touse], NULL, &status);
devices[device_touse], 0, &status);
#endif // PROFILING
@ -451,8 +451,8 @@ void cl_cleanup()
printf("clReleaseContext()\n");
}
for (int p = 0; p < numPlatforms; ++p) {
for (int d = 0; d < numDevices[p]; ++d) {
for (cl_uint p = 0; p < numPlatforms; ++p) {
for (cl_uint d = 0; d < numDevices[p]; ++d) {
status = clReleaseDevice(devices[d]);
cl_errChk(status, "Oops!", true);
printf("clReleaseDevice()\n");

Some files were not shown because too many files have changed in this diff Show more