From b08c7403f6471f7587a027b85287cb40cafc0ff5 Mon Sep 17 00:00:00 2001 From: dhy2000 <46858361+dhy2000@users.noreply.github.com> Date: Fri, 26 Jan 2024 19:56:50 +0800 Subject: [PATCH 01/89] fix #100: change return type to float --- tests/regression/vecaddx/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/regression/vecaddx/main.cpp b/tests/regression/vecaddx/main.cpp index 117f34709..15d58e013 100644 --- a/tests/regression/vecaddx/main.cpp +++ b/tests/regression/vecaddx/main.cpp @@ -50,7 +50,7 @@ public: static const char* type_str() { return "float"; } - static int generate() { + static float generate() { return static_cast(rand()) / RAND_MAX; } static bool compare(float a, float b, int index, int errors) { From 2b481024bbe948b3faa8adfe8966202bb319e3bc Mon Sep 17 00:00:00 2001 From: Jacob Levinson Date: Thu, 25 Jul 2024 16:35:43 -0700 Subject: [PATCH 02/89] Fixes readme by removing $ from shell commands + small tweaks Removed $ from all the shell commands so that they can be easily 1-click copy-pasted from github without the dollar sign, as well as changed "cd Vortex" to "cd vortex" to match the actual directory spelling. Also removed obsolete travis ci link as the project has moved to github ci. --- README.md | 80 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 97484ff57..eaad30cd0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ -[![Build Status](https://travis-ci.com/vortexgpgpu/vortex.svg?branch=master)](https://travis-ci.com/vortexgpgpu/vortex) - # Vortex GPGPU Vortex is a full-stack open-source RISC-V GPGPU. @@ -47,49 +45,63 @@ More detailed build instructions can be found [here](docs/install_vortex.md). - [Yosys](https://github.com/YosysHQ/yosys) - [Sv2v](https://github.com/zachjs/sv2v) ### Install development tools - $ sudo apt-get install build-essential - $ sudo apt-get install binutils - $ sudo apt-get install python - $ sudo apt-get install uuid-dev - $ sudo apt-get install git +```sh +sudo apt-get install build-essential +sudo apt-get install binutils +sudo apt-get install python +sudo apt-get install uuid-dev +sudo apt-get install git +``` ### Install Vortex codebase - $ git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git - $ cd Vortex +```sh +git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git +cd vortex +``` ### Configure your build folder - $ mkdir build - $ cd build - $ ../configure --xlen=32 --tooldir=$HOME/tools +```sh +mkdir build +cd build +../configure --xlen=32 --tooldir=$HOME/tools +``` ### Install prebuilt toolchain - $ ./ci/toolchain_install.sh --all +```sh +./ci/toolchain_install.sh --all +``` ### set environment variables - # should always run before using the toolchain! - $ source ./ci/toolchain_env.sh +```sh +# should always run before using the toolchain! +source ./ci/toolchain_env.sh +``` ### Building Vortex - $ make -s +```sh +make -s +``` ### Quick demo running vecadd OpenCL kernel on 2 cores - $ ./ci/blackbox.sh --cores=2 --app=vecadd +```sh +./ci/blackbox.sh --cores=2 --app=vecadd +``` ### Common Developer Tips - Installing Vortex kernel and runtime libraries to use with external tools requires passing --prefix= to the configure script. - ```sh - $ ../configure --xlen=32 --tooldir=$HOME/tools --prefix= - $ make -s - $ make install - `````` +```sh +../configure --xlen=32 --tooldir=$HOME/tools --prefix= +make -s +make install +``` - Building Vortex 64-bit simply requires using --xlen=64 configure option. - ```sh - $ ../configure --xlen=32 --tooldir=$HOME/tools - ``` +```sh +../configure --xlen=32 --tooldir=$HOME/tools +``` - Sourcing "./ci/toolchain_env.sh" is required everytime you start a new terminal. we recommend adding "source /ci/toolchain_env.sh" to your ~/.bashrc file to automate the process at login. - ```sh - $ echo "source /ci/toolchain_env.sh" >> ~/.bashrc - ``` +```sh +echo "source /ci/toolchain_env.sh" >> ~/.bashrc +``` - Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again to get it propagated into your build folder. - ```sh - $ ../configure - ``` +```sh +../configure +``` - To debug the GPU, you can generate a "run.log" trace. see /docs/debugging.md for more information. - ```sh - $ ./ci/blackbox.sh --app=demo --debug=3 - ``` +```sh +./ci/blackbox.sh --app=demo --debug=3 +``` - For additional information, check out the /docs. From bdbe22ff4da2a8219d3a16833aa440a31e51c3f8 Mon Sep 17 00:00:00 2001 From: Jacob Levinson Date: Thu, 25 Jul 2024 16:40:19 -0700 Subject: [PATCH 03/89] Capitalize S in "set enviroment vairables" --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index eaad30cd0..7cafd498d 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ cd build ```sh ./ci/toolchain_install.sh --all ``` -### set environment variables +### Set environment variables ```sh # should always run before using the toolchain! source ./ci/toolchain_env.sh From bad280ae8016e5228dadfa54ccb86341a37b26b7 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 28 Jul 2024 12:48:01 -0700 Subject: [PATCH 04/89] testing writeback cache --- hw/rtl/VX_config.vh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 804715aad..f43eb2581 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -537,7 +537,7 @@ // Enable Cache Writeback `ifndef DCACHE_WRITEBACK -`define DCACHE_WRITEBACK 0 +`define DCACHE_WRITEBACK 1 `endif // LMEM Configurable Knobs //////////////////////////////////////////////////// @@ -601,7 +601,7 @@ // Enable Cache Writeback `ifndef L2_WRITEBACK -`define L2_WRITEBACK 0 +`define L2_WRITEBACK 1 `endif // L3cache Configurable Knobs ///////////////////////////////////////////////// @@ -647,7 +647,7 @@ // Enable Cache Writeback `ifndef L3_WRITEBACK -`define L3_WRITEBACK 0 +`define L3_WRITEBACK 1 `endif // ISA Extensions ///////////////////////////////////////////////////////////// From 7f990075684a3c5ef64615749fae2db770d59663 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 28 Jul 2024 13:17:14 -0700 Subject: [PATCH 05/89] CI script update --- .github/workflows/ci.yml | 1 + tests/opencl/Makefile | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 386ad0ba1..388ec9ab7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -165,6 +165,7 @@ jobs: else ./ci/regression.sh --${{ matrix.name }} fi + continue-on-error: true complete: runs-on: ubuntu-20.04 diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index e4be7e712..e60cd6ec7 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -8,6 +8,9 @@ all: $(MAKE) -C psort $(MAKE) -C saxpy $(MAKE) -C sfilter + $(MAKE) -C sgemm2 + $(MAKE) -C sgemm3 + $(MAKE) -C psum $(MAKE) -C oclprintf $(MAKE) -C dotproduct $(MAKE) -C transpose @@ -19,9 +22,6 @@ all: $(MAKE) -C kmeans $(MAKE) -C blackscholes $(MAKE) -C bfs - $(MAKE) -C sgemm2 - $(MAKE) -C sgemm3 - $(MAKE) -C psum run-simx: $(MAKE) -C vecadd run-simx @@ -30,6 +30,9 @@ run-simx: $(MAKE) -C psort run-simx $(MAKE) -C saxpy run-simx $(MAKE) -C sfilter run-simx + $(MAKE) -C sgemm2 run-simx + $(MAKE) -C sgemm3 run-simx + $(MAKE) -C psum run-simx $(MAKE) -C oclprintf run-simx $(MAKE) -C dotproduct run-simx $(MAKE) -C transpose run-simx @@ -40,9 +43,6 @@ run-simx: $(MAKE) -C kmeans run-simx $(MAKE) -C blackscholes run-simx $(MAKE) -C bfs run-simx - $(MAKE) -C sgemm2 run-simx - $(MAKE) -C sgemm3 run-simx - $(MAKE) -C psum run-simx run-rtlsim: $(MAKE) -C vecadd run-rtlsim @@ -51,6 +51,9 @@ run-rtlsim: $(MAKE) -C psort run-rtlsim $(MAKE) -C saxpy run-rtlsim $(MAKE) -C sfilter run-rtlsim + $(MAKE) -C sgemm2 run-rtlsim + $(MAKE) -C sgemm3 run-rtlsim + $(MAKE) -C psum run-rtlsim $(MAKE) -C oclprintf run-rtlsim $(MAKE) -C dotproduct run-rtlsim $(MAKE) -C transpose run-rtlsim @@ -61,9 +64,6 @@ run-rtlsim: $(MAKE) -C kmeans run-rtlsim $(MAKE) -C blackscholes run-rtlsim $(MAKE) -C bfs run-rtlsim - $(MAKE) -C sgemm2 run-rtlsim - $(MAKE) -C sgemm3 run-rtlsim - $(MAKE) -C psum run-rtlsim clean: $(MAKE) -C vecadd clean @@ -72,6 +72,9 @@ clean: $(MAKE) -C psort clean $(MAKE) -C saxpy clean $(MAKE) -C sfilter clean + $(MAKE) -C sgemm2 clean + $(MAKE) -C sgemm3 clean + $(MAKE) -C psum clean $(MAKE) -C oclprintf clean $(MAKE) -C dotproduct clean $(MAKE) -C transpose clean @@ -82,7 +85,4 @@ clean: $(MAKE) -C guassian clean $(MAKE) -C kmeans clean $(MAKE) -C blackscholes clean - $(MAKE) -C bfs clean - $(MAKE) -C sgemm2 clean - $(MAKE) -C sgemm3 clean - $(MAKE) -C psum clean \ No newline at end of file + $(MAKE) -C bfs clean \ No newline at end of file From 160c428ef5ba3000060b3c6f087def25f2fa8e17 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 28 Jul 2024 17:29:15 -0700 Subject: [PATCH 06/89] fixed uuid format --- hw/dpi/util_dpi.cpp | 2 +- sim/simx/emulator.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/dpi/util_dpi.cpp b/hw/dpi/util_dpi.cpp index 020816b0b..b0b36f4cb 100644 --- a/hw/dpi/util_dpi.cpp +++ b/hw/dpi/util_dpi.cpp @@ -215,6 +215,6 @@ uint64_t dpi_uuid_gen(bool reset, int wid) { return 0; } uint32_t instr_uuid = g_uuid_gens[wid]++; - uint64_t uuid = (uint64_t(wid) << 32) | instr_uuid; + uint64_t uuid = (uint64_t(instr_uuid) << 12) | wid; return uuid; } \ No newline at end of file diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index cd305bb0d..841fbc0c0 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -157,7 +157,7 @@ instr_trace_t* Emulator::step() { #ifndef NDEBUG uint32_t instr_uuid = warp.uuid++; uint32_t g_wid = core_->id() * arch_.num_warps() + scheduled_warp; - uint64_t uuid = (uint64_t(g_wid) << 32) | instr_uuid; + uint64_t uuid = (uint64_t(instr_uuid) << 12) | g_wid; #else uint64_t uuid = 0; #endif From 382b686d59121a55a2d099032d0fd45d4a7338b9 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 28 Jul 2024 17:40:03 -0700 Subject: [PATCH 07/89] reset GRPs only in debug mode --- docs/debugging.md | 4 ++-- hw/rtl/core/VX_operands.sv | 7 +++++++ sim/simx/emulator.cpp | 10 ++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/docs/debugging.md b/docs/debugging.md index e0450e5e7..6e2e14890 100644 --- a/docs/debugging.md +++ b/docs/debugging.md @@ -53,9 +53,9 @@ A waveform trace `trace.vcd` will be generated in the current directory during t ## Analyzing Vortex trace log When debugging Vortex RTL or SimX Simulator, reading the trace run.log file can be overwhelming when the trace gets really large. -We provide a trace sanitizer tool under ./hw/scripts/trace_csv.py that you can use to convert the large trace into a CSV file containing all the instructions that executed with their source and destination operands. To increase compatibility between traces you will need to initialize RTLSIM's GPRs to zero by defining GPR_RESET. +We provide a trace sanitizer tool under ./hw/scripts/trace_csv.py that you can use to convert the large trace into a CSV file containing all the instructions that executed with their source and destination operands. - $ CONFIGS="-DGPR_RESET" ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=3 --log=run_rtlsim.log + $ ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=3 --log=run_rtlsim.log $ ./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv $ ./ci/blackbox.sh --driver=simx --app=demo --debug=3 --log=run_simx.log diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 17d8a9d0c..04a12e4c6 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -13,6 +13,13 @@ `include "VX_define.vh" +// reset all GPRs in debug mode +`ifdef SIMULATION +`ifndef NDEBUG +`define GPR_RESET +`endif +`endif + module VX_operands import VX_gpu_pkg::*; #( parameter `STRING INSTANCE_ID = "", parameter NUM_BANKS = 4, diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 841fbc0c0..4bcffd14e 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -53,15 +53,25 @@ void Emulator::warp_t::clear(uint64_t startup_addr) { this->uuid = 0; this->fcsr = 0; + std::srand(50); + for (auto& reg_file : this->ireg_file) { for (auto& reg : reg_file) { + #ifndef NDEBUG reg = 0; + #else + reg = std::rand(); + #endif } } for (auto& reg_file : this->freg_file) { for (auto& reg : reg_file) { + #ifndef NDEBUG reg = 0; + #else + reg = std::rand(); + #endif } } } From 48b1ab7494fa5639901f313101058e8495ee7d0d Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 28 Jul 2024 18:03:34 -0700 Subject: [PATCH 08/89] fixed uuid format --- hw/dpi/util_dpi.cpp | 3 ++- sim/simx/emulator.cpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/dpi/util_dpi.cpp b/hw/dpi/util_dpi.cpp index b0b36f4cb..0c0e92e82 100644 --- a/hw/dpi/util_dpi.cpp +++ b/hw/dpi/util_dpi.cpp @@ -215,6 +215,7 @@ uint64_t dpi_uuid_gen(bool reset, int wid) { return 0; } uint32_t instr_uuid = g_uuid_gens[wid]++; - uint64_t uuid = (uint64_t(instr_uuid) << 12) | wid; + uint32_t total_warps = NUM_WARPS * NUM_CORES * NUM_CLUSTERS; + uint64_t uuid = uint64_t(instr_uuid) * total_warps + wid; return uuid; } \ No newline at end of file diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 4bcffd14e..7b35d1ae0 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -167,7 +167,8 @@ instr_trace_t* Emulator::step() { #ifndef NDEBUG uint32_t instr_uuid = warp.uuid++; uint32_t g_wid = core_->id() * arch_.num_warps() + scheduled_warp; - uint64_t uuid = (uint64_t(instr_uuid) << 12) | g_wid; + uint32_t total_warps = arch_.num_warps() * arch_.num_cores() * arch_.num_clusters(); + uint64_t uuid = uint64_t(instr_uuid) * total_warps + g_wid; #else uint64_t uuid = 0; #endif From 2e060faaf4228e3e379fd0a40d9716b6dfb39749 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 00:05:51 -0700 Subject: [PATCH 09/89] reverting uuid format to ease file diff --- hw/dpi/util_dpi.cpp | 3 +-- sim/simx/emulator.cpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/hw/dpi/util_dpi.cpp b/hw/dpi/util_dpi.cpp index 0c0e92e82..020816b0b 100644 --- a/hw/dpi/util_dpi.cpp +++ b/hw/dpi/util_dpi.cpp @@ -215,7 +215,6 @@ uint64_t dpi_uuid_gen(bool reset, int wid) { return 0; } uint32_t instr_uuid = g_uuid_gens[wid]++; - uint32_t total_warps = NUM_WARPS * NUM_CORES * NUM_CLUSTERS; - uint64_t uuid = uint64_t(instr_uuid) * total_warps + wid; + uint64_t uuid = (uint64_t(wid) << 32) | instr_uuid; return uuid; } \ No newline at end of file diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 7b35d1ae0..0567eb853 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -167,8 +167,7 @@ instr_trace_t* Emulator::step() { #ifndef NDEBUG uint32_t instr_uuid = warp.uuid++; uint32_t g_wid = core_->id() * arch_.num_warps() + scheduled_warp; - uint32_t total_warps = arch_.num_warps() * arch_.num_cores() * arch_.num_clusters(); - uint64_t uuid = uint64_t(instr_uuid) * total_warps + g_wid; + uint64_t uuid = (uint64_t(g_wid) << 32) | instr_uuid; #else uint64_t uuid = 0; #endif From 03e21924f4bc7eab9ef126299681123028b9739f Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 00:28:07 -0700 Subject: [PATCH 10/89] Verilator bug workaround This was causing a buffer overflow, ignoring range checks --- hw/rtl/libs/VX_find_first.sv | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/hw/rtl/libs/VX_find_first.sv b/hw/rtl/libs/VX_find_first.sv index f06971106..18f345855 100644 --- a/hw/rtl/libs/VX_find_first.sv +++ b/hw/rtl/libs/VX_find_first.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,10 +17,10 @@ module VX_find_first #( parameter N = 1, parameter DATAW = 1, - parameter REVERSE = 0 + parameter REVERSE = 0 ) ( input wire [N-1:0][DATAW-1:0] data_in, - input wire [N-1:0] valid_in, + input wire [N-1:0] valid_in, output wire [DATAW-1:0] data_out, output wire valid_out ); @@ -37,10 +37,12 @@ module VX_find_first #( assign s_n[TL+i] = REVERSE ? valid_in[N-1-i] : valid_in[i]; assign d_n[TL+i] = REVERSE ? data_in[N-1-i] : data_in[i]; end - - for (genvar i = TL+N; i < TN; ++i) begin - assign s_n[i] = 0; - assign d_n[i] = '0; + + if (TL < (TN-N)) begin + for (genvar i = TL+N; i < TN; ++i) begin + assign s_n[i] = 0; + assign d_n[i] = '0; + end end for (genvar j = 0; j < LOGN; ++j) begin @@ -48,10 +50,10 @@ module VX_find_first #( assign s_n[2**j-1+i] = s_n[2**(j+1)-1+i*2] | s_n[2**(j+1)-1+i*2+1]; assign d_n[2**j-1+i] = s_n[2**(j+1)-1+i*2] ? d_n[2**(j+1)-1+i*2] : d_n[2**(j+1)-1+i*2+1]; end - end - + end + assign valid_out = s_n[0]; assign data_out = d_n[0]; - + endmodule `TRACING_ON From 96831c8b896e4f365e83cd9366f13bd55454d321 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 03:11:33 -0700 Subject: [PATCH 11/89] writeback cache fixes --- hw/rtl/cache/VX_cache_bank.sv | 75 ++++++++++++++++++-------------- hw/rtl/cache/VX_cache_data.sv | 66 +++++++++++------------------ hw/rtl/cache/VX_cache_tags.sv | 80 ++++++++++++++++++++++++----------- 3 files changed, 124 insertions(+), 97 deletions(-) diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index 03f3efd41..ced0778af 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -149,7 +149,8 @@ module VX_cache_bank #( wire is_creq_st0, is_creq_st1; wire is_fill_st0, is_fill_st1; wire is_replay_st0, is_replay_st1; - wire creq_flush_st0, creq_flush_st1; + wire creq_flush_sel, creq_flush_st0, creq_flush_st1; + wire evict_dirty_st0, evict_dirty_st1; wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1; wire [NUM_WAYS-1:0] tag_matches_st0; wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0; @@ -223,15 +224,16 @@ module VX_cache_bank #( wire init_fire = line_flush_init; wire replay_fire = replay_valid && replay_ready; wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; - wire flush_fire = line_flush_valid && line_flush_ready; + wire line_flush_fire = line_flush_valid && line_flush_ready; wire core_req_fire = core_req_valid && core_req_ready; - assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire; + assign valid_sel = init_fire || replay_fire || mem_rsp_fire || line_flush_fire || core_req_fire; assign rw_sel = replay_valid ? replay_rw : core_req_rw; assign byteen_sel = replay_valid ? replay_byteen : core_req_byteen; assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel; assign req_idx_sel = replay_valid ? replay_idx : core_req_idx; assign tag_sel = replay_valid ? replay_tag : core_req_tag; + assign creq_flush_sel = core_req_valid && core_req_flush; assign addr_sel = (line_flush_init | line_flush_valid) ? `CS_LINE_ADDR_WIDTH'(line_flush_sel) : (replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr)); @@ -260,7 +262,7 @@ module VX_cache_bank #( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_sel, line_flush_init, replay_enable, fill_enable, flush_enable, creq_enable, core_req_flush, line_flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}), + .data_in ({valid_sel, line_flush_init, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, line_flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}), .data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0}) ); @@ -273,16 +275,18 @@ module VX_cache_bank #( wire do_init_st0 = valid_st0 && is_init_st0; wire do_flush_st0 = valid_st0 && is_flush_st0; wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0; + wire do_creq_wr_st0 = valid_st0 && is_creq_st0 && rw_st0; wire do_replay_rd_st0 = valid_st0 && is_replay_st0 && ~rw_st0; + wire do_replay_wr_st0 = valid_st0 && is_replay_st0 && rw_st0; wire do_fill_st0 = valid_st0 && is_fill_st0; - wire do_lookup_st0 = valid_st0 && ~(is_fill_st0 || is_init_st0); - wire do_cache_rd_st0 = do_creq_rd_st0 || do_replay_rd_st0; + wire do_cache_wr_st0 = do_creq_wr_st0 || do_replay_wr_st0; + wire do_lookup_st0 = do_cache_rd_st0 || do_cache_wr_st0; wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0]; - wire [NUM_WAYS-1:0] repl_way_st0; - wire [`CS_TAG_SEL_BITS-1:0] repl_tag_st0; + wire [NUM_WAYS-1:0] evict_way_st0; + wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0; `RESET_RELAY (tag_reset, reset); @@ -294,6 +298,7 @@ module VX_cache_bank #( .NUM_BANKS (NUM_BANKS), .NUM_WAYS (NUM_WAYS), .WORD_SIZE (WORD_SIZE), + .WRITEBACK (WRITEBACK), .UUID_WIDTH (UUID_WIDTH) ) cache_tags ( .clk (clk), @@ -303,33 +308,41 @@ module VX_cache_bank #( .stall (pipe_stall), - // init/fill/lookup/flush - .init (do_init_st0 || do_flush_st0), + // init/flush/fill/write/lookup + .init (do_init_st0), + .flush (do_flush_st0), .fill (do_fill_st0), + .write (do_cache_wr_st0), .lookup (do_lookup_st0), .line_addr (addr_st0), + .way_sel (flush_way_st0), .tag_matches(tag_matches_st0), // replacement - .repl_way (repl_way_st0), - .repl_tag (repl_tag_st0) + .evict_dirty(evict_dirty_st0), + .evict_way (evict_way_st0), + .evict_tag (evict_tag_st0) ); + wire [`CS_LINE_ADDR_WIDTH-1:0] addr2_st0; + + wire is_flush2_st0 = WRITEBACK && is_flush_st0; + assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0; - assign way_sel_st0 = is_fill_st0 ? repl_way_st0 : (is_flush_st0 ? flush_way_st0 : tag_matches_st0); + assign way_sel_st0 = (is_fill_st0 || is_flush2_st0) ? evict_way_st0 : tag_matches_st0; - wire [`CS_LINE_ADDR_WIDTH-1:0] addr_r_st0 = (is_fill_st0 || is_flush_st0) ? {repl_tag_st0, addr_st0[`CS_LINE_SEL_BITS-1:0]} : addr_st0; + assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, addr_st0[`CS_LINE_SEL_BITS-1:0]} : addr_st0; VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1), + .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_st0, is_flush_st0, is_replay_st0, is_fill_st0, is_creq_st0, creq_flush_st0, rw_st0, addr_r_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, mshr_pending_st0}), - .data_out ({valid_st1, is_flush_st1, is_replay_st1, is_fill_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, mshr_pending_st1}) + .data_in ({valid_st0, is_flush2_st0, is_replay_st0, is_fill_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}), + .data_out ({valid_st1, is_flush_st1, is_replay_st1, is_fill_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1}) ); // we have a tag hit @@ -363,7 +376,7 @@ module VX_cache_bank #( `UNUSED_VAR (do_write_miss_st1) // ensure mshr replay always get a hit - `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("runtime error: invalid mshr replay")); + `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("runtime error: missed mshr replay")); // detect BRAM's read-during-write hazard assign rdw_hazard_st0 = do_fill_st0; // stall cycle after a fill @@ -380,7 +393,6 @@ module VX_cache_bank #( wire [`CS_LINE_WIDTH-1:0] dirty_data_st1; wire [LINE_SIZE-1:0] dirty_byteen_st1; - wire dirty_valid_st1; if (`CS_WORDS_PER_LINE > 1) begin reg [LINE_SIZE-1:0] write_byteen_r; @@ -416,7 +428,7 @@ module VX_cache_bank #( .read (do_cache_rd_st1), .fill (do_fill_st1 && ~rdw_hazard_st1), - .flush (do_flush_st1), + .flush (do_flush_st1 && ~rdw_hazard_st1), .write (do_cache_wr_st1), .way_sel (way_sel_st1), .line_addr (addr_st1), @@ -425,7 +437,6 @@ module VX_cache_bank #( .write_data (write_data_st1), .write_byteen(write_byteen_st1), .read_data (read_data_st1), - .dirty_valid(dirty_valid_st1), .dirty_data (dirty_data_st1), .dirty_byteen(dirty_byteen_st1) ); @@ -565,7 +576,7 @@ module VX_cache_bank #( wire mreq_queue_rw; wire mreq_queue_flush; - wire is_evict_st1 = (is_fill_st1 || is_flush_st1) && dirty_valid_st1; + wire is_evict_st1 = (is_fill_st1 || is_flush_st1) && evict_dirty_st1; wire do_writeback_st1 = valid_st1 && is_evict_st1; `UNUSED_VAR (do_writeback_st1) @@ -574,7 +585,7 @@ module VX_cache_bank #( || do_writeback_st1) && ~rdw_hazard_st1; end else begin - `UNUSED_VAR (dirty_valid_st1) + `UNUSED_VAR (evict_dirty_st1) assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1) || do_creq_wr_st1) && ~rdw_hazard_st1; @@ -621,32 +632,32 @@ module VX_cache_bank #( `ifdef DBG_TRACE_CACHE wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready; - wire pipeline_stall = (replay_valid || mem_rsp_valid || core_req_valid || line_flush_valid) - && ~(replay_fire || mem_rsp_fire || core_req_fire || line_flush_valid); + wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || line_flush_valid) + && ~(replay_fire || mem_rsp_fire || core_req_fire || line_flush_fire); always @(posedge clk) begin - if (pipeline_stall) begin - `TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw_st0=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard_st0)); + if (input_stall || pipe_stall) begin + `TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw_st0=%b, rdw_st1=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard_st0, rdw_hazard_st1)); end if (mem_rsp_fire) begin - `TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data)); + `TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data)); end if (replay_fire) begin `TRACE(2, ("%d: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel)); end if (core_req_fire) begin if (core_req_rw) - `TRACE(2, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel)); + `TRACE(2, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel)); else `TRACE(2, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel)); end if (crsp_queue_fire) begin - `TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1)); + `TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1)); end if (mreq_queue_push) begin if (do_creq_wr_st1 && !WRITEBACK) - `TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1)); + `TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1)); else if (do_writeback_st1) - `TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%b, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data)); + `TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data)); else `TRACE(2, ("%d: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1)); end diff --git a/hw/rtl/cache/VX_cache_data.sv b/hw/rtl/cache/VX_cache_data.sv index 6bf8f1c3e..f6b77c1c5 100644 --- a/hw/rtl/cache/VX_cache_data.sv +++ b/hw/rtl/cache/VX_cache_data.sv @@ -53,7 +53,6 @@ module VX_cache_data #( input wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen, input wire [NUM_WAYS-1:0] way_sel, output wire [`CS_WORD_WIDTH-1:0] read_data, - output wire dirty_valid, output wire [`CS_LINE_WIDTH-1:0] dirty_data, output wire [LINE_SIZE-1:0] dirty_byteen ); @@ -69,12 +68,11 @@ module VX_cache_data #( localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1; wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0]; + + wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] rdata; wire [`LOG2UP(NUM_WAYS)-1:0] way_idx; if (WRITEBACK) begin - reg [`CS_LINES_PER_BANK * NUM_WAYS-1:0][LINE_SIZE-1:0] dirty_bytes_r; - reg [`CS_LINES_PER_BANK * NUM_WAYS-1:0] dirty_blocks_r; - wire [`CLOG2(`CS_LINES_PER_BANK * NUM_WAYS)-1:0] way_addr; if (NUM_WAYS > 1) begin assign way_addr = {line_sel, way_idx}; @@ -82,33 +80,29 @@ module VX_cache_data #( assign way_addr = line_sel; end - always @(posedge clk) begin - if (fill) begin - dirty_bytes_r[way_addr] <= '0; - end else if (write) begin - dirty_bytes_r[way_addr] <= dirty_bytes_r[way_addr] | write_byteen; + VX_sp_ram #( + .DATAW (LINE_SIZE * NUM_WAYS), + .SIZE (`CS_LINES_PER_BANK) + ) byteen_store ( + .clk (clk), + .read (1'b1), + .write (write || fill || flush), + `UNUSED_PIN (wren), + .addr (way_addr), + .wdata (write ? (dirty_byteen | write_byteen) : ((fill || flush) ? '0 : dirty_byteen)), + .rdata (dirty_byteen) + ); + + wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] dirty_data_w; + for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin + for (genvar j = 0; j < NUM_WAYS; ++j) begin + assign dirty_data_w[j][i] = rdata[i][j]; end end - - always @(posedge clk) begin - if (reset) begin - for (integer i = 0; i < `CS_LINES_PER_BANK * NUM_WAYS; ++i) begin - dirty_blocks_r[i] <= 0; - end - end else begin - if (fill) begin - dirty_blocks_r[way_addr] <= 0; - end else if (write) begin - dirty_blocks_r[way_addr] <= 1; - end - end - end - - assign dirty_byteen = dirty_bytes_r[way_addr]; - assign dirty_valid = dirty_blocks_r[way_addr]; + assign dirty_data = dirty_data_w[way_idx]; end else begin assign dirty_byteen = '0; - assign dirty_valid = 0; + assign dirty_data = '0; end // order the data layout to perform ways multiplexing last. @@ -146,8 +140,6 @@ module VX_cache_data #( `UNUSED_PIN (valid_out) ); - wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] rdata; - VX_sp_ram #( .DATAW (`CS_LINE_WIDTH * NUM_WAYS), .SIZE (`CS_LINES_PER_BANK), @@ -172,27 +164,19 @@ module VX_cache_data #( end assign read_data = per_way_rdata[way_idx]; - wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] dirty_data_w; - for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin - for (genvar j = 0; j < NUM_WAYS; ++j) begin - assign dirty_data_w[j][i] = rdata[i][j]; - end - end - assign dirty_data = dirty_data_w[way_idx]; - `ifdef DBG_TRACE_CACHE always @(posedge clk) begin if (fill && ~stall) begin - `TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data)); + `TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data)); end if (flush && ~stall) begin - `TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b, byteen=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_valid, dirty_byteen)); + `TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_byteen, dirty_data)); end if (read && ~stall) begin - `TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid)); + `TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid)); end if (write && ~stall) begin - `TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid)); + `TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid)); end end `endif diff --git a/hw/rtl/cache/VX_cache_tags.sv b/hw/rtl/cache/VX_cache_tags.sv index 4595bdbcf..6e57301c4 100644 --- a/hw/rtl/cache/VX_cache_tags.sv +++ b/hw/rtl/cache/VX_cache_tags.sv @@ -26,6 +26,8 @@ module VX_cache_tags #( parameter NUM_WAYS = 1, // Size of a word in bytes parameter WORD_SIZE = 1, + // Enable cache writeback + parameter WRITEBACK = 0, // Request debug identifier parameter UUID_WIDTH = 0 ) ( @@ -40,61 +42,81 @@ module VX_cache_tags #( // init/fill/lookup input wire init, + input wire flush, input wire fill, + input wire write, input wire lookup, input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr, + input wire [NUM_WAYS-1:0] way_sel, output wire [NUM_WAYS-1:0] tag_matches, - // replacement - output wire [NUM_WAYS-1:0] repl_way, - output wire [`CS_TAG_SEL_BITS-1:0] repl_tag + // eviction + output wire evict_dirty, + output wire [NUM_WAYS-1:0] evict_way, + output wire [`CS_TAG_SEL_BITS-1:0] evict_tag ); `UNUSED_SPARAM (INSTANCE_ID) `UNUSED_PARAM (BANK_ID) `UNUSED_VAR (reset) `UNUSED_VAR (lookup) - // valid, tag - localparam TAG_WIDTH = 1 + `CS_TAG_SEL_BITS; + // valid, dirty, tag + localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS; wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0]; wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_ADDR_TAG(line_addr); wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag; wire [NUM_WAYS-1:0] read_valid; + wire [NUM_WAYS-1:0] read_dirty; if (NUM_WAYS > 1) begin - reg [NUM_WAYS-1:0] repl_way_r; + reg [NUM_WAYS-1:0] evict_way_r; // cyclic assignment of replacement way always @(posedge clk) begin if (reset) begin - repl_way_r <= 1; - end else if (~stall) begin // hold the value on stalls prevent filling different slots twice - repl_way_r <= {repl_way_r[NUM_WAYS-2:0], repl_way_r[NUM_WAYS-1]}; + evict_way_r <= 1; + end else if (~stall) begin // holding the value on stalls prevents filling different slots twice + evict_way_r <= {evict_way_r[NUM_WAYS-2:0], evict_way_r[NUM_WAYS-1]}; end end - assign repl_way = repl_way_r; + assign evict_way = fill ? evict_way_r : way_sel; VX_onehot_mux #( .DATAW (`CS_TAG_SEL_BITS), .N (NUM_WAYS) - ) repl_tag_sel ( + ) evict_tag_sel ( .data_in (read_tag), - .sel_in (repl_way_r), - .data_out (repl_tag) + .sel_in (evict_way), + .data_out (evict_tag) ); end else begin `UNUSED_VAR (stall) - assign repl_way = 1'b1; - assign repl_tag = read_tag; + assign evict_way = 1'b1; + assign evict_tag = read_tag; end for (genvar i = 0; i < NUM_WAYS; ++i) begin - wire do_fill = fill && repl_way[i]; - wire do_write = init || do_fill; - wire line_valid = ~init; + wire do_fill = fill && evict_way[i]; + wire do_flush = flush && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode + wire do_write = WRITEBACK && write && tag_matches[i]; + + wire line_write = init || do_fill || do_flush || do_write; + wire line_valid = ~(init || flush); + + wire [TAG_WIDTH-1:0] line_wdata; + wire [TAG_WIDTH-1:0] line_rdata; + + if (WRITEBACK) begin + assign line_wdata = {line_valid, write, line_tag}; + assign {read_valid[i], read_dirty[i], read_tag[i]} = line_rdata; + end else begin + assign line_wdata = {line_valid, line_tag}; + assign {read_valid[i], read_tag[i]} = line_rdata; + assign read_dirty[i] = 1'b0; + end VX_sp_ram #( .DATAW (TAG_WIDTH), @@ -103,11 +125,11 @@ module VX_cache_tags #( ) tag_store ( .clk (clk), .read (1'b1), - .write (do_write), + .write (line_write), `UNUSED_PIN (wren), .addr (line_sel), - .wdata ({line_valid, line_tag}), - .rdata ({read_valid[i], read_tag[i]}) + .wdata (line_wdata), + .rdata (line_rdata) ); end @@ -115,19 +137,29 @@ module VX_cache_tags #( assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]); end + assign evict_dirty = (| read_dirty); + + // ensure fills and flushes do not stall + `RUNTIME_ASSERT (~fill || ~stall, ("runtime error: stalled fill")); + `RUNTIME_ASSERT (~flush || ~stall, ("runtime error: stalled fill")); + `ifdef DBG_TRACE_CACHE + wire [`CS_LINE_ADDR_WIDTH-1:0] evict_line_addr = {evict_tag, line_sel}; always @(posedge clk) begin - if (fill && ~stall) begin - `TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), repl_way, line_sel, line_tag)); + if (fill) begin + `TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, dirty=%b, evict_addr=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), evict_way, line_sel, line_tag, evict_dirty, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID))); end if (init) begin `TRACE(3, ("%d: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel)); end + if (flush) begin + `TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID), way_sel, line_sel, evict_dirty)); + end if (lookup && ~stall) begin if (tag_matches != 0) begin `TRACE(3, ("%d: %s hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); end else begin - `TRACE(3, ("%d: %s miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid)); + `TRACE(3, ("%d: %s miss: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); end end end From 75f1f957d4dd396bf57ec479ad92c018e905280c Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 03:28:51 -0700 Subject: [PATCH 12/89] minor updates --- hw/rtl/Vortex.sv | 6 ++-- hw/rtl/afu/opae/vortex_afu.sv | 20 +++++++------- hw/rtl/afu/xrt/VX_afu_wrap.sv | 4 +-- hw/rtl/cache/VX_bank_flush.sv | 3 +- hw/rtl/cache/VX_cache_wrap.sv | 8 +++--- hw/rtl/core/VX_dcr_data.sv | 2 +- hw/rtl/mem/VX_local_mem.sv | 8 +++--- hw/syn/altera/opae/Makefile | 4 +-- hw/syn/xilinx/xrt/Makefile | 4 +-- sim/opaesim/Makefile | 2 +- sim/rtlsim/Makefile | 4 +-- sim/rtlsim/processor.cpp | 52 ++++++++++++++++++++--------------- sim/simx/Makefile | 2 +- sim/xrtsim/Makefile | 2 +- 14 files changed, 64 insertions(+), 57 deletions(-) diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv index d3ef57c72..816558094 100644 --- a/hw/rtl/Vortex.sv +++ b/hw/rtl/Vortex.sv @@ -192,12 +192,12 @@ module Vortex import VX_gpu_pkg::*; ( always @(posedge clk) begin if (mem_req_fire) begin if (mem_req_rw) - `TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h data=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data)); + `TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data)); else - `TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen)); + `TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen)); end if (mem_rsp_fire) begin - `TRACE(1, ("%d: MEM Rsp: tag=0x%0h, data=0x%0h\n", $time, mem_rsp_tag, mem_rsp_data)); + `TRACE(1, ("%d: MEM Rd Rsp: tag=0x%0h, data=0x%h\n", $time, mem_rsp_tag, mem_rsp_data)); end end `endif diff --git a/hw/rtl/afu/opae/vortex_afu.sv b/hw/rtl/afu/opae/vortex_afu.sv index cd49e7ddd..63a7a6ed1 100644 --- a/hw/rtl/afu/opae/vortex_afu.sv +++ b/hw/rtl/afu/opae/vortex_afu.sv @@ -240,13 +240,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ MMIO_CMD_ARG0: begin cmd_args[0] <= 64'(cp2af_sRxPort.c0.data); `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_CMD_ARG0: data=0x%0h\n", $time, 64'(cp2af_sRxPort.c0.data))); + `TRACE(2, ("%d: MMIO_CMD_ARG0: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data))); `endif end MMIO_CMD_ARG1: begin cmd_args[1] <= 64'(cp2af_sRxPort.c0.data); `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_CMD_ARG1: data=0x%0h\n", $time, 64'(cp2af_sRxPort.c0.data))); + `TRACE(2, ("%d: MMIO_CMD_ARG1: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data))); `endif end MMIO_CMD_ARG2: begin @@ -263,13 +263,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ `ifdef SCOPE MMIO_SCOPE_WRITE: begin `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_SCOPE_WRITE: data=0x%0h\n", $time, cmd_scope_wdata)); + `TRACE(2, ("%d: MMIO_SCOPE_WRITE: data=0x%h\n", $time, cmd_scope_wdata)); `endif end `endif default: begin `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: Unknown MMIO Wr: addr=0x%0h, data=0x%0h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data))); + `TRACE(2, ("%d: Unknown MMIO Wr: addr=0x%0h, data=0x%h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data))); `endif end endcase @@ -305,14 +305,14 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ MMIO_SCOPE_READ: begin mmio_tx.data <= cmd_scope_rdata; `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_SCOPE_READ: data=0x%0h\n", $time, cmd_scope_rdata)); + `TRACE(2, ("%d: MMIO_SCOPE_READ: data=0x%h\n", $time, cmd_scope_rdata)); `endif end `endif MMIO_DEV_CAPS: begin mmio_tx.data <= dev_caps; `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_DEV_CAPS: data=0x%0h\n", $time, dev_caps)); + `TRACE(2, ("%d: MMIO_DEV_CAPS: data=0x%h\n", $time, dev_caps)); `endif end MMIO_ISA_CAPS: begin @@ -760,7 +760,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE); end `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%0h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data)); + `TRACE(2, ("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data)); `endif end @@ -906,7 +906,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ cci_wr_req_done <= 1; end `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%0h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data)); + `TRACE(2, ("%d: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data)); `endif end @@ -1093,13 +1093,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ always @(posedge clk) begin for (integer i = 0; i < NUM_LOCAL_MEM_BANKS; ++i) begin if (avs_write[i] && ~avs_waitrequest[i]) begin - `TRACE(2, ("%d: AVS Wr Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i])); + `TRACE(2, ("%d: AVS Wr Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i])); end if (avs_read[i] && ~avs_waitrequest[i]) begin `TRACE(2, ("%d: AVS Rd Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i])); end if (avs_readdatavalid[i]) begin - `TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h\n", $time, i, avs_readdata[i])); + `TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h\n", $time, i, avs_readdata[i])); end end end diff --git a/hw/rtl/afu/xrt/VX_afu_wrap.sv b/hw/rtl/afu/xrt/VX_afu_wrap.sv index 15be69007..a844802e9 100644 --- a/hw/rtl/afu/xrt/VX_afu_wrap.sv +++ b/hw/rtl/afu/xrt/VX_afu_wrap.sv @@ -377,13 +377,13 @@ module VX_afu_wrap #( `TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i])); end if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin - `TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%0h\n", $time, i, m_axi_mem_wdata_a[i])); + `TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%h\n", $time, i, m_axi_mem_wdata_a[i])); end if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin `TRACE(2, ("%d: AFU Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i])); end if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin - `TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i])); + `TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i])); end end end diff --git a/hw/rtl/cache/VX_bank_flush.sv b/hw/rtl/cache/VX_bank_flush.sv index 15d1e8379..69afd060a 100644 --- a/hw/rtl/cache/VX_bank_flush.sv +++ b/hw/rtl/cache/VX_bank_flush.sv @@ -36,6 +36,7 @@ module VX_bank_flush #( input wire flush_out_ready, input wire mshr_empty ); + // ways interation is only needed when eviction is enabled parameter CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0); parameter STATE_IDLE = 2'd0; @@ -89,9 +90,7 @@ module VX_bank_flush #( end assign flush_in_ready = flush_in_ready_r; - assign flush_out_init = (state_r == STATE_INIT); - assign flush_out_valid = (state_r == STATE_FLUSH); assign flush_out_line = counter_r[`CS_LINE_SEL_BITS-1:0]; diff --git a/hw/rtl/cache/VX_cache_wrap.sv b/hw/rtl/cache/VX_cache_wrap.sv index 082d8b4e1..546f172b4 100644 --- a/hw/rtl/cache/VX_cache_wrap.sv +++ b/hw/rtl/cache/VX_cache_wrap.sv @@ -223,12 +223,12 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( always @(posedge clk) begin if (core_req_fire) begin if (core_bus_if[i].req_data.rw) - `TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid)); + `TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid)); else `TRACE(1, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid)); end if (core_rsp_fire) begin - `TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid)); + `TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid)); end end end @@ -250,14 +250,14 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( always @(posedge clk) begin if (mem_req_fire) begin if (mem_bus_if.req_data.rw) - `TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", + `TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid)); else `TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid)); end if (mem_rsp_fire) begin - `TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%0h (#%0d)\n", + `TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid)); end end diff --git a/hw/rtl/core/VX_dcr_data.sv b/hw/rtl/core/VX_dcr_data.sv index 58e51efc5..4ac137547 100644 --- a/hw/rtl/core/VX_dcr_data.sv +++ b/hw/rtl/core/VX_dcr_data.sv @@ -52,7 +52,7 @@ module VX_dcr_data import VX_gpu_pkg::*, VX_trace_pkg::*; ( if (dcr_bus_if.write_valid) begin `TRACE(1, ("%d: base-dcr: state=", $time)); trace_base_dcr(1, dcr_bus_if.write_addr); - `TRACE(1, (", data=0x%0h\n", dcr_bus_if.write_data)); + `TRACE(1, (", data=0x%h\n", dcr_bus_if.write_data)); end end `endif diff --git a/hw/rtl/mem/VX_local_mem.sv b/hw/rtl/mem/VX_local_mem.sv index f59ebae5b..e31524927 100644 --- a/hw/rtl/mem/VX_local_mem.sv +++ b/hw/rtl/mem/VX_local_mem.sv @@ -310,7 +310,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( always @(posedge clk) begin if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin if (mem_bus_if[i].req_data.rw) begin - `TRACE(1, ("%d: %s wr-req: req_idx=%0d, addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", + `TRACE(1, ("%d: %s wr-req: req_idx=%0d, addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, req_uuid[i])); end else begin `TRACE(1, ("%d: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n", @@ -318,7 +318,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( end end if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin - `TRACE(1, ("%d: %s rd-rsp: req_idx=%0d, tag=0x%0h, data=0x%0h (#%0d)\n", + `TRACE(1, ("%d: %s rd-rsp: req_idx=%0d, tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.tag, mem_bus_if[i].rsp_data.data[i], rsp_uuid[i])); end end @@ -328,7 +328,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( always @(posedge clk) begin if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin if (per_bank_req_rw[i]) begin - `TRACE(2, ("%d: %s-bank%0d wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", + `TRACE(2, ("%d: %s-bank%0d wr-req: addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_uuid[i])); end else begin `TRACE(2, ("%d: %s-bank%0d rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n", @@ -336,7 +336,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( end end if (per_bank_rsp_valid[i] && per_bank_rsp_ready[i]) begin - `TRACE(2, ("%d: %s-bank%0d rd-rsp: tag=0x%0h, data=0x%0h (#%0d)\n", + `TRACE(2, ("%d: %s-bank%0d rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, per_bank_rsp_tag[i], per_bank_rsp_data[i], per_bank_rsp_uuid[i])); end end diff --git a/hw/syn/altera/opae/Makefile b/hw/syn/altera/opae/Makefile index 235c79c8d..8cd37b47c 100644 --- a/hw/syn/altera/opae/Makefile +++ b/hw/syn/altera/opae/Makefile @@ -76,9 +76,9 @@ endif # Debugigng ifdef DEBUG ifneq ($(TARGET), fpga) - CFLAGS += -DNDEBUG - else CFLAGS += $(DBG_TRACE_FLAGS) + else + CFLAGS += -DNDEBUG endif else CFLAGS += -DNDEBUG diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile index f8f0f5cb0..cbf0f4068 100644 --- a/hw/syn/xilinx/xrt/Makefile +++ b/hw/syn/xilinx/xrt/Makefile @@ -115,10 +115,10 @@ endif ifdef DEBUG VPP_FLAGS += -g --debug.protocol all ifneq ($(TARGET), hw) - CFLAGS += -DNDEBUG - else VPP_FLAGS += --vivado.prop fileset.sim_1.xsim.elaborate.debug_level=all CFLAGS += $(DBG_TRACE_FLAGS) + else + CFLAGS += -DNDEBUG endif else VPP_FLAGS += --optimize 3 diff --git a/sim/opaesim/Makefile b/sim/opaesim/Makefile index 7b0d543d2..fcb1b84b1 100644 --- a/sim/opaesim/Makefile +++ b/sim/opaesim/Makefile @@ -88,7 +88,7 @@ ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) else - VL_FLAGS += -DNDEBUG + VL_FLAGS += -O3 -DNDEBUG CXXFLAGS += -O3 -DNDEBUG endif diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index e9487a2f4..8fd8bae79 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -70,8 +70,8 @@ ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) else - VL_FLAGS += -DNDEBUG - CXXFLAGS += -O2 -DNDEBUG + VL_FLAGS += -O3 -DNDEBUG + CXXFLAGS += -O3 -DNDEBUG endif # Enable perf counters diff --git a/sim/rtlsim/processor.cpp b/sim/rtlsim/processor.cpp index 2c31f939b..e8ce35329 100644 --- a/sim/rtlsim/processor.cpp +++ b/sim/rtlsim/processor.cpp @@ -316,11 +316,11 @@ private: auto mem_rsp_it = pending_mem_reqs_.begin(); auto mem_rsp = *mem_rsp_it; /* - printf("%0ld: [sim] MEM Rd Rsp: addr=%0lx, data=", timestamp, mem_rsp->addr); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); + printf("%0ld: [sim] MEM Rd Rsp: addr=0x%0lx, data=0x", timestamp, mem_rsp->addr); + for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) { + printf("%02x", mem_rsp->block[i]); + } + printf("\n"); */ device_->m_axi_rvalid[0] = 1; device_->m_axi_rid[0] = mem_rsp->tag; @@ -347,7 +347,7 @@ private: auto mem_rsp_it = pending_mem_reqs_.begin(); auto mem_rsp = *mem_rsp_it; /* - printf("%0ld: [sim] MEM Wr Rsp: addr=%0lx\n", timestamp, mem_rsp->addr); + printf("%0ld: [sim] MEM Wr Rsp: addr=0x%0lx\n", timestamp, mem_rsp->addr); */ device_->m_axi_bvalid[0] = 1; device_->m_axi_bid[0] = mem_rsp->tag; @@ -387,11 +387,15 @@ private: } else { // process writes /* - printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); + printf("%0ld: [sim] MEM Wr: addr=0x%0lx, byteen=0x", timestamp, base_addr); + for (int i = (MEM_BLOCK_SIZE/4)-1; i >= 0; --i) { + printf("%x", (int)((byteen >> (4 * i)) & 0xf)); + } + printf(", data=0x"); + for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) { + printf("%02x", data[i]); + } + printf("\n"); */ for (int i = 0; i < MEM_BLOCK_SIZE; i++) { if ((byteen >> i) & 0x1) { @@ -459,11 +463,11 @@ private: auto mem_rsp_it = pending_mem_reqs_.begin(); auto mem_rsp = *mem_rsp_it; /* - printf("%0ld: [sim] MEM Rd: tag=%0lx, addr=%0lx, data=", timestamp, mem_rsp->tag, mem_rsp->addr); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); + printf("%0ld: [sim] MEM Rd Rsp: tag=0x%0lx, addr=0x%0lx, data=0x", timestamp, mem_rsp->tag, mem_rsp->addr); + for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) { + printf("%02x", mem_rsp->block[i]); + } + printf("\n"); */ memcpy(device_->mem_rsp_data.data(), mem_rsp->block.data(), MEM_BLOCK_SIZE); device_->mem_rsp_tag = mem_rsp->tag; @@ -499,11 +503,15 @@ private: } else { // process writes /* - printf("%0ld: [sim] MEM Wr: tag=%0lx, addr=%0x, byteen=%0lx, data=", timestamp, device_->mem_req_tag, byte_addr, byteen); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); + printf("%0ld: [sim] MEM Wr Req: tag=0x%0lx, addr=0x%0lx, byteen=0x", timestamp, device_->mem_req_tag, byte_addr); + for (int i = (MEM_BLOCK_SIZE/4)-1; i >= 0; --i) { + printf("%x", (int)((byteen >> (4 * i)) & 0xf)); + } + printf(", data=0x"); + for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) { + printf("%d=%02x,", i, data[i]); + } + printf("\n"); */ for (int i = 0; i < MEM_BLOCK_SIZE; i++) { if ((byteen >> i) & 0x1) { @@ -530,7 +538,7 @@ private: ram_->read(mem_req->block.data(), byte_addr, MEM_BLOCK_SIZE); pending_mem_reqs_.emplace_back(mem_req); - //printf("%0ld: [sim] MEM Rd Req: addr=%0x, tag=%0lx\n", timestamp, byte_addr, device_->mem_req_tag); + //printf("%0ld: [sim] MEM Rd Req: addr=0x%0lx, tag=0x%0lx\n", timestamp, byte_addr, device_->mem_req_tag); // send dram request dram_queue_.push(mem_req); diff --git a/sim/simx/Makefile b/sim/simx/Makefile index 622f653dd..22d9726bf 100644 --- a/sim/simx/Makefile +++ b/sim/simx/Makefile @@ -25,7 +25,7 @@ ifdef DEBUG CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) #CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer else - CXXFLAGS += -O2 -DNDEBUG + CXXFLAGS += -O3 -DNDEBUG endif # Enable perf counters diff --git a/sim/xrtsim/Makefile b/sim/xrtsim/Makefile index dd11c8d64..1d462d1f9 100644 --- a/sim/xrtsim/Makefile +++ b/sim/xrtsim/Makefile @@ -87,7 +87,7 @@ ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) else - VL_FLAGS += -DNDEBUG + VL_FLAGS += -O3 -DNDEBUG CXXFLAGS += -O3 -DNDEBUG endif From e34f824bf999ab6ae428b2dd871b108b19877b07 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 03:56:08 -0700 Subject: [PATCH 13/89] minor update --- sim/simx/emulator.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 0567eb853..8ef322beb 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -63,6 +63,7 @@ void Emulator::warp_t::clear(uint64_t startup_addr) { reg = std::rand(); #endif } + reg_file.at(0) = 0; // r0 = 0 } for (auto& reg_file : this->freg_file) { From 0709d656ca66222dfe0128b8294973122ab5ef07 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 13:32:35 -0700 Subject: [PATCH 14/89] writeback cache fixes --- hw/rtl/cache/VX_cache_bank.sv | 38 +++++++++++++++++------------------ hw/rtl/cache/VX_cache_data.sv | 14 +++++++++---- hw/rtl/cache/VX_cache_tags.sv | 22 +++++++++++--------- hw/rtl/core/VX_alu_int.sv | 4 ++-- hw/rtl/core/VX_decode.sv | 7 +++---- hw/rtl/libs/VX_dp_ram.sv | 5 +++++ hw/rtl/libs/VX_sp_ram.sv | 2 ++ 7 files changed, 53 insertions(+), 39 deletions(-) diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index ced0778af..fde1af13a 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -184,10 +184,11 @@ module VX_cache_bank #( .mshr_empty (mshr_empty) ); - wire rdw_hazard_st0; - reg rdw_hazard_st1; + wire rdw_hazard1_sel; + wire rdw_hazard2_sel; + reg rdw_hazard3_st1; - wire pipe_stall = crsp_queue_stall || rdw_hazard_st1; + wire pipe_stall = crsp_queue_stall || rdw_hazard3_st1; // inputs arbitration: // mshr replay has highest priority to maximize utilization since there is no miss. @@ -206,14 +207,16 @@ module VX_cache_bank #( wire creq_enable = creq_grant && core_req_valid; assign replay_ready = replay_grant - && ~rdw_hazard_st0 + && ~rdw_hazard1_sel && ~pipe_stall; assign mem_rsp_ready = fill_grant + && ~rdw_hazard2_sel && ~pipe_stall; assign line_flush_ready = flush_grant && ~mreq_queue_alm_full + && ~rdw_hazard2_sel && ~pipe_stall; assign core_req_ready = creq_grant @@ -376,15 +379,14 @@ module VX_cache_bank #( `UNUSED_VAR (do_write_miss_st1) // ensure mshr replay always get a hit - `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("runtime error: missed mshr replay")); + `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("missed mshr replay")); // detect BRAM's read-during-write hazard - assign rdw_hazard_st0 = do_fill_st0; // stall cycle after a fill - wire rdw_case1 = do_cache_rd_st0 && do_cache_wr_st1 && (addr_st0 == addr_st1); // standard cache access - wire rdw_case2 = WRITEBACK && (do_flush_st0 || do_fill_st0) && do_cache_wr_st1; // a writeback can evict preceeding write - always @(posedge clk) begin // after a write to same address - rdw_hazard_st1 <= (rdw_case1 || rdw_case2) - && ~rdw_hazard_st1; // invalidate if pipeline stalled to avoid repeats + assign rdw_hazard1_sel = do_fill_st0; // stall first replay following a fill + assign rdw_hazard2_sel = WRITEBACK && do_cache_wr_st0; // a writeback can evict any preceeding write + always @(posedge clk) begin // stall reads following writes to same address + rdw_hazard3_st1 <= do_cache_rd_st0 && do_cache_wr_st1 && (addr_st0 == addr_st1) + && ~rdw_hazard3_st1; // release pipeline stall end wire [`CS_LINE_WIDTH-1:0] write_data_st1 = {`CS_WORDS_PER_LINE{data_st1[`CS_WORD_WIDTH-1:0]}}; @@ -427,8 +429,8 @@ module VX_cache_bank #( .stall (pipe_stall), .read (do_cache_rd_st1), - .fill (do_fill_st1 && ~rdw_hazard_st1), - .flush (do_flush_st1 && ~rdw_hazard_st1), + .fill (do_fill_st1), + .flush (do_flush_st1), .write (do_cache_wr_st1), .way_sel (way_sel_st1), .line_addr (addr_st1), @@ -556,7 +558,7 @@ module VX_cache_bank #( ) core_rsp_queue ( .clk (clk), .reset (crsp_queue_reset), - .valid_in (crsp_queue_valid && ~rdw_hazard_st1), + .valid_in (crsp_queue_valid && ~rdw_hazard3_st1), .ready_in (crsp_queue_ready), .data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}), .data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}), @@ -582,13 +584,11 @@ module VX_cache_bank #( if (WRITEBACK) begin assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1) - || do_writeback_st1) - && ~rdw_hazard_st1; + || do_writeback_st1); end else begin `UNUSED_VAR (evict_dirty_st1) assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1) - || do_creq_wr_st1) - && ~rdw_hazard_st1; + || do_creq_wr_st1); end assign mreq_queue_pop = mem_req_valid && mem_req_ready; @@ -636,7 +636,7 @@ module VX_cache_bank #( && ~(replay_fire || mem_rsp_fire || core_req_fire || line_flush_fire); always @(posedge clk) begin if (input_stall || pipe_stall) begin - `TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw_st0=%b, rdw_st1=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard_st0, rdw_hazard_st1)); + `TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw1=%b, rdw2=%b, rdw3=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard1_sel, rdw_hazard2_sel, rdw_hazard3_st1)); end if (mem_rsp_fire) begin `TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data)); diff --git a/hw/rtl/cache/VX_cache_data.sv b/hw/rtl/cache/VX_cache_data.sv index f6b77c1c5..45dc8b210 100644 --- a/hw/rtl/cache/VX_cache_data.sv +++ b/hw/rtl/cache/VX_cache_data.sv @@ -82,10 +82,12 @@ module VX_cache_data #( VX_sp_ram #( .DATAW (LINE_SIZE * NUM_WAYS), - .SIZE (`CS_LINES_PER_BANK) + .SIZE (`CS_LINES_PER_BANK), + .NO_RWCHECK (1), + .RW_ASSERT (1) ) byteen_store ( .clk (clk), - .read (1'b1), + .read (write || fill || flush), .write (write || fill || flush), `UNUSED_PIN (wren), .addr (way_addr), @@ -140,14 +142,18 @@ module VX_cache_data #( `UNUSED_PIN (valid_out) ); + wire line_read = (read && ~stall) + || (WRITEBACK && (fill || flush)); + VX_sp_ram #( .DATAW (`CS_LINE_WIDTH * NUM_WAYS), .SIZE (`CS_LINES_PER_BANK), .WRENW (BYTEENW), - .NO_RWCHECK (1) + .NO_RWCHECK (1), + .RW_ASSERT (1) ) data_store ( .clk (clk), - .read (1'b1), + .read (line_read), .write (write || fill), .wren (wren), .addr (line_sel), diff --git a/hw/rtl/cache/VX_cache_tags.sv b/hw/rtl/cache/VX_cache_tags.sv index 6e57301c4..4cba8f299 100644 --- a/hw/rtl/cache/VX_cache_tags.sv +++ b/hw/rtl/cache/VX_cache_tags.sv @@ -97,12 +97,17 @@ module VX_cache_tags #( assign evict_tag = read_tag; end + // fill and flush need to also read in writeback mode + wire fill_s = fill && (!WRITEBACK || ~stall); + wire flush_s = flush && (!WRITEBACK || ~stall); + for (genvar i = 0; i < NUM_WAYS; ++i) begin - wire do_fill = fill && evict_way[i]; - wire do_flush = flush && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode + wire do_fill = fill_s && evict_way[i]; + wire do_flush = flush_s && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode wire do_write = WRITEBACK && write && tag_matches[i]; + wire line_read = (lookup && ~stall) || (WRITEBACK && (fill_s || flush_s)); wire line_write = init || do_fill || do_flush || do_write; wire line_valid = ~(init || flush); @@ -121,10 +126,11 @@ module VX_cache_tags #( VX_sp_ram #( .DATAW (TAG_WIDTH), .SIZE (`CS_LINES_PER_BANK), - .NO_RWCHECK (1) + .NO_RWCHECK (1), + .RW_ASSERT (1) ) tag_store ( .clk (clk), - .read (1'b1), + .read (line_read), .write (line_write), `UNUSED_PIN (wren), .addr (line_sel), @@ -139,20 +145,16 @@ module VX_cache_tags #( assign evict_dirty = (| read_dirty); - // ensure fills and flushes do not stall - `RUNTIME_ASSERT (~fill || ~stall, ("runtime error: stalled fill")); - `RUNTIME_ASSERT (~flush || ~stall, ("runtime error: stalled fill")); - `ifdef DBG_TRACE_CACHE wire [`CS_LINE_ADDR_WIDTH-1:0] evict_line_addr = {evict_tag, line_sel}; always @(posedge clk) begin - if (fill) begin + if (fill && ~stall) begin `TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, dirty=%b, evict_addr=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), evict_way, line_sel, line_tag, evict_dirty, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID))); end if (init) begin `TRACE(3, ("%d: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel)); end - if (flush) begin + if (flush && ~stall) begin `TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID), way_sel, line_sel, evict_dirty)); end if (lookup && ~stall) begin diff --git a/hw/rtl/core/VX_alu_int.sv b/hw/rtl/core/VX_alu_int.sv index 47bfcc6bf..9d5e32fe3 100644 --- a/hw/rtl/core/VX_alu_int.sv +++ b/hw/rtl/core/VX_alu_int.sv @@ -108,7 +108,7 @@ module VX_alu_int #( 2'b00: msc_result[i] = alu_in1[i] & alu_in2_imm[i]; // AND 2'b01: msc_result[i] = alu_in1[i] | alu_in2_imm[i]; // OR 2'b10: msc_result[i] = alu_in1[i] ^ alu_in2_imm[i]; // XOR - 2'b11: msc_result[i] = alu_in1[i] << alu_in2_imm[i][SHIFT_IMM_BITS-1:0]; // SLL + default: msc_result[i] = alu_in1[i] << alu_in2_imm[i][SHIFT_IMM_BITS-1:0]; // SLL endcase end assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0])); // SLLW @@ -126,7 +126,7 @@ module VX_alu_int #( 3'b100: alu_result[i] = add_result_w[i]; // ADDIW, ADDW 3'b101: alu_result[i] = sub_result_w[i]; // SUBW 3'b110: alu_result[i] = shr_result_w[i]; // SRLW, SRAW, SRLIW, SRAIW - 3'b111: alu_result[i] = msc_result_w[i]; // SLLW + default: alu_result[i] = msc_result_w[i]; // SLLW endcase end end diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index 9660859ce..95157055c 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -99,7 +99,7 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( 3'h4: r_type = `INST_ALU_XOR; 3'h5: r_type = func7[5] ? `INST_ALU_SRA : `INST_ALU_SRL; 3'h6: r_type = `INST_ALU_OR; - 3'h7: r_type = `INST_ALU_AND; + default: r_type = `INST_ALU_AND; endcase end @@ -111,8 +111,7 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( 3'h4: b_type = `INST_BR_LT; 3'h5: b_type = `INST_BR_GE; 3'h6: b_type = `INST_BR_LTU; - 3'h7: b_type = `INST_BR_GEU; - default: b_type = 'x; + default: b_type = `INST_BR_GEU; endcase end @@ -139,7 +138,7 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( 3'h4: m_type = `INST_M_DIV; 3'h5: m_type = `INST_M_DIVU; 3'h6: m_type = `INST_M_REM; - 3'h7: m_type = `INST_M_REMU; + default: m_type = `INST_M_REMU; endcase end `endif diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index fa11a541f..d307a9576 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -22,6 +22,7 @@ module VX_dp_ram #( parameter OUT_REG = 0, parameter NO_RWCHECK = 0, parameter LUTRAM = 0, + parameter RW_ASSERT = 0, parameter INIT_ENABLE = 0, parameter INIT_FILE = "", parameter [DATAW-1:0] INIT_VALUE = 0, @@ -50,6 +51,7 @@ module VX_dp_ram #( end \ end + `UNUSED_PARAM (RW_ASSERT) `UNUSED_VAR (read) `ifdef SYNTHESIS @@ -307,6 +309,9 @@ module VX_dp_ram #( assign rdata = ram[raddr]; end else begin assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; + if (RW_ASSERT) begin + `RUNTIME_ASSERT (~read || (rdata == ram[raddr]), ("read after write mismatch")); + end end end `endif diff --git a/hw/rtl/libs/VX_sp_ram.sv b/hw/rtl/libs/VX_sp_ram.sv index 297a23d20..a62099b1b 100644 --- a/hw/rtl/libs/VX_sp_ram.sv +++ b/hw/rtl/libs/VX_sp_ram.sv @@ -21,6 +21,7 @@ module VX_sp_ram #( parameter WRENW = 1, parameter OUT_REG = 0, parameter NO_RWCHECK = 0, + parameter RW_ASSERT = 0, parameter LUTRAM = 0, parameter INIT_ENABLE = 0, parameter INIT_FILE = "", @@ -42,6 +43,7 @@ module VX_sp_ram #( .WRENW (WRENW), .OUT_REG (OUT_REG), .NO_RWCHECK (NO_RWCHECK), + .RW_ASSERT (RW_ASSERT), .LUTRAM (LUTRAM), .INIT_ENABLE (INIT_ENABLE), .INIT_FILE (INIT_FILE), From a91dabcc726465e7be3decadc3280f3088af86b5 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 13:52:04 -0700 Subject: [PATCH 15/89] minor update --- hw/rtl/VX_config.vh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index f43eb2581..804715aad 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -537,7 +537,7 @@ // Enable Cache Writeback `ifndef DCACHE_WRITEBACK -`define DCACHE_WRITEBACK 1 +`define DCACHE_WRITEBACK 0 `endif // LMEM Configurable Knobs //////////////////////////////////////////////////// @@ -601,7 +601,7 @@ // Enable Cache Writeback `ifndef L2_WRITEBACK -`define L2_WRITEBACK 1 +`define L2_WRITEBACK 0 `endif // L3cache Configurable Knobs ///////////////////////////////////////////////// @@ -647,7 +647,7 @@ // Enable Cache Writeback `ifndef L3_WRITEBACK -`define L3_WRITEBACK 1 +`define L3_WRITEBACK 0 `endif // ISA Extensions ///////////////////////////////////////////////////////////// From 84571631140a0e859a3c3257c61afd3a2e89d583 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 14:43:49 -0700 Subject: [PATCH 16/89] adding dirty bytes configuration to writeback cache --- hw/rtl/VX_cluster.sv | 1 + hw/rtl/VX_socket.sv | 1 + hw/rtl/Vortex.sv | 1 + hw/rtl/cache/VX_cache.sv | 5 ++++ hw/rtl/cache/VX_cache_bank.sv | 4 +++ hw/rtl/cache/VX_cache_cluster.sv | 4 +++ hw/rtl/cache/VX_cache_data.sv | 32 ++++++++++++---------- hw/rtl/cache/VX_cache_tags.sv | 12 ++++++--- hw/rtl/cache/VX_cache_top.sv | 46 +++++++++++++++++++------------- hw/rtl/cache/VX_cache_wrap.sv | 4 +++ 10 files changed, 74 insertions(+), 36 deletions(-) diff --git a/hw/rtl/VX_cluster.sv b/hw/rtl/VX_cluster.sv index 108e95073..c84aadcb7 100644 --- a/hw/rtl/VX_cluster.sv +++ b/hw/rtl/VX_cluster.sv @@ -100,6 +100,7 @@ module VX_cluster import VX_gpu_pkg::*; #( .TAG_WIDTH (L2_TAG_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`L2_WRITEBACK), + .DIRTY_BYTES (`L2_WRITEBACK), .UUID_WIDTH (`UUID_WIDTH), .CORE_OUT_BUF (2), .MEM_OUT_BUF (2), diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv index abdf67612..8d7b86160 100644 --- a/hw/rtl/VX_socket.sv +++ b/hw/rtl/VX_socket.sv @@ -150,6 +150,7 @@ module VX_socket import VX_gpu_pkg::*; #( .UUID_WIDTH (`UUID_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`DCACHE_WRITEBACK), + .DIRTY_BYTES (`DCACHE_WRITEBACK), .NC_ENABLE (1), .CORE_OUT_BUF (2), .MEM_OUT_BUF (2) diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv index 816558094..d3a308009 100644 --- a/hw/rtl/Vortex.sv +++ b/hw/rtl/Vortex.sv @@ -84,6 +84,7 @@ module Vortex import VX_gpu_pkg::*; ( .TAG_WIDTH (L2_MEM_TAG_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`L3_WRITEBACK), + .DIRTY_BYTES (`L3_WRITEBACK), .UUID_WIDTH (`UUID_WIDTH), .CORE_OUT_BUF (2), .MEM_OUT_BUF (2), diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index acaa1dac3..dfad8baad 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -45,6 +45,9 @@ module VX_cache import VX_gpu_pkg::*; #( // Enable cache writeback parameter WRITEBACK = 0, + // Enable dirty bytes on writeback + parameter DIRTY_BYTES = 0, + // Request debug identifier parameter UUID_WIDTH = 0, @@ -71,6 +74,7 @@ module VX_cache import VX_gpu_pkg::*; #( `STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter")) `STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter")) + `STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter")) localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS); localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS); @@ -373,6 +377,7 @@ module VX_cache import VX_gpu_pkg::*; #( .MSHR_SIZE (MSHR_SIZE), .MREQ_SIZE (MREQ_SIZE), .WRITE_ENABLE (WRITE_ENABLE), + .DIRTY_BYTES (DIRTY_BYTES), .WRITEBACK (WRITEBACK), .UUID_WIDTH (UUID_WIDTH), .TAG_WIDTH (TAG_WIDTH), diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index fde1af13a..ca0c0b3cf 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -44,6 +44,9 @@ module VX_cache_bank #( // Enable cache writeback parameter WRITEBACK = 0, + // Enable dirty bytes on writeback + parameter DIRTY_BYTES = 0, + // Request debug identifier parameter UUID_WIDTH = 0, @@ -419,6 +422,7 @@ module VX_cache_bank #( .WORD_SIZE (WORD_SIZE), .WRITE_ENABLE (WRITE_ENABLE), .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), .UUID_WIDTH (UUID_WIDTH) ) cache_data ( .clk (clk), diff --git a/hw/rtl/cache/VX_cache_cluster.sv b/hw/rtl/cache/VX_cache_cluster.sv index c567ddbc5..716e69561 100644 --- a/hw/rtl/cache/VX_cache_cluster.sv +++ b/hw/rtl/cache/VX_cache_cluster.sv @@ -49,6 +49,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( // Enable cache writeback parameter WRITEBACK = 0, + // Enable dirty bytes on writeback + parameter DIRTY_BYTES = 0, + // Request debug identifier parameter UUID_WIDTH = 0, @@ -155,6 +158,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .MREQ_SIZE (MREQ_SIZE), .WRITE_ENABLE (WRITE_ENABLE), .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), .UUID_WIDTH (UUID_WIDTH), .TAG_WIDTH (ARB_TAG_WIDTH), .TAG_SEL_IDX (TAG_SEL_IDX), diff --git a/hw/rtl/cache/VX_cache_data.sv b/hw/rtl/cache/VX_cache_data.sv index 45dc8b210..747f02917 100644 --- a/hw/rtl/cache/VX_cache_data.sv +++ b/hw/rtl/cache/VX_cache_data.sv @@ -30,6 +30,8 @@ module VX_cache_data #( parameter WRITE_ENABLE = 1, // Enable cache writeback parameter WRITEBACK = 0, + // Enable dirty bytes on writeback + parameter DIRTY_BYTES = 0, // Request debug identifier parameter UUID_WIDTH = 0 ) ( @@ -80,20 +82,22 @@ module VX_cache_data #( assign way_addr = line_sel; end - VX_sp_ram #( - .DATAW (LINE_SIZE * NUM_WAYS), - .SIZE (`CS_LINES_PER_BANK), - .NO_RWCHECK (1), - .RW_ASSERT (1) - ) byteen_store ( - .clk (clk), - .read (write || fill || flush), - .write (write || fill || flush), - `UNUSED_PIN (wren), - .addr (way_addr), - .wdata (write ? (dirty_byteen | write_byteen) : ((fill || flush) ? '0 : dirty_byteen)), - .rdata (dirty_byteen) - ); + if (DIRTY_BYTES) begin + VX_sp_ram #( + .DATAW (LINE_SIZE * NUM_WAYS), + .SIZE (`CS_LINES_PER_BANK) + ) byteen_store ( + .clk (clk), + .read (write || fill || flush), + .write (write || fill || flush), + `UNUSED_PIN (wren), + .addr (way_addr), + .wdata (write ? (dirty_byteen | write_byteen) : ((fill || flush) ? '0 : dirty_byteen)), + .rdata (dirty_byteen) + ); + end else begin + assign dirty_byteen = {LINE_SIZE{1'b1}}; + end wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] dirty_data_w; for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin diff --git a/hw/rtl/cache/VX_cache_tags.sv b/hw/rtl/cache/VX_cache_tags.sv index 4cba8f299..d7a948c62 100644 --- a/hw/rtl/cache/VX_cache_tags.sv +++ b/hw/rtl/cache/VX_cache_tags.sv @@ -107,7 +107,7 @@ module VX_cache_tags #( wire do_flush = flush_s && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode wire do_write = WRITEBACK && write && tag_matches[i]; - wire line_read = (lookup && ~stall) || (WRITEBACK && (fill_s || flush_s)); + wire line_read = (WRITEBACK && (fill_s || flush_s)); wire line_write = init || do_fill || do_flush || do_write; wire line_valid = ~(init || flush); @@ -159,9 +159,15 @@ module VX_cache_tags #( end if (lookup && ~stall) begin if (tag_matches != 0) begin - `TRACE(3, ("%d: %s hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); + if (write) + `TRACE(3, ("%d: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); + else + `TRACE(3, ("%d: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); end else begin - `TRACE(3, ("%d: %s miss: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); + if (write) + `TRACE(3, ("%d: %s read-miss: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); + else + `TRACE(3, ("%d: %s read-miss: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); end end end diff --git a/hw/rtl/cache/VX_cache_top.sv b/hw/rtl/cache/VX_cache_top.sv index 59dd1c364..0959701aa 100644 --- a/hw/rtl/cache/VX_cache_top.sv +++ b/hw/rtl/cache/VX_cache_top.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,20 +20,20 @@ module VX_cache_top import VX_gpu_pkg::*; #( parameter NUM_REQS = 4, // Size of cache in bytes - parameter CACHE_SIZE = 16384, + parameter CACHE_SIZE = 16384, // Size of line inside a bank in bytes - parameter LINE_SIZE = 64, + parameter LINE_SIZE = 64, // Number of banks parameter NUM_BANKS = 4, // Number of associative ways parameter NUM_WAYS = 4, // Size of a word in bytes - parameter WORD_SIZE = 4, + parameter WORD_SIZE = 4, // Core Response Queue Size parameter CRSQ_SIZE = 2, // Miss Reserv Queue Knob - parameter MSHR_SIZE = 16, + parameter MSHR_SIZE = 16, // Memory Response Queue Size parameter MRSQ_SIZE = 0, // Memory Request Queue Size @@ -42,6 +42,12 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Enable cache writeable parameter WRITE_ENABLE = 1, + // Enable cache writeback + parameter WRITEBACK = 0, + + // Enable dirty bytes on writeback + parameter DIRTY_BYTES = 0, + // Request debug identifier parameter UUID_WIDTH = 0, @@ -55,7 +61,7 @@ module VX_cache_top import VX_gpu_pkg::*; #( parameter MEM_OUT_BUF = 2, parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS) - ) ( + ) ( input wire clk, input wire reset, @@ -82,17 +88,17 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Memory request output wire mem_req_valid, - output wire mem_req_rw, + output wire mem_req_rw, output wire [LINE_SIZE-1:0] mem_req_byteen, output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr, - output wire [`CS_LINE_WIDTH-1:0] mem_req_data, - output wire [MEM_TAG_WIDTH-1:0] mem_req_tag, + output wire [`CS_LINE_WIDTH-1:0] mem_req_data, + output wire [MEM_TAG_WIDTH-1:0] mem_req_tag, input wire mem_req_ready, - + // Memory response - input wire mem_rsp_valid, + input wire mem_rsp_valid, input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data, - input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag, + input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag, output wire mem_rsp_ready ); VX_mem_bus_if #( @@ -127,18 +133,18 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Memory request assign mem_req_valid = mem_bus_if.req_valid; - assign mem_req_rw = mem_bus_if.req_data.rw; + assign mem_req_rw = mem_bus_if.req_data.rw; assign mem_req_byteen = mem_bus_if.req_data.byteen; assign mem_req_addr = mem_bus_if.req_data.addr; - assign mem_req_data = mem_bus_if.req_data.data; - assign mem_req_tag = mem_bus_if.req_data.tag; + assign mem_req_data = mem_bus_if.req_data.data; + assign mem_req_tag = mem_bus_if.req_data.tag; assign mem_bus_if.req_ready = mem_req_ready; `UNUSED_VAR (mem_bus_if.req_data.atype) - + // Memory response - assign mem_bus_if.rsp_valid = mem_rsp_valid; + assign mem_bus_if.rsp_valid = mem_rsp_valid; assign mem_bus_if.rsp_data.data = mem_rsp_data; - assign mem_bus_if.rsp_data.tag = mem_rsp_tag; + assign mem_bus_if.rsp_data.tag = mem_rsp_tag; assign mem_rsp_ready = mem_bus_if.rsp_ready; VX_cache #( @@ -156,6 +162,8 @@ module VX_cache_top import VX_gpu_pkg::*; #( .TAG_WIDTH (TAG_WIDTH), .UUID_WIDTH (UUID_WIDTH), .WRITE_ENABLE (WRITE_ENABLE), + .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), .CORE_OUT_BUF (CORE_OUT_BUF), .MEM_OUT_BUF (MEM_OUT_BUF) ) cache ( diff --git a/hw/rtl/cache/VX_cache_wrap.sv b/hw/rtl/cache/VX_cache_wrap.sv index 546f172b4..37940297f 100644 --- a/hw/rtl/cache/VX_cache_wrap.sv +++ b/hw/rtl/cache/VX_cache_wrap.sv @@ -48,6 +48,9 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( // Enable cache writeback parameter WRITEBACK = 0, + // Enable dirty bytes on writeback + parameter DIRTY_BYTES = 0, + // Request debug identifier parameter UUID_WIDTH = 0, @@ -187,6 +190,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( .MREQ_SIZE (MREQ_SIZE), .WRITE_ENABLE (WRITE_ENABLE), .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), .UUID_WIDTH (UUID_WIDTH), .TAG_WIDTH (TAG_WIDTH), .CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF), From 3223a40a76de4e2af78fb418b5c2f51b6dcd1f73 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 14:58:35 -0700 Subject: [PATCH 17/89] Verilator optimization flags update --- sim/opaesim/Makefile | 6 +++--- sim/rtlsim/Makefile | 4 ++-- sim/simx/Makefile | 2 +- sim/xrtsim/Makefile | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sim/opaesim/Makefile b/sim/opaesim/Makefile index fcb1b84b1..2a4eaf02d 100644 --- a/sim/opaesim/Makefile +++ b/sim/opaesim/Makefile @@ -88,8 +88,8 @@ ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) else - VL_FLAGS += -O3 -DNDEBUG - CXXFLAGS += -O3 -DNDEBUG + VL_FLAGS += -DNDEBUG + CXXFLAGS += -O2 -DNDEBUG endif # Enable scope analyzer @@ -123,7 +123,7 @@ $(DESTDIR)/vortex_afu.h : $(AFU_DIR)/vortex_afu.vh $(SCRIPT_DIR)/gen_config.py -i $^ -o $@ $(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/vortex_afu.h $(SCOPE_JSON) - verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $@.obj_dir -o $@ + verilator --build --exe $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $@.obj_dir -o $@ clean: rm -rf $(DESTDIR)/$(PROJECT).obj_dir diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index 8fd8bae79..e9487a2f4 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -70,8 +70,8 @@ ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) else - VL_FLAGS += -O3 -DNDEBUG - CXXFLAGS += -O3 -DNDEBUG + VL_FLAGS += -DNDEBUG + CXXFLAGS += -O2 -DNDEBUG endif # Enable perf counters diff --git a/sim/simx/Makefile b/sim/simx/Makefile index 22d9726bf..622f653dd 100644 --- a/sim/simx/Makefile +++ b/sim/simx/Makefile @@ -25,7 +25,7 @@ ifdef DEBUG CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) #CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer else - CXXFLAGS += -O3 -DNDEBUG + CXXFLAGS += -O2 -DNDEBUG endif # Enable perf counters diff --git a/sim/xrtsim/Makefile b/sim/xrtsim/Makefile index 1d462d1f9..c2128f3c4 100644 --- a/sim/xrtsim/Makefile +++ b/sim/xrtsim/Makefile @@ -87,8 +87,8 @@ ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) else - VL_FLAGS += -O3 -DNDEBUG - CXXFLAGS += -O3 -DNDEBUG + VL_FLAGS += -DNDEBUG + CXXFLAGS += -O2 -DNDEBUG endif # Enable scope analyzer @@ -119,7 +119,7 @@ $(DESTDIR)/scope.json: $(DESTDIR)/vortex.xml $(SCRIPT_DIR)/scope.py $^ -o $@ $(DESTDIR)/$(PROJECT): $(SRCS) $(SCOPE_JSON) - verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $@.obj_dir -o $@ + verilator --build --exe $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $@.obj_dir -o $@ clean: rm -rf $(DESTDIR)/$(PROJECT).obj_dir From a62e651b026fa9f5fbe9b6bcc704bf781784105a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 15:51:52 -0700 Subject: [PATCH 18/89] minor update --- hw/rtl/libs/VX_dp_ram.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index d307a9576..13589bcdf 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -298,7 +298,7 @@ module VX_dp_ram #( if (write) begin ram[waddr] <= ram_n; end - prev_write <= (| wren); + prev_write <= write; prev_data <= ram[waddr]; prev_waddr <= waddr; end From 724cb4084918f7b01938ba4b0e0416b9becb84a0 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 16:01:08 -0700 Subject: [PATCH 19/89] minor update --- hw/rtl/libs/VX_dp_ram.sv | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index 13589bcdf..2ed12ebad 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -54,6 +54,8 @@ module VX_dp_ram #( `UNUSED_PARAM (RW_ASSERT) `UNUSED_VAR (read) + `RUNTIME_ASSERT(~write || (| wren), ("invalid write enable mask")); + `ifdef SYNTHESIS if (WRENW > 1) begin `ifdef QUARTUS @@ -310,7 +312,7 @@ module VX_dp_ram #( end else begin assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; if (RW_ASSERT) begin - `RUNTIME_ASSERT (~read || (rdata == ram[raddr]), ("read after write mismatch")); + `RUNTIME_ASSERT(~read || (rdata == ram[raddr]), ("read after write mismatch")); end end end From eab1791d466b90f408909ff627d38cef5127445d Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 16:07:46 -0700 Subject: [PATCH 20/89] CI script update --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 388ec9ab7..8e9bbeae9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -115,6 +115,7 @@ jobs: runs-on: ubuntu-20.04 needs: build strategy: + fail-fast: false matrix: name: [regression, opencl, config1, config2, debug, stress] xlen: [32, 64] @@ -165,7 +166,6 @@ jobs: else ./ci/regression.sh --${{ matrix.name }} fi - continue-on-error: true complete: runs-on: ubuntu-20.04 From 2a9895c3373de54f15b0e5b9fc241242a4ee4d14 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 16:20:23 -0700 Subject: [PATCH 21/89] minor update --- hw/rtl/libs/VX_dp_ram.sv | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index 2ed12ebad..17d602a78 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -54,7 +54,9 @@ module VX_dp_ram #( `UNUSED_PARAM (RW_ASSERT) `UNUSED_VAR (read) - `RUNTIME_ASSERT(~write || (| wren), ("invalid write enable mask")); + if (WRENW > 1) begin + `RUNTIME_ASSERT(~write || (| wren), ("invalid write enable mask")); + end `ifdef SYNTHESIS if (WRENW > 1) begin From bce5226614774e6901c12ea2a5d21d5480633499 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 17:07:01 -0700 Subject: [PATCH 22/89] minor update --- hw/rtl/core/VX_alu_int.sv | 4 ++-- hw/rtl/core/VX_decode.sv | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/hw/rtl/core/VX_alu_int.sv b/hw/rtl/core/VX_alu_int.sv index 9d5e32fe3..47bfcc6bf 100644 --- a/hw/rtl/core/VX_alu_int.sv +++ b/hw/rtl/core/VX_alu_int.sv @@ -108,7 +108,7 @@ module VX_alu_int #( 2'b00: msc_result[i] = alu_in1[i] & alu_in2_imm[i]; // AND 2'b01: msc_result[i] = alu_in1[i] | alu_in2_imm[i]; // OR 2'b10: msc_result[i] = alu_in1[i] ^ alu_in2_imm[i]; // XOR - default: msc_result[i] = alu_in1[i] << alu_in2_imm[i][SHIFT_IMM_BITS-1:0]; // SLL + 2'b11: msc_result[i] = alu_in1[i] << alu_in2_imm[i][SHIFT_IMM_BITS-1:0]; // SLL endcase end assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0])); // SLLW @@ -126,7 +126,7 @@ module VX_alu_int #( 3'b100: alu_result[i] = add_result_w[i]; // ADDIW, ADDW 3'b101: alu_result[i] = sub_result_w[i]; // SUBW 3'b110: alu_result[i] = shr_result_w[i]; // SRLW, SRAW, SRLIW, SRAIW - default: alu_result[i] = msc_result_w[i]; // SLLW + 3'b111: alu_result[i] = msc_result_w[i]; // SLLW endcase end end diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index 95157055c..9660859ce 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -99,7 +99,7 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( 3'h4: r_type = `INST_ALU_XOR; 3'h5: r_type = func7[5] ? `INST_ALU_SRA : `INST_ALU_SRL; 3'h6: r_type = `INST_ALU_OR; - default: r_type = `INST_ALU_AND; + 3'h7: r_type = `INST_ALU_AND; endcase end @@ -111,7 +111,8 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( 3'h4: b_type = `INST_BR_LT; 3'h5: b_type = `INST_BR_GE; 3'h6: b_type = `INST_BR_LTU; - default: b_type = `INST_BR_GEU; + 3'h7: b_type = `INST_BR_GEU; + default: b_type = 'x; endcase end @@ -138,7 +139,7 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( 3'h4: m_type = `INST_M_DIV; 3'h5: m_type = `INST_M_DIVU; 3'h6: m_type = `INST_M_REM; - default: m_type = `INST_M_REMU; + 3'h7: m_type = `INST_M_REMU; endcase end `endif From 5600a8dd4206dff99184d808c31feb9c4d05a3ec Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 29 Jul 2024 19:49:12 -0700 Subject: [PATCH 23/89] writeback cache fixes --- hw/rtl/cache/VX_cache_bank.sv | 50 +++++++++++++++++++++----------- hw/rtl/cache/VX_cache_data.sv | 20 +++++++------ hw/rtl/cache/VX_cache_flush.sv | 3 +- hw/rtl/cache/VX_cache_init.sv | 52 ---------------------------------- 4 files changed, 47 insertions(+), 78 deletions(-) delete mode 100644 hw/rtl/cache/VX_cache_init.sv diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index ca0c0b3cf..83800d536 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -135,7 +135,7 @@ module VX_cache_bank #( wire [MSHR_ADDR_WIDTH-1:0] replay_id; wire replay_ready; - wire is_init_st0; + wire is_init_st0, is_init_st1; wire is_flush_st0, is_flush_st1; wire [NUM_WAYS-1:0] flush_way_st0; @@ -341,14 +341,14 @@ module VX_cache_bank #( assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, addr_st0[`CS_LINE_SEL_BITS-1:0]} : addr_st0; VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1), + .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_st0, is_flush2_st0, is_replay_st0, is_fill_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}), - .data_out ({valid_st1, is_flush_st1, is_replay_st1, is_fill_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1}) + .data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}), + .data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1}) ); // we have a tag hit @@ -362,22 +362,24 @@ module VX_cache_bank #( wire is_read_st1 = is_creq_st1 && ~rw_st1; wire is_write_st1 = is_creq_st1 && rw_st1; + + wire do_init_st1 = valid_st1 && is_init_st1; + wire do_fill_st1 = valid_st1 && is_fill_st1; + wire do_flush_st1 = valid_st1 && is_flush_st1; + wire do_creq_rd_st1 = valid_st1 && is_read_st1; wire do_creq_wr_st1 = valid_st1 && is_write_st1; - wire do_fill_st1 = valid_st1 && is_fill_st1; wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1; wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1; - wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1; - wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1; - wire do_read_hit_st1 = do_creq_rd_st1 && is_hit_st1; wire do_read_miss_st1 = do_creq_rd_st1 && ~is_hit_st1; wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1; wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1; - wire do_flush_st1 = valid_st1 && is_flush_st1; + wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1; + wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1; `UNUSED_VAR (do_write_miss_st1) @@ -432,6 +434,7 @@ module VX_cache_bank #( .stall (pipe_stall), + .init (do_init_st1), .read (do_cache_rd_st1), .fill (do_fill_st1), .flush (do_flush_st1), @@ -582,11 +585,17 @@ module VX_cache_bank #( wire mreq_queue_rw; wire mreq_queue_flush; - wire is_evict_st1 = (is_fill_st1 || is_flush_st1) && evict_dirty_st1; - wire do_writeback_st1 = valid_st1 && is_evict_st1; + wire is_fill_or_flush_st1 = is_fill_st1 || is_flush_st1; + wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1; + wire do_writeback_st1 = do_fill_or_flush_st1 && evict_dirty_st1; `UNUSED_VAR (do_writeback_st1) if (WRITEBACK) begin + if (DIRTY_BYTES) begin + // ensure dirty bytes are valid + wire has_dirty_bytes = (| dirty_byteen_st1); + `RUNTIME_ASSERT (~do_fill_or_flush_st1 || (evict_dirty_st1 == has_dirty_bytes), ("missmatch dirty bytes")); + end assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1) || do_writeback_st1); end else begin @@ -595,14 +604,23 @@ module VX_cache_bank #( || do_creq_wr_st1); end - assign mreq_queue_pop = mem_req_valid && mem_req_ready; - assign mreq_queue_rw = WRITE_ENABLE && (WRITEBACK ? is_evict_st1 : rw_st1); + assign mreq_queue_pop = mem_req_valid && mem_req_ready; assign mreq_queue_addr = addr_st1; - assign mreq_queue_id = mshr_id_st1; - assign mreq_queue_data = is_write_st1 ? write_data_st1 : dirty_data_st1; - assign mreq_queue_byteen = is_write_st1 ? write_byteen_st1 : dirty_byteen_st1; + assign mreq_queue_id = mshr_id_st1; assign mreq_queue_flush = creq_flush_st1; + if (WRITE_ENABLE) begin + assign mreq_queue_rw = WRITEBACK ? is_fill_or_flush_st1 : rw_st1; + assign mreq_queue_data = WRITEBACK ? dirty_data_st1 : write_data_st1; + assign mreq_queue_byteen = WRITEBACK ? dirty_byteen_st1 : write_byteen_st1; + end else begin + assign mreq_queue_rw = 0; + assign mreq_queue_data = 0; + assign mreq_queue_byteen = 0; + `UNUSED_VAR (dirty_data_st1) + `UNUSED_VAR (dirty_byteen_st1) + end + `RESET_RELAY (mreq_queue_reset, reset); VX_fifo_queue #( diff --git a/hw/rtl/cache/VX_cache_data.sv b/hw/rtl/cache/VX_cache_data.sv index 747f02917..ecfefa26d 100644 --- a/hw/rtl/cache/VX_cache_data.sv +++ b/hw/rtl/cache/VX_cache_data.sv @@ -44,6 +44,7 @@ module VX_cache_data #( input wire stall, + input wire init, input wire read, input wire fill, input wire flush, @@ -64,6 +65,7 @@ module VX_cache_data #( `UNUSED_VAR (reset) `UNUSED_VAR (stall) `UNUSED_VAR (line_addr) + `UNUSED_VAR (init) `UNUSED_VAR (read) `UNUSED_VAR (flush) @@ -75,24 +77,24 @@ module VX_cache_data #( wire [`LOG2UP(NUM_WAYS)-1:0] way_idx; if (WRITEBACK) begin - wire [`CLOG2(`CS_LINES_PER_BANK * NUM_WAYS)-1:0] way_addr; - if (NUM_WAYS > 1) begin - assign way_addr = {line_sel, way_idx}; - end else begin - assign way_addr = line_sel; - end - if (DIRTY_BYTES) begin + wire [`CLOG2(`CS_LINES_PER_BANK * NUM_WAYS)-1:0] way_addr; + if (NUM_WAYS > 1) begin + assign way_addr = {line_sel, way_idx}; + end else begin + assign way_addr = line_sel; + end + VX_sp_ram #( .DATAW (LINE_SIZE * NUM_WAYS), .SIZE (`CS_LINES_PER_BANK) ) byteen_store ( .clk (clk), .read (write || fill || flush), - .write (write || fill || flush), + .write (init || write || fill || flush), `UNUSED_PIN (wren), .addr (way_addr), - .wdata (write ? (dirty_byteen | write_byteen) : ((fill || flush) ? '0 : dirty_byteen)), + .wdata (write ? (dirty_byteen | write_byteen) : ((init || fill || flush) ? '0 : dirty_byteen)), .rdata (dirty_byteen) ); end else begin diff --git a/hw/rtl/cache/VX_cache_flush.sv b/hw/rtl/cache/VX_cache_flush.sv index 7c46a48f0..70fbf0584 100644 --- a/hw/rtl/cache/VX_cache_flush.sv +++ b/hw/rtl/cache/VX_cache_flush.sv @@ -119,11 +119,12 @@ module VX_cache_flush #( STATE_WAIT: begin if (no_inflight_reqs) begin state_n = STATE_FLUSH; + flush_done_n = '0; end end STATE_FLUSH: begin flush_done_n = flush_done | flush_ready; - if (flush_done_n == 0) begin + if (flush_done_n == {NUM_BANKS{1'b1}}) begin state_n = STATE_DONE; lock_released_n = flush_req_mask; end diff --git a/hw/rtl/cache/VX_cache_init.sv b/hw/rtl/cache/VX_cache_init.sv deleted file mode 100644 index 3cccdcdae..000000000 --- a/hw/rtl/cache/VX_cache_init.sv +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -`include "VX_cache_define.vh" - -// cache flush unit -module VX_cache_init #( - // Size of cache in bytes - parameter CACHE_SIZE = 1024, - // Size of line inside a bank in bytes - parameter LINE_SIZE = 16, - // Number of banks - parameter NUM_BANKS = 1, - // Number of associative ways - parameter NUM_WAYS = 1 -) ( - input wire clk, - input wire reset, - output wire [`CS_LINE_SEL_BITS-1:0] addr_out, - output wire valid_out -); - reg enabled; - reg [`CS_LINE_SEL_BITS-1:0] line_ctr; - - always @(posedge clk) begin - if (reset) begin - enabled <= 1; - line_ctr <= '0; - end else begin - if (enabled) begin - if (line_ctr == ((2 ** `CS_LINE_SEL_BITS)-1)) begin - enabled <= 0; - end - line_ctr <= line_ctr + `CS_LINE_SEL_BITS'(1); - end - end - end - - assign addr_out = line_ctr; - assign valid_out = enabled; - -endmodule From 22b0525c51822bee51cf73e307199e83e3a7ef4a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 00:06:44 -0700 Subject: [PATCH 24/89] writeback cache fixes --- hw/rtl/VX_config.vh | 6 ++-- hw/rtl/cache/VX_cache_bank.sv | 28 +++++++++++------- hw/rtl/cache/VX_cache_data.sv | 56 ++++++++++++++++++----------------- hw/rtl/cache/VX_cache_tags.sv | 8 ++--- hw/rtl/libs/VX_dp_ram.sv | 2 +- 5 files changed, 55 insertions(+), 45 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 804715aad..f43eb2581 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -537,7 +537,7 @@ // Enable Cache Writeback `ifndef DCACHE_WRITEBACK -`define DCACHE_WRITEBACK 0 +`define DCACHE_WRITEBACK 1 `endif // LMEM Configurable Knobs //////////////////////////////////////////////////// @@ -601,7 +601,7 @@ // Enable Cache Writeback `ifndef L2_WRITEBACK -`define L2_WRITEBACK 0 +`define L2_WRITEBACK 1 `endif // L3cache Configurable Knobs ///////////////////////////////////////////////// @@ -647,7 +647,7 @@ // Enable Cache Writeback `ifndef L3_WRITEBACK -`define L3_WRITEBACK 0 +`define L3_WRITEBACK 1 `endif // ISA Extensions ///////////////////////////////////////////////////////////// diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index 83800d536..94ea614e3 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -140,6 +140,7 @@ module VX_cache_bank #( wire [NUM_WAYS-1:0] flush_way_st0; wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1; + wire [`CS_LINE_SEL_BITS-1:0] line_sel_st0, line_sel_st1; wire rw_sel, rw_st0, rw_st1; wire [WORD_SEL_WIDTH-1:0] wsel_sel, wsel_st0, wsel_st1; wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1; @@ -291,6 +292,8 @@ module VX_cache_bank #( wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0]; + assign line_sel_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0]; + wire [NUM_WAYS-1:0] evict_way_st0; wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0; @@ -338,7 +341,7 @@ module VX_cache_bank #( assign way_sel_st0 = (is_fill_st0 || is_flush2_st0) ? evict_way_st0 : tag_matches_st0; - assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, addr_st0[`CS_LINE_SEL_BITS-1:0]} : addr_st0; + assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0; VX_pipe_register #( .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1), @@ -381,16 +384,20 @@ module VX_cache_bank #( wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1; wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1; + assign line_sel_st1 = addr_st1[`CS_LINE_SEL_BITS-1:0]; + `UNUSED_VAR (do_write_miss_st1) // ensure mshr replay always get a hit `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("missed mshr replay")); - // detect BRAM's read-during-write hazard + // both tag and data stores use BRAM with no read-during-write protection. + // we ned to stall the pipeline to prevent read-after-write hazards. assign rdw_hazard1_sel = do_fill_st0; // stall first replay following a fill assign rdw_hazard2_sel = WRITEBACK && do_cache_wr_st0; // a writeback can evict any preceeding write - always @(posedge clk) begin // stall reads following writes to same address - rdw_hazard3_st1 <= do_cache_rd_st0 && do_cache_wr_st1 && (addr_st0 == addr_st1) + always @(posedge clk) begin + // stall reads following writes to same line address + rdw_hazard3_st1 <= do_cache_rd_st0 && do_cache_wr_st1 && (line_sel_st0 == line_sel_st1) && ~rdw_hazard3_st1; // release pipeline stall end @@ -588,20 +595,21 @@ module VX_cache_bank #( wire is_fill_or_flush_st1 = is_fill_st1 || is_flush_st1; wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1; wire do_writeback_st1 = do_fill_or_flush_st1 && evict_dirty_st1; - `UNUSED_VAR (do_writeback_st1) if (WRITEBACK) begin if (DIRTY_BYTES) begin - // ensure dirty bytes are valid + // ensure dirty bytes match the tag info wire has_dirty_bytes = (| dirty_byteen_st1); - `RUNTIME_ASSERT (~do_fill_or_flush_st1 || (evict_dirty_st1 == has_dirty_bytes), ("missmatch dirty bytes")); + `RUNTIME_ASSERT (~do_fill_or_flush_st1 || (evict_dirty_st1 == has_dirty_bytes), ("missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", evict_dirty_st1, has_dirty_bytes, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID))); end assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1) - || do_writeback_st1); + || do_writeback_st1) + && ~rdw_hazard3_st1; end else begin - `UNUSED_VAR (evict_dirty_st1) + `UNUSED_VAR (do_writeback_st1) assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1) - || do_creq_wr_st1); + || do_creq_wr_st1) + && ~rdw_hazard3_st1; end assign mreq_queue_pop = mem_req_valid && mem_req_ready; diff --git a/hw/rtl/cache/VX_cache_data.sv b/hw/rtl/cache/VX_cache_data.sv index ecfefa26d..d1dd8050f 100644 --- a/hw/rtl/cache/VX_cache_data.sv +++ b/hw/rtl/cache/VX_cache_data.sv @@ -73,16 +73,17 @@ module VX_cache_data #( wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0]; - wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] rdata; + wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_rdata; wire [`LOG2UP(NUM_WAYS)-1:0] way_idx; if (WRITEBACK) begin if (DIRTY_BYTES) begin - wire [`CLOG2(`CS_LINES_PER_BANK * NUM_WAYS)-1:0] way_addr; - if (NUM_WAYS > 1) begin - assign way_addr = {line_sel, way_idx}; - end else begin - assign way_addr = line_sel; + wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_rdata; + wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_wdata; + + for (genvar i = 0; i < NUM_WAYS; ++i) begin + wire [LINE_SIZE-1:0] wdata = write ? (bs_rdata[i] | write_byteen) : ((fill || flush) ? '0 : bs_rdata[i]); + assign bs_wdata[i] = init ? '0 : (way_sel[i] ? wdata : bs_rdata[i]); end VX_sp_ram #( @@ -93,21 +94,23 @@ module VX_cache_data #( .read (write || fill || flush), .write (init || write || fill || flush), `UNUSED_PIN (wren), - .addr (way_addr), - .wdata (write ? (dirty_byteen | write_byteen) : ((init || fill || flush) ? '0 : dirty_byteen)), - .rdata (dirty_byteen) + .addr (line_sel), + .wdata (bs_wdata), + .rdata (bs_rdata) ); + + assign dirty_byteen = bs_rdata[way_idx]; end else begin assign dirty_byteen = {LINE_SIZE{1'b1}}; end - wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] dirty_data_w; + wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] flipped_rdata; for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin for (genvar j = 0; j < NUM_WAYS; ++j) begin - assign dirty_data_w[j][i] = rdata[i][j]; + assign flipped_rdata[j][i] = line_rdata[i][j]; end end - assign dirty_data = dirty_data_w[way_idx]; + assign dirty_data = flipped_rdata[way_idx]; end else begin assign dirty_byteen = '0; assign dirty_data = '0; @@ -116,28 +119,25 @@ module VX_cache_data #( // order the data layout to perform ways multiplexing last. // this allows converting way index to binary in parallel with BRAM read. - wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] wdata; - wire [BYTEENW-1:0] wren; + wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_wdata; + wire [BYTEENW-1:0] line_wren; if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin - for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin - assign wdata[i] = (fill || !WRITE_ENABLE) ? {NUM_WAYS{fill_data[i]}} : {NUM_WAYS{write_data[i]}}; - end - wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w; for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin for (genvar j = 0; j < NUM_WAYS; ++j) begin + assign line_wdata[i][j] = (fill || !WRITE_ENABLE) ? fill_data[i] : write_data[i]; assign wren_w[i][j] = ((fill || !WRITE_ENABLE) ? {WORD_SIZE{1'b1}} : write_byteen[i]) & {WORD_SIZE{(way_sel[j] || (NUM_WAYS == 1))}}; end end - assign wren = wren_w; + assign line_wren = wren_w; end else begin `UNUSED_VAR (write) `UNUSED_VAR (write_byteen) `UNUSED_VAR (write_data) - assign wdata = fill_data; - assign wren = fill; + assign line_wdata = fill_data; + assign line_wren = fill; end VX_onehot_encoder #( @@ -151,6 +151,8 @@ module VX_cache_data #( wire line_read = (read && ~stall) || (WRITEBACK && (fill || flush)); + wire line_write = write || fill; + VX_sp_ram #( .DATAW (`CS_LINE_WIDTH * NUM_WAYS), .SIZE (`CS_LINES_PER_BANK), @@ -160,19 +162,19 @@ module VX_cache_data #( ) data_store ( .clk (clk), .read (line_read), - .write (write || fill), - .wren (wren), + .write (line_write), + .wren (line_wren), .addr (line_sel), - .wdata (wdata), - .rdata (rdata) + .wdata (line_wdata), + .rdata (line_rdata) ); wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata; if (`CS_WORDS_PER_LINE > 1) begin - assign per_way_rdata = rdata[wsel]; + assign per_way_rdata = line_rdata[wsel]; end else begin `UNUSED_VAR (wsel) - assign per_way_rdata = rdata; + assign per_way_rdata = line_rdata; end assign read_data = per_way_rdata[way_idx]; diff --git a/hw/rtl/cache/VX_cache_tags.sv b/hw/rtl/cache/VX_cache_tags.sv index d7a948c62..bdb4479ce 100644 --- a/hw/rtl/cache/VX_cache_tags.sv +++ b/hw/rtl/cache/VX_cache_tags.sv @@ -143,7 +143,7 @@ module VX_cache_tags #( assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]); end - assign evict_dirty = (| read_dirty); + assign evict_dirty = | (read_dirty & evict_way); `ifdef DBG_TRACE_CACHE wire [`CS_LINE_ADDR_WIDTH-1:0] evict_line_addr = {evict_tag, line_sel}; @@ -162,12 +162,12 @@ module VX_cache_tags #( if (write) `TRACE(3, ("%d: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); else - `TRACE(3, ("%d: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); + `TRACE(3, ("%d: %s read-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); end else begin if (write) - `TRACE(3, ("%d: %s read-miss: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); + `TRACE(3, ("%d: %s write-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid)); else - `TRACE(3, ("%d: %s read-miss: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); + `TRACE(3, ("%d: %s read-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid)); end end end diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index 17d602a78..a2e323772 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -314,7 +314,7 @@ module VX_dp_ram #( end else begin assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; if (RW_ASSERT) begin - `RUNTIME_ASSERT(~read || (rdata == ram[raddr]), ("read after write mismatch")); + `RUNTIME_ASSERT(~read || (rdata == ram[raddr]), ("read after write hazard")); end end end From edf960d9ed18fd183d4dd9b3bb7cf42e945e0930 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 00:58:31 -0700 Subject: [PATCH 25/89] writeback cache fixes --- hw/rtl/cache/VX_cache_bank.sv | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index 94ea614e3..885fdb943 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -215,11 +215,12 @@ module VX_cache_bank #( && ~pipe_stall; assign mem_rsp_ready = fill_grant + && (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions && ~rdw_hazard2_sel && ~pipe_stall; assign line_flush_ready = flush_grant - && ~mreq_queue_alm_full + && (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions && ~rdw_hazard2_sel && ~pipe_stall; From 99cbae182084c40d03120474e4ed01057f7a757e Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 01:55:32 -0700 Subject: [PATCH 26/89] writeback cache deadlock fix --- hw/rtl/VX_cluster.sv | 2 +- hw/rtl/VX_socket.sv | 2 +- hw/rtl/Vortex.sv | 2 +- hw/rtl/cache/VX_cache.sv | 10 +++++++--- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/hw/rtl/VX_cluster.sv b/hw/rtl/VX_cluster.sv index c84aadcb7..714e69dd4 100644 --- a/hw/rtl/VX_cluster.sv +++ b/hw/rtl/VX_cluster.sv @@ -96,7 +96,7 @@ module VX_cluster import VX_gpu_pkg::*; #( .CRSQ_SIZE (`L2_CRSQ_SIZE), .MSHR_SIZE (`L2_MSHR_SIZE), .MRSQ_SIZE (`L2_MRSQ_SIZE), - .MREQ_SIZE (`L2_MREQ_SIZE), + .MREQ_SIZE (`L2_WRITEBACK ? `L2_MSHR_SIZE : `L2_MREQ_SIZE), .TAG_WIDTH (L2_TAG_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`L2_WRITEBACK), diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv index 8d7b86160..df2b284eb 100644 --- a/hw/rtl/VX_socket.sv +++ b/hw/rtl/VX_socket.sv @@ -145,7 +145,7 @@ module VX_socket import VX_gpu_pkg::*; #( .CRSQ_SIZE (`DCACHE_CRSQ_SIZE), .MSHR_SIZE (`DCACHE_MSHR_SIZE), .MRSQ_SIZE (`DCACHE_MRSQ_SIZE), - .MREQ_SIZE (`DCACHE_MREQ_SIZE), + .MREQ_SIZE (`DCACHE_WRITEBACK ? `DCACHE_MSHR_SIZE : `DCACHE_MREQ_SIZE), .TAG_WIDTH (DCACHE_TAG_WIDTH), .UUID_WIDTH (`UUID_WIDTH), .WRITE_ENABLE (1), diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv index d3a308009..978259101 100644 --- a/hw/rtl/Vortex.sv +++ b/hw/rtl/Vortex.sv @@ -80,7 +80,7 @@ module Vortex import VX_gpu_pkg::*; ( .CRSQ_SIZE (`L3_CRSQ_SIZE), .MSHR_SIZE (`L3_MSHR_SIZE), .MRSQ_SIZE (`L3_MRSQ_SIZE), - .MREQ_SIZE (`L3_MREQ_SIZE), + .MREQ_SIZE (`L3_WRITEBACK ? `L3_MSHR_SIZE : `L3_MREQ_SIZE), .TAG_WIDTH (L2_MEM_TAG_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`L3_WRITEBACK), diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index dfad8baad..d1b4a6d17 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -72,9 +72,13 @@ module VX_cache import VX_gpu_pkg::*; #( VX_mem_bus_if.master mem_bus_if ); - `STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter")) - `STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter")) - `STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter")) + `STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter: number of banks must be power of 2")) + `STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter: writeback requires write enable")) + `STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter: dirty bytes require writeback")) + + // In writeback mode, memory fill response may issue a new memory request to handle evicted blocks. + // We need to ensure that the memory request queue never fills up to avoid deadlock. + `STATIC_ASSERT(!WRITEBACK || (MREQ_SIZE >= MSHR_SIZE), ("invalid parameter: writeback requires MREQ_SIZE >= MSHR_SIZE")) localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS); localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS); From f46b764748092404abf58ec4130fd91e551418c7 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 01:59:50 -0700 Subject: [PATCH 27/89] minor update --- hw/rtl/core/VX_sfu_unit.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rtl/core/VX_sfu_unit.sv b/hw/rtl/core/VX_sfu_unit.sv index add229893..5ef4211d0 100644 --- a/hw/rtl/core/VX_sfu_unit.sv +++ b/hw/rtl/core/VX_sfu_unit.sv @@ -179,7 +179,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( VX_gather_unit #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), - .OUT_BUF (1) + .OUT_BUF (3) ) gather_unit ( .clk (clk), .reset (reset), From 54e6421854d4db8ecce21e3c8a78e1d9942e8c6e Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 02:42:25 -0700 Subject: [PATCH 28/89] minor update --- Makefile.in | 8 ++++---- config.mk.in | 2 +- configure | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile.in b/Makefile.in index 82a2ebdb5..264738aca 100644 --- a/Makefile.in +++ b/Makefile.in @@ -36,10 +36,10 @@ clean: clean-build $(MAKE) -C $(VORTEX_HOME)/third_party clean # Install setup -KERNEL_INC_DST = $(PREFIX)/kernel/include -KERNEL_LIB_DST = $(PREFIX)/kernel/lib$(XLEN) -RUNTIME_INC_DST = $(PREFIX)/runtime/include -RUNTIME_LIB_DST = $(PREFIX)/runtime/lib +KERNEL_INC_DST = $(INSTALLDIR)/kernel/include +KERNEL_LIB_DST = $(INSTALLDIR)/kernel/lib$(XLEN) +RUNTIME_INC_DST = $(INSTALLDIR)/runtime/include +RUNTIME_LIB_DST = $(INSTALLDIR)/runtime/lib KERNEL_HEADERS = $(wildcard $(VORTEX_HOME)/kernel/include/*.h) KERNEL_LIBS = $(wildcard kernel/*.a) diff --git a/config.mk.in b/config.mk.in index c1f67e5a9..81339f195 100644 --- a/config.mk.in +++ b/config.mk.in @@ -19,7 +19,7 @@ TOOLDIR ?= @TOOLDIR@ OSVERSION ?= @OSVERSION@ -PREFIX ?= @PREFIX@ +INSTALLDIR ?= @INSTALLDIR@ LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex diff --git a/configure b/configure index 643c27150..62975784b 100755 --- a/configure +++ b/configure @@ -63,7 +63,7 @@ copy_files() { filename_no_ext="${filename%.in}" dest_file="$dest_dir/$filename_no_ext" mkdir -p "$dest_dir" - sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@PREFIX@|$PREFIX|g" "$file" > "$dest_file" + sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g" "$file" > "$dest_file" # apply permissions to bash scripts read -r firstline < "$dest_file" if [[ "$firstline" =~ ^#!.*bash ]]; then From 047960ac4d35d6830d8565e828f2958ecf1a7954 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 02:51:12 -0700 Subject: [PATCH 29/89] minor update --- hw/rtl/cache/VX_bank_flush.sv | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/rtl/cache/VX_bank_flush.sv b/hw/rtl/cache/VX_bank_flush.sv index 69afd060a..9a9e1796a 100644 --- a/hw/rtl/cache/VX_bank_flush.sv +++ b/hw/rtl/cache/VX_bank_flush.sv @@ -37,11 +37,11 @@ module VX_bank_flush #( input wire mshr_empty ); // ways interation is only needed when eviction is enabled - parameter CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0); + localparam CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0); - parameter STATE_IDLE = 2'd0; - parameter STATE_INIT = 2'd1; - parameter STATE_FLUSH = 2'd2; + localparam STATE_IDLE = 2'd0; + localparam STATE_INIT = 2'd1; + localparam STATE_FLUSH = 2'd2; reg [CTR_WIDTH-1:0] counter_r; reg [1:0] state_r, state_n; From 6e55840a3296cae38ea923c42fcc81ccb103e97b Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 03:32:49 -0700 Subject: [PATCH 30/89] minor update --- hw/rtl/VX_config.vh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index f43eb2581..804715aad 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -537,7 +537,7 @@ // Enable Cache Writeback `ifndef DCACHE_WRITEBACK -`define DCACHE_WRITEBACK 1 +`define DCACHE_WRITEBACK 0 `endif // LMEM Configurable Knobs //////////////////////////////////////////////////// @@ -601,7 +601,7 @@ // Enable Cache Writeback `ifndef L2_WRITEBACK -`define L2_WRITEBACK 1 +`define L2_WRITEBACK 0 `endif // L3cache Configurable Knobs ///////////////////////////////////////////////// @@ -647,7 +647,7 @@ // Enable Cache Writeback `ifndef L3_WRITEBACK -`define L3_WRITEBACK 1 +`define L3_WRITEBACK 0 `endif // ISA Extensions ///////////////////////////////////////////////////////////// From abf8d2c51ae891d6d21693fb779178574dadc578 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 05:59:50 -0700 Subject: [PATCH 31/89] minor update --- ci/trace_csv.py | 1 + hw/rtl/cache/VX_cache_flush.sv | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/trace_csv.py b/ci/trace_csv.py index c3113de85..540e1898c 100755 --- a/ci/trace_csv.py +++ b/ci/trace_csv.py @@ -60,6 +60,7 @@ def parse_simx(log_lines): instr_data["destination"] = re.search(destination_pattern, line).group(1) except Exception as e: print("Error at line {}: {}".format(lineno, e)) + instr_data = None if instr_data: entries.append(instr_data) return entries diff --git a/hw/rtl/cache/VX_cache_flush.sv b/hw/rtl/cache/VX_cache_flush.sv index 70fbf0584..7f158850c 100644 --- a/hw/rtl/cache/VX_cache_flush.sv +++ b/hw/rtl/cache/VX_cache_flush.sv @@ -76,7 +76,6 @@ module VX_cache_flush #( `UNUSED_VAR (bank_req_fire) end - reg [1:0] state, state_n; reg [NUM_BANKS-1:0] flush_done, flush_done_n; From 2bc8a881b6dc034dc68aebf20237ec3cfeb9d466 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 12:05:36 -0700 Subject: [PATCH 32/89] fixed trace log formatting --- sim/common/mem.cpp | 10 +++++----- sim/opaesim/opae_sim.cpp | 6 +++--- sim/simx/core.cpp | 4 ++-- sim/simx/dcrs.cpp | 8 ++++---- sim/simx/decode.cpp | 10 +++++----- sim/simx/emulator.cpp | 26 +++++++++++++------------- sim/simx/execute.cpp | 20 ++++++++++---------- sim/simx/instr_trace.h | 8 ++++---- sim/simx/local_mem.cpp | 4 ++-- sim/simx/types.h | 20 ++++++++++---------- 10 files changed, 58 insertions(+), 58 deletions(-) diff --git a/sim/common/mem.cpp b/sim/common/mem.cpp index ed4bcc522..a0f1884d1 100644 --- a/sim/common/mem.cpp +++ b/sim/common/mem.cpp @@ -48,7 +48,7 @@ void RamMemDevice::read(void* data, uint64_t addr, uint64_t size) { if ((addr & (wordSize_-1)) || (addr_end & (wordSize_-1)) || (addr_end <= contents_.size())) { - std::cout << "lookup of 0x" << std::hex << (addr_end-1) << " failed.\n"; + std::cout << "lookup of 0x" << std::hex << (addr_end-1) << std::dec << " failed.\n"; throw BadAddress(); } @@ -63,7 +63,7 @@ void RamMemDevice::write(const void* data, uint64_t addr, uint64_t size) { if ((addr & (wordSize_-1)) || (addr_end & (wordSize_-1)) || (addr_end <= contents_.size())) { - std::cout << "lookup of 0x" << std::hex << (addr_end-1) << " failed.\n"; + std::cout << "lookup of 0x" << std::hex << (addr_end-1) << std::dec << " failed.\n"; throw BadAddress(); } @@ -104,7 +104,7 @@ void MemoryUnit::ADecoder::map(uint64_t start, uint64_t end, MemDevice &md) { void MemoryUnit::ADecoder::read(void* data, uint64_t addr, uint64_t size) { mem_accessor_t ma; if (!this->lookup(addr, size, &ma)) { - std::cout << "lookup of 0x" << std::hex << addr << " failed.\n"; + std::cout << "lookup of 0x" << std::hex << addr << std::dec << " failed.\n"; throw BadAddress(); } ma.md->read(data, ma.addr, size); @@ -113,7 +113,7 @@ void MemoryUnit::ADecoder::read(void* data, uint64_t addr, uint64_t size) { void MemoryUnit::ADecoder::write(const void* data, uint64_t addr, uint64_t size) { mem_accessor_t ma; if (!this->lookup(addr, size, &ma)) { - std::cout << "lookup of 0x" << std::hex << addr << " failed.\n"; + std::cout << "lookup of 0x" << std::hex << addr << std::dec << " failed.\n"; throw BadAddress(); } ma.md->write(data, ma.addr, size); @@ -252,7 +252,7 @@ bool ACLManager::check(uint64_t addr, uint64_t size, int flags) const { while (it != acl_map_.end() && it->first < end) { if (it->second.end > addr) { if ((it->second.flags & flags) != flags) { - std::cout << "Memory access violation from 0x" << std::hex << addr << " to 0x" << end << ", curent flags=" << it->second.flags << ", access flags=" << flags << std::endl; + std::cout << "Memory access violation from 0x" << std::hex << addr << " to 0x" << end << ", curent flags=" << it->second.flags << ", access flags=" << flags << std::dec << std::endl; return false; // Overlapping entry is missing at least one required flag bit } addr = it->second.end; // Move to the end of the current matching range diff --git a/sim/opaesim/opae_sim.cpp b/sim/opaesim/opae_sim.cpp index d6e06721d..9d43ea595 100644 --- a/sim/opaesim/opae_sim.cpp +++ b/sim/opaesim/opae_sim.cpp @@ -380,7 +380,7 @@ private: device_->vcp2af_sRxPort_c0_hdr_resp_type = 0; memcpy(device_->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE); device_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata; - /*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata); + /*printf("%0ld: [sim] CCI Rd Rsp: addr=0x%lx, mdata=0x%x, data=0x", timestamp, cci_rd_it->addr, cci_rd_it->mdata); for (int i = 0; i < CACHE_BLOCK_SIZE; ++i) printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]); printf("\n");*/ @@ -398,7 +398,7 @@ private: cci_req.mdata = device_->af2cp_sTxPort_c0_hdr_mdata; auto host_ptr = (uint64_t*)(device_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE); memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE); - //printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, device_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata); + //printf("%0ld: [sim] CCI Rd Req: addr=0x%lx, mdata=0x%x\n", timestamp, device_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata); cci_reads_.emplace_back(cci_req); } @@ -453,7 +453,7 @@ private: } } - /*printf("%0ld: [sim] MEM Wr Req: bank=%d, addr=%x, data=", timestamp, b, byte_addr); + /*printf("%0ld: [sim] MEM Wr Req: bank=%d, 0x%x, data=0x", timestamp, b, byte_addr); for (int i = 0; i < MEM_BLOCK_SIZE; i++) { printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); } diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 75aa47670..04f6abf57 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -215,7 +215,7 @@ void Core::fetch() { auto& mem_rsp = icache_rsp_port.front(); auto trace = pending_icache_.at(mem_rsp.tag); decode_latch_.push(trace); - DT(3, "icache-rsp: addr=0x" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace); + DT(3, "icache-rsp: addr=0x" << std::hex << trace->PC << ", tag=0x" << mem_rsp.tag << std::dec << ", " << *trace); pending_icache_.release(mem_rsp.tag); icache_rsp_port.pop(); --pending_ifetches_; @@ -232,7 +232,7 @@ void Core::fetch() { mem_req.cid = trace->cid; mem_req.uuid = trace->uuid; icache_req_ports.at(0).push(mem_req, 2); - DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); + DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=0x" << mem_req.tag << std::dec << ", " << *trace); fetch_latch_.pop(); ++perf_stats_.ifetches; ++pending_ifetches_; diff --git a/sim/simx/dcrs.cpp b/sim/simx/dcrs.cpp index bce4639ba..242d630eb 100644 --- a/sim/simx/dcrs.cpp +++ b/sim/simx/dcrs.cpp @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,13 +16,13 @@ using namespace vortex; -void DCRS::write(uint32_t addr, uint32_t value) { +void DCRS::write(uint32_t addr, uint32_t value) { if (addr >= VX_DCR_BASE_STATE_BEGIN && addr < VX_DCR_BASE_STATE_END) { base_dcrs.write(addr, value); return; } - std::cout << std::hex << "Error: invalid global DCR addr=0x" << addr << std::endl; + std::cout << "Error: invalid global DCR addr=0x" << std::hex << addr << std::dec << std::endl; std::abort(); } \ No newline at end of file diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp index 6cefe378f..dba57c4ef 100644 --- a/sim/simx/decode.cpp +++ b/sim/simx/decode.cpp @@ -416,19 +416,19 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) { int sep = 0; if (instr.getRDType() != RegType::None) { if (sep++ != 0) { os << ", "; } else { os << " "; } - os << instr.getRDType() << std::dec << instr.getRDest(); + os << instr.getRDType() << instr.getRDest(); } for (uint32_t i = 0; i < instr.getNRSrc(); ++i) { if (sep++ != 0) { os << ", "; } else { os << " "; } if (instr.getRSType(i) != RegType::None) { - os << instr.getRSType(i) << std::dec << instr.getRSrc(i); + os << instr.getRSType(i) << instr.getRSrc(i); } else { - os << "0x" << std::hex << instr.getRSrc(0); + os << "0x" << std::hex << instr.getRSrc(0) << std::dec; } } if (instr.hasImm()) { if (sep++ != 0) { os << ", "; } else { os << " "; } - os << "0x" << std::hex << instr.getImm(); + os << "0x" << std::hex << instr.getImm() << std::dec; } return os; } @@ -450,7 +450,7 @@ std::shared_ptr Emulator::decode(uint32_t code) const { auto op_it = sc_instTable.find(op); if (op_it == sc_instTable.end()) { - std::cout << std::hex << "Error: invalid opcode: 0x" << static_cast(op) << std::endl; + std::cout << "Error: invalid opcode: 0x" << std::hex << static_cast(op) << std::dec << std::endl; return nullptr; } diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 8ef322beb..7ed9a10f9 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -138,7 +138,7 @@ instr_trace_t* Emulator::step() { // process pending wspawn if (wspawn_.valid && active_warps_.count() == 1) { - DP(3, "*** Activate " << (wspawn_.num_warps-1) << " warps at PC: " << std::hex << wspawn_.nextPC); + DP(3, "*** Activate " << (wspawn_.num_warps-1) << " warps at PC: " << std::hex << wspawn_.nextPC << std::dec); for (uint32_t i = 1; i < wspawn_.num_warps; ++i) { auto& warp = warps_.at(i); warp.PC = wspawn_.nextPC; @@ -185,11 +185,11 @@ instr_trace_t* Emulator::step() { // Decode auto instr = this->decode(instr_code); if (!instr) { - std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=0x" << warp.PC << " (#" << std::dec << uuid << ")" << std::endl; + std::cout << "Error: invalid instruction 0x" << std::hex << instr_code << ", at PC=0x" << warp.PC << " (#" << std::dec << uuid << ")" << std::endl; std::abort(); } - DP(1, "Instr 0x" << std::hex << instr_code << ": " << *instr); + DP(1, "Instr 0x" << std::hex << instr_code << ": " << std::dec << *instr); // Create trace auto trace = new instr_trace_t(uuid, arch_); @@ -199,17 +199,17 @@ instr_trace_t* Emulator::step() { DP(5, "Register state:"); for (uint32_t i = 0; i < MAX_NUM_REGS; ++i) { - DPN(5, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':'); + DPN(5, " %r" << std::setfill('0') << std::setw(2) << i << ':' << std::hex); // Integer register file for (uint32_t j = 0; j < arch_.num_threads(); ++j) { - DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << warp.ireg_file.at(j).at(i) << std::setfill(' ') << ' '); + DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << warp.ireg_file.at(j).at(i) << std::setfill(' ') << ' '); } DPN(5, '|'); // Floating point register file for (uint32_t j = 0; j < arch_.num_threads(); ++j) { - DPN(5, ' ' << std::setfill('0') << std::setw(16) << std::hex << warp.freg_file.at(j).at(i) << std::setfill(' ') << ' '); + DPN(5, ' ' << std::setfill('0') << std::setw(16) << warp.freg_file.at(j).at(i) << std::setfill(' ') << ' '); } - DPN(5, std::endl); + DPN(5, std::dec << std::endl); } return trace; @@ -292,7 +292,7 @@ void Emulator::dcache_read(void *data, uint64_t addr, uint32_t size) { mmu_.read(data, addr, size, 0); } - DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl); + DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << std::dec << " (size=" << size << ", type=" << type << ")" << std::endl); } void Emulator::dcache_write(const void* data, uint64_t addr, uint32_t size) { @@ -307,7 +307,7 @@ void Emulator::dcache_write(const void* data, uint64_t addr, uint32_t size) { mmu_.write(data, addr, size, 0); } } - DPH(2, "Mem Write: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl); + DPH(2, "Mem Write: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << std::dec << " (size=" << size << ", type=" << type << ")" << std::endl); } void Emulator::dcache_amo_reserve(uint64_t addr) { @@ -333,7 +333,7 @@ void Emulator::writeToStdOut(const void* data, uint64_t addr, uint32_t size) { char c = *(char*)data; ss_buf << c; if (c == '\n') { - std::cout << std::dec << "#" << tid << ": " << ss_buf.str() << std::flush; + std::cout << "#" << tid << ": " << ss_buf.str() << std::flush; ss_buf.str(""); } } @@ -458,12 +458,12 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { } } break; default: { - std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl; + std::cout << "Error: invalid MPM CLASS: value=" << perf_class << std::endl; std::abort(); } break; } } else { - std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl; + std::cout << "Error: invalid CSR read addr=0x"<< std::hex << addr << std::dec << std::endl; std::abort(); } } @@ -498,7 +498,7 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) { case VX_CSR_MCAUSE: break; default: { - std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl; + std::cout << "Error: invalid CSR write addr=0x" << std::hex << addr << ", value=0x" << value << std::dec << std::endl; std::abort(); } } diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index a037d995c..db098726b 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -102,7 +102,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { auto reg = instr.getRSrc(i); switch (type) { case RegType::Integer: - DPH(2, "Src" << std::dec << i << " Reg: " << type << std::dec << reg << "={"); + DPH(2, "Src" << i << " Reg: " << type << reg << "={"); for (uint32_t t = 0; t < num_threads; ++t) { if (t) DPN(2, ", "); if (!warp.tmask.test(t)) { @@ -110,12 +110,12 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { continue; } rsdata[t][i].u = warp.ireg_file.at(t)[reg]; - DPN(2, "0x" << std::hex << rsdata[t][i].i); + DPN(2, "0x" << std::hex << rsdata[t][i].i << std::dec); } DPN(2, "}" << std::endl); break; case RegType::Float: - DPH(2, "Src" << std::dec << i << " Reg: " << type << std::dec << reg << "={"); + DPH(2, "Src" << i << " Reg: " << type << reg << "={"); for (uint32_t t = 0; t < num_threads; ++t) { if (t) DPN(2, ", "); if (!warp.tmask.test(t)) { @@ -123,7 +123,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { continue; } rsdata[t][i].u64 = warp.freg_file.at(t)[reg]; - DPN(2, "0x" << std::hex << rsdata[t][i].f); + DPN(2, "0x" << std::hex << rsdata[t][i].f << std::dec); } DPN(2, "}" << std::endl); break; @@ -633,7 +633,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { all_taken = curr_taken; } else { if (all_taken != curr_taken) { - std::cout << "divergent branch! PC=0x" << std::hex << warp.PC << " (#" << std::dec << trace->uuid << ")\n" << std::flush; + std::cout << "divergent branch! PC=0x" << std::hex << warp.PC << std::dec << " (#" << trace->uuid << ")\n" << std::flush; std::abort(); } } @@ -1338,7 +1338,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { bool is_divergent = then_tmask.any() && else_tmask.any(); if (is_divergent) { if (stack_size == ipdom_size_) { - std::cout << "IPDOM stack is full! size=" << std::dec << stack_size << ", PC=0x" << std::hex << warp.PC << " (#" << std::dec << trace->uuid << ")\n" << std::flush; + std::cout << "IPDOM stack is full! size=" << stack_size << ", PC=0x" << std::hex << warp.PC << std::dec << " (#" << trace->uuid << ")\n" << std::flush; std::abort(); } // set new thread mask to the larger set @@ -1425,7 +1425,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { switch (type) { case RegType::Integer: if (rdest) { - DPH(2, "Dest Reg: " << type << std::dec << rdest << "={"); + DPH(2, "Dest Reg: " << type << rdest << "={"); for (uint32_t t = 0; t < num_threads; ++t) { if (t) DPN(2, ", "); if (!warp.tmask.test(t)) { @@ -1433,7 +1433,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { continue; } warp.ireg_file.at(t)[rdest] = rddata[t].i; - DPN(2, "0x" << std::hex << rddata[t].i); + DPN(2, "0x" << std::hex << rddata[t].i << std::dec); } DPN(2, "}" << std::endl); trace->dst_reg = {type, rdest}; @@ -1444,7 +1444,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { } break; case RegType::Float: - DPH(2, "Dest Reg: " << type << std::dec << rdest << "={"); + DPH(2, "Dest Reg: " << type << rdest << "={"); for (uint32_t t = 0; t < num_threads; ++t) { if (t) DPN(2, ", "); if (!warp.tmask.test(t)) { @@ -1452,7 +1452,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { continue; } warp.freg_file.at(t)[rdest] = rddata[t].u64; - DPN(2, "0x" << std::hex << rddata[t].f); + DPN(2, "0x" << std::hex << rddata[t].f << std::dec); } DPN(2, "}" << std::endl); trace->dst_reg = {type, rdest}; diff --git a/sim/simx/instr_trace.h b/sim/simx/instr_trace.h index 7f6b37580..bbf4eab59 100644 --- a/sim/simx/instr_trace.h +++ b/sim/simx/instr_trace.h @@ -146,14 +146,14 @@ inline std::ostream &operator<<(std::ostream &os, const instr_trace_t& trace) { for (uint32_t i = 0, n = trace.arch.num_threads(); i < n; ++i) { os << trace.tmask.test(i); } - os << ", PC=0x" << std::hex << trace.PC; + os << ", PC=0x" << std::hex << trace.PC << std::dec; os << ", wb=" << trace.wb; if (trace.dst_reg.type != RegType::None) { - os << ", rd=" << trace.dst_reg.type << std::dec << trace.dst_reg.idx; + os << ", rd=" << trace.dst_reg.type << trace.dst_reg.idx; } for (uint32_t i = 0; i < trace.src_regs.size(); ++i) { if (trace.src_regs[i].type != RegType::None) { - os << ", rs" << i << "=" << trace.src_regs[i].type << std::dec << trace.src_regs[i].idx; + os << ", rs" << i << "=" << trace.src_regs[i].type << trace.src_regs[i].idx; } } os << ", ex=" << trace.fu_type; @@ -162,7 +162,7 @@ inline std::ostream &operator<<(std::ostream &os, const instr_trace_t& trace) { os << ", sop=" << trace.sop; os << ", eop=" << trace.eop; } - os << " (#" << std::dec << trace.uuid << ")"; + os << " (#" << trace.uuid << ")"; return os; } diff --git a/sim/simx/local_mem.cpp b/sim/simx/local_mem.cpp index 195fe5300..1bab3fccb 100644 --- a/sim/simx/local_mem.cpp +++ b/sim/simx/local_mem.cpp @@ -52,13 +52,13 @@ public: void read(void* data, uint64_t addr, uint32_t size) { auto s_addr = to_local_addr(addr); - DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::endl); + DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl); ram_.read(data, s_addr, size); } void write(const void* data, uint64_t addr, uint32_t size) { auto s_addr = to_local_addr(addr); - DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::endl); + DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl); ram_.write(data, s_addr, size); } diff --git a/sim/simx/types.h b/sim/simx/types.h index 385015cc9..b452dd379 100644 --- a/sim/simx/types.h +++ b/sim/simx/types.h @@ -264,14 +264,14 @@ inline std::ostream &operator<<(std::ostream &os, const LsuReq& req) { for (size_t i = 0; i < req.mask.size(); ++i) { os << "addr" << i << "="; if (req.mask.test(i)) { - os << "0x" << std::hex << req.addrs.at(i); + os << "0x" << std::hex << req.addrs.at(i) << std::dec; } else { os << "-"; } os << ", "; } - os << std::dec << "tag=" << req.tag << ", cid=" << req.cid; - os << " (#" << std::dec << req.uuid << ")"; + os << "tag=0x" << std::hex << req.tag << std::dec << ", cid=" << req.cid; + os << " (#" << req.uuid << ")"; return os; } @@ -292,8 +292,8 @@ struct LsuRsp { }; inline std::ostream &operator<<(std::ostream &os, const LsuRsp& rsp) { - os << "mask=" << rsp.mask << ", tag=" << rsp.tag << ", cid=" << rsp.cid; - os << " (#" << std::dec << rsp.uuid << ")"; + os << "mask=" << rsp.mask << ", tag=0x" << std::hex << rsp.tag << std::dec << ", cid=" << rsp.cid; + os << " (#" << rsp.uuid << ")"; return os; } @@ -324,9 +324,9 @@ struct MemReq { inline std::ostream &operator<<(std::ostream &os, const MemReq& req) { os << "rw=" << req.write << ", "; - os << "addr=0x" << std::hex << req.addr << ", type=" << req.type; - os << std::dec << ", tag=" << req.tag << ", cid=" << req.cid; - os << " (#" << std::dec << req.uuid << ")"; + os << "addr=0x" << std::hex << req.addr << std::dec << ", type=" << req.type; + os << ", tag=0x" << std::hex << req.tag << std::dec << ", cid=" << req.cid; + os << " (#" << req.uuid << ")"; return os; } @@ -345,8 +345,8 @@ struct MemRsp { }; inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) { - os << "tag=" << rsp.tag << ", cid=" << rsp.cid; - os << " (#" << std::dec << rsp.uuid << ")"; + os << "tag=0x" << std::hex << rsp.tag << std::dec << ", cid=" << rsp.cid; + os << " (#" << rsp.uuid << ")"; return os; } From 029609b3fde3dc0fbf5105a9c38c8c9cfd2cc731 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 14:47:08 -0700 Subject: [PATCH 33/89] disable atexit() support, not needed for static kernels. --- kernel/src/vx_start.S | 4 --- kernel/src/vx_syscalls.c | 67 +++------------------------------------- 2 files changed, 4 insertions(+), 67 deletions(-) diff --git a/kernel/src/vx_start.S b/kernel/src/vx_start.S index 630856f3b..af0ef1428 100644 --- a/kernel/src/vx_start.S +++ b/kernel/src/vx_start.S @@ -51,10 +51,6 @@ _start: # la t0, trap_entry # csrw mtvec, t0 - # register global termination functions - la a0, __libc_fini_array - call atexit - # run global initialization functions call __libc_init_array diff --git a/kernel/src/vx_syscalls.c b/kernel/src/vx_syscalls.c index 6ff9fbb97..4759fe622 100644 --- a/kernel/src/vx_syscalls.c +++ b/kernel/src/vx_syscalls.c @@ -119,70 +119,11 @@ void __libc_fini_array (void) { } #endif -/* -#define MAX_CORES 64 -volatile int g_cxa_locks[MAX_CORES] = {0}; -*/ - -void __cxa_lock() { - /*int core_id = vx_core_id(); - g_cxa_locks[core_id] = 1; - vx_fence(); - for (int i = 1; i < MAX_CORES; ++i) { - int other = (core_id + i) % MAX_CORES; - while (g_cxa_locks[other]) { - vx_fence(); // cache coherence not supported, so we need to flush the caches - } - }*/ -} - -void __cxa_unlock() { - /*vx_fence(); - int core_id = vx_core_id(); - g_cxa_locks[core_id] = 0;*/ -} - -#define MAX_FEXITS 64 - -typedef struct { - void (*f[MAX_FEXITS])(void*); - void *a[MAX_FEXITS]; -} fexit_list_t; - -static fexit_list_t g_fexit_list; -static int g_num_fexits = 0; - +// This function will be called by LIBC at program exit. +// Since this platform only support statically linked programs, +// it is not required to support LIBC's exit functions registration via atexit(). void __funcs_on_exit() { - void (*func)(void *), *arg; - fexit_list_t* fexit_list = &g_fexit_list; - for (int i = 0; i < g_num_fexits; ++i) { - func = fexit_list->f[i]; - arg = fexit_list->a[i]; - func(arg); - } -} - -void __cxa_finalize(void *dso) {} - -int __cxa_atexit(void (*func)(void *), void *arg, void *dso) { - __cxa_lock(); - int num_fexits = g_num_fexits; - if (num_fexits >= MAX_FEXITS) - return -1; - fexit_list_t* fexit_list = &g_fexit_list; - fexit_list->f[num_fexits] = func; - fexit_list->a[num_fexits] = arg; - g_num_fexits = num_fexits + 1; - __cxa_unlock(); - return 0; -} - -static void call(void *p) { - ((void (*)(void))(uintptr_t)p)(); -} - -int atexit(void (*func)(void)) { - return __cxa_atexit(call, (void*)(uintptr_t)func, 0); + __libc_fini_array(); } #ifdef __cplusplus From e1c5b5277e0bc9fb12ffba843fc24408ed271405 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 17:55:21 -0700 Subject: [PATCH 34/89] minor update --- hw/rtl/core/VX_operands.sv | 14 +++--- hw/rtl/libs/VX_stream_arb.sv | 86 ++++++++++++++++++------------------ 2 files changed, 49 insertions(+), 51 deletions(-) diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 04a12e4c6..3773fbfca 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -43,8 +43,8 @@ module VX_operands import VX_gpu_pkg::*; #( localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS); localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS); localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS; - localparam METADATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS; - localparam DATAW = `UUID_WIDTH + METADATAW + 3 * `NUM_THREADS * `XLEN; + localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH; + localparam DATAW = META_DATAW + 3 * `NUM_THREADS * `XLEN; localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS); localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS; localparam XLEN_SIZE = `XLEN / 8; @@ -69,8 +69,7 @@ module VX_operands import VX_gpu_pkg::*; #( wire pipe_in_ready; reg pipe_out_valid; wire pipe_out_ready; - reg [`UUID_WIDTH-1:0] pipe_out_uuid; - reg [METADATAW-1:0] pipe_out_data; + reg [META_DATAW-1:0] pipe_out_data; reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data, src_data_n; reg [NUM_SRC_REGS-1:0] data_fetched; @@ -174,7 +173,6 @@ module VX_operands import VX_gpu_pkg::*; #( end end if (~pipe_stall) begin - pipe_out_uuid <= scoreboard_if.data.uuid; pipe_out_data <= { scoreboard_if.data.wis, scoreboard_if.data.tmask, @@ -183,7 +181,8 @@ module VX_operands import VX_gpu_pkg::*; #( scoreboard_if.data.ex_type, scoreboard_if.data.op_type, scoreboard_if.data.op_args, - scoreboard_if.data.rd + scoreboard_if.data.rd, + scoreboard_if.data.uuid }; has_collision <= has_collision_n; gpr_rd_addr <= gpr_rd_addr_n; @@ -205,14 +204,12 @@ module VX_operands import VX_gpu_pkg::*; #( .valid_in (stg_in_valid), .ready_in (stg_in_ready), .data_in ({ - pipe_out_uuid, pipe_out_data, src_data_n[0], src_data_n[1], src_data_n[2] }), .data_out ({ - operands_if.data.uuid, operands_if.data.wis, operands_if.data.tmask, operands_if.data.PC, @@ -221,6 +218,7 @@ module VX_operands import VX_gpu_pkg::*; #( operands_if.data.op_type, operands_if.data.op_args, operands_if.data.rd, + operands_if.data.uuid, operands_if.data.rs1_data, operands_if.data.rs2_data, operands_if.data.rs3_data diff --git a/hw/rtl/libs/VX_stream_arb.sv b/hw/rtl/libs/VX_stream_arb.sv index f9bb24f3d..bbffe7342 100644 --- a/hw/rtl/libs/VX_stream_arb.sv +++ b/hw/rtl/libs/VX_stream_arb.sv @@ -46,14 +46,14 @@ module VX_stream_arb #( for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin - localparam BATCH_BEGIN = i * NUM_REQS; - localparam BATCH_END = `MIN(BATCH_BEGIN + NUM_REQS, NUM_INPUTS); - localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN; + localparam SLICE_BEGIN = i * NUM_REQS; + localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_INPUTS); + localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN; `RESET_RELAY (slice_reset, reset); VX_stream_arb #( - .NUM_INPUTS (BATCH_SIZE), + .NUM_INPUTS (SLICE_SIZE), .NUM_OUTPUTS (1), .DATAW (DATAW), .ARBITER (ARBITER), @@ -63,9 +63,9 @@ module VX_stream_arb #( ) arb_slice ( .clk (clk), .reset (slice_reset), - .valid_in (valid_in[BATCH_END-1: BATCH_BEGIN]), - .ready_in (ready_in[BATCH_END-1: BATCH_BEGIN]), - .data_in (data_in[BATCH_END-1: BATCH_BEGIN]), + .valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]), + .ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]), + .data_in (data_in[SLICE_END-1: SLICE_BEGIN]), .data_out (data_out[i]), .sel_out (sel_out[i]), .valid_out (valid_out[i]), @@ -77,28 +77,28 @@ module VX_stream_arb #( // (#inputs > max_fanout) and (#outputs == 1) - localparam NUM_BATCHES = `CDIV(NUM_INPUTS, MAX_FANOUT); + localparam NUM_SLICES = `CDIV(NUM_INPUTS, MAX_FANOUT); localparam LOG_NUM_REQS2 = `CLOG2(MAX_FANOUT); - localparam LOG_NUM_REQS3 = `CLOG2(NUM_BATCHES); + localparam LOG_NUM_REQS3 = `CLOG2(NUM_SLICES); - wire [NUM_BATCHES-1:0] valid_tmp; - wire [NUM_BATCHES-1:0][DATAW+LOG_NUM_REQS2-1:0] data_tmp; - wire [NUM_BATCHES-1:0] ready_tmp; + wire [NUM_SLICES-1:0] valid_tmp; + wire [NUM_SLICES-1:0][DATAW+LOG_NUM_REQS2-1:0] data_tmp; + wire [NUM_SLICES-1:0] ready_tmp; - for (genvar i = 0; i < NUM_BATCHES; ++i) begin + for (genvar i = 0; i < NUM_SLICES; ++i) begin - localparam BATCH_BEGIN = i * MAX_FANOUT; - localparam BATCH_END = `MIN(BATCH_BEGIN + MAX_FANOUT, NUM_INPUTS); - localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN; + localparam SLICE_BEGIN = i * MAX_FANOUT; + localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_INPUTS); + localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN; wire [DATAW-1:0] data_tmp_u; - wire [`LOG2UP(BATCH_SIZE)-1:0] sel_tmp_u; + wire [`LOG2UP(SLICE_SIZE)-1:0] sel_tmp_u; `RESET_RELAY (slice_reset, reset); if (MAX_FANOUT != 1) begin VX_stream_arb #( - .NUM_INPUTS (BATCH_SIZE), + .NUM_INPUTS (SLICE_SIZE), .NUM_OUTPUTS (1), .DATAW (DATAW), .ARBITER (ARBITER), @@ -108,9 +108,9 @@ module VX_stream_arb #( ) fanout_slice_arb ( .clk (clk), .reset (slice_reset), - .valid_in (valid_in[BATCH_END-1: BATCH_BEGIN]), - .data_in (data_in[BATCH_END-1: BATCH_BEGIN]), - .ready_in (ready_in[BATCH_END-1: BATCH_BEGIN]), + .valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]), + .data_in (data_in[SLICE_END-1: SLICE_BEGIN]), + .ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]), .valid_out (valid_tmp[i]), .data_out (data_tmp_u), .sel_out (sel_tmp_u), @@ -125,7 +125,7 @@ module VX_stream_arb #( wire [LOG_NUM_REQS3-1:0] sel_out_u; VX_stream_arb #( - .NUM_INPUTS (NUM_BATCHES), + .NUM_INPUTS (NUM_SLICES), .NUM_OUTPUTS (1), .DATAW (DATAW + LOG_NUM_REQS2), .ARBITER (ARBITER), @@ -214,15 +214,15 @@ module VX_stream_arb #( for (genvar i = 0; i < NUM_INPUTS; ++i) begin - localparam BATCH_BEGIN = i * NUM_REQS; - localparam BATCH_END = `MIN(BATCH_BEGIN + NUM_REQS, NUM_OUTPUTS); - localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN; + localparam SLICE_BEGIN = i * NUM_REQS; + localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_OUTPUTS); + localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN; `RESET_RELAY (slice_reset, reset); VX_stream_arb #( .NUM_INPUTS (1), - .NUM_OUTPUTS (BATCH_SIZE), + .NUM_OUTPUTS (SLICE_SIZE), .DATAW (DATAW), .ARBITER (ARBITER), .MAX_FANOUT (MAX_FANOUT), @@ -234,13 +234,13 @@ module VX_stream_arb #( .valid_in (valid_in[i]), .ready_in (ready_in[i]), .data_in (data_in[i]), - .data_out (data_out[BATCH_END-1: BATCH_BEGIN]), - .valid_out (valid_out[BATCH_END-1: BATCH_BEGIN]), - .ready_out (ready_out[BATCH_END-1: BATCH_BEGIN]), + .data_out (data_out[SLICE_END-1: SLICE_BEGIN]), + .valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]), + .ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]), `UNUSED_PIN (sel_out) ); - for (genvar j = BATCH_BEGIN; j < BATCH_END; ++j) begin + for (genvar j = SLICE_BEGIN; j < SLICE_END; ++j) begin assign sel_out[j] = i; end end @@ -249,15 +249,15 @@ module VX_stream_arb #( // (#inputs == 1) and (#outputs > max_fanout) - localparam NUM_BATCHES = `CDIV(NUM_OUTPUTS, MAX_FANOUT); + localparam NUM_SLICES = `CDIV(NUM_OUTPUTS, MAX_FANOUT); - wire [NUM_BATCHES-1:0] valid_tmp; - wire [NUM_BATCHES-1:0][DATAW-1:0] data_tmp; - wire [NUM_BATCHES-1:0] ready_tmp; + wire [NUM_SLICES-1:0] valid_tmp; + wire [NUM_SLICES-1:0][DATAW-1:0] data_tmp; + wire [NUM_SLICES-1:0] ready_tmp; VX_stream_arb #( .NUM_INPUTS (1), - .NUM_OUTPUTS (NUM_BATCHES), + .NUM_OUTPUTS (NUM_SLICES), .DATAW (DATAW), .ARBITER (ARBITER), .MAX_FANOUT (MAX_FANOUT), @@ -275,17 +275,17 @@ module VX_stream_arb #( `UNUSED_PIN (sel_out) ); - for (genvar i = 0; i < NUM_BATCHES; ++i) begin + for (genvar i = 0; i < NUM_SLICES; ++i) begin - localparam BATCH_BEGIN = i * MAX_FANOUT; - localparam BATCH_END = `MIN(BATCH_BEGIN + MAX_FANOUT, NUM_OUTPUTS); - localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN; + localparam SLICE_BEGIN = i * MAX_FANOUT; + localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_OUTPUTS); + localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN; `RESET_RELAY (slice_reset, reset); VX_stream_arb #( .NUM_INPUTS (1), - .NUM_OUTPUTS (BATCH_SIZE), + .NUM_OUTPUTS (SLICE_SIZE), .DATAW (DATAW), .ARBITER (ARBITER), .MAX_FANOUT (MAX_FANOUT), @@ -297,9 +297,9 @@ module VX_stream_arb #( .valid_in (valid_tmp[i]), .ready_in (ready_tmp[i]), .data_in (data_tmp[i]), - .data_out (data_out[BATCH_END-1: BATCH_BEGIN]), - .valid_out (valid_out[BATCH_END-1: BATCH_BEGIN]), - .ready_out (ready_out[BATCH_END-1: BATCH_BEGIN]), + .data_out (data_out[SLICE_END-1: SLICE_BEGIN]), + .valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]), + .ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]), `UNUSED_PIN (sel_out) ); end From 95ca49a85ff4e00493275b9f9d065a2421e31b39 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 20:38:06 -0700 Subject: [PATCH 35/89] writeback cache fixes --- hw/rtl/cache/VX_bank_flush.sv | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/hw/rtl/cache/VX_bank_flush.sv b/hw/rtl/cache/VX_bank_flush.sv index 9a9e1796a..530547987 100644 --- a/hw/rtl/cache/VX_bank_flush.sv +++ b/hw/rtl/cache/VX_bank_flush.sv @@ -42,17 +42,15 @@ module VX_bank_flush #( localparam STATE_IDLE = 2'd0; localparam STATE_INIT = 2'd1; localparam STATE_FLUSH = 2'd2; + localparam STATE_DONE = 2'd3; reg [CTR_WIDTH-1:0] counter_r; reg [1:0] state_r, state_n; - reg flush_in_ready_r, flush_in_ready_n; always @(*) begin state_n = state_r; - flush_in_ready_n = 0; case (state_r) - // STATE_IDLE - default: begin + STATE_IDLE: begin if (flush_in_valid && mshr_empty) begin state_n = STATE_FLUSH; end @@ -63,22 +61,23 @@ module VX_bank_flush #( end end STATE_FLUSH: begin - if (counter_r == ((2 ** CTR_WIDTH)-1)) begin - state_n = STATE_IDLE; - flush_in_ready_n = 1; + if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_out_ready) begin + state_n = STATE_DONE; end end + STATE_DONE: begin + // generate a completion pulse + state_n = STATE_IDLE; + end endcase end always @(posedge clk) begin if (reset) begin - state_r <= STATE_INIT; + state_r <= STATE_INIT; counter_r <= '0; - flush_in_ready_r <= '0; end else begin state_r <= state_n; - flush_in_ready_r <= flush_in_ready_n; if (state_r != STATE_IDLE) begin if ((state_r == STATE_INIT) || flush_out_ready) begin counter_r <= counter_r + CTR_WIDTH'(1); @@ -89,7 +88,8 @@ module VX_bank_flush #( end end - assign flush_in_ready = flush_in_ready_r; + assign flush_in_ready = (state_r == STATE_DONE); + assign flush_out_init = (state_r == STATE_INIT); assign flush_out_valid = (state_r == STATE_FLUSH); assign flush_out_line = counter_r[`CS_LINE_SEL_BITS-1:0]; From 2e77c9eec2894acf60170b7d541ab40375a6ae1c Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 22:14:06 -0700 Subject: [PATCH 36/89] writeback cache fixes --- hw/rtl/cache/VX_cache_flush.sv | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hw/rtl/cache/VX_cache_flush.sv b/hw/rtl/cache/VX_cache_flush.sv index 7f158850c..298a9c4d6 100644 --- a/hw/rtl/cache/VX_cache_flush.sv +++ b/hw/rtl/cache/VX_cache_flush.sv @@ -118,17 +118,22 @@ module VX_cache_flush #( STATE_WAIT: begin if (no_inflight_reqs) begin state_n = STATE_FLUSH; - flush_done_n = '0; end end STATE_FLUSH: begin + // wait for all banks to finish flushing flush_done_n = flush_done | flush_ready; if (flush_done_n == {NUM_BANKS{1'b1}}) begin state_n = STATE_DONE; + flush_done_n = '0; + // only release current flush requests + // and keep normal requests locked lock_released_n = flush_req_mask; end end STATE_DONE: begin + // wait until released flush requests are issued + // when returning to IDLE state other requests will unlock lock_released_n = lock_released & ~core_bus_out_ready; if (lock_released_n == 0) begin state_n = STATE_IDLE; From 516ce43a5c41b961a02ce616a1b2d3008b13e982 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Jul 2024 22:21:10 -0700 Subject: [PATCH 37/89] testing writeback cache --- hw/rtl/VX_config.vh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 804715aad..f43eb2581 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -537,7 +537,7 @@ // Enable Cache Writeback `ifndef DCACHE_WRITEBACK -`define DCACHE_WRITEBACK 0 +`define DCACHE_WRITEBACK 1 `endif // LMEM Configurable Knobs //////////////////////////////////////////////////// @@ -601,7 +601,7 @@ // Enable Cache Writeback `ifndef L2_WRITEBACK -`define L2_WRITEBACK 0 +`define L2_WRITEBACK 1 `endif // L3cache Configurable Knobs ///////////////////////////////////////////////// @@ -647,7 +647,7 @@ // Enable Cache Writeback `ifndef L3_WRITEBACK -`define L3_WRITEBACK 0 +`define L3_WRITEBACK 1 `endif // ISA Extensions ///////////////////////////////////////////////////////////// From 609155e490751e4d0e31f6017feebf0fc6b84493 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 31 Jul 2024 00:15:32 -0700 Subject: [PATCH 38/89] fixed CSV trace converter --- ci/trace_csv.py | 62 +++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/ci/trace_csv.py b/ci/trace_csv.py index 540e1898c..4a36f5f6a 100755 --- a/ci/trace_csv.py +++ b/ci/trace_csv.py @@ -19,6 +19,8 @@ import csv import re import inspect +configs = None + def parse_args(): parser = argparse.ArgumentParser(description='CPU trace log to CSV format converter.') parser.add_argument('-t', '--type', default='simx', help='log type (rtlsim or simx)') @@ -26,6 +28,24 @@ def parse_args(): parser.add_argument('log', help='Input log file') return parser.parse_args() +def load_config(filename): + config_pattern = r"CONFIGS: num_threads=(\d+), num_warps=(\d+), num_cores=(\d+), num_clusters=(\d+), socket_size=(\d+), local_mem_base=0x([0-9a-fA-F]+), num_barriers=(\d+)" + with open(filename, 'r') as file: + for line in file: + config_match = re.search(config_pattern, line) + if config_match: + config = { + 'num_threads': int(config_match.group(1)), + 'num_warps': int(config_match.group(2)), + 'num_cores': int(config_match.group(3)), + 'num_clusters': int(config_match.group(4)), + 'socket_size': int(config_match.group(5)), + 'local_mem_base': int(config_match.group(6), 16), + 'num_barriers': int(config_match.group(7)), + } + return config + return None + def parse_simx(log_lines): pc_pattern = r"PC=(0x[0-9a-fA-F]+)" instr_pattern = r"Instr (0x[0-9a-fA-F]+):" @@ -46,10 +66,10 @@ def parse_simx(log_lines): instr_data = {} instr_data["lineno"] = lineno instr_data["PC"] = re.search(pc_pattern, line).group(1) - instr_data["core_id"] = re.search(core_id_pattern, line).group(1) - instr_data["warp_id"] = re.search(warp_id_pattern, line).group(1) + instr_data["core_id"] = int(re.search(core_id_pattern, line).group(1)) + instr_data["warp_id"] = int(re.search(warp_id_pattern, line).group(1)) instr_data["tmask"] = re.search(tmask_pattern, line).group(1) - instr_data["uuid"] = re.search(uuid_pattern, line).group(1) + instr_data["uuid"] = int(re.search(uuid_pattern, line).group(1)) elif line.startswith("DEBUG Instr"): instr_data["instr"] = re.search(instr_pattern, line).group(1) instr_data["opcode"] = re.search(opcode_pattern, line).group(1) @@ -96,7 +116,7 @@ def append_value(text, reg, value, tmask_arr, sep): return text, sep def parse_rtlsim(log_lines): - config_pattern = r"CONFIGS: num_threads=(\d+), num_warps=(\d+), num_cores=(\d+), num_clusters=(\d+), socket_size=(\d+), local_mem_base=(\d+), num_barriers=(\d+)" + global configs line_pattern = r"\d+: cluster(\d+)-socket(\d+)-core(\d+)-(decode|issue|commit)" pc_pattern = r"PC=(0x[0-9a-fA-F]+)" instr_pattern = r"instr=(0x[0-9a-fA-F]+)" @@ -118,36 +138,20 @@ def parse_rtlsim(log_lines): uuid_pattern = r"#(\d+)" entries = [] instr_data = {} - num_threads = 0 - num_warps = 0 - num_cores = 0 - num_clusters = 0 - socket_size = 0 - local_mem_base = 0 - num_barriers = 0 - num_sockets = 0 + num_cores = configs['num_cores'] + socket_size = configs['socket_size'] + num_sockets = (num_cores + socket_size - 1) // socket_size for lineno, line in enumerate(log_lines, start=1): try: - config_match = re.search(config_pattern, line) - if config_match: - num_threads = int(config_match.group(1)) - num_warps = int(config_match.group(2)) - num_cores = int(config_match.group(3)) - num_clusters = int(config_match.group(4)) - socket_size = int(config_match.group(5)) - local_mem_base = int(config_match.group(6)) - num_barriers = int(config_match.group(7)) - num_sockets = (num_cores + socket_size - 1) // socket_size - continue line_match = re.search(line_pattern, line) if line_match: PC = re.search(pc_pattern, line).group(1) - warp_id = re.search(warp_id_pattern, line).group(1) + warp_id = int(re.search(warp_id_pattern, line).group(1)) tmask = re.search(tmask_pattern, line).group(1) - uuid = re.search(uuid_pattern, line).group(1) - cluster_id = line_match.group(1) - socket_id = line_match.group(2) - core_id = line_match.group(3) + uuid = int(re.search(uuid_pattern, line).group(1)) + cluster_id = int(line_match.group(1)) + socket_id = int(line_match.group(2)) + core_id = int(line_match.group(3)) stage = line_match.group(4) if stage == "decode": trace = {} @@ -274,7 +278,9 @@ def split_log_file(log_filename): return sublogs def main(): + global configs args = parse_args() + configs = load_config(args.log) sublogs = split_log_file(args.log) write_csv(sublogs, args.csv, args.type) From fc50b668191f9c3d37dbefc3c107fda4037d62bc Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 31 Jul 2024 00:16:13 -0700 Subject: [PATCH 39/89] regression script update --- ci/regression.sh.in | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/regression.sh.in b/ci/regression.sh.in index 4c7e9967a..dfce036a3 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -143,8 +143,8 @@ debug() test_csv_trace - ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" - ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" ./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1" echo "debugging tests done!" @@ -193,8 +193,8 @@ config1() CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx # L2/L3 - ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1" - ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1" ./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1" ./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1" @@ -280,7 +280,7 @@ stress() echo "begin stress tests..." # test verilator reset values - CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood + CONFIGS="-DVERILATOR_RESET_VALUE=1 -DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --args="-n128" --l2cache echo "stress tests done!" From 3fe8f963aac81cc30bb01b06025d0d7dfb9edf45 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 31 Jul 2024 02:20:32 -0700 Subject: [PATCH 40/89] writeback cache fixes --- hw/rtl/cache/VX_bank_flush.sv | 60 +++++++++++++++++-------------- hw/rtl/cache/VX_cache.sv | 16 +++++---- hw/rtl/cache/VX_cache_bank.sv | 66 +++++++++++++++++----------------- hw/rtl/cache/VX_cache_data.sv | 2 +- hw/rtl/cache/VX_cache_flush.sv | 24 ++++++++----- 5 files changed, 92 insertions(+), 76 deletions(-) diff --git a/hw/rtl/cache/VX_bank_flush.sv b/hw/rtl/cache/VX_bank_flush.sv index 530547987..4b0b551f4 100644 --- a/hw/rtl/cache/VX_bank_flush.sv +++ b/hw/rtl/cache/VX_bank_flush.sv @@ -27,32 +27,34 @@ module VX_bank_flush #( ) ( input wire clk, input wire reset, - input wire flush_in_valid, - output wire flush_in_ready, - output wire flush_out_init, - output wire flush_out_valid, - output wire [`CS_LINE_SEL_BITS-1:0] flush_out_line, - output wire [NUM_WAYS-1:0] flush_out_way, - input wire flush_out_ready, + input wire flush_begin, + output wire flush_end, + output wire flush_init, + output wire flush_valid, + output wire [`CS_LINE_SEL_BITS-1:0] flush_line, + output wire [NUM_WAYS-1:0] flush_way, + input wire flush_ready, input wire mshr_empty ); // ways interation is only needed when eviction is enabled localparam CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0); - localparam STATE_IDLE = 2'd0; - localparam STATE_INIT = 2'd1; - localparam STATE_FLUSH = 2'd2; - localparam STATE_DONE = 2'd3; + localparam STATE_IDLE = 0; + localparam STATE_INIT = 1; + localparam STATE_WAIT = 2; + localparam STATE_FLUSH = 3; + localparam STATE_DONE = 4; + + reg [2:0] state_r, state_n; reg [CTR_WIDTH-1:0] counter_r; - reg [1:0] state_r, state_n; always @(*) begin state_n = state_r; case (state_r) STATE_IDLE: begin - if (flush_in_valid && mshr_empty) begin - state_n = STATE_FLUSH; + if (flush_begin) begin + state_n = STATE_WAIT; end end STATE_INIT: begin @@ -60,8 +62,14 @@ module VX_bank_flush #( state_n = STATE_IDLE; end end + STATE_WAIT: begin + // wait for pending requests to complete + if (mshr_empty) begin + state_n = STATE_FLUSH; + end + end STATE_FLUSH: begin - if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_out_ready) begin + if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_ready) begin state_n = STATE_DONE; end end @@ -79,7 +87,8 @@ module VX_bank_flush #( end else begin state_r <= state_n; if (state_r != STATE_IDLE) begin - if ((state_r == STATE_INIT) || flush_out_ready) begin + if ((state_r == STATE_INIT) + || ((state_r == STATE_FLUSH) && flush_ready)) begin counter_r <= counter_r + CTR_WIDTH'(1); end end else begin @@ -88,21 +97,20 @@ module VX_bank_flush #( end end - assign flush_in_ready = (state_r == STATE_DONE); - - assign flush_out_init = (state_r == STATE_INIT); - assign flush_out_valid = (state_r == STATE_FLUSH); - assign flush_out_line = counter_r[`CS_LINE_SEL_BITS-1:0]; + assign flush_end = (state_r == STATE_DONE); + assign flush_init = (state_r == STATE_INIT); + assign flush_valid = (state_r == STATE_FLUSH); + assign flush_line = counter_r[`CS_LINE_SEL_BITS-1:0]; if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin - reg [NUM_WAYS-1:0] flush_out_way_r; + reg [NUM_WAYS-1:0] flush_way_r; always @(*) begin - flush_out_way_r = '0; - flush_out_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1; + flush_way_r = '0; + flush_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1; end - assign flush_out_way = flush_out_way_r; + assign flush_way = flush_way_r; end else begin - assign flush_out_way = {NUM_WAYS{1'b1}}; + assign flush_way = {NUM_WAYS{1'b1}}; end endmodule diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index d1b4a6d17..3c70bce85 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -109,8 +109,8 @@ module VX_cache import VX_gpu_pkg::*; #( .TAG_WIDTH (TAG_WIDTH) ) core_bus2_if[NUM_REQS](); - wire [NUM_BANKS-1:0] per_bank_flush_valid; - wire [NUM_BANKS-1:0] per_bank_flush_ready; + wire [NUM_BANKS-1:0] per_bank_flush_begin; + wire [NUM_BANKS-1:0] per_bank_flush_end; wire [NUM_BANKS-1:0] per_bank_core_req_fire; @@ -127,8 +127,8 @@ module VX_cache import VX_gpu_pkg::*; #( .core_bus_in_if (core_bus_if), .core_bus_out_if (core_bus2_if), .bank_req_fire (per_bank_core_req_fire), - .flush_valid (per_bank_flush_valid), - .flush_ready (per_bank_flush_ready) + .flush_begin (per_bank_flush_begin), + .flush_end (per_bank_flush_end) ); /////////////////////////////////////////////////////////////////////////// @@ -324,6 +324,7 @@ module VX_cache import VX_gpu_pkg::*; #( .NUM_OUTPUTS (NUM_BANKS), .DATAW (CORE_REQ_DATAW), .PERF_CTR_BITS (`PERF_CTR_BITS), + .ARBITER ("F"), .OUT_BUF (REQ_XBAR_BUF) ) req_xbar ( .clk (clk), @@ -432,8 +433,8 @@ module VX_cache import VX_gpu_pkg::*; #( .mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)), .mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]), - .flush_valid (per_bank_flush_valid[bank_id]), - .flush_ready (per_bank_flush_ready[bank_id]) + .flush_begin (per_bank_flush_begin[bank_id]), + .flush_end (per_bank_flush_end[bank_id]) ); if (NUM_BANKS == 1) begin @@ -457,7 +458,8 @@ module VX_cache import VX_gpu_pkg::*; #( VX_stream_xbar #( .NUM_INPUTS (NUM_BANKS), .NUM_OUTPUTS (NUM_REQS), - .DATAW (CORE_RSP_DATAW) + .DATAW (CORE_RSP_DATAW), + .ARBITER ("F") ) rsp_xbar ( .clk (clk), .reset (rsp_xbar_reset), diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index 885fdb943..4ed523e04 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -108,8 +108,8 @@ module VX_cache_bank #( output wire mem_rsp_ready, // flush - input wire flush_valid, - output wire flush_ready + input wire flush_begin, + output wire flush_end ); localparam PIPELINE_STAGES = 2; @@ -162,11 +162,11 @@ module VX_cache_bank #( wire mshr_pending_st0, mshr_pending_st1; wire mshr_empty; - wire line_flush_valid; - wire line_flush_init; - wire [`CS_LINE_SEL_BITS-1:0] line_flush_sel; - wire [NUM_WAYS-1:0] line_flush_way; - wire line_flush_ready; + wire flush_valid; + wire init_valid; + wire [`CS_LINE_SEL_BITS-1:0] flush_sel; + wire [NUM_WAYS-1:0] flush_way; + wire flush_ready; // flush unit VX_bank_flush #( @@ -176,16 +176,16 @@ module VX_cache_bank #( .NUM_WAYS (NUM_WAYS), .WRITEBACK (WRITEBACK) ) flush_unit ( - .clk (clk), - .reset (reset), - .flush_in_valid (flush_valid), - .flush_in_ready (flush_ready), - .flush_out_init (line_flush_init), - .flush_out_valid (line_flush_valid), - .flush_out_line (line_flush_sel), - .flush_out_way (line_flush_way), - .flush_out_ready (line_flush_ready), - .mshr_empty (mshr_empty) + .clk (clk), + .reset (reset), + .flush_begin (flush_begin), + .flush_end (flush_end), + .flush_init (init_valid), + .flush_valid (flush_valid), + .flush_line (flush_sel), + .flush_way (flush_way), + .flush_ready (flush_ready), + .mshr_empty (mshr_empty) ); wire rdw_hazard1_sel; @@ -198,16 +198,16 @@ module VX_cache_bank #( // mshr replay has highest priority to maximize utilization since there is no miss. // handle memory responses next to prevent deadlock with potential memory request from a miss. // flush has precedence over core requests to ensure that the cache is in a consistent state. - wire replay_grant = ~line_flush_init; + wire replay_grant = ~init_valid; wire replay_enable = replay_grant && replay_valid; - wire fill_grant = ~line_flush_init && ~replay_enable; + wire fill_grant = ~init_valid && ~replay_enable; wire fill_enable = fill_grant && mem_rsp_valid; - wire flush_grant = ~line_flush_init && ~replay_enable && ~fill_enable; - wire flush_enable = flush_grant && line_flush_valid; + wire flush_grant = ~init_valid && ~replay_enable && ~fill_enable; + wire flush_enable = flush_grant && flush_valid; - wire creq_grant = ~line_flush_init && ~replay_enable && ~fill_enable && ~flush_enable; + wire creq_grant = ~init_valid && ~replay_enable && ~fill_enable && ~flush_enable; wire creq_enable = creq_grant && core_req_valid; assign replay_ready = replay_grant @@ -219,23 +219,23 @@ module VX_cache_bank #( && ~rdw_hazard2_sel && ~pipe_stall; - assign line_flush_ready = flush_grant - && (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions - && ~rdw_hazard2_sel - && ~pipe_stall; + assign flush_ready = flush_grant + && (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions + && ~rdw_hazard2_sel + && ~pipe_stall; assign core_req_ready = creq_grant && ~mreq_queue_alm_full && ~mshr_alm_full && ~pipe_stall; - wire init_fire = line_flush_init; + wire init_fire = init_valid; wire replay_fire = replay_valid && replay_ready; wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; - wire line_flush_fire = line_flush_valid && line_flush_ready; + wire flush_fire = flush_valid && flush_ready; wire core_req_fire = core_req_valid && core_req_ready; - assign valid_sel = init_fire || replay_fire || mem_rsp_fire || line_flush_fire || core_req_fire; + assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire; assign rw_sel = replay_valid ? replay_rw : core_req_rw; assign byteen_sel = replay_valid ? replay_byteen : core_req_byteen; assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel; @@ -243,7 +243,7 @@ module VX_cache_bank #( assign tag_sel = replay_valid ? replay_tag : core_req_tag; assign creq_flush_sel = core_req_valid && core_req_flush; - assign addr_sel = (line_flush_init | line_flush_valid) ? `CS_LINE_ADDR_WIDTH'(line_flush_sel) : + assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) : (replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr)); if (WRITE_ENABLE) begin @@ -270,7 +270,7 @@ module VX_cache_bank #( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_sel, line_flush_init, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, line_flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}), + .data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}), .data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0}) ); @@ -663,8 +663,8 @@ module VX_cache_bank #( `ifdef DBG_TRACE_CACHE wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready; - wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || line_flush_valid) - && ~(replay_fire || mem_rsp_fire || core_req_fire || line_flush_fire); + wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || flush_valid) + && ~(replay_fire || mem_rsp_fire || core_req_fire || flush_fire); always @(posedge clk) begin if (input_stall || pipe_stall) begin `TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw1=%b, rdw2=%b, rdw3=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard1_sel, rdw_hazard2_sel, rdw_hazard3_st1)); diff --git a/hw/rtl/cache/VX_cache_data.sv b/hw/rtl/cache/VX_cache_data.sv index d1dd8050f..da1a7fe63 100644 --- a/hw/rtl/cache/VX_cache_data.sv +++ b/hw/rtl/cache/VX_cache_data.sv @@ -117,7 +117,7 @@ module VX_cache_data #( end // order the data layout to perform ways multiplexing last. - // this allows converting way index to binary in parallel with BRAM read. + // this allows converting way index to binary in parallel with BRAM readaccess and way selection. wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_wdata; wire [BYTEENW-1:0] line_wren; diff --git a/hw/rtl/cache/VX_cache_flush.sv b/hw/rtl/cache/VX_cache_flush.sv index 298a9c4d6..7a33565fc 100644 --- a/hw/rtl/cache/VX_cache_flush.sv +++ b/hw/rtl/cache/VX_cache_flush.sv @@ -26,13 +26,16 @@ module VX_cache_flush #( VX_mem_bus_if.slave core_bus_in_if [NUM_REQS], VX_mem_bus_if.master core_bus_out_if [NUM_REQS], input wire [NUM_BANKS-1:0] bank_req_fire, - output wire [NUM_BANKS-1:0] flush_valid, - input wire [NUM_BANKS-1:0] flush_ready + output wire [NUM_BANKS-1:0] flush_begin, + input wire [NUM_BANKS-1:0] flush_end ); localparam STATE_IDLE = 0; - localparam STATE_WAIT = 1; + localparam STATE_WAIT1 = 1; localparam STATE_FLUSH = 2; - localparam STATE_DONE = 3; + localparam STATE_WAIT2 = 3; + localparam STATE_DONE = 4; + + reg [2:0] state, state_n; // track in-flight core requests @@ -76,7 +79,6 @@ module VX_cache_flush #( `UNUSED_VAR (bank_req_fire) end - reg [1:0] state, state_n; reg [NUM_BANKS-1:0] flush_done, flush_done_n; wire [NUM_REQS-1:0] flush_req_mask; @@ -112,17 +114,21 @@ module VX_cache_flush #( case (state) STATE_IDLE: begin if (flush_req_enable) begin - state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT : STATE_FLUSH; + state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT1 : STATE_FLUSH; end end - STATE_WAIT: begin + STATE_WAIT1: begin if (no_inflight_reqs) begin state_n = STATE_FLUSH; end end STATE_FLUSH: begin + // generate a flush request pulse + state_n = STATE_WAIT2; + end + STATE_WAIT2: begin // wait for all banks to finish flushing - flush_done_n = flush_done | flush_ready; + flush_done_n = flush_done | flush_end; if (flush_done_n == {NUM_BANKS{1'b1}}) begin state_n = STATE_DONE; flush_done_n = '0; @@ -154,6 +160,6 @@ module VX_cache_flush #( end end - assign flush_valid = {NUM_BANKS{state == STATE_FLUSH}}; + assign flush_begin = {NUM_BANKS{state == STATE_FLUSH}}; endmodule From 4dc34cfd2d04328af9b8e011896d979db1aaa87e Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 31 Jul 2024 10:52:57 -0700 Subject: [PATCH 41/89] hw arbitration update --- hw/rtl/core/VX_alu_muldiv.sv | 1 + hw/rtl/core/VX_alu_unit.sv | 3 +- hw/rtl/core/VX_lsu_slice.sv | 1 + hw/rtl/libs/VX_avs_adapter.sv | 2 +- hw/rtl/libs/VX_axi_adapter.sv | 60 +++++++++++++++++------------------ hw/rtl/libs/VX_stream_arb.sv | 2 +- hw/rtl/libs/VX_stream_xbar.sv | 2 +- hw/rtl/mem/VX_local_mem.sv | 2 ++ 8 files changed, 39 insertions(+), 34 deletions(-) diff --git a/hw/rtl/core/VX_alu_muldiv.sv b/hw/rtl/core/VX_alu_muldiv.sv index 460295463..1a4806705 100644 --- a/hw/rtl/core/VX_alu_muldiv.sv +++ b/hw/rtl/core/VX_alu_muldiv.sv @@ -324,6 +324,7 @@ module VX_alu_muldiv #( VX_stream_arb #( .NUM_INPUTS (2), .DATAW (TAG_WIDTH + (NUM_LANES * `XLEN)), + .ARBITER ("F"), .OUT_BUF (1) ) rsp_buf ( .clk (clk), diff --git a/hw/rtl/core/VX_alu_unit.sv b/hw/rtl/core/VX_alu_unit.sv index d8c131838..f34b0b5b1 100644 --- a/hw/rtl/core/VX_alu_unit.sv +++ b/hw/rtl/core/VX_alu_unit.sv @@ -126,7 +126,8 @@ module VX_alu_unit #( VX_stream_arb #( .NUM_INPUTS (RSP_ARB_SIZE), .DATAW (RSP_ARB_DATAW), - .OUT_BUF (PARTIAL_BW ? 1 : 3) + .OUT_BUF (PARTIAL_BW ? 1 : 3), + .ARBITER ("F") ) rsp_arb ( .clk (clk), .reset (arb_reset), diff --git a/hw/rtl/core/VX_lsu_slice.sv b/hw/rtl/core/VX_lsu_slice.sv index 120dc9f8e..8c685fca2 100644 --- a/hw/rtl/core/VX_lsu_slice.sv +++ b/hw/rtl/core/VX_lsu_slice.sv @@ -490,6 +490,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( VX_stream_arb #( .NUM_INPUTS (2), .DATAW (RSP_ARB_DATAW), + .ARBITER ("P"), // prioritize commit_rsp_if .OUT_BUF (3) ) rsp_arb ( .clk (clk), diff --git a/hw/rtl/libs/VX_avs_adapter.sv b/hw/rtl/libs/VX_avs_adapter.sv index 6e9abf597..28da07565 100644 --- a/hw/rtl/libs/VX_avs_adapter.sv +++ b/hw/rtl/libs/VX_avs_adapter.sv @@ -195,7 +195,7 @@ module VX_avs_adapter #( VX_stream_arb #( .NUM_INPUTS (NUM_BANKS), .DATAW (DATA_WIDTH + TAG_WIDTH), - .ARBITER ("R"), + .ARBITER ("F"), .OUT_BUF (RSP_OUT_BUF) ) rsp_arb ( .clk (clk), diff --git a/hw/rtl/libs/VX_axi_adapter.sv b/hw/rtl/libs/VX_axi_adapter.sv index 69e3e3adc..a1c5b5b36 100644 --- a/hw/rtl/libs/VX_axi_adapter.sv +++ b/hw/rtl/libs/VX_axi_adapter.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,10 +15,10 @@ `TRACING_OFF module VX_axi_adapter #( - parameter DATA_WIDTH = 512, + parameter DATA_WIDTH = 512, parameter ADDR_WIDTH = 32, parameter TAG_WIDTH = 8, - parameter NUM_BANKS = 1, + parameter NUM_BANKS = 1, parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)), parameter RSP_OUT_BUF = 0 ) ( @@ -34,13 +34,13 @@ module VX_axi_adapter #( input wire [TAG_WIDTH-1:0] mem_req_tag, output wire mem_req_ready, - // Vortex response - output wire mem_rsp_valid, + // Vortex response + output wire mem_rsp_valid, output wire [DATA_WIDTH-1:0] mem_rsp_data, output wire [TAG_WIDTH-1:0] mem_rsp_tag, input wire mem_rsp_ready, - // AXI write request address channel + // AXI write request address channel output wire m_axi_awvalid [NUM_BANKS], input wire m_axi_awready [NUM_BANKS], output wire [ADDR_WIDTH-1:0] m_axi_awaddr [NUM_BANKS], @@ -54,7 +54,7 @@ module VX_axi_adapter #( output wire [3:0] m_axi_awqos [NUM_BANKS], output wire [3:0] m_axi_awregion [NUM_BANKS], - // AXI write request data channel + // AXI write request data channel output wire m_axi_wvalid [NUM_BANKS], input wire m_axi_wready [NUM_BANKS], output wire [DATA_WIDTH-1:0] m_axi_wdata [NUM_BANKS], @@ -66,7 +66,7 @@ module VX_axi_adapter #( output wire m_axi_bready [NUM_BANKS], input wire [TAG_WIDTH-1:0] m_axi_bid [NUM_BANKS], input wire [1:0] m_axi_bresp [NUM_BANKS], - + // AXI read address channel output wire m_axi_arvalid [NUM_BANKS], input wire m_axi_arready [NUM_BANKS], @@ -74,13 +74,13 @@ module VX_axi_adapter #( output wire [TAG_WIDTH-1:0] m_axi_arid [NUM_BANKS], output wire [7:0] m_axi_arlen [NUM_BANKS], output wire [2:0] m_axi_arsize [NUM_BANKS], - output wire [1:0] m_axi_arburst [NUM_BANKS], + output wire [1:0] m_axi_arburst [NUM_BANKS], output wire [1:0] m_axi_arlock [NUM_BANKS], output wire [3:0] m_axi_arcache [NUM_BANKS], output wire [2:0] m_axi_arprot [NUM_BANKS], output wire [3:0] m_axi_arqos [NUM_BANKS], output wire [3:0] m_axi_arregion [NUM_BANKS], - + // AXI read response channel input wire m_axi_rvalid [NUM_BANKS], output wire m_axi_rready [NUM_BANKS], @@ -88,15 +88,15 @@ module VX_axi_adapter #( input wire m_axi_rlast [NUM_BANKS], input wire [TAG_WIDTH-1:0] m_axi_rid [NUM_BANKS], input wire [1:0] m_axi_rresp [NUM_BANKS] -); +); localparam AXSIZE = `CLOG2(DATA_WIDTH/8); - localparam BANK_ADDRW = `LOG2UP(NUM_BANKS); + localparam BANK_ADDRW = `LOG2UP(NUM_BANKS); localparam LOG2_NUM_BANKS = `CLOG2(NUM_BANKS); wire [BANK_ADDRW-1:0] req_bank_sel; if (NUM_BANKS > 1) begin - assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0]; + assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0]; end else begin assign req_bank_sel = '0; end @@ -108,12 +108,12 @@ module VX_axi_adapter #( for (genvar i = 0; i < NUM_BANKS; ++i) begin wire m_axi_aw_fire = m_axi_awvalid[i] && m_axi_awready[i]; - wire m_axi_w_fire = m_axi_wvalid[i] && m_axi_wready[i]; + wire m_axi_w_fire = m_axi_wvalid[i] && m_axi_wready[i]; always @(posedge clk) begin if (reset) begin m_axi_aw_ack[i] <= 0; m_axi_w_ack[i] <= 0; - end else begin + end else begin if (mem_req_fire && (req_bank_sel == i)) begin m_axi_aw_ack[i] <= 0; m_axi_w_ack[i] <= 0; @@ -127,10 +127,10 @@ module VX_axi_adapter #( end end - wire axi_write_ready [NUM_BANKS]; + wire axi_write_ready [NUM_BANKS]; for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign axi_write_ready[i] = (m_axi_awready[i] || m_axi_aw_ack[i]) + assign axi_write_ready[i] = (m_axi_awready[i] || m_axi_aw_ack[i]) && (m_axi_wready[i] || m_axi_w_ack[i]); end @@ -141,17 +141,17 @@ module VX_axi_adapter #( assign mem_req_ready = mem_req_rw ? axi_write_ready[0] : m_axi_arready[0]; end - // AXI write request address channel + // AXI write request address channel for (genvar i = 0; i < NUM_BANKS; ++i) begin assign m_axi_awvalid[i] = mem_req_valid && mem_req_rw && (req_bank_sel == i) && ~m_axi_aw_ack[i]; assign m_axi_awaddr[i] = (ADDR_WIDTH'(mem_req_addr) >> LOG2_NUM_BANKS) << AXSIZE; assign m_axi_awid[i] = mem_req_tag; - assign m_axi_awlen[i] = 8'b00000000; + assign m_axi_awlen[i] = 8'b00000000; assign m_axi_awsize[i] = 3'(AXSIZE); - assign m_axi_awburst[i] = 2'b00; - assign m_axi_awlock[i] = 2'b00; + assign m_axi_awburst[i] = 2'b00; + assign m_axi_awlock[i] = 2'b00; assign m_axi_awcache[i] = 4'b0000; - assign m_axi_awprot[i] = 3'b000; + assign m_axi_awprot[i] = 3'b000; assign m_axi_awqos[i] = 4'b0000; assign m_axi_awregion[i]= 4'b0000; end @@ -170,31 +170,31 @@ module VX_axi_adapter #( `UNUSED_VAR (m_axi_bid[i]) `UNUSED_VAR (m_axi_bresp[i]) assign m_axi_bready[i] = 1'b1; - `RUNTIME_ASSERT(~m_axi_bvalid[i] || m_axi_bresp[i] == 0, ("%t: *** AXI response error", $time)); + `RUNTIME_ASSERT(~m_axi_bvalid[i] || m_axi_bresp[i] == 0, ("%t: *** AXI response error", $time)); end // AXI read request channel for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign m_axi_arvalid[i] = mem_req_valid && ~mem_req_rw && (req_bank_sel == i); + assign m_axi_arvalid[i] = mem_req_valid && ~mem_req_rw && (req_bank_sel == i); assign m_axi_araddr[i] = (ADDR_WIDTH'(mem_req_addr) >> LOG2_NUM_BANKS) << AXSIZE; assign m_axi_arid[i] = mem_req_tag; assign m_axi_arlen[i] = 8'b00000000; assign m_axi_arsize[i] = 3'(AXSIZE); - assign m_axi_arburst[i] = 2'b00; - assign m_axi_arlock[i] = 2'b00; + assign m_axi_arburst[i] = 2'b00; + assign m_axi_arlock[i] = 2'b00; assign m_axi_arcache[i] = 4'b0000; assign m_axi_arprot[i] = 3'b000; assign m_axi_arqos[i] = 4'b0000; assign m_axi_arregion[i]= 4'b0000; end - // AXI read response channel + // AXI read response channel wire [NUM_BANKS-1:0] rsp_arb_valid_in; wire [NUM_BANKS-1:0][DATA_WIDTH+TAG_WIDTH-1:0] rsp_arb_data_in; wire [NUM_BANKS-1:0] rsp_arb_ready_in; - `UNUSED_VAR (m_axi_rlast) + `UNUSED_VAR (m_axi_rlast) for (genvar i = 0; i < NUM_BANKS; ++i) begin assign rsp_arb_valid_in[i] = m_axi_rvalid[i]; @@ -207,7 +207,7 @@ module VX_axi_adapter #( VX_stream_arb #( .NUM_INPUTS (NUM_BANKS), .DATAW (DATA_WIDTH + TAG_WIDTH), - .ARBITER ("R"), + .ARBITER ("F"), .OUT_BUF (RSP_OUT_BUF) ) rsp_arb ( .clk (clk), diff --git a/hw/rtl/libs/VX_stream_arb.sv b/hw/rtl/libs/VX_stream_arb.sv index bbffe7342..165f7a01d 100644 --- a/hw/rtl/libs/VX_stream_arb.sv +++ b/hw/rtl/libs/VX_stream_arb.sv @@ -18,7 +18,7 @@ module VX_stream_arb #( parameter NUM_INPUTS = 1, parameter NUM_OUTPUTS = 1, parameter DATAW = 1, - parameter `STRING ARBITER = "P", + parameter `STRING ARBITER = "R", parameter MAX_FANOUT = `MAX_FANOUT, parameter OUT_BUF = 0, parameter LUTRAM = 0, diff --git a/hw/rtl/libs/VX_stream_xbar.sv b/hw/rtl/libs/VX_stream_xbar.sv index cb0d9a179..7539121f2 100644 --- a/hw/rtl/libs/VX_stream_xbar.sv +++ b/hw/rtl/libs/VX_stream_xbar.sv @@ -20,7 +20,7 @@ module VX_stream_xbar #( parameter DATAW = 4, parameter IN_WIDTH = `LOG2UP(NUM_INPUTS), parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS), - parameter ARBITER = "P", + parameter ARBITER = "R", parameter OUT_BUF = 0, parameter LUTRAM = 0, parameter MAX_FANOUT = `MAX_FANOUT, diff --git a/hw/rtl/mem/VX_local_mem.sv b/hw/rtl/mem/VX_local_mem.sv index e31524927..0c97464b5 100644 --- a/hw/rtl/mem/VX_local_mem.sv +++ b/hw/rtl/mem/VX_local_mem.sv @@ -120,6 +120,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( .NUM_OUTPUTS (NUM_BANKS), .DATAW (REQ_DATAW), .PERF_CTR_BITS (`PERF_CTR_BITS), + .ARBITER ("F"), .OUT_BUF (3) // output should be registered for the data_store addressing ) req_xbar ( .clk (clk), @@ -209,6 +210,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( .NUM_INPUTS (NUM_BANKS), .NUM_OUTPUTS (NUM_REQS), .DATAW (RSP_DATAW), + .ARBITER ("F"), .OUT_BUF (OUT_BUF) ) rsp_xbar ( .clk (clk), From ef5d58dc9e3ce4618ad791c0fe97dcd9335ff052 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 31 Jul 2024 11:45:51 -0700 Subject: [PATCH 42/89] cache regression tests --- ci/regression.sh.in | 150 +++++++++++++++++++++++++------------------- hw/rtl/VX_config.vh | 6 +- 2 files changed, 87 insertions(+), 69 deletions(-) diff --git a/ci/regression.sh.in b/ci/regression.sh.in index dfce036a3..f405f8b1b 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -122,32 +122,54 @@ opencl() echo "opencl tests done!" } -test_csv_trace() +cache() { - # test CSV trace generation - make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null - make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null - make -C tests/riscv/isa run-simx-32im > run_simx.log - make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log - ./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv - ./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv - diff trace_rtlsim.csv trace_simx.csv - # clean build - make -C sim/simx clean - make -C sim/rtlsim clean -} + echo "begin cache tests..." -debug() -{ - echo "begin debugging tests..." + # disable local memory + CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo --perf=1 + CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=demo --perf=1 - test_csv_trace + # disable L1 cache + CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" - CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" - ./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1" + # reduce l1 line size + CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=rtlsim --app=io_addr + CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=simx --app=io_addr + CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx - echo "debugging tests done!" + # test cache ways + CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx + + # test cache banking + CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx + CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx + + # test writeback + CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --app=mstress + CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --app=mstress + CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress + CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress + + # cache clustering + CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=4 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=4 --warps=1 --threads=2 + + # L2/L3 + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=diverge --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --app=diverge --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1" + + echo "begin cache tests..." } config1() @@ -163,10 +185,12 @@ config1() ./ci/blackbox.sh --driver=simx --warps=8 --threads=16 --app=diverge # cores clustering - ./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1" + ./ci/blackbox.sh --driver=rtlsim --cores=4 --app=diverge --args="-n1" + ./ci/blackbox.sh --driver=simx --cores=4 --app=diverge --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1" - ./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1" - ./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1" + ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1" # issue width CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge @@ -192,16 +216,6 @@ config1() CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx - # L2/L3 - CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1" - CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1" - ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1" - ./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1" - ./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1" - - # multiple L1 caches per socket - CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=8 --warps=1 --threads=2 - echo "configuration-1 tests done!" } @@ -232,37 +246,9 @@ config2() # disabling ZICOND extension CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo - # disable local memory - CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo --perf=1 - CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=demo --perf=1 - # test AXI bus AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=demo - # disable L1 cache - CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - - # reduce l1 line size - CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=rtlsim --app=io_addr - CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=simx --app=io_addr - CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx - - # test cache ways - CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx - - # test cache banking - CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx - CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx - # test 128-bit MEM block CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=demo @@ -275,12 +261,40 @@ config2() echo "configuration-2 tests done!" } +test_csv_trace() +{ + # test CSV trace generation + make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null + make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null + make -C tests/riscv/isa run-simx-32im > run_simx.log + make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log + ./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv + ./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv + diff trace_rtlsim.csv trace_simx.csv + # clean build + make -C sim/simx clean + make -C sim/rtlsim clean +} + +debug() +{ + echo "begin debugging tests..." + + test_csv_trace + + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" + ./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1" + + echo "debugging tests done!" +} + stress() { echo "begin stress tests..." # test verilator reset values - CONFIGS="-DVERILATOR_RESET_VALUE=1 -DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood + CONFIGS="-DVERILATOR_RESET_VALUE=1 -DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --args="-n128" --l2cache echo "stress tests done!" @@ -299,11 +313,9 @@ synthesis() show_usage() { echo "Vortex Regression Test" - echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]" + echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]" } -start=$SECONDS - declare -a tests=() clean=0 @@ -327,6 +339,9 @@ while [ "$1" != "" ]; do --opencl ) tests+=("opencl") ;; + --cache ) + tests+=("cache") + ;; --config1 ) tests+=("config1") ;; @@ -349,6 +364,7 @@ while [ "$1" != "" ]; do tests+=("kernel") tests+=("regression") tests+=("opencl") + tests+=("cache") tests+=("config1") tests+=("config2") tests+=("debug") @@ -372,6 +388,8 @@ then make -s fi +start=$SECONDS + for test in "${tests[@]}"; do $test done diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index f43eb2581..804715aad 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -537,7 +537,7 @@ // Enable Cache Writeback `ifndef DCACHE_WRITEBACK -`define DCACHE_WRITEBACK 1 +`define DCACHE_WRITEBACK 0 `endif // LMEM Configurable Knobs //////////////////////////////////////////////////// @@ -601,7 +601,7 @@ // Enable Cache Writeback `ifndef L2_WRITEBACK -`define L2_WRITEBACK 1 +`define L2_WRITEBACK 0 `endif // L3cache Configurable Knobs ///////////////////////////////////////////////// @@ -647,7 +647,7 @@ // Enable Cache Writeback `ifndef L3_WRITEBACK -`define L3_WRITEBACK 1 +`define L3_WRITEBACK 0 `endif // ISA Extensions ///////////////////////////////////////////////////////////// From 81251b1af854fec341ef7188a243267e013140a9 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 31 Jul 2024 13:55:44 -0700 Subject: [PATCH 43/89] minor update --- hw/rtl/VX_platform.vh | 8 ++--- hw/rtl/libs/VX_elastic_buffer.sv | 4 +-- hw/rtl/libs/VX_pipe_buffer.sv | 58 ++++++++++++++++++++------------ 3 files changed, 42 insertions(+), 28 deletions(-) diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 73a6edd78..59f5ef0f5 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -238,11 +238,11 @@ `define RESET_RELAY(dst, src) \ `RESET_RELAY_EX (dst, src, 1, 0) -// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2 -`define TO_OUT_BUF_SIZE(out_reg) `MIN(out_reg, 2) +// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2, 5 -> 2 +`define TO_OUT_BUF_SIZE(s) `MIN(s, 2) -// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2 -`define TO_OUT_BUF_REG(out_reg) ((out_reg & 1) + ((out_reg >> 2) << 1)) +// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2, 5 > 3 +`define TO_OUT_BUF_REG(s) ((s < 2) ? s : (s - 2)) `define REPEAT(n,f,s) `_REPEAT_``n(f,s) `define _REPEAT_0(f,s) diff --git a/hw/rtl/libs/VX_elastic_buffer.sv b/hw/rtl/libs/VX_elastic_buffer.sv index 01464840c..ee6f31b58 100644 --- a/hw/rtl/libs/VX_elastic_buffer.sv +++ b/hw/rtl/libs/VX_elastic_buffer.sv @@ -103,9 +103,9 @@ module VX_elastic_buffer #( assign ready_in = ~full; - VX_elastic_buffer #( + VX_pipe_buffer #( .DATAW (DATAW), - .SIZE ((OUT_REG == 2) ? 1 : 0) + .DEPTH ((OUT_REG > 0) ? (OUT_REG-1) : 0) ) out_buf ( .clk (clk), .reset (reset), diff --git a/hw/rtl/libs/VX_pipe_buffer.sv b/hw/rtl/libs/VX_pipe_buffer.sv index 75a4579a0..167235c17 100644 --- a/hw/rtl/libs/VX_pipe_buffer.sv +++ b/hw/rtl/libs/VX_pipe_buffer.sv @@ -1,11 +1,11 @@ // Copyright 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,39 +24,53 @@ `TRACING_OFF module VX_pipe_buffer #( - parameter DATAW = 1, - parameter PASSTHRU = 0 -) ( + parameter DATAW = 1, + parameter DEPTH = 1 +) ( input wire clk, input wire reset, input wire valid_in, - output wire ready_in, + output wire ready_in, input wire [DATAW-1:0] data_in, output wire [DATAW-1:0] data_out, input wire ready_out, output wire valid_out -); - if (PASSTHRU != 0) begin +); + if (DEPTH == 0) begin `UNUSED_VAR (clk) `UNUSED_VAR (reset) assign ready_in = ready_out; - assign valid_out = valid_in; + assign valid_out = valid_in; assign data_out = data_in; end else begin - wire stall = valid_out && ~ready_out; + wire [DEPTH:0] valid; + `IGNORE_UNOPTFLAT_BEGIN + wire [DEPTH:0] ready; + `IGNORE_UNOPTFLAT_END + wire [DEPTH:0][DATAW-1:0] data; - VX_pipe_register #( - .DATAW (1 + DATAW), - .RESETW (1) - ) pipe_register ( - .clk (clk), - .reset (reset), - .enable (~stall), - .data_in ({valid_in, data_in}), - .data_out ({valid_out, data_out}) - ); + assign valid[0] = valid_in; + assign data[0] = data_in; + assign ready_in = ready[0]; + + for (genvar i = 0; i < DEPTH; ++i) begin + assign ready[i] = (ready[i+1] || ~valid[i+1]); + VX_pipe_register #( + .DATAW (1 + DATAW), + .RESETW (1) + ) pipe_register ( + .clk (clk), + .reset (reset), + .enable (ready[i]), + .data_in ({valid[i], data[i]}), + .data_out ({valid[i+1], data[i+1]}) + ); + end + + assign valid_out = valid[DEPTH]; + assign data_out = data[DEPTH]; + assign ready[DEPTH] = ready_out; - assign ready_in = ~stall; end endmodule From 0a3035e6a7e0d15a87686cf88d9d81546393917e Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 31 Jul 2024 14:51:34 -0700 Subject: [PATCH 44/89] minor update --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8e9bbeae9..f49dd42bf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -117,7 +117,7 @@ jobs: strategy: fail-fast: false matrix: - name: [regression, opencl, config1, config2, debug, stress] + name: [regression, opencl, cache, config1, config2, debug, stress] xlen: [32, 64] steps: From e53b295eea1164172843d466f9b921a801735afd Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 31 Jul 2024 20:53:40 -0700 Subject: [PATCH 45/89] writeback cache fixes --- hw/rtl/cache/VX_bank_flush.sv | 21 ++++++++++++++++----- hw/rtl/cache/VX_cache_bank.sv | 10 ++++++++-- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/hw/rtl/cache/VX_bank_flush.sv b/hw/rtl/cache/VX_bank_flush.sv index 4b0b551f4..6c02c1e13 100644 --- a/hw/rtl/cache/VX_bank_flush.sv +++ b/hw/rtl/cache/VX_bank_flush.sv @@ -14,6 +14,7 @@ `include "VX_cache_define.vh" module VX_bank_flush #( + parameter BANK_ID = 0, // Size of cache in bytes parameter CACHE_SIZE = 1024, // Size of line inside a bank in bytes @@ -34,16 +35,18 @@ module VX_bank_flush #( output wire [`CS_LINE_SEL_BITS-1:0] flush_line, output wire [NUM_WAYS-1:0] flush_way, input wire flush_ready, - input wire mshr_empty + input wire mshr_empty, + input wire bank_empty ); // ways interation is only needed when eviction is enabled localparam CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0); localparam STATE_IDLE = 0; localparam STATE_INIT = 1; - localparam STATE_WAIT = 2; + localparam STATE_WAIT1 = 2; localparam STATE_FLUSH = 3; - localparam STATE_DONE = 4; + localparam STATE_WAIT2 = 4; + localparam STATE_DONE = 5; reg [2:0] state_r, state_n; @@ -54,7 +57,7 @@ module VX_bank_flush #( case (state_r) STATE_IDLE: begin if (flush_begin) begin - state_n = STATE_WAIT; + state_n = STATE_WAIT1; end end STATE_INIT: begin @@ -62,7 +65,7 @@ module VX_bank_flush #( state_n = STATE_IDLE; end end - STATE_WAIT: begin + STATE_WAIT1: begin // wait for pending requests to complete if (mshr_empty) begin state_n = STATE_FLUSH; @@ -70,6 +73,14 @@ module VX_bank_flush #( end STATE_FLUSH: begin if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_ready) begin + state_n = (BANK_ID == 0) ? STATE_DONE : STATE_WAIT2; + end + end + STATE_WAIT2: begin + // ensure the bank is empty before notifying the cache flush unit, + // because the flush request to lower caches only goes through bank0 + // and it is important that request gets send out last. + if (bank_empty) begin state_n = STATE_DONE; end end diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index 4ed523e04..b2edbf918 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -120,6 +120,7 @@ module VX_cache_bank #( wire crsp_queue_stall; wire mshr_alm_full; + wire mreq_queue_empty; wire mreq_queue_alm_full; wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr; @@ -168,8 +169,12 @@ module VX_cache_bank #( wire [NUM_WAYS-1:0] flush_way; wire flush_ready; + // ensure we have no pending memory request in the bank + wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty; + // flush unit VX_bank_flush #( + .BANK_ID (BANK_ID), .CACHE_SIZE (CACHE_SIZE), .LINE_SIZE (LINE_SIZE), .NUM_BANKS (NUM_BANKS), @@ -185,7 +190,8 @@ module VX_cache_bank #( .flush_line (flush_sel), .flush_way (flush_way), .flush_ready (flush_ready), - .mshr_empty (mshr_empty) + .mshr_empty (mshr_empty), + .bank_empty (no_pending_req) ); wire rdw_hazard1_sel; @@ -585,7 +591,7 @@ module VX_cache_bank #( // schedule memory request - wire mreq_queue_push, mreq_queue_pop, mreq_queue_empty; + wire mreq_queue_push, mreq_queue_pop; wire [`CS_LINE_WIDTH-1:0] mreq_queue_data; wire [LINE_SIZE-1:0] mreq_queue_byteen; wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr; From 29c5a28273f27d308a69dae8666bc042bb08d3a3 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 2 Aug 2024 00:36:10 -0700 Subject: [PATCH 46/89] minor update --- hw/rtl/core/VX_operands.sv | 81 ++++++++++++++++----------------- hw/rtl/libs/VX_onehot_mux.sv | 87 +++++++++++++++++++++++++++++++++++- runtime/stub/utils.cpp | 4 +- 3 files changed, 127 insertions(+), 45 deletions(-) diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 3773fbfca..f22c540fe 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -44,7 +44,8 @@ module VX_operands import VX_gpu_pkg::*; #( localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS); localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS; localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH; - localparam DATAW = META_DATAW + 3 * `NUM_THREADS * `XLEN; + localparam REGS_DATAW = NUM_SRC_REGS * `NUM_THREADS * `XLEN; + localparam DATAW = META_DATAW + REGS_DATAW; localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS); localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS; localparam XLEN_SIZE = `XLEN / 8; @@ -69,10 +70,12 @@ module VX_operands import VX_gpu_pkg::*; #( wire pipe_in_ready; reg pipe_out_valid; wire pipe_out_ready; - reg [META_DATAW-1:0] pipe_out_data; + reg [META_DATAW-1:0] pipe_out_data, pipe_out_data_n; reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data, src_data_n; - reg [NUM_SRC_REGS-1:0] data_fetched; + wire reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n2; + + reg [NUM_SRC_REGS-1:0] data_fetched, data_fetched_n; reg has_collision, has_collision_n; wire stg_in_valid, stg_in_ready; @@ -134,6 +137,15 @@ module VX_operands import VX_gpu_pkg::*; #( end end + always @(*) begin + data_fetched_n = data_fetched; + if (scoreboard_if.ready) begin + data_fetched_n = '0; + end else begin + data_fetched_n = data_fetched | req_in_ready; + end + end + always @(*) begin src_data_n = src_data; for (integer b = 0; b < NUM_BANKS; ++b) begin @@ -143,6 +155,18 @@ module VX_operands import VX_gpu_pkg::*; #( end end + assign pipe_out_data_n = { + scoreboard_if.data.wis, + scoreboard_if.data.tmask, + scoreboard_if.data.PC, + scoreboard_if.data.wb, + scoreboard_if.data.ex_type, + scoreboard_if.data.op_type, + scoreboard_if.data.op_args, + scoreboard_if.data.rd, + scoreboard_if.data.uuid + }; + wire pipe_stall = pipe_out_valid && ~pipe_out_ready; assign pipe_in_ready = ~pipe_stall; @@ -150,45 +174,18 @@ module VX_operands import VX_gpu_pkg::*; #( wire stg_in_fire = stg_in_valid && stg_in_ready; - always @(posedge clk) begin - if (reset) begin - pipe_out_valid <= 0; - gpr_rd_valid <= '0; - data_fetched <= '0; - src_data <= '0; - end else begin - if (~pipe_stall) begin - pipe_out_valid <= scoreboard_if.valid; - gpr_rd_valid <= gpr_rd_valid_n; - if (scoreboard_if.ready) begin - data_fetched <= '0; - end else begin - data_fetched <= data_fetched | req_in_ready; - end - if (stg_in_fire) begin - src_data <= '0; - end else begin - src_data <= src_data_n; - end - end - end - if (~pipe_stall) begin - pipe_out_data <= { - scoreboard_if.data.wis, - scoreboard_if.data.tmask, - scoreboard_if.data.PC, - scoreboard_if.data.wb, - scoreboard_if.data.ex_type, - scoreboard_if.data.op_type, - scoreboard_if.data.op_args, - scoreboard_if.data.rd, - scoreboard_if.data.uuid - }; - has_collision <= has_collision_n; - gpr_rd_addr <= gpr_rd_addr_n; - gpr_rd_req_idx <= gpr_rd_req_idx_n; - end - end + assign src_data_n2 = stg_in_fire ? '0 : src_data_n; + + VX_pipe_register #( + .DATAW (1 + NUM_BANKS + NUM_SRC_REGS + REGS_DATAW + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)), + .RESETW (1 + NUM_BANKS + NUM_SRC_REGS + REGS_DATAW) + ) pipe_reg ( + .clk (clk), + .reset (reset), + .enable (~pipe_stall), + .data_in ({scoreboard_if.valid, gpr_rd_valid_n, data_fetched_n, src_data_n2, pipe_out_data_n, has_collision_n, gpr_rd_addr_n, gpr_rd_req_idx_n}), + .data_out ({pipe_out_valid, gpr_rd_valid, data_fetched, src_data, pipe_out_data, has_collision, gpr_rd_addr, gpr_rd_req_idx}) + ); assign pipe_out_ready = stg_in_ready; assign stg_in_valid = pipe_out_valid && ~has_collision; diff --git a/hw/rtl/libs/VX_onehot_mux.sv b/hw/rtl/libs/VX_onehot_mux.sv index 8d9b87c8e..cc0fffaa6 100644 --- a/hw/rtl/libs/VX_onehot_mux.sv +++ b/hw/rtl/libs/VX_onehot_mux.sv @@ -17,7 +17,8 @@ module VX_onehot_mux #( parameter DATAW = 1, parameter N = 1, - parameter MODEL = 1 + parameter MODEL = 1, + parameter LUT_OPT = 0 ) ( input wire [N-1:0][DATAW-1:0] data_in, input wire [N-1:0] sel_in, @@ -26,6 +27,90 @@ module VX_onehot_mux #( if (N == 1) begin `UNUSED_VAR (sel_in) assign data_out = data_in; + end else if (LUT_OPT && N == 2) begin + `UNUSED_VAR (sel_in) + assign data_out = sel_in[0] ? data_in[0] : data_in[1]; + end else if (LUT_OPT && N == 3) begin + reg [DATAW-1:0] data_out_r; + always @(*) begin + case (sel_in) + 3'b001: data_out_r = data_in[0]; + 3'b010: data_out_r = data_in[1]; + 3'b100: data_out_r = data_in[2]; + default: data_out_r = 'x; + endcase + end + assign data_out = data_out_r; + end else if (LUT_OPT && N == 4) begin + reg [DATAW-1:0] data_out_r; + always @(*) begin + case (sel_in) + 4'b0001: data_out_r = data_in[0]; + 4'b0010: data_out_r = data_in[1]; + 4'b0100: data_out_r = data_in[2]; + 4'b1000: data_out_r = data_in[3]; + default: data_out_r = 'x; + endcase + end + assign data_out = data_out_r; + end else if (LUT_OPT && N == 5) begin + reg [DATAW-1:0] data_out_r; + always @(*) begin + case (sel_in) + 5'b00001: data_out_r = data_in[0]; + 5'b00010: data_out_r = data_in[1]; + 5'b00100: data_out_r = data_in[2]; + 5'b01000: data_out_r = data_in[3]; + 5'b10000: data_out_r = data_in[4]; + default: data_out_r = 'x; + endcase + end + assign data_out = data_out_r; + end else if (LUT_OPT && N == 6) begin + reg [DATAW-1:0] data_out_r; + always @(*) begin + case (sel_in) + 6'b000001: data_out_r = data_in[0]; + 6'b000010: data_out_r = data_in[1]; + 6'b000100: data_out_r = data_in[2]; + 6'b001000: data_out_r = data_in[3]; + 6'b010000: data_out_r = data_in[4]; + 6'b100000: data_out_r = data_in[5]; + default: data_out_r = 'x; + endcase + end + assign data_out = data_out_r; + end else if (LUT_OPT && N == 7) begin + reg [DATAW-1:0] data_out_r; + always @(*) begin + case (sel_in) + 7'b0000001: data_out_r = data_in[0]; + 7'b0000010: data_out_r = data_in[1]; + 7'b0000100: data_out_r = data_in[2]; + 7'b0001000: data_out_r = data_in[3]; + 7'b0010000: data_out_r = data_in[4]; + 7'b0100000: data_out_r = data_in[5]; + 7'b1000000: data_out_r = data_in[6]; + default: data_out_r = 'x; + endcase + end + assign data_out = data_out_r; + end else if (LUT_OPT && N == 8) begin + reg [DATAW-1:0] data_out_r; + always @(*) begin + case (sel_in) + 8'b00000001: data_out_r = data_in[0]; + 8'b00000010: data_out_r = data_in[1]; + 8'b00000100: data_out_r = data_in[2]; + 8'b00001000: data_out_r = data_in[3]; + 8'b00010000: data_out_r = data_in[4]; + 8'b00100000: data_out_r = data_in[5]; + 8'b01000000: data_out_r = data_in[6]; + 8'b10000000: data_out_r = data_in[7]; + default: data_out_r = 'x; + endcase + end + assign data_out = data_out_r; end else if (MODEL == 1) begin wire [N-1:0][DATAW-1:0] mask; for (genvar i = 0; i < N; ++i) begin diff --git a/runtime/stub/utils.cpp b/runtime/stub/utils.cpp index eea7691f5..9826db711 100644 --- a/runtime/stub/utils.cpp +++ b/runtime/stub/utils.cpp @@ -314,7 +314,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { if (num_cores > 1) { uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_csrs_per_core + scrb_wctl_per_core; int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core); - fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, scrs=%d%%, wctl=%d%%)\n" + fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n" , core_id , scrb_stalls_per_core , scrb_percent_per_core @@ -559,7 +559,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent); fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent); fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent); - fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, scrs=%d%%, wctl=%d%%)\n" + fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n" , scrb_stalls , scrb_percent , calcAvgPercent(scrb_alu, scrb_total) From 3075c1737bf86dfb4a1375ebef84a46492990050 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 2 Aug 2024 15:12:10 -0700 Subject: [PATCH 47/89] fixed bug in VX_onehot_encoder.sv (see issue #126) --- hw/rtl/libs/VX_onehot_encoder.sv | 50 ++++++++++++++++---------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/hw/rtl/libs/VX_onehot_encoder.sv b/hw/rtl/libs/VX_onehot_encoder.sv index 92c7d1ea1..8f7ada257 100644 --- a/hw/rtl/libs/VX_onehot_encoder.sv +++ b/hw/rtl/libs/VX_onehot_encoder.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,13 +23,13 @@ module VX_onehot_encoder #( parameter MODEL = 1, parameter LN = `LOG2UP(N) ) ( - input wire [N-1:0] data_in, + input wire [N-1:0] data_in, output wire [LN-1:0] data_out, output wire valid_out -); +); if (N == 1) begin - assign data_out = data_in; + assign data_out = 0; assign valid_out = data_in; end else if (N == 2) begin @@ -37,43 +37,43 @@ module VX_onehot_encoder #( assign data_out = data_in[!REVERSE]; assign valid_out = (| data_in); - end else if (MODEL == 1) begin - localparam M = 1 << LN; - `IGNORE_UNOPTFLAT_BEGIN + end else if (MODEL == 1) begin + localparam M = 1 << LN; + `IGNORE_UNOPTFLAT_BEGIN wire [LN-1:0][M-1:0] addr; wire [LN:0][M-1:0] v; `IGNORE_UNOPTFLAT_END - + // base case, also handle padding for non-power of two inputs assign v[0] = REVERSE ? (M'(data_in) << (M - N)) : M'(data_in); - + for (genvar lvl = 1; lvl < (LN+1); ++lvl) begin localparam SN = 1 << (LN - lvl); localparam SI = M / SN; localparam SW = lvl; - + for (genvar s = 0; s < SN; ++s) begin `IGNORE_UNOPTFLAT_BEGIN wire [1:0] vs = {v[lvl-1][s*SI+(SI>>1)], v[lvl-1][s*SI]}; `IGNORE_UNOPTFLAT_END - + assign v[lvl][s*SI] = (| vs); if (lvl == 1) begin - assign addr[lvl-1][s*SI +: SW] = vs[!REVERSE]; + assign addr[lvl-1][s*SI +: SW] = vs[!REVERSE]; end else begin - assign addr[lvl-1][s*SI +: SW] = { + assign addr[lvl-1][s*SI +: SW] = { vs[!REVERSE], addr[lvl-2][s*SI +: SW-1] | addr[lvl-2][s*SI+(SI>>1) +: SW-1] }; - end - end - end - + end + end + end + assign data_out = addr[LN-1][LN-1:0]; assign valid_out = v[LN][0]; - end else if (MODEL == 2 && REVERSE == 0) begin + end else if (MODEL == 2 && REVERSE == 0) begin for (genvar j = 0; j < LN; ++j) begin wire [N-1:0] mask; @@ -90,19 +90,19 @@ module VX_onehot_encoder #( reg [LN-1:0] index_r; if (REVERSE != 0) begin - always @(*) begin - index_r = 'x; + always @(*) begin + index_r = 'x; for (integer i = N-1; i >= 0; --i) begin - if (data_in[i]) begin + if (data_in[i]) begin index_r = LN'(N-1-i); end end end end else begin - always @(*) begin - index_r = 'x; + always @(*) begin + index_r = 'x; for (integer i = 0; i < N; ++i) begin - if (data_in[i]) begin + if (data_in[i]) begin index_r = LN'(i); end end From 16c209ac0cee55487eb622864c2b0c768eaa2572 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 2 Aug 2024 16:23:36 -0700 Subject: [PATCH 48/89] fixed operand collector critical path --- hw/rtl/core/VX_operands.sv | 127 +++++++++-------- hw/rtl/libs/VX_dp_ram.sv | 269 +++++++++++------------------------ hw/rtl/libs/VX_dp_ram_rst.sv | 79 +++++----- hw/syn/xilinx/README | 1 + 4 files changed, 195 insertions(+), 281 deletions(-) diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index f22c540fe..054064ec8 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -54,31 +54,28 @@ module VX_operands import VX_gpu_pkg::*; #( `UNUSED_VAR (writeback_if.data.sop) wire [NUM_SRC_REGS-1:0] src_valid; - wire [NUM_SRC_REGS-1:0] req_in_valid; - wire [NUM_SRC_REGS-1:0] req_in_ready; + wire [NUM_SRC_REGS-1:0] req_in_valid, req_in_ready; wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_in_data; wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx; - wire [NUM_BANKS-1:0] gpr_rd_valid_n, gpr_rd_ready; - reg [NUM_BANKS-1:0] gpr_rd_valid; - wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr_n; - reg [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr; - wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data; - wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx_n; - reg [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx; + wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready; + wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2; + wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr, gpr_rd_addr_st1; + wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st1, gpr_rd_data_st2; + wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx, gpr_rd_req_idx_st1, gpr_rd_req_idx_st2; - wire pipe_in_ready; - reg pipe_out_valid; - wire pipe_out_ready; - reg [META_DATAW-1:0] pipe_out_data, pipe_out_data_n; + wire pipe_valid_st1, pipe_ready_st1; + wire pipe_valid_st2, pipe_ready_st2; + wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2; - reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data, src_data_n; - wire reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n2; - - reg [NUM_SRC_REGS-1:0] data_fetched, data_fetched_n; - reg has_collision, has_collision_n; + reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n; + wire [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2; - wire stg_in_valid, stg_in_ready; + reg [NUM_SRC_REGS-1:0] data_fetched_n; + wire [NUM_SRC_REGS-1:0] data_fetched_st1; + + reg has_collision_n; + wire has_collision_st1; wire [NUM_SRC_REGS-1:0][`NR_BITS-1:0] src_regs = {scoreboard_if.data.rs3, scoreboard_if.data.rs2, @@ -98,7 +95,7 @@ module VX_operands import VX_gpu_pkg::*; #( end for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin - assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched[i]; + assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched_st1[i]; end assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid; @@ -118,13 +115,20 @@ module VX_operands import VX_gpu_pkg::*; #( .data_in (req_in_data), .sel_in (req_bank_idx), .ready_in (req_in_ready), - .valid_out (gpr_rd_valid_n), - .data_out (gpr_rd_addr_n), - .sel_out (gpr_rd_req_idx_n), + .valid_out (gpr_rd_valid), + .data_out (gpr_rd_addr), + .sel_out (gpr_rd_req_idx), .ready_out (gpr_rd_ready) ); - assign gpr_rd_ready = {NUM_BANKS{stg_in_ready}}; + wire pipe_in_ready = pipe_ready_st1 || ~pipe_valid_st1; + + assign gpr_rd_ready = {NUM_BANKS{pipe_in_ready}}; + + assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n; + + wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1; + wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2; always @(*) begin has_collision_n = 0; @@ -138,24 +142,15 @@ module VX_operands import VX_gpu_pkg::*; #( end always @(*) begin - data_fetched_n = data_fetched; + data_fetched_n = data_fetched_st1; if (scoreboard_if.ready) begin data_fetched_n = '0; end else begin - data_fetched_n = data_fetched | req_in_ready; + data_fetched_n = data_fetched_st1 | req_in_ready; end end - always @(*) begin - src_data_n = src_data; - for (integer b = 0; b < NUM_BANKS; ++b) begin - if (gpr_rd_valid[b]) begin - src_data_n[gpr_rd_req_idx[b]] = gpr_rd_data[b]; - end - end - end - - assign pipe_out_data_n = { + assign pipe_data = { scoreboard_if.data.wis, scoreboard_if.data.tmask, scoreboard_if.data.PC, @@ -167,28 +162,42 @@ module VX_operands import VX_gpu_pkg::*; #( scoreboard_if.data.uuid }; - wire pipe_stall = pipe_out_valid && ~pipe_out_ready; - assign pipe_in_ready = ~pipe_stall; - - assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n; - - wire stg_in_fire = stg_in_valid && stg_in_ready; - - assign src_data_n2 = stg_in_fire ? '0 : src_data_n; - VX_pipe_register #( - .DATAW (1 + NUM_BANKS + NUM_SRC_REGS + REGS_DATAW + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)), - .RESETW (1 + NUM_BANKS + NUM_SRC_REGS + REGS_DATAW) - ) pipe_reg ( + .DATAW (1 + NUM_BANKS + NUM_SRC_REGS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)), + .RESETW (1 + NUM_BANKS + NUM_SRC_REGS) + ) pipe_reg1 ( .clk (clk), .reset (reset), - .enable (~pipe_stall), - .data_in ({scoreboard_if.valid, gpr_rd_valid_n, data_fetched_n, src_data_n2, pipe_out_data_n, has_collision_n, gpr_rd_addr_n, gpr_rd_req_idx_n}), - .data_out ({pipe_out_valid, gpr_rd_valid, data_fetched, src_data, pipe_out_data, has_collision, gpr_rd_addr, gpr_rd_req_idx}) + .enable (pipe_in_ready), + .data_in ({scoreboard_if.valid, gpr_rd_valid, data_fetched_n, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}), + .data_out ({pipe_valid_st1, gpr_rd_valid_st1, data_fetched_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1}) ); - assign pipe_out_ready = stg_in_ready; - assign stg_in_valid = pipe_out_valid && ~has_collision; + assign pipe_ready_st1 = pipe_ready_st2 || ~pipe_valid_st2; + + assign src_data_st1 = pipe_fire_st2 ? '0 : src_data_n; + + wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1; + + VX_pipe_register #( + .DATAW (1 + NUM_BANKS + REGS_DATAW + (NUM_BANKS * `XLEN * `NUM_THREADS) + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH), + .RESETW (1 + NUM_BANKS + REGS_DATAW) + ) pipe_reg2 ( + .clk (clk), + .reset (reset), + .enable (pipe_ready_st1), + .data_in ({pipe_valid2_st1, gpr_rd_valid_st1, src_data_st1, gpr_rd_data_st1, pipe_data_st1, gpr_rd_req_idx_st1}), + .data_out ({pipe_valid_st2, gpr_rd_valid_st2, src_data_st2, gpr_rd_data_st2, pipe_data_st2, gpr_rd_req_idx_st2}) + ); + + always @(*) begin + src_data_n = src_data_st2; + for (integer b = 0; b < NUM_BANKS; ++b) begin + if (gpr_rd_valid_st2[b]) begin + src_data_n[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b]; + end + end + end VX_elastic_buffer #( .DATAW (DATAW), @@ -198,10 +207,10 @@ module VX_operands import VX_gpu_pkg::*; #( ) out_buffer ( .clk (clk), .reset (reset), - .valid_in (stg_in_valid), - .ready_in (stg_in_ready), + .valid_in (pipe_valid_st2), + .ready_in (pipe_ready_st2), .data_in ({ - pipe_out_data, + pipe_data_st2, src_data_n[0], src_data_n[1], src_data_n[2] @@ -278,13 +287,13 @@ module VX_operands import VX_gpu_pkg::*; #( `ifdef GPR_RESET .reset (reset), `endif - .read (1'b1), + .read (pipe_fire_st1), .wren (wren), .write (gpr_wr_enabled), .waddr (gpr_wr_addr), .wdata (writeback_if.data.data), - .raddr (gpr_rd_addr[b]), - .rdata (gpr_rd_data[b]) + .raddr (gpr_rd_addr_st1[b]), + .rdata (gpr_rd_data_st1[b]) ); end diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index a2e323772..364a45ebf 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -58,42 +58,37 @@ module VX_dp_ram #( `RUNTIME_ASSERT(~write || (| wren), ("invalid write enable mask")); end + wire [DATAW-1:0] rdata_w; + `ifdef SYNTHESIS if (WRENW > 1) begin `ifdef QUARTUS if (LUTRAM != 0) begin - if (OUT_REG != 0) begin - reg [DATAW-1:0] rdata_r; - `USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; - end - end - if (read) begin - rdata_r <= ram[raddr]; + `USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + for (integer i = 0; i < WRENW; ++i) begin + if (wren[i]) + ram[waddr][i] <= wdata[i * WSELW +: WSELW]; end end - assign rdata = rdata_r; - end else begin - `USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; - end - end - end - assign rdata = ram[raddr]; end + assign rdata_w = ram[raddr]; end else begin - if (OUT_REG != 0) begin - reg [DATAW-1:0] rdata_r; + if (NO_RWCHECK != 0) begin + `NO_RW_RAM_CHECK reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + for (integer i = 0; i < WRENW; ++i) begin + if (wren[i]) + ram[waddr][i] <= wdata[i * WSELW +: WSELW]; + end + end + end + assign rdata_w = ram[raddr]; + end else begin reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin @@ -103,37 +98,8 @@ module VX_dp_ram #( ram[waddr][i] <= wdata[i * WSELW +: WSELW]; end end - if (read) begin - rdata_r <= ram[raddr]; - end - end - assign rdata = rdata_r; - end else begin - if (NO_RWCHECK != 0) begin - `NO_RW_RAM_CHECK reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; - end - end - end - assign rdata = ram[raddr]; - end else begin - reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; - end - end - end - assign rdata = ram[raddr]; end + assign rdata_w = ram[raddr]; end end `else @@ -141,35 +107,18 @@ module VX_dp_ram #( if (LUTRAM != 0) begin `USE_FAST_BRAM reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; `RAM_INITIALIZATION - if (OUT_REG != 0) begin - reg [DATAW-1:0] rdata_r; - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end - end - if (read) begin - rdata_r <= ram[raddr]; + always @(posedge clk) begin + if (write) begin + for (integer i = 0; i < WRENW; ++i) begin + if (wren[i]) + ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; end end - assign rdata = rdata_r; - end else begin - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end - end - end - assign rdata = ram[raddr]; end + assign rdata_w = ram[raddr]; end else begin - if (OUT_REG != 0) begin - reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - reg [DATAW-1:0] rdata_r; + if (NO_RWCHECK != 0) begin + `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -178,37 +127,20 @@ module VX_dp_ram #( ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; end end - if (read) begin - rdata_r <= ram[raddr]; - end end - assign rdata = rdata_r; + assign rdata_w = ram[raddr]; end else begin - if (NO_RWCHECK != 0) begin - `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end + reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + for (integer i = 0; i < WRENW; ++i) begin + if (wren[i]) + ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; end end - assign rdata = ram[raddr]; - end else begin - reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end - end - end - assign rdata = ram[raddr]; end + assign rdata_w = ram[raddr]; end end `endif @@ -217,64 +149,36 @@ module VX_dp_ram #( if (LUTRAM != 0) begin `USE_FAST_BRAM reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; `RAM_INITIALIZATION - if (OUT_REG != 0) begin - reg [DATAW-1:0] rdata_r; - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; - end - if (read) begin - rdata_r <= ram[raddr]; - end + always @(posedge clk) begin + if (write) begin + ram[waddr] <= wdata; end - assign rdata = rdata_r; - end else begin - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; - end - end - assign rdata = ram[raddr]; end + assign rdata_w = ram[raddr]; end else begin - if (OUT_REG != 0) begin - reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - reg [DATAW-1:0] rdata_r; + if (NO_RWCHECK != 0) begin + `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin ram[waddr] <= wdata; end - if (read) begin - rdata_r <= ram[raddr]; - end end - assign rdata = rdata_r; + assign rdata_w = ram[raddr]; end else begin - if (NO_RWCHECK != 0) begin - `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; - end + reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + ram[waddr] <= wdata; end - assign rdata = ram[raddr]; - end else begin - reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; - end - end - assign rdata = ram[raddr]; end + assign rdata_w = ram[raddr]; end end end `else - // RAM emulation + // simulation reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; `RAM_INITIALIZATION @@ -283,42 +187,43 @@ module VX_dp_ram #( assign ram_n[i * WSELW +: WSELW] = ((WRENW == 1) | wren[i]) ? wdata[i * WSELW +: WSELW] : ram[waddr][i * WSELW +: WSELW]; end - if (OUT_REG != 0) begin - reg [DATAW-1:0] rdata_r; - always @(posedge clk) begin - if (write) begin - ram[waddr] <= ram_n; - end - if (read) begin - rdata_r <= ram[raddr]; - end + reg [DATAW-1:0] prev_data; + reg [ADDRW-1:0] prev_waddr; + reg prev_write; + + always @(posedge clk) begin + if (write) begin + ram[waddr] <= ram_n; end - assign rdata = rdata_r; + prev_write <= write; + prev_data <= ram[waddr]; + prev_waddr <= waddr; + end + + if (LUTRAM || !NO_RWCHECK) begin + `UNUSED_VAR (prev_write) + `UNUSED_VAR (prev_data) + `UNUSED_VAR (prev_waddr) + assign rdata_w = ram[raddr]; end else begin - reg [DATAW-1:0] prev_data; - reg [ADDRW-1:0] prev_waddr; - reg prev_write; - always @(posedge clk) begin - if (write) begin - ram[waddr] <= ram_n; - end - prev_write <= write; - prev_data <= ram[waddr]; - prev_waddr <= waddr; - end - if (LUTRAM || !NO_RWCHECK) begin - `UNUSED_VAR (prev_write) - `UNUSED_VAR (prev_data) - `UNUSED_VAR (prev_waddr) - assign rdata = ram[raddr]; - end else begin - assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; - if (RW_ASSERT) begin - `RUNTIME_ASSERT(~read || (rdata == ram[raddr]), ("read after write hazard")); - end + assign rdata_w = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; + if (RW_ASSERT) begin + `RUNTIME_ASSERT(~read || (rdata_w == ram[raddr]), ("read after write hazard")); end end `endif + if (OUT_REG != 0) begin + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read) begin + rdata_r <= rdata_w; + end + end + assign rdata = rdata_r; + end else begin + assign rdata = rdata_w; + end + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_dp_ram_rst.sv b/hw/rtl/libs/VX_dp_ram_rst.sv index e7598dbe6..6778b45d3 100644 --- a/hw/rtl/libs/VX_dp_ram_rst.sv +++ b/hw/rtl/libs/VX_dp_ram_rst.sv @@ -62,53 +62,52 @@ module VX_dp_ram_rst #( assign ram_n[i * WSELW +: WSELW] = ((WRENW == 1) | wren[i]) ? wdata[i * WSELW +: WSELW] : ram[waddr][i * WSELW +: WSELW]; end + reg [DATAW-1:0] prev_data; + reg [ADDRW-1:0] prev_waddr; + reg prev_write; + + always @(posedge clk) begin + if (reset) begin + for (integer i = 0; i < SIZE; ++i) begin + ram[i] <= DATAW'(INIT_VALUE); + end + prev_write <= 0; + prev_data <= '0; + prev_waddr <= '0; + end else begin + if (write) begin + ram[waddr] <= ram_n; + end + prev_write <= (| wren); + prev_data <= ram[waddr]; + prev_waddr <= waddr; + end + end + + wire [DATAW-1:0] rdata_w; + + if (LUTRAM || !NO_RWCHECK) begin + `UNUSED_VAR (prev_write) + `UNUSED_VAR (prev_data) + `UNUSED_VAR (prev_waddr) + assign rdata_w = ram[raddr]; + end else begin + assign rdata_w = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; + if (RW_ASSERT) begin + `RUNTIME_ASSERT(~read || (rdata_w == ram[raddr]), ("read after write hazard")); + end + end + if (OUT_REG != 0) begin reg [DATAW-1:0] rdata_r; always @(posedge clk) begin - if (reset) begin - for (integer i = 0; i < SIZE; ++i) begin - ram[i] <= DATAW'(INIT_VALUE); - end - rdata_r <= '0; - end else begin - if (write) begin - ram[waddr] <= ram_n; - end - if (read) begin - rdata_r <= ram[raddr]; - end + if (read) begin + rdata_r <= rdata_w; end end assign rdata = rdata_r; end else begin - reg [DATAW-1:0] prev_data; - reg [ADDRW-1:0] prev_waddr; - reg prev_write; - always @(posedge clk) begin - if (reset) begin - for (integer i = 0; i < SIZE; ++i) begin - ram[i] <= DATAW'(INIT_VALUE); - end - prev_write <= 0; - prev_data <= '0; - prev_waddr <= '0; - end else begin - if (write) begin - ram[waddr] <= ram_n; - end - prev_write <= (| wren); - prev_data <= ram[waddr]; - prev_waddr <= waddr; - end - end - if (LUTRAM || !NO_RWCHECK) begin - `UNUSED_VAR (prev_write) - `UNUSED_VAR (prev_data) - `UNUSED_VAR (prev_waddr) - assign rdata = ram[raddr]; - end else begin - assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; - end + assign rdata = rdata_w; end endmodule diff --git a/hw/syn/xilinx/README b/hw/syn/xilinx/README index b2218e65e..563c4c17e 100644 --- a/hw/syn/xilinx/README +++ b/hw/syn/xilinx/README @@ -45,6 +45,7 @@ FPGA_BIN_DIR= XRT_DEVICE_INDEX=1 TARGET=hw ./ci/blackbox.sh --driver=xr # build report logs /bin/vortex_afu.xclbin.info +/_x/logs/link/vivado.log # search for keyword "Very high fanout" /_x/reports/link/link/imp/impl_1_full_util_routed.rpt /_x/reports/link/imp/impl_1_hw_bb_locked_timing_summary_routed.rpt # search for keyword "VIOLATED" /_x/logs/link/syn/ulp_vortex_afu_1_0_synth_1_runme.log From c1b8ecfd1a637ad42590f2aa42d2a7eddc15b75c Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 2 Aug 2024 16:39:40 -0700 Subject: [PATCH 49/89] block ram reset refactoring --- hw/rtl/cache/VX_cache_mshr.sv | 3 +- hw/rtl/core/VX_fetch.sv | 3 +- hw/rtl/core/VX_ipdom_stack.sv | 3 +- hw/rtl/core/VX_operands.sv | 6 +- hw/rtl/libs/VX_dp_ram.sv | 25 ++++++-- hw/rtl/libs/VX_dp_ram_rst.sv | 114 --------------------------------- hw/rtl/libs/VX_fifo_queue.sv | 8 ++- hw/rtl/libs/VX_index_buffer.sv | 21 +++--- 8 files changed, 43 insertions(+), 140 deletions(-) delete mode 100644 hw/rtl/libs/VX_dp_ram_rst.sv diff --git a/hw/rtl/cache/VX_cache_mshr.sv b/hw/rtl/cache/VX_cache_mshr.sv index b0e577283..fa08b65a4 100644 --- a/hw/rtl/cache/VX_cache_mshr.sv +++ b/hw/rtl/cache/VX_cache_mshr.sv @@ -232,9 +232,10 @@ module VX_cache_mshr #( .LUTRAM (1) ) entries ( .clk (clk), + .reset (1'b0), .read (1'b1), .write (allocate_valid), - `UNUSED_PIN (wren), + .wren (1'b1), .waddr (allocate_id_r), .wdata (allocate_data), .raddr (dequeue_id_r), diff --git a/hw/rtl/core/VX_fetch.sv b/hw/rtl/core/VX_fetch.sv index 59c419a83..de086219d 100644 --- a/hw/rtl/core/VX_fetch.sv +++ b/hw/rtl/core/VX_fetch.sv @@ -56,9 +56,10 @@ module VX_fetch import VX_gpu_pkg::*; #( .LUTRAM (1) ) tag_store ( .clk (clk), + .reset (1'b0), .read (1'b1), .write (icache_req_fire), - `UNUSED_PIN (wren), + .wren (1'b1), .waddr (req_tag), .wdata ({schedule_if.data.PC, schedule_if.data.tmask}), .raddr (rsp_tag), diff --git a/hw/rtl/core/VX_ipdom_stack.sv b/hw/rtl/core/VX_ipdom_stack.sv index 01d5ec78e..54c703709 100644 --- a/hw/rtl/core/VX_ipdom_stack.sv +++ b/hw/rtl/core/VX_ipdom_stack.sv @@ -72,9 +72,10 @@ module VX_ipdom_stack #( .LUTRAM (OUT_REG ? 0 : 1) ) store ( .clk (clk), + .reset (1'b0), .read (1'b1), .write (push), - `UNUSED_PIN (wren), + .wren (1'b1), .waddr (wr_ptr), .wdata ({q1, q0}), .raddr (rd_ptr), diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 054064ec8..af3c9c823 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -273,11 +273,7 @@ module VX_operands import VX_gpu_pkg::*; #( assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}}; end - `ifdef GPR_RESET - VX_dp_ram_rst #( - `else VX_dp_ram #( - `endif .DATAW (`XLEN * `NUM_THREADS), .SIZE (PER_BANK_REGS * PER_ISSUE_WARPS), .WRENW (BYTEENW), @@ -286,6 +282,8 @@ module VX_operands import VX_gpu_pkg::*; #( .clk (clk), `ifdef GPR_RESET .reset (reset), + `else + .reset (1'b0), `endif .read (pipe_fire_st1), .wren (wren), diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index 364a45ebf..7fe0e3491 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -23,12 +23,14 @@ module VX_dp_ram #( parameter NO_RWCHECK = 0, parameter LUTRAM = 0, parameter RW_ASSERT = 0, + parameter RESET_RAM = 0, parameter INIT_ENABLE = 0, parameter INIT_FILE = "", parameter [DATAW-1:0] INIT_VALUE = 0, parameter ADDRW = `LOG2UP(SIZE) ) ( input wire clk, + input wire reset, input wire read, input wire write, input wire [WRENW-1:0] wren, @@ -192,12 +194,21 @@ module VX_dp_ram #( reg prev_write; always @(posedge clk) begin - if (write) begin - ram[waddr] <= ram_n; + if (RESET_RAM && reset) begin + for (integer i = 0; i < SIZE; ++i) begin + ram[i] <= DATAW'(INIT_VALUE); + end + prev_write <= 0; + prev_data <= '0; + prev_waddr <= '0; + end else begin + if (write) begin + ram[waddr] <= ram_n; + end + prev_write <= write; + prev_data <= ram[waddr]; + prev_waddr <= waddr; end - prev_write <= write; - prev_data <= ram[waddr]; - prev_waddr <= waddr; end if (LUTRAM || !NO_RWCHECK) begin @@ -216,7 +227,9 @@ module VX_dp_ram #( if (OUT_REG != 0) begin reg [DATAW-1:0] rdata_r; always @(posedge clk) begin - if (read) begin + if (reset) begin + rdata_r <= '0; + end else if (read) begin rdata_r <= rdata_w; end end diff --git a/hw/rtl/libs/VX_dp_ram_rst.sv b/hw/rtl/libs/VX_dp_ram_rst.sv deleted file mode 100644 index 6778b45d3..000000000 --- a/hw/rtl/libs/VX_dp_ram_rst.sv +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -`include "VX_platform.vh" - -`TRACING_OFF -module VX_dp_ram_rst #( - parameter DATAW = 1, - parameter SIZE = 1, - parameter ADDR_MIN = 0, - parameter WRENW = 1, - parameter OUT_REG = 0, - parameter NO_RWCHECK = 0, - parameter LUTRAM = 0, - parameter INIT_ENABLE = 0, - parameter INIT_FILE = "", - parameter [DATAW-1:0] INIT_VALUE = 0, - parameter ADDRW = `LOG2UP(SIZE) -) ( - input wire clk, - input wire reset, - input wire read, - input wire write, - input wire [WRENW-1:0] wren, - input wire [ADDRW-1:0] waddr, - input wire [DATAW-1:0] wdata, - input wire [ADDRW-1:0] raddr, - output wire [DATAW-1:0] rdata -); - localparam WSELW = DATAW / WRENW; - `STATIC_ASSERT((WRENW * WSELW == DATAW), ("invalid parameter")) - -`define RAM_INITIALIZATION \ - if (INIT_ENABLE != 0) begin \ - if (INIT_FILE != "") begin \ - initial $readmemh(INIT_FILE, ram); \ - end else begin \ - initial \ - for (integer i = 0; i < SIZE; ++i) \ - ram[i] = INIT_VALUE; \ - end \ - end - - `UNUSED_VAR (read) - - // RAM emulation - reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - - wire [DATAW-1:0] ram_n; - for (genvar i = 0; i < WRENW; ++i) begin - assign ram_n[i * WSELW +: WSELW] = ((WRENW == 1) | wren[i]) ? wdata[i * WSELW +: WSELW] : ram[waddr][i * WSELW +: WSELW]; - end - - reg [DATAW-1:0] prev_data; - reg [ADDRW-1:0] prev_waddr; - reg prev_write; - - always @(posedge clk) begin - if (reset) begin - for (integer i = 0; i < SIZE; ++i) begin - ram[i] <= DATAW'(INIT_VALUE); - end - prev_write <= 0; - prev_data <= '0; - prev_waddr <= '0; - end else begin - if (write) begin - ram[waddr] <= ram_n; - end - prev_write <= (| wren); - prev_data <= ram[waddr]; - prev_waddr <= waddr; - end - end - - wire [DATAW-1:0] rdata_w; - - if (LUTRAM || !NO_RWCHECK) begin - `UNUSED_VAR (prev_write) - `UNUSED_VAR (prev_data) - `UNUSED_VAR (prev_waddr) - assign rdata_w = ram[raddr]; - end else begin - assign rdata_w = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; - if (RW_ASSERT) begin - `RUNTIME_ASSERT(~read || (rdata_w == ram[raddr]), ("read after write hazard")); - end - end - - if (OUT_REG != 0) begin - reg [DATAW-1:0] rdata_r; - always @(posedge clk) begin - if (read) begin - rdata_r <= rdata_w; - end - end - assign rdata = rdata_r; - end else begin - assign rdata = rdata_w; - end - -endmodule -`TRACING_ON diff --git a/hw/rtl/libs/VX_fifo_queue.sv b/hw/rtl/libs/VX_fifo_queue.sv index a430d32f7..565849aee 100644 --- a/hw/rtl/libs/VX_fifo_queue.sv +++ b/hw/rtl/libs/VX_fifo_queue.sv @@ -177,10 +177,11 @@ module VX_fifo_queue #( .SIZE (DEPTH), .LUTRAM (LUTRAM) ) dp_ram ( - .clk(clk), + .clk (clk), + .reset (1'b0), .read (1'b1), .write (push), - `UNUSED_PIN (wren), + .wren (1'b1), .waddr (wr_ptr_r), .wdata (data_in), .raddr (rd_ptr_r), @@ -226,9 +227,10 @@ module VX_fifo_queue #( .LUTRAM (LUTRAM) ) dp_ram ( .clk (clk), + .reset (1'b0), .read (1'b1), .write (push), - `UNUSED_PIN (wren), + .wren (1'b1), .waddr (wr_ptr_r), .wdata (data_in), .raddr (rd_ptr_n_r), diff --git a/hw/rtl/libs/VX_index_buffer.sv b/hw/rtl/libs/VX_index_buffer.sv index 9c19b9184..4d3804f28 100644 --- a/hw/rtl/libs/VX_index_buffer.sv +++ b/hw/rtl/libs/VX_index_buffer.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,17 +24,17 @@ module VX_index_buffer #( input wire reset, output wire [ADDRW-1:0] write_addr, - input wire [DATAW-1:0] write_data, + input wire [DATAW-1:0] write_data, input wire acquire_en, input wire [ADDRW-1:0] read_addr, output wire [DATAW-1:0] read_data, input wire release_en, - + output wire empty, - output wire full + output wire full ); - + VX_allocator #( .SIZE (SIZE) ) allocator ( @@ -43,9 +43,9 @@ module VX_index_buffer #( .acquire_en (acquire_en), .acquire_addr (write_addr), .release_en (release_en), - .release_addr (read_addr), + .release_addr (read_addr), .empty (empty), - .full (full) + .full (full) ); VX_dp_ram #( @@ -54,14 +54,15 @@ module VX_index_buffer #( .LUTRAM (LUTRAM) ) data_table ( .clk (clk), + .reset (1'b0), .read (1'b1), .write (acquire_en), - `UNUSED_PIN (wren), + .wren (1'b1), .waddr (write_addr), .wdata (write_data), .raddr (read_addr), .rdata (read_data) ); - + endmodule `TRACING_ON From 9c5aee5e25799bc09cbfba1c79607685b7cf4a60 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 2 Aug 2024 18:13:58 -0700 Subject: [PATCH 50/89] bram reset fix --- hw/rtl/cache/VX_cache_data.sv | 4 +++- hw/rtl/cache/VX_cache_tags.sv | 3 ++- hw/rtl/libs/VX_sp_ram.sv | 4 ++++ hw/rtl/mem/VX_local_mem.sv | 1 + 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/hw/rtl/cache/VX_cache_data.sv b/hw/rtl/cache/VX_cache_data.sv index da1a7fe63..854903065 100644 --- a/hw/rtl/cache/VX_cache_data.sv +++ b/hw/rtl/cache/VX_cache_data.sv @@ -91,9 +91,10 @@ module VX_cache_data #( .SIZE (`CS_LINES_PER_BANK) ) byteen_store ( .clk (clk), + .reset (1'b0), .read (write || fill || flush), .write (init || write || fill || flush), - `UNUSED_PIN (wren), + .wren (1'b1), .addr (line_sel), .wdata (bs_wdata), .rdata (bs_rdata) @@ -161,6 +162,7 @@ module VX_cache_data #( .RW_ASSERT (1) ) data_store ( .clk (clk), + .reset (1'b0), .read (line_read), .write (line_write), .wren (line_wren), diff --git a/hw/rtl/cache/VX_cache_tags.sv b/hw/rtl/cache/VX_cache_tags.sv index bdb4479ce..7e579e3be 100644 --- a/hw/rtl/cache/VX_cache_tags.sv +++ b/hw/rtl/cache/VX_cache_tags.sv @@ -130,9 +130,10 @@ module VX_cache_tags #( .RW_ASSERT (1) ) tag_store ( .clk (clk), + .reset (1'b0), .read (line_read), .write (line_write), - `UNUSED_PIN (wren), + .wren (1'b1), .addr (line_sel), .wdata (line_wdata), .rdata (line_rdata) diff --git a/hw/rtl/libs/VX_sp_ram.sv b/hw/rtl/libs/VX_sp_ram.sv index a62099b1b..4ab2a9b7a 100644 --- a/hw/rtl/libs/VX_sp_ram.sv +++ b/hw/rtl/libs/VX_sp_ram.sv @@ -23,12 +23,14 @@ module VX_sp_ram #( parameter NO_RWCHECK = 0, parameter RW_ASSERT = 0, parameter LUTRAM = 0, + parameter RESET_RAM = 0, parameter INIT_ENABLE = 0, parameter INIT_FILE = "", parameter [DATAW-1:0] INIT_VALUE = 0, parameter ADDRW = `LOG2UP(SIZE) ) ( input wire clk, + input wire reset, input wire read, input wire write, input wire [WRENW-1:0] wren, @@ -45,12 +47,14 @@ module VX_sp_ram #( .NO_RWCHECK (NO_RWCHECK), .RW_ASSERT (RW_ASSERT), .LUTRAM (LUTRAM), + .RESET_RAM (RESET_RAM), .INIT_ENABLE (INIT_ENABLE), .INIT_FILE (INIT_FILE), .INIT_VALUE (INIT_VALUE), .ADDRW (ADDRW) ) dp_ram ( .clk (clk), + .reset (reset), .read (read), .write (write), .wren (wren), diff --git a/hw/rtl/mem/VX_local_mem.sv b/hw/rtl/mem/VX_local_mem.sv index 0c97464b5..cd45bbfd6 100644 --- a/hw/rtl/mem/VX_local_mem.sv +++ b/hw/rtl/mem/VX_local_mem.sv @@ -166,6 +166,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( .WRENW (WORD_SIZE) ) data_store ( .clk (clk), + .reset (1'b0), .read (1'b1), .write (per_bank_req_valid[i] && per_bank_req_ready[i] && per_bank_req_rw[i]), .wren (per_bank_req_byteen[i]), From f723e7baf51d507e3e353737ee975987dce0f815 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 2 Aug 2024 18:15:08 -0700 Subject: [PATCH 51/89] registering local memory bram output --- hw/rtl/mem/VX_local_mem.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rtl/mem/VX_local_mem.sv b/hw/rtl/mem/VX_local_mem.sv index cd45bbfd6..facc63634 100644 --- a/hw/rtl/mem/VX_local_mem.sv +++ b/hw/rtl/mem/VX_local_mem.sv @@ -182,7 +182,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( VX_elastic_buffer #( .DATAW (REQ_SEL_WIDTH + TAG_WIDTH), - .SIZE (0) + .SIZE (1) ) bank_buf ( .clk (clk), .reset (bank_reset), From 410c47e2ae06d5a62e5253ca7a04831819671844 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 2 Aug 2024 18:16:50 -0700 Subject: [PATCH 52/89] adding out_buf to VX_pe_serializer + testing --- ci/regression.sh.in | 7 +++++ hw/rtl/fpu/VX_fpu_cvt.sv | 25 ++++++++-------- hw/rtl/fpu/VX_fpu_div.sv | 49 +++++++++++++++--------------- hw/rtl/fpu/VX_fpu_fma.sv | 3 +- hw/rtl/fpu/VX_fpu_ncp.sv | 25 ++++++++-------- hw/rtl/fpu/VX_fpu_sqrt.sv | 45 ++++++++++++++-------------- hw/rtl/libs/VX_pe_serializer.sv | 53 +++++++++++++++++++++++---------- 7 files changed, 120 insertions(+), 87 deletions(-) diff --git a/ci/regression.sh.in b/ci/regression.sh.in index f405f8b1b..e4fb1c999 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -210,6 +210,13 @@ config1() CONFIGS="-DISSUE_WIDTH=2 -DNUM_FPU_BLOCK=1 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx CONFIGS="-DISSUE_WIDTH=4 -DNUM_FPU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx + # FPU's PE scaling + CONFIGS="-DFMA_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfmadd" + CONFIGS="-DFCVT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tftoi" + CONFIGS="-DFDIV_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfdiv" + CONFIGS="-DFSQRT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfsqrt" + CONFIGS="-DFNCP_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfclamp" + # LSU scaling CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx diff --git a/hw/rtl/fpu/VX_fpu_cvt.sv b/hw/rtl/fpu/VX_fpu_cvt.sv index 6d74ddcb7..4c1a6e755 100644 --- a/hw/rtl/fpu/VX_fpu_cvt.sv +++ b/hw/rtl/fpu/VX_fpu_cvt.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,7 +21,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( parameter TAG_WIDTH = 1 ) ( input wire clk, - input wire reset, + input wire reset, output wire ready_in, input wire valid_in, @@ -36,7 +36,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( input wire is_signed, input wire [NUM_LANES-1:0][31:0] dataa, - output wire [NUM_LANES-1:0][31:0] result, + output wire [NUM_LANES-1:0][31:0] result, output wire has_fflags, output wire [`FP_FLAGS_BITS-1:0] fflags, @@ -45,25 +45,26 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( input wire ready_out, output wire valid_out -); +); `UNUSED_VAR (frm) - wire [NUM_LANES-1:0] mask_out; + wire [NUM_LANES-1:0] mask_out; wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out; fflags_t [NUM_LANES-1:0] fflags_out; - wire pe_enable; + wire pe_enable; wire [NUM_PES-1:0][31:0] pe_data_in; wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; - + VX_pe_serializer #( - .NUM_LANES (NUM_LANES), - .NUM_PES (NUM_PES), + .NUM_LANES (NUM_LANES), + .NUM_PES (NUM_PES), .LATENCY (`LATENCY_FCVT), .DATA_IN_WIDTH(32), .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), - .PE_REG (0) + .PE_REG (0), + .OUT_BUF ((`FCVT_PE_RATIO > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), @@ -94,7 +95,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( .enable (pe_enable), .frm (frm), .is_itof (is_itof), - .is_signed (is_signed), + .is_signed (is_signed), .dataa (pe_data_in[i][0 +: 32]), .result (pe_data_out[i][0 +: 32]), .fflags (pe_data_out[i][32 +: `FP_FLAGS_BITS]) diff --git a/hw/rtl/fpu/VX_fpu_div.sv b/hw/rtl/fpu/VX_fpu_div.sv index 0647a8782..992f0fbe9 100644 --- a/hw/rtl/fpu/VX_fpu_div.sv +++ b/hw/rtl/fpu/VX_fpu_div.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,7 +21,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #( parameter TAG_WIDTH = 1 ) ( input wire clk, - input wire reset, + input wire reset, input wire valid_in, output wire ready_in, @@ -31,10 +31,10 @@ module VX_fpu_div import VX_fpu_pkg::*; #( input wire [TAG_WIDTH-1:0] tag_in, input wire [`INST_FRM_BITS-1:0] frm, - + input wire [NUM_LANES-1:0][31:0] dataa, input wire [NUM_LANES-1:0][31:0] datab, - output wire [NUM_LANES-1:0][31:0] result, + output wire [NUM_LANES-1:0][31:0] result, output wire has_fflags, output wire [`FP_FLAGS_BITS-1:0] fflags, @@ -47,27 +47,28 @@ module VX_fpu_div import VX_fpu_pkg::*; #( `UNUSED_VAR (frm) wire [NUM_LANES-1:0][2*32-1:0] data_in; - wire [NUM_LANES-1:0] mask_out; + wire [NUM_LANES-1:0] mask_out; wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out; wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out; - wire pe_enable; + wire pe_enable; wire [NUM_PES-1:0][2*32-1:0] pe_data_in; - wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; + wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; for (genvar i = 0; i < NUM_LANES; ++i) begin assign data_in[i][0 +: 32] = dataa[i]; assign data_in[i][32 +: 32] = datab[i]; end - + VX_pe_serializer #( - .NUM_LANES (NUM_LANES), - .NUM_PES (NUM_PES), + .NUM_LANES (NUM_LANES), + .NUM_PES (NUM_PES), .LATENCY (`LATENCY_FDIV), .DATA_IN_WIDTH(2*32), .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), - .PE_REG (0) + .PE_REG (0), + .OUT_BUF ((`FDIV_PE_RATIO > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), @@ -92,7 +93,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #( fflags_t [NUM_LANES-1:0] per_lane_fflags; `ifdef QUARTUS - + for (genvar i = 0; i < NUM_PES; ++i) begin acl_fdiv fdiv ( .clk (clk), @@ -103,8 +104,8 @@ module VX_fpu_div import VX_fpu_pkg::*; #( .q (pe_data_out[i][0 +: 32]) ); assign pe_data_out[i][32 +: `FP_FLAGS_BITS] = 'x; - end - + end + assign has_fflags = 0; assign per_lane_fflags = 'x; `UNUSED_VAR (fflags_out) @@ -131,21 +132,21 @@ module VX_fpu_div import VX_fpu_pkg::*; #( assign has_fflags = 1; assign per_lane_fflags = fflags_out; -`else +`else for (genvar i = 0; i < NUM_PES; ++i) begin reg [63:0] r; - `UNUSED_VAR (r) + `UNUSED_VAR (r) fflags_t f; - always @(*) begin + always @(*) begin dpi_fdiv ( - pe_enable, - int'(0), - {32'hffffffff, pe_data_in[i][0 +: 32]}, - {32'hffffffff, pe_data_in[i][32 +: 32]}, - frm, - r, + pe_enable, + int'(0), + {32'hffffffff, pe_data_in[i][0 +: 32]}, + {32'hffffffff, pe_data_in[i][32 +: 32]}, + frm, + r, f ); end diff --git a/hw/rtl/fpu/VX_fpu_fma.sv b/hw/rtl/fpu/VX_fpu_fma.sv index 8151fbf55..33790dfca 100644 --- a/hw/rtl/fpu/VX_fpu_fma.sv +++ b/hw/rtl/fpu/VX_fpu_fma.sv @@ -98,7 +98,8 @@ module VX_fpu_fma import VX_fpu_pkg::*; #( .DATA_IN_WIDTH(3*32), .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), - .PE_REG ((NUM_LANES != NUM_PES) ? 1 : 0) + .PE_REG ((`FMA_PE_RATIO != 1) ? 1 : 0), + .OUT_BUF ((`FMA_PE_RATIO > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_fpu_ncp.sv b/hw/rtl/fpu/VX_fpu_ncp.sv index 017738775..0479b8826 100644 --- a/hw/rtl/fpu/VX_fpu_ncp.sv +++ b/hw/rtl/fpu/VX_fpu_ncp.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -35,7 +35,7 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #( input wire [NUM_LANES-1:0][31:0] dataa, input wire [NUM_LANES-1:0][31:0] datab, - output wire [NUM_LANES-1:0][31:0] result, + output wire [NUM_LANES-1:0][31:0] result, output wire has_fflags, output wire [`FP_FLAGS_BITS-1:0] fflags, @@ -44,15 +44,15 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #( input wire ready_out, output wire valid_out -); +); `UNUSED_VAR (frm) wire [NUM_LANES-1:0][2*32-1:0] data_in; - wire [NUM_LANES-1:0] mask_out; + wire [NUM_LANES-1:0] mask_out; wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out; fflags_t [NUM_LANES-1:0] fflags_out; - wire pe_enable; + wire pe_enable; wire [NUM_PES-1:0][2*32-1:0] pe_data_in; wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; @@ -60,15 +60,16 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #( assign data_in[i][0 +: 32] = dataa[i]; assign data_in[i][32 +: 32] = datab[i]; end - + VX_pe_serializer #( - .NUM_LANES (NUM_LANES), - .NUM_PES (NUM_PES), + .NUM_LANES (NUM_LANES), + .NUM_PES (NUM_PES), .LATENCY (`LATENCY_FNCP), .DATA_IN_WIDTH(2*32), .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), - .PE_REG (0) + .PE_REG (0), + .OUT_BUF ((`FNCP_PE_RATIO > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), @@ -97,8 +98,8 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #( .clk (clk), .reset (reset), .enable (pe_enable), - .frm (frm), - .op_type (op_type), + .frm (frm), + .op_type (op_type), .dataa (pe_data_in[i][0 +: 32]), .datab (pe_data_in[i][32 +: 32]), .result (pe_data_out[i][0 +: 32]), diff --git a/hw/rtl/fpu/VX_fpu_sqrt.sv b/hw/rtl/fpu/VX_fpu_sqrt.sv index 03529e629..2e32077a4 100644 --- a/hw/rtl/fpu/VX_fpu_sqrt.sv +++ b/hw/rtl/fpu/VX_fpu_sqrt.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -18,10 +18,10 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( parameter NUM_LANES = 1, parameter NUM_PES = `UP(NUM_LANES /`FSQRT_PE_RATIO), - parameter TAG_WIDTH = 1 + parameter TAG_WIDTH = 1 ) ( input wire clk, - input wire reset, + input wire reset, output wire ready_in, input wire valid_in, @@ -29,11 +29,11 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( input wire [NUM_LANES-1:0] mask_in, input wire [TAG_WIDTH-1:0] tag_in, - + input wire [`INST_FRM_BITS-1:0] frm, input wire [NUM_LANES-1:0][31:0] dataa, - output wire [NUM_LANES-1:0][31:0] result, + output wire [NUM_LANES-1:0][31:0] result, output wire has_fflags, output wire [`FP_FLAGS_BITS-1:0] fflags, @@ -46,22 +46,23 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( `UNUSED_VAR (frm) - wire [NUM_LANES-1:0] mask_out; + wire [NUM_LANES-1:0] mask_out; wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out; wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out; - wire pe_enable; + wire pe_enable; wire [NUM_PES-1:0][31:0] pe_data_in; wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; - + VX_pe_serializer #( - .NUM_LANES (NUM_LANES), - .NUM_PES (NUM_PES), + .NUM_LANES (NUM_LANES), + .NUM_PES (NUM_PES), .LATENCY (`LATENCY_FSQRT), .DATA_IN_WIDTH(32), .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), - .PE_REG (0) + .PE_REG (0), + .OUT_BUF ((`FSQRT_PE_RATIO > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), @@ -83,10 +84,10 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS]; end - fflags_t [NUM_LANES-1:0] per_lane_fflags; + fflags_t [NUM_LANES-1:0] per_lane_fflags; `ifdef QUARTUS - + for (genvar i = 0; i < NUM_PES; ++i) begin acl_fsqrt fsqrt ( .clk (clk), @@ -105,7 +106,7 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( `elsif VIVADO for (genvar i = 0; i < NUM_PES; ++i) begin - wire tuser; + wire tuser; xil_fsqrt fsqrt ( .aclk (clk), @@ -130,17 +131,17 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( `UNUSED_VAR (r) fflags_t f; - always @(*) begin + always @(*) begin dpi_fsqrt ( - pe_enable, - int'(0), - {32'hffffffff, pe_data_in[i]}, - frm, - r, + pe_enable, + int'(0), + {32'hffffffff, pe_data_in[i]}, + frm, + r, f ); end - + VX_shift_register #( .DATAW (32 + $bits(fflags_t)), .DEPTH (`LATENCY_FSQRT) diff --git a/hw/rtl/libs/VX_pe_serializer.sv b/hw/rtl/libs/VX_pe_serializer.sv index 7060c258c..efaa45d03 100644 --- a/hw/rtl/libs/VX_pe_serializer.sv +++ b/hw/rtl/libs/VX_pe_serializer.sv @@ -21,7 +21,8 @@ module VX_pe_serializer #( parameter DATA_IN_WIDTH = 1, parameter DATA_OUT_WIDTH = 1, parameter TAG_WIDTH = 0, - parameter PE_REG = 0 + parameter PE_REG = 0' + parameter OUT_BUF = 0 ) ( input wire clk, input wire reset, @@ -43,6 +44,11 @@ module VX_pe_serializer #( output wire [TAG_WIDTH-1:0] tag_out, input wire ready_out ); + wire valid_out_u; + wire [NUM_LANES-1:0][DATA_OUT_WIDTH-1:0] data_out_u; + wire [TAG_WIDTH-1:0] tag_out_u; + wire ready_out_u; + wire [NUM_PES-1:0][DATA_IN_WIDTH-1:0] pe_data_in_s; wire valid_out_s; wire [TAG_WIDTH-1:0] tag_out_s; @@ -105,7 +111,7 @@ module VX_pe_serializer #( reg [TAG_WIDTH-1:0] tag_out_r; wire valid_out_b = valid_out_s && batch_out_done; - wire ready_out_b = ready_out || ~valid_out; + wire ready_out_b = ready_out_u || ~valid_out_u; always @(posedge clk) begin if (reset) begin @@ -119,29 +125,44 @@ module VX_pe_serializer #( end end - assign enable = ready_out_b || ~valid_out_b; - assign ready_in = enable && batch_in_done; + assign enable = ready_out_b || ~valid_out_b; + assign ready_in = enable && batch_in_done; + assign pe_enable = enable; - assign pe_enable = enable; - - assign valid_out = valid_out_r; - assign data_out = data_out_r; - assign tag_out = tag_out_r; + assign valid_out_u = valid_out_r; + assign data_out_u = data_out_r; + assign tag_out_u = tag_out_r; end else begin assign pe_data_in_s = data_in; - assign enable = ready_out || ~valid_out; - assign ready_in = enable; + assign enable = ready_out_u || ~valid_out_u; + assign ready_in = enable; + assign pe_enable = enable; - assign pe_enable = enable; - - assign valid_out = valid_out_s; - assign data_out = pe_data_out; - assign tag_out = tag_out_s; + assign valid_out_u = valid_out_s; + assign data_out_u = pe_data_out; + assign tag_out_u = tag_out_s; end + `RESET_RELAY (out_buf_reset, reset); + + VX_elastic_buffer #( + .DATAW (DATA_OUT_WIDTH + TAG_WIDTH), + .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), + .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) + ) out_buf ( + .clk (clk), + .reset (out_buf_reset), + .valid_in (valid_out_u), + .ready_in (ready_out_u), + .data_in ({data_out_u, tag_out_u}), + .data_out ({data_out, tag_out}), + .valid_out (valid_out), + .ready_out (ready_out) + ); + endmodule `TRACING_ON From 3b81a32b126cb48fb571011caf83eb7e44c59109 Mon Sep 17 00:00:00 2001 From: Jacob Levinson Date: Fri, 2 Aug 2024 18:25:56 -0700 Subject: [PATCH 53/89] Fix invalid use of incomplete type error by including --- runtime/common/common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime/common/common.h b/runtime/common/common.h index f7125064e..1f718f938 100644 --- a/runtime/common/common.h +++ b/runtime/common/common.h @@ -21,6 +21,7 @@ #include #include +#include #define CACHE_BLOCK_SIZE 64 From 067b7a8726d1fa9064072ca09775d972570b7361 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 2 Aug 2024 18:57:07 -0700 Subject: [PATCH 54/89] fixed typo --- hw/rtl/libs/VX_pe_serializer.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rtl/libs/VX_pe_serializer.sv b/hw/rtl/libs/VX_pe_serializer.sv index efaa45d03..cad51097b 100644 --- a/hw/rtl/libs/VX_pe_serializer.sv +++ b/hw/rtl/libs/VX_pe_serializer.sv @@ -21,7 +21,7 @@ module VX_pe_serializer #( parameter DATA_IN_WIDTH = 1, parameter DATA_OUT_WIDTH = 1, parameter TAG_WIDTH = 0, - parameter PE_REG = 0' + parameter PE_REG = 0, parameter OUT_BUF = 0 ) ( input wire clk, From e8cdae1225faf2276b543ccd66418de8cea5cd7b Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 2 Aug 2024 19:19:57 -0700 Subject: [PATCH 55/89] minor fix in VX_local_mem.sv --- hw/rtl/mem/VX_local_mem.sv | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/hw/rtl/mem/VX_local_mem.sv b/hw/rtl/mem/VX_local_mem.sv index facc63634..9c72907c5 100644 --- a/hw/rtl/mem/VX_local_mem.sv +++ b/hw/rtl/mem/VX_local_mem.sv @@ -160,6 +160,9 @@ module VX_local_mem import VX_gpu_pkg::*; #( `RESET_RELAY (bank_reset, reset); for (genvar i = 0; i < NUM_BANKS; ++i) begin + wire bank_req_valid, bank_req_ready; + wire [WORD_WIDTH-1:0] bank_rsp_data; + VX_sp_ram #( .DATAW (WORD_WIDTH), .SIZE (WORDS_PER_BANK), @@ -172,24 +175,24 @@ module VX_local_mem import VX_gpu_pkg::*; #( .wren (per_bank_req_byteen[i]), .addr (per_bank_req_addr[i]), .wdata (per_bank_req_data[i]), - .rdata (per_bank_rsp_data[i]) + .rdata (bank_rsp_data) ); // drop write response - wire per_bank_req_valid_w, per_bank_req_ready_w; - assign per_bank_req_valid_w = per_bank_req_valid[i] && ~per_bank_req_rw[i]; - assign per_bank_req_ready[i] = per_bank_req_ready_w || per_bank_req_rw[i]; + assign bank_req_valid = per_bank_req_valid[i] && ~per_bank_req_rw[i]; + assign per_bank_req_ready[i] = bank_req_ready || per_bank_req_rw[i]; + // register BRAM output VX_elastic_buffer #( - .DATAW (REQ_SEL_WIDTH + TAG_WIDTH), + .DATAW (REQ_SEL_WIDTH + WORD_WIDTH + TAG_WIDTH), .SIZE (1) ) bank_buf ( .clk (clk), .reset (bank_reset), - .valid_in (per_bank_req_valid_w), - .ready_in (per_bank_req_ready_w), - .data_in ({per_bank_req_idx[i], per_bank_req_tag[i]}), - .data_out ({per_bank_rsp_idx[i], per_bank_rsp_tag[i]}), + .valid_in (bank_req_valid), + .ready_in (bank_req_ready), + .data_in ({per_bank_req_idx[i], bank_rsp_data, per_bank_req_tag[i]}), + .data_out ({per_bank_rsp_idx[i], per_bank_rsp_data[i], per_bank_rsp_tag[i]}), .valid_out (per_bank_rsp_valid[i]), .ready_out (per_bank_rsp_ready[i]) ); @@ -197,10 +200,10 @@ module VX_local_mem import VX_gpu_pkg::*; #( // bank responses gather - wire [NUM_BANKS-1:0][RSP_DATAW-1:0] per_bank_rsp_data_all; + wire [NUM_BANKS-1:0][RSP_DATAW-1:0] per_bank_rsp_data2; for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign per_bank_rsp_data_all[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]}; + assign per_bank_rsp_data2[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]}; end wire [NUM_REQS-1:0] rsp_valid_out; @@ -219,7 +222,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( `UNUSED_PIN (collisions), .sel_in (per_bank_rsp_idx), .valid_in (per_bank_rsp_valid), - .data_in (per_bank_rsp_data_all), + .data_in (per_bank_rsp_data2), .ready_in (per_bank_rsp_ready), .valid_out (rsp_valid_out), .data_out (rsp_data_out), From 76f74b8a5954f65921b04a9645bee16e542a3fba Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 2 Aug 2024 19:50:34 -0700 Subject: [PATCH 56/89] minor update --- hw/rtl/libs/VX_pe_serializer.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rtl/libs/VX_pe_serializer.sv b/hw/rtl/libs/VX_pe_serializer.sv index cad51097b..e71672041 100644 --- a/hw/rtl/libs/VX_pe_serializer.sv +++ b/hw/rtl/libs/VX_pe_serializer.sv @@ -150,7 +150,7 @@ module VX_pe_serializer #( `RESET_RELAY (out_buf_reset, reset); VX_elastic_buffer #( - .DATAW (DATA_OUT_WIDTH + TAG_WIDTH), + .DATAW (NUM_LANES * DATA_OUT_WIDTH + TAG_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( From 52c5f1ff6bee5e816aa446be43e2f096333cd807 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 2 Aug 2024 23:32:34 -0700 Subject: [PATCH 57/89] minor update --- hw/rtl/cache/VX_cache_data.sv | 5 ++--- hw/rtl/cache/VX_cache_mshr.sv | 2 +- hw/rtl/cache/VX_cache_tags.sv | 3 +-- hw/rtl/core/VX_fetch.sv | 2 +- hw/rtl/core/VX_ipdom_stack.sv | 2 +- hw/rtl/core/VX_operands.sv | 7 +++---- hw/rtl/libs/VX_dp_ram.sv | 14 +++++++++----- hw/rtl/libs/VX_fifo_queue.sv | 4 ++-- hw/rtl/libs/VX_index_buffer.sv | 2 +- 9 files changed, 21 insertions(+), 20 deletions(-) diff --git a/hw/rtl/cache/VX_cache_data.sv b/hw/rtl/cache/VX_cache_data.sv index 854903065..a114e1689 100644 --- a/hw/rtl/cache/VX_cache_data.sv +++ b/hw/rtl/cache/VX_cache_data.sv @@ -62,7 +62,6 @@ module VX_cache_data #( `UNUSED_SPARAM (INSTANCE_ID) `UNUSED_PARAM (BANK_ID) `UNUSED_PARAM (WORD_SIZE) - `UNUSED_VAR (reset) `UNUSED_VAR (stall) `UNUSED_VAR (line_addr) `UNUSED_VAR (init) @@ -91,7 +90,7 @@ module VX_cache_data #( .SIZE (`CS_LINES_PER_BANK) ) byteen_store ( .clk (clk), - .reset (1'b0), + .reset (reset), .read (write || fill || flush), .write (init || write || fill || flush), .wren (1'b1), @@ -162,7 +161,7 @@ module VX_cache_data #( .RW_ASSERT (1) ) data_store ( .clk (clk), - .reset (1'b0), + .reset (reset), .read (line_read), .write (line_write), .wren (line_wren), diff --git a/hw/rtl/cache/VX_cache_mshr.sv b/hw/rtl/cache/VX_cache_mshr.sv index fa08b65a4..4f8163269 100644 --- a/hw/rtl/cache/VX_cache_mshr.sv +++ b/hw/rtl/cache/VX_cache_mshr.sv @@ -232,7 +232,7 @@ module VX_cache_mshr #( .LUTRAM (1) ) entries ( .clk (clk), - .reset (1'b0), + .reset (reset), .read (1'b1), .write (allocate_valid), .wren (1'b1), diff --git a/hw/rtl/cache/VX_cache_tags.sv b/hw/rtl/cache/VX_cache_tags.sv index 7e579e3be..7fef69be6 100644 --- a/hw/rtl/cache/VX_cache_tags.sv +++ b/hw/rtl/cache/VX_cache_tags.sv @@ -57,7 +57,6 @@ module VX_cache_tags #( ); `UNUSED_SPARAM (INSTANCE_ID) `UNUSED_PARAM (BANK_ID) - `UNUSED_VAR (reset) `UNUSED_VAR (lookup) // valid, dirty, tag @@ -130,7 +129,7 @@ module VX_cache_tags #( .RW_ASSERT (1) ) tag_store ( .clk (clk), - .reset (1'b0), + .reset (reset), .read (line_read), .write (line_write), .wren (1'b1), diff --git a/hw/rtl/core/VX_fetch.sv b/hw/rtl/core/VX_fetch.sv index de086219d..043a87939 100644 --- a/hw/rtl/core/VX_fetch.sv +++ b/hw/rtl/core/VX_fetch.sv @@ -56,7 +56,7 @@ module VX_fetch import VX_gpu_pkg::*; #( .LUTRAM (1) ) tag_store ( .clk (clk), - .reset (1'b0), + .reset (reset), .read (1'b1), .write (icache_req_fire), .wren (1'b1), diff --git a/hw/rtl/core/VX_ipdom_stack.sv b/hw/rtl/core/VX_ipdom_stack.sv index 54c703709..0ec05cbae 100644 --- a/hw/rtl/core/VX_ipdom_stack.sv +++ b/hw/rtl/core/VX_ipdom_stack.sv @@ -72,7 +72,7 @@ module VX_ipdom_stack #( .LUTRAM (OUT_REG ? 0 : 1) ) store ( .clk (clk), - .reset (1'b0), + .reset (reset), .read (1'b1), .write (push), .wren (1'b1), diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index af3c9c823..b438997ec 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -277,14 +277,13 @@ module VX_operands import VX_gpu_pkg::*; #( .DATAW (`XLEN * `NUM_THREADS), .SIZE (PER_BANK_REGS * PER_ISSUE_WARPS), .WRENW (BYTEENW), + `ifdef GPR_RESET + .RESET_RAM (1), + `endif .NO_RWCHECK (1) ) gpr_ram ( .clk (clk), - `ifdef GPR_RESET .reset (reset), - `else - .reset (1'b0), - `endif .read (pipe_fire_st1), .wren (wren), .write (gpr_wr_enabled), diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index 7fe0e3491..6683eaecc 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -24,6 +24,7 @@ module VX_dp_ram #( parameter LUTRAM = 0, parameter RW_ASSERT = 0, parameter RESET_RAM = 0, + parameter READ_ENABLE = 0, parameter INIT_ENABLE = 0, parameter INIT_FILE = "", parameter [DATAW-1:0] INIT_VALUE = 0, @@ -198,13 +199,16 @@ module VX_dp_ram #( for (integer i = 0; i < SIZE; ++i) begin ram[i] <= DATAW'(INIT_VALUE); end - prev_write <= 0; - prev_data <= '0; - prev_waddr <= '0; end else begin if (write) begin ram[waddr] <= ram_n; end + end + if (reset) begin + prev_write <= 0; + prev_data <= '0; + prev_waddr <= '0; + end else begin prev_write <= write; prev_data <= ram[waddr]; prev_waddr <= waddr; @@ -227,9 +231,9 @@ module VX_dp_ram #( if (OUT_REG != 0) begin reg [DATAW-1:0] rdata_r; always @(posedge clk) begin - if (reset) begin + if (READ_ENABLE && reset) begin rdata_r <= '0; - end else if (read) begin + end else if (!READ_ENABLE || read) begin rdata_r <= rdata_w; end end diff --git a/hw/rtl/libs/VX_fifo_queue.sv b/hw/rtl/libs/VX_fifo_queue.sv index 565849aee..ea00d67c7 100644 --- a/hw/rtl/libs/VX_fifo_queue.sv +++ b/hw/rtl/libs/VX_fifo_queue.sv @@ -178,7 +178,7 @@ module VX_fifo_queue #( .LUTRAM (LUTRAM) ) dp_ram ( .clk (clk), - .reset (1'b0), + .reset (reset), .read (1'b1), .write (push), .wren (1'b1), @@ -227,7 +227,7 @@ module VX_fifo_queue #( .LUTRAM (LUTRAM) ) dp_ram ( .clk (clk), - .reset (1'b0), + .reset (reset), .read (1'b1), .write (push), .wren (1'b1), diff --git a/hw/rtl/libs/VX_index_buffer.sv b/hw/rtl/libs/VX_index_buffer.sv index 4d3804f28..4e8439818 100644 --- a/hw/rtl/libs/VX_index_buffer.sv +++ b/hw/rtl/libs/VX_index_buffer.sv @@ -54,7 +54,7 @@ module VX_index_buffer #( .LUTRAM (LUTRAM) ) data_table ( .clk (clk), - .reset (1'b0), + .reset (reset), .read (1'b1), .write (acquire_en), .wren (1'b1), From 4c1b3fd88d4a1b4d2118bb88ba9706cc0f7d36fa Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 3 Aug 2024 00:10:06 -0700 Subject: [PATCH 58/89] local memory area optimization --- hw/rtl/mem/VX_local_mem.sv | 53 ++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/hw/rtl/mem/VX_local_mem.sv b/hw/rtl/mem/VX_local_mem.sv index 9c72907c5..274794b07 100644 --- a/hw/rtl/mem/VX_local_mem.sv +++ b/hw/rtl/mem/VX_local_mem.sv @@ -94,7 +94,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_req_idx; wire [NUM_BANKS-1:0] per_bank_req_ready; - wire [NUM_BANKS-1:0][REQ_DATAW-1:0] per_bank_req_data_all; + wire [NUM_BANKS-1:0][REQ_DATAW-1:0] per_bank_req_data_aos; wire [NUM_REQS-1:0] req_valid_in; wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in; @@ -111,7 +111,8 @@ module VX_local_mem import VX_gpu_pkg::*; #( req_bank_addr[i], mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, - mem_bus_if[i].req_data.tag}; + mem_bus_if[i].req_data.tag + }; assign mem_bus_if[i].req_ready = req_ready_in[i]; end @@ -135,7 +136,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( .sel_in (req_bank_idx), .ready_in (req_ready_in), .valid_out (per_bank_req_valid), - .data_out (per_bank_req_data_all), + .data_out (per_bank_req_data_aos), .sel_out (per_bank_req_idx), .ready_out (per_bank_req_ready) ); @@ -146,7 +147,8 @@ module VX_local_mem import VX_gpu_pkg::*; #( per_bank_req_addr[i], per_bank_req_byteen[i], per_bank_req_data[i], - per_bank_req_tag[i]} = per_bank_req_data_all[i]; + per_bank_req_tag[i] + } = per_bank_req_data_aos[i]; end // banks access @@ -160,17 +162,18 @@ module VX_local_mem import VX_gpu_pkg::*; #( `RESET_RELAY (bank_reset, reset); for (genvar i = 0; i < NUM_BANKS; ++i) begin - wire bank_req_valid, bank_req_ready; + wire bank_rsp_valid, bank_rsp_ready; wire [WORD_WIDTH-1:0] bank_rsp_data; VX_sp_ram #( .DATAW (WORD_WIDTH), .SIZE (WORDS_PER_BANK), - .WRENW (WORD_SIZE) + .WRENW (WORD_SIZE), + .NO_RWCHECK (1) ) data_store ( .clk (clk), - .reset (1'b0), - .read (1'b1), + .reset (reset), + .read (per_bank_req_valid[i] && per_bank_req_ready[i] && ~per_bank_req_rw[i]), .write (per_bank_req_valid[i] && per_bank_req_ready[i] && per_bank_req_rw[i]), .wren (per_bank_req_byteen[i]), .addr (per_bank_req_addr[i]), @@ -178,19 +181,31 @@ module VX_local_mem import VX_gpu_pkg::*; #( .rdata (bank_rsp_data) ); - // drop write response - assign bank_req_valid = per_bank_req_valid[i] && ~per_bank_req_rw[i]; - assign per_bank_req_ready[i] = bank_req_ready || per_bank_req_rw[i]; + // read-during-write hazard detection + reg [BANK_ADDR_WIDTH-1:0] last_wr_addr; + reg last_wr_valid; + always @(posedge clk) begin + if (reset) begin + last_wr_valid <= 0; + end else begin + last_wr_valid <= per_bank_req_valid[i] && per_bank_req_ready[i] && per_bank_req_rw[i]; + end + last_wr_addr <= per_bank_req_addr[i]; + end + wire is_rdw_hazard = last_wr_valid && ~per_bank_req_rw[i] && (per_bank_req_addr[i] == last_wr_addr); + + // drop write response and stall on read-during-write hazard + assign bank_rsp_valid = per_bank_req_valid[i] && ~per_bank_req_rw[i] && ~is_rdw_hazard; + assign per_bank_req_ready[i] = (bank_rsp_ready || per_bank_req_rw[i]) && ~is_rdw_hazard; // register BRAM output - VX_elastic_buffer #( - .DATAW (REQ_SEL_WIDTH + WORD_WIDTH + TAG_WIDTH), - .SIZE (1) + VX_pipe_buffer #( + .DATAW (REQ_SEL_WIDTH + WORD_WIDTH + TAG_WIDTH) ) bank_buf ( .clk (clk), .reset (bank_reset), - .valid_in (bank_req_valid), - .ready_in (bank_req_ready), + .valid_in (bank_rsp_valid), + .ready_in (bank_rsp_ready), .data_in ({per_bank_req_idx[i], bank_rsp_data, per_bank_req_tag[i]}), .data_out ({per_bank_rsp_idx[i], per_bank_rsp_data[i], per_bank_rsp_tag[i]}), .valid_out (per_bank_rsp_valid[i]), @@ -200,10 +215,10 @@ module VX_local_mem import VX_gpu_pkg::*; #( // bank responses gather - wire [NUM_BANKS-1:0][RSP_DATAW-1:0] per_bank_rsp_data2; + wire [NUM_BANKS-1:0][RSP_DATAW-1:0] per_bank_rsp_data_aos; for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign per_bank_rsp_data2[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]}; + assign per_bank_rsp_data_aos[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]}; end wire [NUM_REQS-1:0] rsp_valid_out; @@ -222,7 +237,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( `UNUSED_PIN (collisions), .sel_in (per_bank_rsp_idx), .valid_in (per_bank_rsp_valid), - .data_in (per_bank_rsp_data2), + .data_in (per_bank_rsp_data_aos), .ready_in (per_bank_rsp_ready), .valid_out (rsp_valid_out), .data_out (rsp_data_out), From d09bce011bf7f55aec4bda41a0b4f6e6d828a257 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 3 Aug 2024 00:52:41 -0700 Subject: [PATCH 59/89] local memory test update --- tests/kernel/conform/tests.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/kernel/conform/tests.cpp b/tests/kernel/conform/tests.cpp index f5f33a13e..6125a6911 100644 --- a/tests/kernel/conform/tests.cpp +++ b/tests/kernel/conform/tests.cpp @@ -46,13 +46,15 @@ int test_global_memory() { /////////////////////////////////////////////////////////////////////////////// -int* lmem_addr = (int*)LMEM_BASE_ADDR; +volatile int* lmem_addr = (int*)LMEM_BASE_ADDR; int lmem_buffer[8]; void __attribute__((noinline)) do_lmem_wr() { unsigned tid = vx_thread_id(); lmem_addr[tid] = 65 + tid; + int x = lmem_addr[tid]; + lmem_addr[tid] = x; } void __attribute__((noinline)) do_lmem_rd() { From fc0392e5e3963871df2532f2340f49a6cd91d5cd Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 3 Aug 2024 00:54:17 -0700 Subject: [PATCH 60/89] fixed typo --- hw/syn/altera/opae/Makefile | 2 +- hw/syn/xilinx/xrt/Makefile | 2 +- hw/syn/yosys/Makefile | 2 +- hw/unittest/common.mk | 2 +- runtime/opae/Makefile | 2 +- runtime/rtlsim/Makefile | 2 +- runtime/simx/Makefile | 2 +- runtime/stub/Makefile | 2 +- runtime/xrt/Makefile | 2 +- sim/opaesim/Makefile | 2 +- sim/rtlsim/Makefile | 2 +- sim/simx/Makefile | 2 +- sim/xrtsim/Makefile | 2 +- tests/opencl/common.mk | 2 +- tests/regression/common.mk | 2 +- tests/unittest/common.mk | 2 +- 16 files changed, 16 insertions(+), 16 deletions(-) diff --git a/hw/syn/altera/opae/Makefile b/hw/syn/altera/opae/Makefile index 8cd37b47c..7d782fcfb 100644 --- a/hw/syn/altera/opae/Makefile +++ b/hw/syn/altera/opae/Makefile @@ -73,7 +73,7 @@ ifneq ($(TARGET), fpga) CFLAGS += -DSIMULATION endif -# Debugigng +# Debugging ifdef DEBUG ifneq ($(TARGET), fpga) CFLAGS += $(DBG_TRACE_FLAGS) diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile index cbf0f4068..d5ada6062 100644 --- a/hw/syn/xilinx/xrt/Makefile +++ b/hw/syn/xilinx/xrt/Makefile @@ -111,7 +111,7 @@ ifeq ($(TARGET), hw_emu) CFLAGS += -DSIMULATION endif -# Debugigng +# Debugging ifdef DEBUG VPP_FLAGS += -g --debug.protocol all ifneq ($(TARGET), hw) diff --git a/hw/syn/yosys/Makefile b/hw/syn/yosys/Makefile index a0c4fdcc9..80bfdae02 100644 --- a/hw/syn/yosys/Makefile +++ b/hw/syn/yosys/Makefile @@ -49,7 +49,7 @@ endif RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache RTL_INCLUDE += $(FPU_INCLUDE) -# Debugigng +# Debugging ifdef DEBUG CFLAGS += $(DBG_TRACE_FLAGS) else diff --git a/hw/unittest/common.mk b/hw/unittest/common.mk index ac3e6b4ff..48aefd415 100644 --- a/hw/unittest/common.mk +++ b/hw/unittest/common.mk @@ -29,7 +29,7 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count() VL_FLAGS += -j $(THREADS) #VL_FLAGS += --threads $(THREADS) -# Debugigng +# Debugging ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) diff --git a/runtime/opae/Makefile b/runtime/opae/Makefile index 3954d3f19..1a9810eca 100644 --- a/runtime/opae/Makefile +++ b/runtime/opae/Makefile @@ -30,7 +30,7 @@ else CXXFLAGS += -I$(SYN_DIR) endif -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else diff --git a/runtime/rtlsim/Makefile b/runtime/rtlsim/Makefile index 4523be18d..f6adbf8c8 100644 --- a/runtime/rtlsim/Makefile +++ b/runtime/rtlsim/Makefile @@ -19,7 +19,7 @@ LDFLAGS += -L$(DESTDIR) -lrtlsim SRCS := $(SRC_DIR)/vortex.cpp -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else diff --git a/runtime/simx/Makefile b/runtime/simx/Makefile index 7c73ca66d..c20e33b53 100644 --- a/runtime/simx/Makefile +++ b/runtime/simx/Makefile @@ -15,7 +15,7 @@ LDFLAGS += -L$(DESTDIR) -lsimx SRCS := $(SRC_DIR)/vortex.cpp -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else diff --git a/runtime/stub/Makefile b/runtime/stub/Makefile index 6dc8d88f8..ae6e27ed1 100644 --- a/runtime/stub/Makefile +++ b/runtime/stub/Makefile @@ -12,7 +12,7 @@ LDFLAGS += -shared -pthread -ldl SRCS := $(SRC_DIR)/vortex.cpp $(SRC_DIR)/utils.cpp -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else diff --git a/runtime/xrt/Makefile b/runtime/xrt/Makefile index 4a30c23cb..66d3e481b 100644 --- a/runtime/xrt/Makefile +++ b/runtime/xrt/Makefile @@ -26,7 +26,7 @@ endif PROJECT := libvortex-xrt.so -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else diff --git a/sim/opaesim/Makefile b/sim/opaesim/Makefile index 2a4eaf02d..2e549ca74 100644 --- a/sim/opaesim/Makefile +++ b/sim/opaesim/Makefile @@ -83,7 +83,7 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count() VL_FLAGS += -j $(THREADS) #VL_FLAGS += --threads $(THREADS) -# Debugigng +# Debugging ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index e9487a2f4..3deffc759 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -65,7 +65,7 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count() VL_FLAGS += -j $(THREADS) #VL_FLAGS += --threads $(THREADS) -# Debugigng +# Debugging ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) diff --git a/sim/simx/Makefile b/sim/simx/Makefile index 622f653dd..31fde7023 100644 --- a/sim/simx/Makefile +++ b/sim/simx/Makefile @@ -20,7 +20,7 @@ LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulato SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) #CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer diff --git a/sim/xrtsim/Makefile b/sim/xrtsim/Makefile index c2128f3c4..765e3e268 100644 --- a/sim/xrtsim/Makefile +++ b/sim/xrtsim/Makefile @@ -82,7 +82,7 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count() VL_FLAGS += -j $(THREADS) #VL_FLAGS += --threads $(THREADS) -# Debugigng +# Debugging ifdef DEBUG VL_FLAGS += --trace --trace-structs $(DBG_FLAGS) CXXFLAGS += -g -O0 $(DBG_FLAGS) diff --git a/tests/opencl/common.mk b/tests/opencl/common.mk index 0c559e8c5..2e287a944 100644 --- a/tests/opencl/common.mk +++ b/tests/opencl/common.mk @@ -44,7 +44,7 @@ CXXFLAGS += -I$(POCL_PATH)/include POCL_CC_FLAGS += LLVM_PREFIX=$(LLVM_VORTEX) POCL_VORTEX_BINTOOL="$(VX_BINTOOL)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)" -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 POCL_CC_FLAGS += POCL_DEBUG=all diff --git a/tests/regression/common.mk b/tests/regression/common.mk index c063fe34e..12b45e848 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -52,7 +52,7 @@ CXXFLAGS += -I$(VORTEX_RT_PATH)/include -I$(ROOT_DIR)/hw LDFLAGS += -L$(ROOT_DIR)/runtime -lvortex -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else diff --git a/tests/unittest/common.mk b/tests/unittest/common.mk index 4f94afa08..a6f6b2794 100644 --- a/tests/unittest/common.mk +++ b/tests/unittest/common.mk @@ -2,7 +2,7 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -I$(VORTEX_RT_PATH)/common -# Debugigng +# Debugging ifdef DEBUG CXXFLAGS += -g -O0 else From fce935f1c4063d10a3b52197010a9bcb14892a54 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 3 Aug 2024 00:54:58 -0700 Subject: [PATCH 61/89] add debug level to FPGA makefile --- hw/syn/altera/opae/Makefile | 2 +- hw/syn/xilinx/xrt/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/syn/altera/opae/Makefile b/hw/syn/altera/opae/Makefile index 7d782fcfb..62a9bb72c 100644 --- a/hw/syn/altera/opae/Makefile +++ b/hw/syn/altera/opae/Makefile @@ -76,7 +76,7 @@ endif # Debugging ifdef DEBUG ifneq ($(TARGET), fpga) - CFLAGS += $(DBG_TRACE_FLAGS) + CFLAGS += -DDEBUG_LEVEL=$(DEBUG) $(DBG_TRACE_FLAGS) else CFLAGS += -DNDEBUG endif diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile index d5ada6062..38ae29f36 100644 --- a/hw/syn/xilinx/xrt/Makefile +++ b/hw/syn/xilinx/xrt/Makefile @@ -116,7 +116,7 @@ ifdef DEBUG VPP_FLAGS += -g --debug.protocol all ifneq ($(TARGET), hw) VPP_FLAGS += --vivado.prop fileset.sim_1.xsim.elaborate.debug_level=all - CFLAGS += $(DBG_TRACE_FLAGS) + CFLAGS += -DDEBUG_LEVEL=$(DEBUG) $(DBG_TRACE_FLAGS) else CFLAGS += -DNDEBUG endif From 35fb50f9a6b843432a43672c61b0b3ca7c009f62 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 3 Aug 2024 10:43:08 -0700 Subject: [PATCH 62/89] minor updates --- hw/rtl/cache/VX_cache.sv | 5 +---- hw/rtl/cache/VX_cache_bank.sv | 21 ++++++++++++++------- hw/rtl/cache/VX_cache_bypass.sv | 9 +++++++-- hw/rtl/cache/VX_cache_cluster.sv | 8 +++++--- hw/rtl/mem/VX_local_mem.sv | 12 ++++++------ 5 files changed, 33 insertions(+), 22 deletions(-) diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index 3c70bce85..f8833dbc2 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -114,16 +114,13 @@ module VX_cache import VX_gpu_pkg::*; #( wire [NUM_BANKS-1:0] per_bank_core_req_fire; - // this reset relay is required to sync with bank initialization - `RESET_RELAY (flush_reset, reset); - VX_cache_flush #( .NUM_REQS (NUM_REQS), .NUM_BANKS (NUM_BANKS), .BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency ) flush_unit ( .clk (clk), - .reset (flush_reset), + .reset (reset), .core_bus_in_if (core_bus_if), .core_bus_out_if (core_bus2_if), .bank_req_fire (per_bank_core_req_fire), diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index b2edbf918..ab1fdb3d3 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -172,6 +172,9 @@ module VX_cache_bank #( // ensure we have no pending memory request in the bank wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty; + // this reset relay should match pipeline during tags initialization + `RESET_RELAY (flush_reset, reset); + // flush unit VX_bank_flush #( .BANK_ID (BANK_ID), @@ -182,7 +185,7 @@ module VX_cache_bank #( .WRITEBACK (WRITEBACK) ) flush_unit ( .clk (clk), - .reset (reset), + .reset (flush_reset), .flush_begin (flush_begin), .flush_end (flush_end), .flush_init (init_valid), @@ -269,15 +272,17 @@ module VX_cache_bank #( assign req_uuid_sel = 0; end + `RESET_RELAY (pipe0_reset, reset); + VX_pipe_register #( .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + NUM_WAYS + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH), .RESETW (1) ) pipe_reg0 ( .clk (clk), - .reset (reset), + .reset (pipe0_reset), .enable (~pipe_stall), - .data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}), - .data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0}) + .data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}), + .data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0}) ); if (UUID_WIDTH != 0) begin @@ -304,7 +309,7 @@ module VX_cache_bank #( wire [NUM_WAYS-1:0] evict_way_st0; wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0; - `RESET_RELAY (tag_reset, reset); + `RESET_RELAY (tags_reset, reset); VX_cache_tags #( .INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)), @@ -318,7 +323,7 @@ module VX_cache_bank #( .UUID_WIDTH (UUID_WIDTH) ) cache_tags ( .clk (clk), - .reset (tag_reset), + .reset (tags_reset), .req_uuid (req_uuid_st0), @@ -350,12 +355,14 @@ module VX_cache_bank #( assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0; + `RESET_RELAY (pipe1_reset, reset); + VX_pipe_register #( .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1), .RESETW (1) ) pipe_reg1 ( .clk (clk), - .reset (reset), + .reset (pipe1_reset), .enable (~pipe_stall), .data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}), .data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1}) diff --git a/hw/rtl/cache/VX_cache_bypass.sv b/hw/rtl/cache/VX_cache_bypass.sv index 379d33e8a..90ba0f30d 100644 --- a/hw/rtl/cache/VX_cache_bypass.sv +++ b/hw/rtl/cache/VX_cache_bypass.sv @@ -217,13 +217,15 @@ module VX_cache_bypass #( assign mem_bus_in_if.req_ready = mem_req_out_ready; + `RESET_RELAY (mem_req_reset, reset); + VX_elastic_buffer #( .DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `ADDR_TYPE_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH), .SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0), .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) ) mem_req_buf ( .clk (clk), - .reset (reset), + .reset (mem_req_reset), .valid_in (mem_req_out_valid), .ready_in (mem_req_out_ready), .data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_atype, mem_req_out_data, mem_req_out_tag}), @@ -309,13 +311,16 @@ module VX_cache_bypass #( end for (genvar i = 0; i < NUM_REQS; ++i) begin + + `RESET_RELAY (core_rsp_reset, reset); + VX_elastic_buffer #( .DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH), .SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0), .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF)) ) core_rsp_buf ( .clk (clk), - .reset (reset), + .reset (core_rsp_reset), .valid_in (core_rsp_in_valid[i]), .ready_in (core_rsp_in_ready[i]), .data_in ({core_rsp_in_data[i], core_rsp_in_tag[i]}), diff --git a/hw/rtl/cache/VX_cache_cluster.sv b/hw/rtl/cache/VX_cache_cluster.sv index 716e69561..0db1360d4 100644 --- a/hw/rtl/cache/VX_cache_cluster.sv +++ b/hw/rtl/cache/VX_cache_cluster.sv @@ -117,7 +117,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( `ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]); end - `RESET_RELAY (arb_reset, reset); + `RESET_RELAY (cache_arb_reset, reset); VX_mem_arb #( .NUM_INPUTS (NUM_INPUTS), @@ -130,7 +130,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0) ) cache_arb ( .clk (clk), - .reset (arb_reset), + .reset (cache_arb_reset), .bus_in_if (core_bus_tmp_if), .bus_out_if (arb_core_bus_tmp_if) ); @@ -182,6 +182,8 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1)) ) mem_bus_tmp_if[1](); + `RESET_RELAY (mem_arb_reset, reset); + VX_mem_arb #( .NUM_INPUTS (NUM_CACHES), .DATA_SIZE (LINE_SIZE), @@ -192,7 +194,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0) ) mem_arb ( .clk (clk), - .reset (reset), + .reset (mem_arb_reset), .bus_in_if (cache_mem_bus_if), .bus_out_if (mem_bus_tmp_if) ); diff --git a/hw/rtl/mem/VX_local_mem.sv b/hw/rtl/mem/VX_local_mem.sv index 274794b07..88e736e4c 100644 --- a/hw/rtl/mem/VX_local_mem.sv +++ b/hw/rtl/mem/VX_local_mem.sv @@ -159,12 +159,12 @@ module VX_local_mem import VX_gpu_pkg::*; #( wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_rsp_tag; wire [NUM_BANKS-1:0] per_bank_rsp_ready; - `RESET_RELAY (bank_reset, reset); - for (genvar i = 0; i < NUM_BANKS; ++i) begin wire bank_rsp_valid, bank_rsp_ready; wire [WORD_WIDTH-1:0] bank_rsp_data; + `RESET_RELAY (bram_reset, reset); + VX_sp_ram #( .DATAW (WORD_WIDTH), .SIZE (WORDS_PER_BANK), @@ -172,7 +172,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( .NO_RWCHECK (1) ) data_store ( .clk (clk), - .reset (reset), + .reset (bram_reset), .read (per_bank_req_valid[i] && per_bank_req_ready[i] && ~per_bank_req_rw[i]), .write (per_bank_req_valid[i] && per_bank_req_ready[i] && per_bank_req_rw[i]), .wren (per_bank_req_byteen[i]), @@ -185,7 +185,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( reg [BANK_ADDR_WIDTH-1:0] last_wr_addr; reg last_wr_valid; always @(posedge clk) begin - if (reset) begin + if (bram_reset) begin last_wr_valid <= 0; end else begin last_wr_valid <= per_bank_req_valid[i] && per_bank_req_ready[i] && per_bank_req_rw[i]; @@ -201,9 +201,9 @@ module VX_local_mem import VX_gpu_pkg::*; #( // register BRAM output VX_pipe_buffer #( .DATAW (REQ_SEL_WIDTH + WORD_WIDTH + TAG_WIDTH) - ) bank_buf ( + ) bram_buf ( .clk (clk), - .reset (bank_reset), + .reset (bram_reset), .valid_in (bank_rsp_valid), .ready_in (bank_rsp_ready), .data_in ({per_bank_req_idx[i], bank_rsp_data, per_bank_req_tag[i]}), From 4b93c9ffb56053dcf8a8846e4dabf89ff6da16d5 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 3 Aug 2024 11:49:12 -0700 Subject: [PATCH 63/89] minor updates --- hw/rtl/core/VX_alu_muldiv.sv | 2 +- hw/rtl/core/VX_operands.sv | 22 ++++++++---- hw/rtl/libs/VX_avs_adapter.sv | 18 +++++++--- hw/rtl/libs/VX_axi_adapter.sv | 4 ++- hw/rtl/libs/VX_mem_adapter.sv | 62 ++++++++++++++++++--------------- hw/rtl/libs/VX_mem_coalescer.sv | 52 +++++++++++---------------- hw/rtl/libs/VX_mem_scheduler.sv | 12 +++++-- 7 files changed, 97 insertions(+), 75 deletions(-) diff --git a/hw/rtl/core/VX_alu_muldiv.sv b/hw/rtl/core/VX_alu_muldiv.sv index 1a4806705..3beb035f4 100644 --- a/hw/rtl/core/VX_alu_muldiv.sv +++ b/hw/rtl/core/VX_alu_muldiv.sv @@ -83,7 +83,7 @@ module VX_alu_muldiv #( .DEPTH (`LATENCY_IMUL), .RESETW (1) ) mul_shift_reg ( - .clk(clk), + .clk (clk), .reset (reset), .enable (mul_ready_in), .data_in ({mul_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, mul_result_tmp}), diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index b438997ec..afb5546dd 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -100,6 +100,8 @@ module VX_operands import VX_gpu_pkg::*; #( assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid; + `RESET_RELAY (req_xbar_reset, reset); + VX_stream_xbar #( .NUM_INPUTS (NUM_SRC_REGS), .NUM_OUTPUTS (NUM_BANKS), @@ -109,7 +111,7 @@ module VX_operands import VX_gpu_pkg::*; #( .OUT_BUF (0) // no output buffering ) req_xbar ( .clk (clk), - .reset (reset), + .reset (req_xbar_reset), `UNUSED_PIN(collisions), .valid_in (req_in_valid), .data_in (req_in_data), @@ -162,12 +164,14 @@ module VX_operands import VX_gpu_pkg::*; #( scoreboard_if.data.uuid }; + `RESET_RELAY (pipe1_reset, reset); + VX_pipe_register #( .DATAW (1 + NUM_BANKS + NUM_SRC_REGS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)), .RESETW (1 + NUM_BANKS + NUM_SRC_REGS) ) pipe_reg1 ( .clk (clk), - .reset (reset), + .reset (pipe1_reset), .enable (pipe_in_ready), .data_in ({scoreboard_if.valid, gpr_rd_valid, data_fetched_n, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}), .data_out ({pipe_valid_st1, gpr_rd_valid_st1, data_fetched_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1}) @@ -179,12 +183,14 @@ module VX_operands import VX_gpu_pkg::*; #( wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1; + `RESET_RELAY (pipe2_reset, reset); + VX_pipe_register #( .DATAW (1 + NUM_BANKS + REGS_DATAW + (NUM_BANKS * `XLEN * `NUM_THREADS) + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH), .RESETW (1 + NUM_BANKS + REGS_DATAW) ) pipe_reg2 ( .clk (clk), - .reset (reset), + .reset (pipe2_reset), .enable (pipe_ready_st1), .data_in ({pipe_valid2_st1, gpr_rd_valid_st1, src_data_st1, gpr_rd_data_st1, pipe_data_st1, gpr_rd_req_idx_st1}), .data_out ({pipe_valid_st2, gpr_rd_valid_st2, src_data_st2, gpr_rd_data_st2, pipe_data_st2, gpr_rd_req_idx_st2}) @@ -199,14 +205,16 @@ module VX_operands import VX_gpu_pkg::*; #( end end + `RESET_RELAY (out_buf_reset, reset); + VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)), .LUTRAM (1) - ) out_buffer ( + ) out_buf ( .clk (clk), - .reset (reset), + .reset (out_buf_reset), .valid_in (pipe_valid_st2), .ready_in (pipe_ready_st2), .data_in ({ @@ -273,6 +281,8 @@ module VX_operands import VX_gpu_pkg::*; #( assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}}; end + `RESET_RELAY (bram_reset, reset); + VX_dp_ram #( .DATAW (`XLEN * `NUM_THREADS), .SIZE (PER_BANK_REGS * PER_ISSUE_WARPS), @@ -283,7 +293,7 @@ module VX_operands import VX_gpu_pkg::*; #( .NO_RWCHECK (1) ) gpr_ram ( .clk (clk), - .reset (reset), + .reset (bram_reset), .read (pipe_fire_st1), .wren (wren), .write (gpr_wr_enabled), diff --git a/hw/rtl/libs/VX_avs_adapter.sv b/hw/rtl/libs/VX_avs_adapter.sv index 28da07565..e2fefcd35 100644 --- a/hw/rtl/libs/VX_avs_adapter.sv +++ b/hw/rtl/libs/VX_avs_adapter.sv @@ -82,11 +82,14 @@ module VX_avs_adapter #( end for (genvar i = 0; i < NUM_BANKS; ++i) begin + + `RESET_RELAY (rd_req_reset, reset); + VX_pending_size #( .SIZE (RD_QUEUE_SIZE) ) pending_size ( .clk (clk), - .reset (reset), + .reset (rd_req_reset), .incr (req_queue_push[i]), .decr (req_queue_pop[i]), `UNUSED_PIN (empty), @@ -102,7 +105,7 @@ module VX_avs_adapter #( .DEPTH (RD_QUEUE_SIZE) ) rd_req_queue ( .clk (clk), - .reset (reset), + .reset (rd_req_reset), .push (req_queue_push[i]), .pop (req_queue_pop[i]), .data_in (mem_req_tag), @@ -126,13 +129,15 @@ module VX_avs_adapter #( wire valid_out_w = mem_req_valid && ~req_queue_going_full[i] && (req_bank_sel == i); wire ready_out_w; + `RESET_RELAY (req_out_reset, reset); + VX_elastic_buffer #( .DATAW (1 + DATA_SIZE + BANK_OFFSETW + DATA_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(REQ_OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF)) ) req_out_buf ( .clk (clk), - .reset (reset), + .reset (req_out_reset), .valid_in (valid_out_w), .ready_in (ready_out_w), .data_in ({mem_req_rw, mem_req_byteen, req_bank_off, mem_req_data}), @@ -168,12 +173,15 @@ module VX_avs_adapter #( wire [NUM_BANKS-1:0] rsp_queue_empty; for (genvar i = 0; i < NUM_BANKS; ++i) begin + + `RESET_RELAY (rd_rsp_reset, reset); + VX_fifo_queue #( .DATAW (DATA_WIDTH), .DEPTH (RD_QUEUE_SIZE) ) rd_rsp_queue ( .clk (clk), - .reset (reset), + .reset (rd_rsp_reset), .push (avs_readdatavalid[i]), .pop (req_queue_pop[i]), .data_in (avs_readdata[i]), @@ -192,6 +200,8 @@ module VX_avs_adapter #( assign req_queue_pop[i] = rsp_arb_valid_in[i] && rsp_arb_ready_in[i]; end + `RESET_RELAY (rsp_arb_reset, reset); + VX_stream_arb #( .NUM_INPUTS (NUM_BANKS), .DATAW (DATA_WIDTH + TAG_WIDTH), diff --git a/hw/rtl/libs/VX_axi_adapter.sv b/hw/rtl/libs/VX_axi_adapter.sv index a1c5b5b36..14e930d74 100644 --- a/hw/rtl/libs/VX_axi_adapter.sv +++ b/hw/rtl/libs/VX_axi_adapter.sv @@ -204,6 +204,8 @@ module VX_axi_adapter #( `RUNTIME_ASSERT(~m_axi_rvalid[i] || m_axi_rresp[i] == 0, ("%t: *** AXI response error", $time)); end + `RESET_RELAY (rsp_arb_reset, reset); + VX_stream_arb #( .NUM_INPUTS (NUM_BANKS), .DATAW (DATA_WIDTH + TAG_WIDTH), @@ -211,7 +213,7 @@ module VX_axi_adapter #( .OUT_BUF (RSP_OUT_BUF) ) rsp_arb ( .clk (clk), - .reset (reset), + .reset (rsp_arb_reset), .valid_in (rsp_arb_valid_in), .data_in (rsp_arb_data_in), .ready_in (rsp_arb_ready_in), diff --git a/hw/rtl/libs/VX_mem_adapter.sv b/hw/rtl/libs/VX_mem_adapter.sv index b447bcc35..988ae606c 100644 --- a/hw/rtl/libs/VX_mem_adapter.sv +++ b/hw/rtl/libs/VX_mem_adapter.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,10 +15,10 @@ `TRACING_OFF module VX_mem_adapter #( - parameter SRC_DATA_WIDTH = 1, - parameter SRC_ADDR_WIDTH = 1, - parameter DST_DATA_WIDTH = 1, - parameter DST_ADDR_WIDTH = 1, + parameter SRC_DATA_WIDTH = 1, + parameter SRC_ADDR_WIDTH = 1, + parameter DST_DATA_WIDTH = 1, + parameter DST_ADDR_WIDTH = 1, parameter SRC_TAG_WIDTH = 1, parameter DST_TAG_WIDTH = 1, parameter REQ_OUT_BUF = 0, @@ -35,9 +35,9 @@ module VX_mem_adapter #( input wire [SRC_TAG_WIDTH-1:0] mem_req_tag_in, output wire mem_req_ready_in, - output wire mem_rsp_valid_in, - output wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_in, - output wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_in, + output wire mem_rsp_valid_in, + output wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_in, + output wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_in, input wire mem_rsp_ready_in, output wire mem_req_valid_out, @@ -48,12 +48,12 @@ module VX_mem_adapter #( output wire [DST_TAG_WIDTH-1:0] mem_req_tag_out, input wire mem_req_ready_out, - input wire mem_rsp_valid_out, - input wire [DST_DATA_WIDTH-1:0] mem_rsp_data_out, + input wire mem_rsp_valid_out, + input wire [DST_DATA_WIDTH-1:0] mem_rsp_data_out, input wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_out, output wire mem_rsp_ready_out -); - `STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!")) +); + `STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!")) localparam DST_DATA_SIZE = (DST_DATA_WIDTH / 8); localparam DST_LDATAW = `CLOG2(DST_DATA_WIDTH); @@ -69,7 +69,7 @@ module VX_mem_adapter #( wire [DST_TAG_WIDTH-1:0] mem_req_tag_out_w; wire mem_req_ready_out_w; - wire mem_rsp_valid_in_w; + wire mem_rsp_valid_in_w; wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_in_w; wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_in_w; wire mem_rsp_ready_in_w; @@ -80,7 +80,7 @@ module VX_mem_adapter #( `UNUSED_VAR (clk) `UNUSED_VAR (reset) - + wire [D-1:0] req_idx = mem_req_addr_in[D-1:0]; wire [D-1:0] rsp_idx = mem_rsp_tag_out[D-1:0]; @@ -99,31 +99,31 @@ module VX_mem_adapter #( assign mem_req_valid_out_w = mem_req_valid_in; assign mem_req_rw_out_w = mem_req_rw_in; - assign mem_req_byteen_out_w = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3)); + assign mem_req_byteen_out_w = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3)); assign mem_req_data_out_w = DST_DATA_WIDTH'(mem_req_data_in) << ((DST_LDATAW'(req_idx)) << SRC_LDATAW); assign mem_req_tag_out_w = DST_TAG_WIDTH'({mem_req_tag_in, req_idx}); assign mem_req_ready_in = mem_req_ready_out_w; assign mem_rsp_valid_in_w = mem_rsp_valid_out; - assign mem_rsp_data_in_w = mem_rsp_data_out_w[rsp_idx]; + assign mem_rsp_data_in_w = mem_rsp_data_out_w[rsp_idx]; assign mem_rsp_tag_in_w = SRC_TAG_WIDTH'(mem_rsp_tag_out[SRC_TAG_WIDTH+D-1:D]); assign mem_rsp_ready_out = mem_rsp_ready_in_w; end else if (DST_LDATAW < SRC_LDATAW) begin - + reg [D-1:0] req_ctr, rsp_ctr; reg [P-1:0][DST_DATA_WIDTH-1:0] mem_rsp_data_out_r, mem_rsp_data_out_n; wire mem_req_out_fire = mem_req_valid_out && mem_req_ready_out; - wire mem_rsp_in_fire = mem_rsp_valid_out && mem_rsp_ready_out; + wire mem_rsp_in_fire = mem_rsp_valid_out && mem_rsp_ready_out; wire [P-1:0][DST_DATA_WIDTH-1:0] mem_req_data_in_w = mem_req_data_in; wire [P-1:0][DST_DATA_SIZE-1:0] mem_req_byteen_in_w = mem_req_byteen_in; always @(*) begin mem_rsp_data_out_n = mem_rsp_data_out_r; - if (mem_rsp_in_fire) begin + if (mem_rsp_in_fire) begin mem_rsp_data_out_n[rsp_ctr] = mem_rsp_data_out; end end @@ -139,24 +139,24 @@ module VX_mem_adapter #( if (mem_rsp_in_fire) begin rsp_ctr <= rsp_ctr + 1; end - end + end mem_rsp_data_out_r <= mem_rsp_data_out_n; end reg [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_r; wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_x; - + always @(posedge clk) begin if (mem_rsp_in_fire) begin mem_rsp_tag_in_r <= mem_rsp_tag_out; - end + end end assign mem_rsp_tag_in_x = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_out; - `RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_x == mem_rsp_tag_out), + `RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_x == mem_rsp_tag_out), ("%t: *** out-of-order memory reponse! cur=%d, expected=%d", $time, mem_rsp_tag_in_x, mem_rsp_tag_out)) wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr}; - + if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH + D)) begin `UNUSED_VAR (mem_req_addr_in_qual) assign mem_req_addr_out_w = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0]; @@ -181,8 +181,8 @@ module VX_mem_adapter #( end else begin `UNUSED_VAR (clk) - `UNUSED_VAR (reset) - + `UNUSED_VAR (reset) + if (DST_ADDR_WIDTH < SRC_ADDR_WIDTH) begin `UNUSED_VAR (mem_req_addr_in) assign mem_req_addr_out_w = mem_req_addr_in[DST_ADDR_WIDTH-1:0]; @@ -206,13 +206,15 @@ module VX_mem_adapter #( end + `RESET_RELAY (req_out_reset, reset); + VX_elastic_buffer #( .DATAW (1 + DST_DATA_SIZE + DST_ADDR_WIDTH + DST_DATA_WIDTH + DST_TAG_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(REQ_OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF)) ) req_out_buf ( .clk (clk), - .reset (reset), + .reset (req_out_reset), .valid_in (mem_req_valid_out_w), .ready_in (mem_req_ready_out_w), .data_in ({mem_req_rw_out_w, mem_req_byteen_out_w, mem_req_addr_out_w, mem_req_data_out_w, mem_req_tag_out_w}), @@ -221,13 +223,15 @@ module VX_mem_adapter #( .ready_out (mem_req_ready_out) ); + `RESET_RELAY (rsp_in_reset, reset); + VX_elastic_buffer #( .DATAW (SRC_DATA_WIDTH + SRC_TAG_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(RSP_OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(RSP_OUT_BUF)) ) rsp_in_buf ( .clk (clk), - .reset (reset), + .reset (rsp_in_reset), .valid_in (mem_rsp_valid_in_w), .ready_in (mem_rsp_ready_in_w), .data_in ({mem_rsp_data_in_w, mem_rsp_tag_in_w}), diff --git a/hw/rtl/libs/VX_mem_coalescer.sv b/hw/rtl/libs/VX_mem_coalescer.sv index 17eb01642..4881ed1ef 100644 --- a/hw/rtl/libs/VX_mem_coalescer.sv +++ b/hw/rtl/libs/VX_mem_coalescer.sv @@ -89,14 +89,14 @@ module VX_mem_coalescer #( reg state_r, state_n; - reg out_req_valid_r, out_req_valid_n; - reg out_req_rw_r, out_req_rw_n; - reg [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n; - reg [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n; - reg [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype_r, out_req_atype_n; - reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] out_req_byteen_r, out_req_byteen_n; - reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] out_req_data_r, out_req_data_n; - reg [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n; + logic out_req_valid_r, out_req_valid_n; + logic out_req_rw_r, out_req_rw_n; + logic [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n; + logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n; + logic [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype_r, out_req_atype_n; + logic [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] out_req_byteen_r, out_req_byteen_n; + logic [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] out_req_data_r, out_req_data_n; + logic [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n; reg in_req_ready_n; @@ -149,29 +149,6 @@ module VX_mem_coalescer #( end end - always @(posedge clk) begin - if (reset) begin - state_r <= STATE_SETUP; - processed_mask_r <= '0; - out_req_valid_r <= 0; - end else begin - state_r <= state_n; - batch_valid_r <= batch_valid_n; - seed_addr_r <= seed_addr_n; - seed_atype_r <= seed_atype_n; - addr_matches_r <= addr_matches_n; - out_req_valid_r <= out_req_valid_n; - out_req_mask_r <= out_req_mask_n; - out_req_rw_r <= out_req_rw_n; - out_req_addr_r <= out_req_addr_n; - out_req_atype_r <= out_req_atype_n; - out_req_byteen_r <= out_req_byteen_n; - out_req_data_r <= out_req_data_n; - out_req_tag_r <= out_req_tag_n; - processed_mask_r <= processed_mask_n; - end - end - wire [NUM_REQS-1:0] current_pmask = in_req_mask & addr_matches_r; reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] req_byteen_merged; @@ -248,6 +225,19 @@ module VX_mem_coalescer #( endcase end + `RESET_RELAY (pipe_reset, reset); + + VX_pipe_register #( + .DATAW (1 + NUM_REQS + 1 + 1 + NUM_REQS + OUT_REQS * (1 + 1 + OUT_ADDR_WIDTH + ATYPE_WIDTH + OUT_ADDR_WIDTH + ATYPE_WIDTH + DATA_OUT_SIZE + DATA_OUT_WIDTH + OUT_TAG_WIDTH)), + .RESETW (1 + NUM_REQS + 1) + ) pipe_reg ( + .clk (clk), + .reset (pipe_reset), + .enable (1'b1), + .data_in ({state_n, processed_mask_n, out_req_valid_n, out_req_rw_n, addr_matches_n, batch_valid_n, out_req_mask_n, seed_addr_n, seed_atype_n, out_req_addr_n, out_req_atype_n, out_req_byteen_n, out_req_data_n, out_req_tag_n}), + .data_out ({state_r, processed_mask_r, out_req_valid_r, out_req_rw_r, addr_matches_r, batch_valid_r, out_req_mask_r, seed_addr_r, seed_atype_r, out_req_addr_r, out_req_atype_r, out_req_byteen_r, out_req_data_r, out_req_tag_r}) + ); + wire out_rsp_fire = out_rsp_valid && out_rsp_ready; wire out_rsp_eop; diff --git a/hw/rtl/libs/VX_mem_scheduler.sv b/hw/rtl/libs/VX_mem_scheduler.sv index aa3ef9b2f..bd0c2de9c 100644 --- a/hw/rtl/libs/VX_mem_scheduler.sv +++ b/hw/rtl/libs/VX_mem_scheduler.sv @@ -167,13 +167,15 @@ module VX_mem_scheduler #( assign reqq_tag_u = ibuf_waddr; end + `RESET_RELAY (reqq_reset, reset); + VX_elastic_buffer #( .DATAW (1 + CORE_REQS * (1 + WORD_SIZE + ADDR_WIDTH + ATYPE_WIDTH + WORD_WIDTH) + REQQ_TAG_WIDTH), .SIZE (CORE_QUEUE_SIZE), .OUT_REG (1) ) req_queue ( .clk (clk), - .reset (reset), + .reset (reqq_reset), .valid_in (reqq_valid_in), .ready_in (reqq_ready_in), .data_in ({core_req_rw, core_req_mask, core_req_byteen, core_req_addr, core_req_atype, core_req_data, reqq_tag_u}), @@ -389,13 +391,15 @@ module VX_mem_scheduler #( assign reqq_ready_s = req_sent_all; + `RESET_RELAY (mem_req_reset, reset); + VX_elastic_buffer #( .DATAW (MEM_CHANNELS + 1 + MEM_CHANNELS * (LINE_SIZE + MEM_ADDR_WIDTH + ATYPE_WIDTH + LINE_WIDTH) + MEM_TAG_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(MEM_OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) ) mem_req_buf ( .clk (clk), - .reset (reset), + .reset (mem_req_reset), .valid_in (mem_req_valid_s), .ready_in (mem_req_ready_s), .data_in ({mem_req_mask_s, mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_atype_s, mem_req_data_s, mem_req_tag_s}), @@ -509,13 +513,15 @@ module VX_mem_scheduler #( // Send response to caller + `RESET_RELAY (crsp_reset, reset); + VX_elastic_buffer #( .DATAW (CORE_REQS + 1 + 1 + (CORE_REQS * WORD_WIDTH) + TAG_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(CORE_OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF)) ) rsp_buf ( .clk (clk), - .reset (reset), + .reset (crsp_reset), .valid_in (crsp_valid), .ready_in (crsp_ready), .data_in ({crsp_mask, crsp_sop, crsp_eop, crsp_data, crsp_tag}), From 07981a585c841ae9c904bce0869a5ae207572af4 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 3 Aug 2024 13:00:34 -0700 Subject: [PATCH 64/89] minor update --- hw/rtl/libs/VX_avs_adapter.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rtl/libs/VX_avs_adapter.sv b/hw/rtl/libs/VX_avs_adapter.sv index e2fefcd35..a5fb976ab 100644 --- a/hw/rtl/libs/VX_avs_adapter.sv +++ b/hw/rtl/libs/VX_avs_adapter.sv @@ -209,7 +209,7 @@ module VX_avs_adapter #( .OUT_BUF (RSP_OUT_BUF) ) rsp_arb ( .clk (clk), - .reset (reset), + .reset (rsp_arb_reset), .valid_in (rsp_arb_valid_in), .data_in (rsp_arb_data_in), .ready_in (rsp_arb_ready_in), From 4b6a48c7161840d3fd79610ca437e2b01bb15126 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 3 Aug 2024 13:37:01 -0700 Subject: [PATCH 65/89] minor update --- hw/rtl/libs/VX_mem_coalescer.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/rtl/libs/VX_mem_coalescer.sv b/hw/rtl/libs/VX_mem_coalescer.sv index 4881ed1ef..bf9f698fe 100644 --- a/hw/rtl/libs/VX_mem_coalescer.sv +++ b/hw/rtl/libs/VX_mem_coalescer.sv @@ -87,7 +87,7 @@ module VX_mem_coalescer #( localparam STATE_SETUP = 0; localparam STATE_SEND = 1; - reg state_r, state_n; + logic state_r, state_n; logic out_req_valid_r, out_req_valid_n; logic out_req_rw_r, out_req_rw_n; @@ -228,7 +228,7 @@ module VX_mem_coalescer #( `RESET_RELAY (pipe_reset, reset); VX_pipe_register #( - .DATAW (1 + NUM_REQS + 1 + 1 + NUM_REQS + OUT_REQS * (1 + 1 + OUT_ADDR_WIDTH + ATYPE_WIDTH + OUT_ADDR_WIDTH + ATYPE_WIDTH + DATA_OUT_SIZE + DATA_OUT_WIDTH + OUT_TAG_WIDTH)), + .DATAW (1 + NUM_REQS + 1 + 1 + NUM_REQS + OUT_REQS * (1 + 1 + OUT_ADDR_WIDTH + ATYPE_WIDTH + OUT_ADDR_WIDTH + ATYPE_WIDTH + DATA_OUT_SIZE + DATA_OUT_WIDTH) + OUT_TAG_WIDTH), .RESETW (1 + NUM_REQS + 1) ) pipe_reg ( .clk (clk), From cb1e49d3f60302c6141587a0b15e52a1efdf0be0 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 3 Aug 2024 17:08:16 -0700 Subject: [PATCH 66/89] minor update --- hw/rtl/fpu/VX_fpu_cvt.sv | 2 +- hw/rtl/fpu/VX_fpu_div.sv | 2 +- hw/rtl/fpu/VX_fpu_fma.sv | 4 ++-- hw/rtl/fpu/VX_fpu_ncp.sv | 2 +- hw/rtl/fpu/VX_fpu_sqrt.sv | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/hw/rtl/fpu/VX_fpu_cvt.sv b/hw/rtl/fpu/VX_fpu_cvt.sv index 4c1a6e755..37a2ab419 100644 --- a/hw/rtl/fpu/VX_fpu_cvt.sv +++ b/hw/rtl/fpu/VX_fpu_cvt.sv @@ -64,7 +64,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), .PE_REG (0), - .OUT_BUF ((`FCVT_PE_RATIO > 2) ? 1 : 0) + .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_fpu_div.sv b/hw/rtl/fpu/VX_fpu_div.sv index 992f0fbe9..81fc8f022 100644 --- a/hw/rtl/fpu/VX_fpu_div.sv +++ b/hw/rtl/fpu/VX_fpu_div.sv @@ -68,7 +68,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #( .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), .PE_REG (0), - .OUT_BUF ((`FDIV_PE_RATIO > 2) ? 1 : 0) + .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_fpu_fma.sv b/hw/rtl/fpu/VX_fpu_fma.sv index 33790dfca..3522d8a1e 100644 --- a/hw/rtl/fpu/VX_fpu_fma.sv +++ b/hw/rtl/fpu/VX_fpu_fma.sv @@ -98,8 +98,8 @@ module VX_fpu_fma import VX_fpu_pkg::*; #( .DATA_IN_WIDTH(3*32), .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), - .PE_REG ((`FMA_PE_RATIO != 1) ? 1 : 0), - .OUT_BUF ((`FMA_PE_RATIO > 2) ? 1 : 0) + .PE_REG ((NUM_LANES != NUM_PES) ? 1 : 0), // must be registered for DSPs + .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_fpu_ncp.sv b/hw/rtl/fpu/VX_fpu_ncp.sv index 0479b8826..34b822d89 100644 --- a/hw/rtl/fpu/VX_fpu_ncp.sv +++ b/hw/rtl/fpu/VX_fpu_ncp.sv @@ -69,7 +69,7 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #( .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), .PE_REG (0), - .OUT_BUF ((`FNCP_PE_RATIO > 2) ? 1 : 0) + .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_fpu_sqrt.sv b/hw/rtl/fpu/VX_fpu_sqrt.sv index 2e32077a4..a6e6dda9a 100644 --- a/hw/rtl/fpu/VX_fpu_sqrt.sv +++ b/hw/rtl/fpu/VX_fpu_sqrt.sv @@ -62,7 +62,7 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), .PE_REG (0), - .OUT_BUF ((`FSQRT_PE_RATIO > 2) ? 1 : 0) + .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) ) pe_serializer ( .clk (clk), .reset (reset), From 668b590876b09c0024a4f58eb511fdcacac8943a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 3 Aug 2024 18:25:24 -0700 Subject: [PATCH 67/89] minor update --- kernel/src/vx_start.S | 2 ++ kernel/src/vx_syscalls.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/src/vx_start.S b/kernel/src/vx_start.S index af0ef1428..799967432 100644 --- a/kernel/src/vx_start.S +++ b/kernel/src/vx_start.S @@ -51,8 +51,10 @@ _start: # la t0, trap_entry # csrw mtvec, t0 +#ifdef HAVE_INITFINI_ARRAY # run global initialization functions call __libc_init_array +#endif # call main program routine call main diff --git a/kernel/src/vx_syscalls.c b/kernel/src/vx_syscalls.c index 4759fe622..6f9c829ad 100644 --- a/kernel/src/vx_syscalls.c +++ b/kernel/src/vx_syscalls.c @@ -122,8 +122,10 @@ void __libc_fini_array (void) { // This function will be called by LIBC at program exit. // Since this platform only support statically linked programs, // it is not required to support LIBC's exit functions registration via atexit(). -void __funcs_on_exit() { +void __funcs_on_exit (void) { +#ifdef HAVE_INITFINI_ARRAY __libc_fini_array(); +#endif } #ifdef __cplusplus From 74579fd4dc863349cf37120c57afc165e1b26bee Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 4 Aug 2024 14:13:26 -0700 Subject: [PATCH 68/89] minor update --- hw/rtl/core/VX_operands.sv | 16 ++++++++-------- hw/rtl/libs/VX_onehot_mux.sv | 2 +- hw/rtl/mem/VX_local_mem.sv | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index afb5546dd..acf3f0755 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -167,14 +167,14 @@ module VX_operands import VX_gpu_pkg::*; #( `RESET_RELAY (pipe1_reset, reset); VX_pipe_register #( - .DATAW (1 + NUM_BANKS + NUM_SRC_REGS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)), - .RESETW (1 + NUM_BANKS + NUM_SRC_REGS) + .DATAW (1 + NUM_SRC_REGS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)), + .RESETW (1 + NUM_SRC_REGS) ) pipe_reg1 ( .clk (clk), .reset (pipe1_reset), .enable (pipe_in_ready), - .data_in ({scoreboard_if.valid, gpr_rd_valid, data_fetched_n, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}), - .data_out ({pipe_valid_st1, gpr_rd_valid_st1, data_fetched_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1}) + .data_in ({scoreboard_if.valid, data_fetched_n, gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}), + .data_out ({pipe_valid_st1, data_fetched_st1, gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1}) ); assign pipe_ready_st1 = pipe_ready_st2 || ~pipe_valid_st2; @@ -186,14 +186,14 @@ module VX_operands import VX_gpu_pkg::*; #( `RESET_RELAY (pipe2_reset, reset); VX_pipe_register #( - .DATAW (1 + NUM_BANKS + REGS_DATAW + (NUM_BANKS * `XLEN * `NUM_THREADS) + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH), - .RESETW (1 + NUM_BANKS + REGS_DATAW) + .DATAW (1 + REGS_DATAW + NUM_BANKS + (NUM_BANKS * `XLEN * `NUM_THREADS) + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH), + .RESETW (1 + REGS_DATAW) ) pipe_reg2 ( .clk (clk), .reset (pipe2_reset), .enable (pipe_ready_st1), - .data_in ({pipe_valid2_st1, gpr_rd_valid_st1, src_data_st1, gpr_rd_data_st1, pipe_data_st1, gpr_rd_req_idx_st1}), - .data_out ({pipe_valid_st2, gpr_rd_valid_st2, src_data_st2, gpr_rd_data_st2, pipe_data_st2, gpr_rd_req_idx_st2}) + .data_in ({pipe_valid2_st1, src_data_st1, gpr_rd_valid_st1, gpr_rd_data_st1, pipe_data_st1, gpr_rd_req_idx_st1}), + .data_out ({pipe_valid_st2, src_data_st2, gpr_rd_valid_st2, gpr_rd_data_st2, pipe_data_st2, gpr_rd_req_idx_st2}) ); always @(*) begin diff --git a/hw/rtl/libs/VX_onehot_mux.sv b/hw/rtl/libs/VX_onehot_mux.sv index cc0fffaa6..5b4cf3f38 100644 --- a/hw/rtl/libs/VX_onehot_mux.sv +++ b/hw/rtl/libs/VX_onehot_mux.sv @@ -18,7 +18,7 @@ module VX_onehot_mux #( parameter DATAW = 1, parameter N = 1, parameter MODEL = 1, - parameter LUT_OPT = 0 + parameter LUT_OPT = 1 ) ( input wire [N-1:0][DATAW-1:0] data_in, input wire [N-1:0] sel_in, diff --git a/hw/rtl/mem/VX_local_mem.sv b/hw/rtl/mem/VX_local_mem.sv index 88e736e4c..fea967d8c 100644 --- a/hw/rtl/mem/VX_local_mem.sv +++ b/hw/rtl/mem/VX_local_mem.sv @@ -229,7 +229,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( .NUM_INPUTS (NUM_BANKS), .NUM_OUTPUTS (NUM_REQS), .DATAW (RSP_DATAW), - .ARBITER ("F"), + .ARBITER ("P"), // this priority arbiter has negligeable impact om performance .OUT_BUF (OUT_BUF) ) rsp_xbar ( .clk (clk), From 59108525e1d655baa02b4096ea94c4815a0f8c73 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 4 Aug 2024 14:16:08 -0700 Subject: [PATCH 69/89] minor update --- hw/syn/altera/quartus/project.sdc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/syn/altera/quartus/project.sdc b/hw/syn/altera/quartus/project.sdc index f6373a643..6ea508531 100644 --- a/hw/syn/altera/quartus/project.sdc +++ b/hw/syn/altera/quartus/project.sdc @@ -1 +1 @@ -create_clock -name {clk} -period "220 MHz" -waveform { 0.000 1.0 } [get_ports {clk}] \ No newline at end of file +create_clock -name {clk} -period "200 MHz" -waveform { 0.000 1.0 } [get_ports {clk}] \ No newline at end of file From 42c62001ec22277080e9a3f3933383ef0b4e856a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 4 Aug 2024 22:13:47 -0700 Subject: [PATCH 70/89] fair arbiter speed optimization --- hw/rtl/libs/VX_fair_arbiter.sv | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/hw/rtl/libs/VX_fair_arbiter.sv b/hw/rtl/libs/VX_fair_arbiter.sv index 838563dd8..82bcfc5c6 100644 --- a/hw/rtl/libs/VX_fair_arbiter.sv +++ b/hw/rtl/libs/VX_fair_arbiter.sv @@ -38,17 +38,16 @@ module VX_fair_arbiter #( end else begin - reg [NUM_REQS-1:0] grant_mask; + reg [NUM_REQS-1:0] requests_r; - wire [NUM_REQS-1:0] requests_rem = requests & ~grant_mask; - wire rem_valid = (| requests_rem); - wire [NUM_REQS-1:0] requests_qual = rem_valid ? requests_rem : requests; + wire [NUM_REQS-1:0] requests_sel = requests_r & requests; + wire [NUM_REQS-1:0] requests_qual = (| requests_sel) ? requests_sel : requests; always @(posedge clk) begin if (reset) begin - grant_mask <= '0; + requests_r <= '0; end else if (grant_ready) begin - grant_mask <= rem_valid ? (grant_mask | grant_onehot) : grant_onehot; + requests_r <= requests_qual & ~grant_onehot; end end From b81ae8e431939ea4aa0aabfa166900f8d33d2f2f Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 4 Aug 2024 22:50:28 -0700 Subject: [PATCH 71/89] reset network cleanup --- hw/rtl/VX_socket.sv | 4 +-- hw/rtl/afu/opae/vortex_afu.sv | 8 ++---- hw/rtl/cache/VX_cache.sv | 18 ++++-------- hw/rtl/cache/VX_cache_bank.sv | 33 ++++++---------------- hw/rtl/cache/VX_cache_bypass.sv | 9 ++---- hw/rtl/cache/VX_cache_cluster.sv | 10 +++---- hw/rtl/core/VX_alu_unit.sv | 14 +++------- hw/rtl/core/VX_dispatch_unit.sv | 10 +++---- hw/rtl/core/VX_fpu_unit.sv | 18 ++++-------- hw/rtl/core/VX_gather_unit.sv | 4 +-- hw/rtl/core/VX_lmem_unit.sv | 14 ++++------ hw/rtl/core/VX_operands.sv | 28 +++++++------------ hw/rtl/core/VX_schedule.sv | 6 ++-- hw/rtl/libs/VX_avs_adapter.sv | 20 +++++-------- hw/rtl/libs/VX_axi_adapter.sv | 6 ++-- hw/rtl/libs/VX_mem_adapter.sv | 8 ++---- hw/rtl/libs/VX_mem_coalescer.sv | 4 +-- hw/rtl/libs/VX_mem_scheduler.sv | 12 ++------ hw/rtl/libs/VX_pe_serializer.sv | 4 +-- hw/rtl/libs/VX_reset_relay.sv | 10 +++---- hw/rtl/libs/VX_stream_arb.sv | 10 +++---- hw/rtl/libs/VX_stream_switch.sv | 48 +++++++++++++++----------------- hw/rtl/libs/VX_stream_xbar.sv | 7 ++--- hw/rtl/mem/VX_local_mem.sv | 2 +- 24 files changed, 111 insertions(+), 196 deletions(-) diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv index df2b284eb..694edfe9c 100644 --- a/hw/rtl/VX_socket.sv +++ b/hw/rtl/VX_socket.sv @@ -179,8 +179,6 @@ module VX_socket import VX_gpu_pkg::*; #( `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH); `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH); - `RESET_RELAY (mem_arb_reset, reset); - VX_mem_arb #( .NUM_INPUTS (2), .DATA_SIZE (`L1_LINE_SIZE), @@ -191,7 +189,7 @@ module VX_socket import VX_gpu_pkg::*; #( .RSP_OUT_BUF (2) ) mem_arb ( .clk (clk), - .reset (mem_arb_reset), + .reset (reset), .bus_in_if (l1_mem_bus_if), .bus_out_if (l1_mem_arb_bus_if) ); diff --git a/hw/rtl/afu/opae/vortex_afu.sv b/hw/rtl/afu/opae/vortex_afu.sv index 63a7a6ed1..93f63c48d 100644 --- a/hw/rtl/afu/opae/vortex_afu.sv +++ b/hw/rtl/afu/opae/vortex_afu.sv @@ -580,8 +580,6 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ .TAG_WIDTH (AVS_REQ_TAGW+1) ) mem_bus_if[1](); - `RESET_RELAY (mem_arb_reset, reset); - VX_mem_arb #( .NUM_INPUTS (2), .DATA_SIZE (LMEM_DATA_SIZE), @@ -592,7 +590,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ .RSP_OUT_BUF (0) ) mem_arb ( .clk (clk), - .reset (mem_arb_reset), + .reset (reset), .bus_in_if (cci_vx_mem_bus_if), .bus_out_if (mem_bus_if) ); @@ -778,14 +776,12 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ end end - `RESET_RELAY (cci_rdq_reset, reset); - VX_fifo_queue #( .DATAW (CCI_RD_QUEUE_DATAW), .DEPTH (CCI_RD_QUEUE_SIZE) ) cci_rd_req_queue ( .clk (clk), - .reset (cci_rdq_reset), + .reset (reset), .push (cci_rdq_push), .pop (cci_rdq_pop), .data_in (cci_rdq_din), diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index f8833dbc2..ae0747690 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -136,9 +136,9 @@ module VX_cache import VX_gpu_pkg::*; #( wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s; wire [NUM_REQS-1:0] core_rsp_ready_s; - for (genvar i = 0; i < NUM_REQS; ++i) begin + `RESET_RELAY_EX (core_rsp_reset, reset, NUM_REQS, `MAX_FANOUT); - `RESET_RELAY (core_rsp_reset, reset); + for (genvar i = 0; i < NUM_REQS; ++i) begin VX_elastic_buffer #( .DATAW (`CS_WORD_WIDTH + TAG_WIDTH), @@ -146,7 +146,7 @@ module VX_cache import VX_gpu_pkg::*; #( .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF)) ) core_rsp_buf ( .clk (clk), - .reset (core_rsp_reset), + .reset (core_rsp_reset[i]), .valid_in (core_rsp_valid_s[i]), .ready_in (core_rsp_ready_s[i]), .data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}), @@ -170,15 +170,13 @@ module VX_cache import VX_gpu_pkg::*; #( wire mem_bus_if_flush; - `RESET_RELAY (mem_req_reset, reset); - VX_elastic_buffer #( .DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1), .SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0), .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) ) mem_req_buf ( .clk (clk), - .reset (mem_req_reset), + .reset (reset), .valid_in (mem_req_valid_s), .ready_in (mem_req_ready_s), .data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s, mem_req_flush_s}), @@ -197,15 +195,13 @@ module VX_cache import VX_gpu_pkg::*; #( wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s; wire mem_rsp_ready_s; - `RESET_RELAY (mem_rsp_reset, reset); - VX_elastic_buffer #( .DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH), .SIZE (MRSQ_SIZE), .OUT_REG (MRSQ_SIZE > 2) ) mem_rsp_queue ( .clk (clk), - .reset (mem_rsp_reset), + .reset (reset), .valid_in (mem_bus_if.rsp_valid), .ready_in (mem_bus_if.rsp_ready), .data_in ({mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data}), @@ -502,15 +498,13 @@ module VX_cache import VX_gpu_pkg::*; #( }; end - `RESET_RELAY (mem_arb_reset, reset); - VX_stream_arb #( .NUM_INPUTS (NUM_BANKS), .DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + MSHR_ADDR_WIDTH + 1), .ARBITER ("F") ) mem_req_arb ( .clk (clk), - .reset (mem_arb_reset), + .reset (reset), .valid_in (per_bank_mem_req_valid), .ready_in (per_bank_mem_req_ready), .data_in (data_in), diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index ab1fdb3d3..dbbb4aba3 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -172,9 +172,6 @@ module VX_cache_bank #( // ensure we have no pending memory request in the bank wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty; - // this reset relay should match pipeline during tags initialization - `RESET_RELAY (flush_reset, reset); - // flush unit VX_bank_flush #( .BANK_ID (BANK_ID), @@ -185,7 +182,7 @@ module VX_cache_bank #( .WRITEBACK (WRITEBACK) ) flush_unit ( .clk (clk), - .reset (flush_reset), + .reset (reset), .flush_begin (flush_begin), .flush_end (flush_end), .flush_init (init_valid), @@ -272,14 +269,12 @@ module VX_cache_bank #( assign req_uuid_sel = 0; end - `RESET_RELAY (pipe0_reset, reset); - VX_pipe_register #( .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + NUM_WAYS + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH), .RESETW (1) ) pipe_reg0 ( .clk (clk), - .reset (pipe0_reset), + .reset (reset), .enable (~pipe_stall), .data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}), .data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0}) @@ -309,8 +304,6 @@ module VX_cache_bank #( wire [NUM_WAYS-1:0] evict_way_st0; wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0; - `RESET_RELAY (tags_reset, reset); - VX_cache_tags #( .INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)), .BANK_ID (BANK_ID), @@ -323,7 +316,7 @@ module VX_cache_bank #( .UUID_WIDTH (UUID_WIDTH) ) cache_tags ( .clk (clk), - .reset (tags_reset), + .reset (reset), .req_uuid (req_uuid_st0), @@ -355,14 +348,12 @@ module VX_cache_bank #( assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0; - `RESET_RELAY (pipe1_reset, reset); - VX_pipe_register #( .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1), .RESETW (1) ) pipe_reg1 ( .clk (clk), - .reset (pipe1_reset), + .reset (reset), .enable (~pipe_stall), .data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}), .data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1}) @@ -433,8 +424,6 @@ module VX_cache_bank #( assign write_byteen_st1 = byteen_st1; end - `RESET_RELAY (data_reset, reset); - VX_cache_data #( .INSTANCE_ID ($sformatf("%s-data", INSTANCE_ID)), .BANK_ID (BANK_ID), @@ -449,7 +438,7 @@ module VX_cache_bank #( .UUID_WIDTH (UUID_WIDTH) ) cache_data ( .clk (clk), - .reset (data_reset), + .reset (reset), .req_uuid (req_uuid_st1), @@ -502,8 +491,6 @@ module VX_cache_bank #( `UNUSED_PIN (size) ); - `RESET_RELAY (mshr_reset, reset); - VX_cache_mshr #( .INSTANCE_ID ($sformatf("%s-mshr", INSTANCE_ID)), .BANK_ID (BANK_ID), @@ -514,7 +501,7 @@ module VX_cache_bank #( .DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH) ) cache_mshr ( .clk (clk), - .reset (mshr_reset), + .reset (reset), .deq_req_uuid (req_uuid_sel), .lkp_req_uuid (req_uuid_st0), @@ -577,15 +564,13 @@ module VX_cache_bank #( assign crsp_queue_data = read_data_st1; assign crsp_queue_tag = tag_st1; - `RESET_RELAY (crsp_queue_reset, reset); - VX_elastic_buffer #( .DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH), .SIZE (CRSQ_SIZE), .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF)) ) core_rsp_queue ( .clk (clk), - .reset (crsp_queue_reset), + .reset (reset), .valid_in (crsp_queue_valid && ~rdw_hazard3_st1), .ready_in (crsp_queue_ready), .data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}), @@ -643,8 +628,6 @@ module VX_cache_bank #( `UNUSED_VAR (dirty_byteen_st1) end - `RESET_RELAY (mreq_queue_reset, reset); - VX_fifo_queue #( .DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + 1), .DEPTH (MREQ_SIZE), @@ -652,7 +635,7 @@ module VX_cache_bank #( .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) ) mem_req_queue ( .clk (clk), - .reset (mreq_queue_reset), + .reset (reset), .push (mreq_queue_push), .pop (mreq_queue_pop), .data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_data, mreq_queue_flush}), diff --git a/hw/rtl/cache/VX_cache_bypass.sv b/hw/rtl/cache/VX_cache_bypass.sv index 90ba0f30d..379d33e8a 100644 --- a/hw/rtl/cache/VX_cache_bypass.sv +++ b/hw/rtl/cache/VX_cache_bypass.sv @@ -217,15 +217,13 @@ module VX_cache_bypass #( assign mem_bus_in_if.req_ready = mem_req_out_ready; - `RESET_RELAY (mem_req_reset, reset); - VX_elastic_buffer #( .DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `ADDR_TYPE_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH), .SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0), .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) ) mem_req_buf ( .clk (clk), - .reset (mem_req_reset), + .reset (reset), .valid_in (mem_req_out_valid), .ready_in (mem_req_out_ready), .data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_atype, mem_req_out_data, mem_req_out_tag}), @@ -311,16 +309,13 @@ module VX_cache_bypass #( end for (genvar i = 0; i < NUM_REQS; ++i) begin - - `RESET_RELAY (core_rsp_reset, reset); - VX_elastic_buffer #( .DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH), .SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0), .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF)) ) core_rsp_buf ( .clk (clk), - .reset (core_rsp_reset), + .reset (reset), .valid_in (core_rsp_in_valid[i]), .ready_in (core_rsp_in_ready[i]), .data_in ({core_rsp_in_data[i], core_rsp_in_tag[i]}), diff --git a/hw/rtl/cache/VX_cache_cluster.sv b/hw/rtl/cache/VX_cache_cluster.sv index 0db1360d4..939768b63 100644 --- a/hw/rtl/cache/VX_cache_cluster.sv +++ b/hw/rtl/cache/VX_cache_cluster.sv @@ -102,6 +102,8 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .TAG_WIDTH (ARB_TAG_WIDTH) ) arb_core_bus_if[NUM_CACHES * NUM_REQS](); + `RESET_RELAY_EX (cache_arb_reset, reset, NUM_REQS, `MAX_FANOUT); + for (genvar i = 0; i < NUM_REQS; ++i) begin VX_mem_bus_if #( .DATA_SIZE (WORD_SIZE), @@ -117,8 +119,6 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( `ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]); end - `RESET_RELAY (cache_arb_reset, reset); - VX_mem_arb #( .NUM_INPUTS (NUM_INPUTS), .NUM_OUTPUTS (NUM_CACHES), @@ -130,7 +130,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0) ) cache_arb ( .clk (clk), - .reset (cache_arb_reset), + .reset (cache_arb_reset[i]), .bus_in_if (core_bus_tmp_if), .bus_out_if (arb_core_bus_tmp_if) ); @@ -182,8 +182,6 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1)) ) mem_bus_tmp_if[1](); - `RESET_RELAY (mem_arb_reset, reset); - VX_mem_arb #( .NUM_INPUTS (NUM_CACHES), .DATA_SIZE (LINE_SIZE), @@ -194,7 +192,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0) ) mem_arb ( .clk (clk), - .reset (mem_arb_reset), + .reset (reset), .bus_in_if (cache_mem_bus_if), .bus_out_if (mem_bus_tmp_if) ); diff --git a/hw/rtl/core/VX_alu_unit.sv b/hw/rtl/core/VX_alu_unit.sv index f34b0b5b1..86bcaf05e 100644 --- a/hw/rtl/core/VX_alu_unit.sv +++ b/hw/rtl/core/VX_alu_unit.sv @@ -57,7 +57,7 @@ module VX_alu_unit #( for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin - `RESET_RELAY (block_reset, reset); + `RESET_RELAY_EN (block_reset, reset,(BLOCK_SIZE > 1)); wire is_muldiv_op = `EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV); @@ -72,15 +72,13 @@ module VX_alu_unit #( assign int_execute_if.valid = per_block_execute_if[block_idx].valid && ~is_muldiv_op; assign int_execute_if.data = per_block_execute_if[block_idx].data; - `RESET_RELAY (int_reset, block_reset); - VX_alu_int #( .INSTANCE_ID ($sformatf("%s-int%0d", INSTANCE_ID, block_idx)), .BLOCK_IDX (block_idx), .NUM_LANES (NUM_LANES) ) alu_int ( .clk (clk), - .reset (int_reset), + .reset (block_reset), .execute_if (int_execute_if), .branch_ctl_if (branch_ctl_if[block_idx]), .commit_if (int_commit_if) @@ -99,14 +97,12 @@ module VX_alu_unit #( assign muldiv_execute_if.valid = per_block_execute_if[block_idx].valid && is_muldiv_op; assign muldiv_execute_if.data = per_block_execute_if[block_idx].data; - `RESET_RELAY (muldiv_reset, block_reset); - VX_alu_muldiv #( .INSTANCE_ID ($sformatf("%s-muldiv%0d", INSTANCE_ID, block_idx)), .NUM_LANES (NUM_LANES) ) muldiv_unit ( .clk (clk), - .reset (muldiv_reset), + .reset (block_reset), .execute_if (muldiv_execute_if), .commit_if (muldiv_commit_if) ); @@ -121,8 +117,6 @@ module VX_alu_unit #( // send response - `RESET_RELAY (arb_reset, block_reset); - VX_stream_arb #( .NUM_INPUTS (RSP_ARB_SIZE), .DATAW (RSP_ARB_DATAW), @@ -130,7 +124,7 @@ module VX_alu_unit #( .ARBITER ("F") ) rsp_arb ( .clk (clk), - .reset (arb_reset), + .reset (block_reset), .valid_in ({ `ifdef EXT_M_ENABLE muldiv_commit_if.valid, diff --git a/hw/rtl/core/VX_dispatch_unit.sv b/hw/rtl/core/VX_dispatch_unit.sv index 4adde52ab..618ea1221 100644 --- a/hw/rtl/core/VX_dispatch_unit.sv +++ b/hw/rtl/core/VX_dispatch_unit.sv @@ -40,7 +40,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH); localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN); localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1; - localparam FANOUT_ENABLE= (`NUM_THREADS > MAX_FANOUT); + localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT /2)); localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS); localparam DATA_REGS_OFF = 0; @@ -85,6 +85,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx); assign issue_indices[block_idx] = issue_idx; + `RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1)); + wire valid_p, ready_p; if (`NUM_THREADS != NUM_LANES) begin @@ -100,7 +102,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( wire fire_eop = fire_p && is_last_p; always @(posedge clk) begin - if (reset) begin + if (block_reset) begin sent_mask_p <= '0; is_first_p <= 1; end else begin @@ -215,8 +217,6 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( assign isw = block_idx; end - `RESET_RELAY(buf_out_reset, reset); - wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw); VX_elastic_buffer #( @@ -225,7 +225,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) buf_out ( .clk (clk), - .reset (buf_out_reset), + .reset (block_reset), .valid_in (valid_p), .ready_in (ready_p), .data_in ({ diff --git a/hw/rtl/core/VX_fpu_unit.sv b/hw/rtl/core/VX_fpu_unit.sv index 8622db490..496b24e29 100644 --- a/hw/rtl/core/VX_fpu_unit.sv +++ b/hw/rtl/core/VX_fpu_unit.sv @@ -57,7 +57,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( `UNUSED_VAR (per_block_execute_if[block_idx].data.tid) `UNUSED_VAR (per_block_execute_if[block_idx].data.wb) - `RESET_RELAY (block_reset, reset); + `RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1)); // Store request info wire fpu_req_valid, fpu_req_ready; @@ -84,14 +84,12 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( wire execute_fire = per_block_execute_if[block_idx].valid && per_block_execute_if[block_idx].ready; wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready; - `RESET_RELAY (ibuf_reset, block_reset); - VX_index_buffer #( .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + PID_WIDTH + 1 + 1), .SIZE (`FPUQ_SIZE) ) tag_store ( .clk (clk), - .reset (ibuf_reset), + .reset (block_reset), .acquire_en (execute_fire), .write_addr (fpu_req_tag), .write_data ({per_block_execute_if[block_idx].data.uuid, per_block_execute_if[block_idx].data.wid, per_block_execute_if[block_idx].data.tmask, per_block_execute_if[block_idx].data.PC, per_block_execute_if[block_idx].data.rd, per_block_execute_if[block_idx].data.pid, per_block_execute_if[block_idx].data.sop, per_block_execute_if[block_idx].data.eop}), @@ -113,8 +111,6 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( assign fpu_req_valid = per_block_execute_if[block_idx].valid && ~mdata_full; assign per_block_execute_if[block_idx].ready = fpu_req_ready && ~mdata_full; - `RESET_RELAY (fpu_reset, block_reset); - `ifdef FPU_DPI VX_fpu_dpi #( @@ -123,7 +119,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( .OUT_BUF (PARTIAL_BW ? 1 : 3) ) fpu_dpi ( .clk (clk), - .reset (fpu_reset), + .reset (block_reset), .valid_in (fpu_req_valid), .mask_in (per_block_execute_if[block_idx].data.tmask), @@ -152,7 +148,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( .OUT_BUF (PARTIAL_BW ? 1 : 3) ) fpu_fpnew ( .clk (clk), - .reset (fpu_reset), + .reset (block_reset), .valid_in (fpu_req_valid), .mask_in (per_block_execute_if[block_idx].data.tmask), @@ -181,7 +177,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( .OUT_BUF (PARTIAL_BW ? 1 : 3) ) fpu_dsp ( .clk (clk), - .reset (fpu_reset), + .reset (block_reset), .valid_in (fpu_req_valid), .mask_in (per_block_execute_if[block_idx].data.tmask), @@ -228,14 +224,12 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( // send response - `RESET_RELAY (rsp_reset, block_reset); - VX_elastic_buffer #( .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1), .SIZE (0) ) rsp_buf ( .clk (clk), - .reset (rsp_reset), + .reset (block_reset), .valid_in (fpu_rsp_valid), .ready_in (fpu_rsp_ready), .data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}), diff --git a/hw/rtl/core/VX_gather_unit.sv b/hw/rtl/core/VX_gather_unit.sv index 98d362056..293495eba 100644 --- a/hw/rtl/core/VX_gather_unit.sv +++ b/hw/rtl/core/VX_gather_unit.sv @@ -79,15 +79,13 @@ module VX_gather_unit import VX_gpu_pkg::*; #( .NUM_LANES (NUM_LANES) ) commit_tmp_if(); - `RESET_RELAY(commit_out_reset, reset); - VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (commit_out_reset), + .reset (reset), .valid_in (commit_out_valid[i]), .ready_in (commit_out_ready[i]), .data_in (commit_out_data[i]), diff --git a/hw/rtl/core/VX_lmem_unit.sv b/hw/rtl/core/VX_lmem_unit.sv index e896b4000..9a3be3dc6 100644 --- a/hw/rtl/core/VX_lmem_unit.sv +++ b/hw/rtl/core/VX_lmem_unit.sv @@ -39,6 +39,8 @@ module VX_lmem_unit import VX_gpu_pkg::*; #( .TAG_WIDTH (LSU_TAG_WIDTH) ) lsu_switch_if[`NUM_LSU_BLOCKS](); + `RESET_RELAY_EX (block_reset, reset, `NUM_LSU_BLOCKS, 1); + for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin wire [`NUM_LSU_LANES-1:0] is_addr_local_mask; @@ -52,15 +54,13 @@ module VX_lmem_unit import VX_gpu_pkg::*; #( wire req_global_ready; wire req_local_ready; - `RESET_RELAY (switch_reset, reset); - VX_elastic_buffer #( .DATAW (REQ_DATAW), .SIZE (2), .OUT_REG (1) ) req_global_buf ( .clk (clk), - .reset (switch_reset), + .reset (block_reset[i]), .valid_in (lsu_mem_in_if[i].req_valid && is_addr_global), .data_in ({ lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask, @@ -91,7 +91,7 @@ module VX_lmem_unit import VX_gpu_pkg::*; #( .OUT_REG (0) ) req_local_buf ( .clk (clk), - .reset (switch_reset), + .reset (block_reset[i]), .valid_in (lsu_mem_in_if[i].req_valid && is_addr_local), .data_in ({ lsu_mem_in_if[i].req_data.mask & is_addr_local_mask, @@ -126,7 +126,7 @@ module VX_lmem_unit import VX_gpu_pkg::*; #( .OUT_BUF (1) ) rsp_arb ( .clk (clk), - .reset (switch_reset), + .reset (block_reset[i]), .valid_in ({ lsu_switch_if[i].rsp_valid, lsu_mem_out_if[i].rsp_valid @@ -157,8 +157,6 @@ module VX_lmem_unit import VX_gpu_pkg::*; #( .TAG_WIDTH (LSU_TAG_WIDTH) ) lmem_bus_tmp_if[`NUM_LSU_LANES](); - `RESET_RELAY (adapter_reset, reset); - VX_lsu_adapter #( .NUM_LANES (`NUM_LSU_LANES), .DATA_SIZE (LSU_WORD_SIZE), @@ -168,7 +166,7 @@ module VX_lmem_unit import VX_gpu_pkg::*; #( .RSP_OUT_BUF (0) ) lsu_adapter ( .clk (clk), - .reset (adapter_reset), + .reset (block_reset[i]), .lsu_mem_if (lsu_switch_if[i]), .mem_bus_if (lmem_bus_tmp_if) ); diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index acf3f0755..e3df0c1fa 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -44,8 +44,8 @@ module VX_operands import VX_gpu_pkg::*; #( localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS); localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS; localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH; - localparam REGS_DATAW = NUM_SRC_REGS * `NUM_THREADS * `XLEN; - localparam DATAW = META_DATAW + REGS_DATAW; + localparam REGS_DATAW = `XLEN * `NUM_THREADS; + localparam DATAW = META_DATAW + NUM_SRC_REGS * REGS_DATAW; localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS); localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS; localparam XLEN_SIZE = `XLEN / 8; @@ -100,8 +100,6 @@ module VX_operands import VX_gpu_pkg::*; #( assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid; - `RESET_RELAY (req_xbar_reset, reset); - VX_stream_xbar #( .NUM_INPUTS (NUM_SRC_REGS), .NUM_OUTPUTS (NUM_BANKS), @@ -111,7 +109,7 @@ module VX_operands import VX_gpu_pkg::*; #( .OUT_BUF (0) // no output buffering ) req_xbar ( .clk (clk), - .reset (req_xbar_reset), + .reset (reset), `UNUSED_PIN(collisions), .valid_in (req_in_valid), .data_in (req_in_data), @@ -164,14 +162,12 @@ module VX_operands import VX_gpu_pkg::*; #( scoreboard_if.data.uuid }; - `RESET_RELAY (pipe1_reset, reset); - VX_pipe_register #( .DATAW (1 + NUM_SRC_REGS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)), .RESETW (1 + NUM_SRC_REGS) ) pipe_reg1 ( .clk (clk), - .reset (pipe1_reset), + .reset (reset), .enable (pipe_in_ready), .data_in ({scoreboard_if.valid, data_fetched_n, gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}), .data_out ({pipe_valid_st1, data_fetched_st1, gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1}) @@ -183,11 +179,11 @@ module VX_operands import VX_gpu_pkg::*; #( wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1; - `RESET_RELAY (pipe2_reset, reset); + `RESET_RELAY (pipe2_reset, reset); // needed for pipe_reg2's wide RESETW VX_pipe_register #( - .DATAW (1 + REGS_DATAW + NUM_BANKS + (NUM_BANKS * `XLEN * `NUM_THREADS) + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH), - .RESETW (1 + REGS_DATAW) + .DATAW (1 + NUM_SRC_REGS * REGS_DATAW + NUM_BANKS + NUM_BANKS * REGS_DATAW + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH), + .RESETW (1 + NUM_SRC_REGS * REGS_DATAW) ) pipe_reg2 ( .clk (clk), .reset (pipe2_reset), @@ -205,8 +201,6 @@ module VX_operands import VX_gpu_pkg::*; #( end end - `RESET_RELAY (out_buf_reset, reset); - VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), @@ -214,7 +208,7 @@ module VX_operands import VX_gpu_pkg::*; #( .LUTRAM (1) ) out_buf ( .clk (clk), - .reset (out_buf_reset), + .reset (reset), .valid_in (pipe_valid_st2), .ready_in (pipe_ready_st2), .data_in ({ @@ -281,10 +275,8 @@ module VX_operands import VX_gpu_pkg::*; #( assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}}; end - `RESET_RELAY (bram_reset, reset); - VX_dp_ram #( - .DATAW (`XLEN * `NUM_THREADS), + .DATAW (REGS_DATAW), .SIZE (PER_BANK_REGS * PER_ISSUE_WARPS), .WRENW (BYTEENW), `ifdef GPR_RESET @@ -293,7 +285,7 @@ module VX_operands import VX_gpu_pkg::*; #( .NO_RWCHECK (1) ) gpr_ram ( .clk (clk), - .reset (bram_reset), + .reset (reset), .read (pipe_fire_st1), .wren (wren), .write (gpr_wr_enabled), diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index 6bc748745..71a74c6ac 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -383,16 +383,16 @@ module VX_schedule import VX_gpu_pkg::*; #( wire [`NUM_WARPS-1:0] pending_warp_empty; wire [`NUM_WARPS-1:0] pending_warp_alm_empty; - for (genvar i = 0; i < `NUM_WARPS; ++i) begin + `RESET_RELAY_EX (pending_instr_reset, reset, `NUM_WARPS, `MAX_FANOUT); - `RESET_RELAY (pending_instr_reset, reset); + for (genvar i = 0; i < `NUM_WARPS; ++i) begin VX_pending_size #( .SIZE (4096), .ALM_EMPTY (1) ) counter ( .clk (clk), - .reset (pending_instr_reset), + .reset (pending_instr_reset[i]), .incr (per_warp_incr[i]), .decr (commit_sched_if.committed_warps[i]), .empty (pending_warp_empty[i]), diff --git a/hw/rtl/libs/VX_avs_adapter.sv b/hw/rtl/libs/VX_avs_adapter.sv index a5fb976ab..35d329c7b 100644 --- a/hw/rtl/libs/VX_avs_adapter.sv +++ b/hw/rtl/libs/VX_avs_adapter.sv @@ -81,15 +81,15 @@ module VX_avs_adapter #( assign req_queue_push[i] = mem_req_valid && ~mem_req_rw && bank_req_ready[i] && (req_bank_sel == i); end - for (genvar i = 0; i < NUM_BANKS; ++i) begin + `RESET_RELAY_EX (bank_reset, reset, NUM_BANKS, 1); - `RESET_RELAY (rd_req_reset, reset); + for (genvar i = 0; i < NUM_BANKS; ++i) begin VX_pending_size #( .SIZE (RD_QUEUE_SIZE) ) pending_size ( .clk (clk), - .reset (rd_req_reset), + .reset (bank_reset[i]), .incr (req_queue_push[i]), .decr (req_queue_pop[i]), `UNUSED_PIN (empty), @@ -105,7 +105,7 @@ module VX_avs_adapter #( .DEPTH (RD_QUEUE_SIZE) ) rd_req_queue ( .clk (clk), - .reset (rd_req_reset), + .reset (bank_reset[i]), .push (req_queue_push[i]), .pop (req_queue_pop[i]), .data_in (mem_req_tag), @@ -129,15 +129,13 @@ module VX_avs_adapter #( wire valid_out_w = mem_req_valid && ~req_queue_going_full[i] && (req_bank_sel == i); wire ready_out_w; - `RESET_RELAY (req_out_reset, reset); - VX_elastic_buffer #( .DATAW (1 + DATA_SIZE + BANK_OFFSETW + DATA_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(REQ_OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF)) ) req_out_buf ( .clk (clk), - .reset (req_out_reset), + .reset (bank_reset[i]), .valid_in (valid_out_w), .ready_in (ready_out_w), .data_in ({mem_req_rw, mem_req_byteen, req_bank_off, mem_req_data}), @@ -174,14 +172,12 @@ module VX_avs_adapter #( for (genvar i = 0; i < NUM_BANKS; ++i) begin - `RESET_RELAY (rd_rsp_reset, reset); - VX_fifo_queue #( .DATAW (DATA_WIDTH), .DEPTH (RD_QUEUE_SIZE) ) rd_rsp_queue ( .clk (clk), - .reset (rd_rsp_reset), + .reset (bank_reset[i]), .push (avs_readdatavalid[i]), .pop (req_queue_pop[i]), .data_in (avs_readdata[i]), @@ -200,8 +196,6 @@ module VX_avs_adapter #( assign req_queue_pop[i] = rsp_arb_valid_in[i] && rsp_arb_ready_in[i]; end - `RESET_RELAY (rsp_arb_reset, reset); - VX_stream_arb #( .NUM_INPUTS (NUM_BANKS), .DATAW (DATA_WIDTH + TAG_WIDTH), @@ -209,7 +203,7 @@ module VX_avs_adapter #( .OUT_BUF (RSP_OUT_BUF) ) rsp_arb ( .clk (clk), - .reset (rsp_arb_reset), + .reset (reset), .valid_in (rsp_arb_valid_in), .data_in (rsp_arb_data_in), .ready_in (rsp_arb_ready_in), diff --git a/hw/rtl/libs/VX_axi_adapter.sv b/hw/rtl/libs/VX_axi_adapter.sv index 14e930d74..7fffb9be2 100644 --- a/hw/rtl/libs/VX_axi_adapter.sv +++ b/hw/rtl/libs/VX_axi_adapter.sv @@ -203,9 +203,7 @@ module VX_axi_adapter #( `RUNTIME_ASSERT(~m_axi_rvalid[i] || m_axi_rlast[i] == 1, ("%t: *** AXI response error", $time)); `RUNTIME_ASSERT(~m_axi_rvalid[i] || m_axi_rresp[i] == 0, ("%t: *** AXI response error", $time)); end - - `RESET_RELAY (rsp_arb_reset, reset); - + VX_stream_arb #( .NUM_INPUTS (NUM_BANKS), .DATAW (DATA_WIDTH + TAG_WIDTH), @@ -213,7 +211,7 @@ module VX_axi_adapter #( .OUT_BUF (RSP_OUT_BUF) ) rsp_arb ( .clk (clk), - .reset (rsp_arb_reset), + .reset (reset), .valid_in (rsp_arb_valid_in), .data_in (rsp_arb_data_in), .ready_in (rsp_arb_ready_in), diff --git a/hw/rtl/libs/VX_mem_adapter.sv b/hw/rtl/libs/VX_mem_adapter.sv index 988ae606c..263df0159 100644 --- a/hw/rtl/libs/VX_mem_adapter.sv +++ b/hw/rtl/libs/VX_mem_adapter.sv @@ -206,15 +206,13 @@ module VX_mem_adapter #( end - `RESET_RELAY (req_out_reset, reset); - VX_elastic_buffer #( .DATAW (1 + DST_DATA_SIZE + DST_ADDR_WIDTH + DST_DATA_WIDTH + DST_TAG_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(REQ_OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF)) ) req_out_buf ( .clk (clk), - .reset (req_out_reset), + .reset (reset), .valid_in (mem_req_valid_out_w), .ready_in (mem_req_ready_out_w), .data_in ({mem_req_rw_out_w, mem_req_byteen_out_w, mem_req_addr_out_w, mem_req_data_out_w, mem_req_tag_out_w}), @@ -223,15 +221,13 @@ module VX_mem_adapter #( .ready_out (mem_req_ready_out) ); - `RESET_RELAY (rsp_in_reset, reset); - VX_elastic_buffer #( .DATAW (SRC_DATA_WIDTH + SRC_TAG_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(RSP_OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(RSP_OUT_BUF)) ) rsp_in_buf ( .clk (clk), - .reset (rsp_in_reset), + .reset (reset), .valid_in (mem_rsp_valid_in_w), .ready_in (mem_rsp_ready_in_w), .data_in ({mem_rsp_data_in_w, mem_rsp_tag_in_w}), diff --git a/hw/rtl/libs/VX_mem_coalescer.sv b/hw/rtl/libs/VX_mem_coalescer.sv index bf9f698fe..d9c13691f 100644 --- a/hw/rtl/libs/VX_mem_coalescer.sv +++ b/hw/rtl/libs/VX_mem_coalescer.sv @@ -225,14 +225,12 @@ module VX_mem_coalescer #( endcase end - `RESET_RELAY (pipe_reset, reset); - VX_pipe_register #( .DATAW (1 + NUM_REQS + 1 + 1 + NUM_REQS + OUT_REQS * (1 + 1 + OUT_ADDR_WIDTH + ATYPE_WIDTH + OUT_ADDR_WIDTH + ATYPE_WIDTH + DATA_OUT_SIZE + DATA_OUT_WIDTH) + OUT_TAG_WIDTH), .RESETW (1 + NUM_REQS + 1) ) pipe_reg ( .clk (clk), - .reset (pipe_reset), + .reset (reset), .enable (1'b1), .data_in ({state_n, processed_mask_n, out_req_valid_n, out_req_rw_n, addr_matches_n, batch_valid_n, out_req_mask_n, seed_addr_n, seed_atype_n, out_req_addr_n, out_req_atype_n, out_req_byteen_n, out_req_data_n, out_req_tag_n}), .data_out ({state_r, processed_mask_r, out_req_valid_r, out_req_rw_r, addr_matches_r, batch_valid_r, out_req_mask_r, seed_addr_r, seed_atype_r, out_req_addr_r, out_req_atype_r, out_req_byteen_r, out_req_data_r, out_req_tag_r}) diff --git a/hw/rtl/libs/VX_mem_scheduler.sv b/hw/rtl/libs/VX_mem_scheduler.sv index bd0c2de9c..aa3ef9b2f 100644 --- a/hw/rtl/libs/VX_mem_scheduler.sv +++ b/hw/rtl/libs/VX_mem_scheduler.sv @@ -167,15 +167,13 @@ module VX_mem_scheduler #( assign reqq_tag_u = ibuf_waddr; end - `RESET_RELAY (reqq_reset, reset); - VX_elastic_buffer #( .DATAW (1 + CORE_REQS * (1 + WORD_SIZE + ADDR_WIDTH + ATYPE_WIDTH + WORD_WIDTH) + REQQ_TAG_WIDTH), .SIZE (CORE_QUEUE_SIZE), .OUT_REG (1) ) req_queue ( .clk (clk), - .reset (reqq_reset), + .reset (reset), .valid_in (reqq_valid_in), .ready_in (reqq_ready_in), .data_in ({core_req_rw, core_req_mask, core_req_byteen, core_req_addr, core_req_atype, core_req_data, reqq_tag_u}), @@ -391,15 +389,13 @@ module VX_mem_scheduler #( assign reqq_ready_s = req_sent_all; - `RESET_RELAY (mem_req_reset, reset); - VX_elastic_buffer #( .DATAW (MEM_CHANNELS + 1 + MEM_CHANNELS * (LINE_SIZE + MEM_ADDR_WIDTH + ATYPE_WIDTH + LINE_WIDTH) + MEM_TAG_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(MEM_OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) ) mem_req_buf ( .clk (clk), - .reset (mem_req_reset), + .reset (reset), .valid_in (mem_req_valid_s), .ready_in (mem_req_ready_s), .data_in ({mem_req_mask_s, mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_atype_s, mem_req_data_s, mem_req_tag_s}), @@ -513,15 +509,13 @@ module VX_mem_scheduler #( // Send response to caller - `RESET_RELAY (crsp_reset, reset); - VX_elastic_buffer #( .DATAW (CORE_REQS + 1 + 1 + (CORE_REQS * WORD_WIDTH) + TAG_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(CORE_OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF)) ) rsp_buf ( .clk (clk), - .reset (crsp_reset), + .reset (reset), .valid_in (crsp_valid), .ready_in (crsp_ready), .data_in ({crsp_mask, crsp_sop, crsp_eop, crsp_data, crsp_tag}), diff --git a/hw/rtl/libs/VX_pe_serializer.sv b/hw/rtl/libs/VX_pe_serializer.sv index e71672041..eac1eddcb 100644 --- a/hw/rtl/libs/VX_pe_serializer.sv +++ b/hw/rtl/libs/VX_pe_serializer.sv @@ -147,15 +147,13 @@ module VX_pe_serializer #( end - `RESET_RELAY (out_buf_reset, reset); - VX_elastic_buffer #( .DATAW (NUM_LANES * DATA_OUT_WIDTH + TAG_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (out_buf_reset), + .reset (reset), .valid_in (valid_out_u), .ready_in (ready_out_u), .data_in ({data_out_u, tag_out_u}), diff --git a/hw/rtl/libs/VX_reset_relay.sv b/hw/rtl/libs/VX_reset_relay.sv index 23cc32f2f..d7e735c25 100644 --- a/hw/rtl/libs/VX_reset_relay.sv +++ b/hw/rtl/libs/VX_reset_relay.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,8 +21,8 @@ module VX_reset_relay #( input wire clk, input wire reset, output wire [N-1:0] reset_o -); - if (MAX_FANOUT >= 0 && N > MAX_FANOUT) begin +); + if (MAX_FANOUT >= 0 && N > (MAX_FANOUT + MAX_FANOUT/2)) begin localparam F = `UP(MAX_FANOUT); localparam R = N / F; `PRESERVE_NET reg [R-1:0] reset_r; @@ -38,6 +38,6 @@ module VX_reset_relay #( `UNUSED_VAR (clk) assign reset_o = {N{reset}}; end - + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_stream_arb.sv b/hw/rtl/libs/VX_stream_arb.sv index 165f7a01d..65b0605b4 100644 --- a/hw/rtl/libs/VX_stream_arb.sv +++ b/hw/rtl/libs/VX_stream_arb.sv @@ -73,7 +73,7 @@ module VX_stream_arb #( ); end - end else if (MAX_FANOUT != 0 && (NUM_INPUTS > MAX_FANOUT)) begin + end else if (MAX_FANOUT != 0 && (NUM_INPUTS > (MAX_FANOUT + MAX_FANOUT /2))) begin // (#inputs > max_fanout) and (#outputs == 1) @@ -245,7 +245,7 @@ module VX_stream_arb #( end end - end else if (MAX_FANOUT != 0 && (NUM_OUTPUTS > MAX_FANOUT)) begin + end else if (MAX_FANOUT != 0 && (NUM_OUTPUTS > (MAX_FANOUT + MAX_FANOUT /2))) begin // (#inputs == 1) and (#outputs > max_fanout) @@ -357,9 +357,9 @@ module VX_stream_arb #( // #Inputs == #Outputs - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin + `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); - `RESET_RELAY_EN (out_buf_reset, reset, (NUM_OUTPUTS > 1)); + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin VX_elastic_buffer #( .DATAW (DATAW), @@ -368,7 +368,7 @@ module VX_stream_arb #( .LUTRAM (LUTRAM) ) out_buf ( .clk (clk), - .reset (out_buf_reset), + .reset (out_buf_reset[i]), .valid_in (valid_in[i]), .ready_in (ready_in[i]), .data_in (data_in[i]), diff --git a/hw/rtl/libs/VX_stream_switch.sv b/hw/rtl/libs/VX_stream_switch.sv index f73929071..3a905cb1d 100644 --- a/hw/rtl/libs/VX_stream_switch.sv +++ b/hw/rtl/libs/VX_stream_switch.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -33,7 +33,7 @@ module VX_stream_switch #( output wire [NUM_INPUTS-1:0] ready_in, output wire [NUM_OUTPUTS-1:0] valid_out, - output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out, + output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out, input wire [NUM_OUTPUTS-1:0] ready_out ); if (NUM_INPUTS > NUM_OUTPUTS) begin @@ -52,7 +52,7 @@ module VX_stream_switch #( assign data_in_r[i][j] = '0; end end - end + end wire [NUM_OUTPUTS-1:0] valid_out_r; wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_r; @@ -65,25 +65,24 @@ module VX_stream_switch #( for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin for (genvar j = 0; j < NUM_REQS; ++j) begin - localparam ii = i * NUM_REQS + j; - if (ii < NUM_INPUTS) begin + localparam ii = i * NUM_REQS + j; + if (ii < NUM_INPUTS) begin assign ready_in[ii] = ready_out_r[i] & (sel_in[i] == LOG_NUM_REQS'(j)); end end end + `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin - - `RESET_RELAY_EN (out_buf_reset, reset, (NUM_OUTPUTS > 1)); - VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (out_buf_reset), - .valid_in (valid_out_r[i]), + .reset (out_buf_reset[i]), + .valid_in (valid_out_r[i]), .ready_in (ready_out_r[i]), .data_in (data_out_r[i]), .data_out (data_out[i]), @@ -93,7 +92,7 @@ module VX_stream_switch #( end end else if (NUM_OUTPUTS > NUM_INPUTS) begin - + wire [NUM_INPUTS-1:0][NUM_REQS-1:0] valid_out_r; wire [NUM_INPUTS-1:0][NUM_REQS-1:0] ready_out_r; @@ -104,51 +103,50 @@ module VX_stream_switch #( assign ready_in[i] = ready_out_r[i][sel_in[i]]; end + `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); + for (genvar i = 0; i < NUM_INPUTS; ++i) begin for (genvar j = 0; j < NUM_REQS; ++j) begin localparam ii = i * NUM_REQS + j; if (ii < NUM_OUTPUTS) begin - - `RESET_RELAY (out_buf_reset, reset); - VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (out_buf_reset), + .reset (out_buf_reset[ii]), .valid_in (valid_out_r[i][j]), .ready_in (ready_out_r[i][j]), - .data_in (data_in[i]), + .data_in (data_in[i]), .data_out (data_out[ii]), .valid_out (valid_out[ii]), .ready_out (ready_out[ii]) ); end else begin + `UNUSED_VAR (out_buf_reset[ii]) `UNUSED_VAR (valid_out_r[i][j]) assign ready_out_r[i][j] = '0; - end + end end end - + end else begin // #Inputs == #Outputs - + `UNUSED_VAR (sel_in) + `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin - - `RESET_RELAY_EN (out_buf_reset, reset, (NUM_OUTPUTS > 1)); - VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (out_buf_reset), + .reset (out_buf_reset[i]), .valid_in (valid_in[i]), .ready_in (ready_in[i]), .data_in (data_in[i]), @@ -159,6 +157,6 @@ module VX_stream_switch #( end end - + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_stream_xbar.sv b/hw/rtl/libs/VX_stream_xbar.sv index 7539121f2..b7bdcbf5e 100644 --- a/hw/rtl/libs/VX_stream_xbar.sv +++ b/hw/rtl/libs/VX_stream_xbar.sv @@ -126,10 +126,9 @@ module VX_stream_xbar #( assign data_out_r = {NUM_OUTPUTS{data_in}}; assign ready_in = ready_out_r[sel_in]; + `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin - - `RESET_RELAY (out_buf_reset, reset); - VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), @@ -137,7 +136,7 @@ module VX_stream_xbar #( .LUTRAM (LUTRAM) ) out_buf ( .clk (clk), - .reset (out_buf_reset), + .reset (out_buf_reset[i]), .valid_in (valid_out_r[i]), .ready_in (ready_out_r[i]), .data_in (data_out_r[i]), diff --git a/hw/rtl/mem/VX_local_mem.sv b/hw/rtl/mem/VX_local_mem.sv index fea967d8c..3dce0ec43 100644 --- a/hw/rtl/mem/VX_local_mem.sv +++ b/hw/rtl/mem/VX_local_mem.sv @@ -163,7 +163,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( wire bank_rsp_valid, bank_rsp_ready; wire [WORD_WIDTH-1:0] bank_rsp_data; - `RESET_RELAY (bram_reset, reset); + `RESET_RELAY_EN (bram_reset, reset, (NUM_BANKS > 1)); VX_sp_ram #( .DATAW (WORD_WIDTH), From fbedc567e52e8591e47b985cb6d3f10511c6cfbd Mon Sep 17 00:00:00 2001 From: Jacob Levinson Date: Sun, 4 Aug 2024 23:39:13 -0700 Subject: [PATCH 72/89] Updated prints and code spacing --- tests/regression/stencil3d/main.cpp | 30 ++++++++++------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/tests/regression/stencil3d/main.cpp b/tests/regression/stencil3d/main.cpp index a47f94710..0536effc0 100644 --- a/tests/regression/stencil3d/main.cpp +++ b/tests/regression/stencil3d/main.cpp @@ -120,29 +120,19 @@ static void stencil_cpu(TYPE *out, const TYPE *in, uint32_t width, uint32_t heig // Check bounds and replicate the boundary values if (nx < 0) - { - nx = 0; - } + {nx = 0;} else if (nx >= (int)width) - { - nx = width - 1; - } + {nx = width - 1;} + if (ny < 0) - { - ny = 0; - } + {ny = 0;} else if (ny >= (int)height) - { - ny = height - 1; - } + {ny = height - 1;} + if (nz < 0) - { - nz = 0; - } + {nz = 0;} else if (nz >= (int)depth) - { - nz = depth - 1; - } + {nz = depth - 1;} // Sum up the values sum += in[nz * width * height + ny * width + nx]; @@ -238,8 +228,8 @@ int main(int argc, char *argv[]) uint32_t buf_size = size_cubed * sizeof(TYPE); std::cout << "data type: " << Comparator::type_str() << std::endl; - std::cout << "matrix size: " << size << "x" << size << std::endl; - std::cout << "block size: " << block_size << "x" << block_size << std::endl; + std::cout << "matrix size: " << size << "x" << size << "x" << size << std::endl; + std::cout << "block size: " << block_size << "x" << block_size << "x" << block_size << std::endl; kernel_arg.grid_dim[0] = size / block_size; kernel_arg.grid_dim[1] = size / block_size; From 9dcb377b67aaffd5d871805cbca132e4006afdeb Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 6 Aug 2024 12:32:02 -0700 Subject: [PATCH 73/89] Moving from one-hot to binary muxing optimization FPGA synthesis is suboptimal with one-hot muxing, particularly Xilinx Vivado. This change fixed Xilinx synthesis for 256-thread cores. --- hw/rtl/libs/VX_stream_arb.sv | 10 +--------- hw/rtl/libs/VX_stream_pack.sv | 18 +++++------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/hw/rtl/libs/VX_stream_arb.sv b/hw/rtl/libs/VX_stream_arb.sv index 65b0605b4..98fed5859 100644 --- a/hw/rtl/libs/VX_stream_arb.sv +++ b/hw/rtl/libs/VX_stream_arb.sv @@ -174,17 +174,9 @@ module VX_stream_arb #( ); assign valid_in_r = arb_valid; + assign data_in_r = data_in[arb_index]; assign arb_ready = ready_in_r; - VX_onehot_mux #( - .DATAW (DATAW), - .N (NUM_REQS) - ) onehot_mux ( - .data_in (data_in), - .sel_in (arb_onehot), - .data_out (data_in_r) - ); - for (genvar i = 0; i < NUM_REQS; ++i) begin assign ready_in[i] = ready_in_r && arb_onehot[i]; end diff --git a/hw/rtl/libs/VX_stream_pack.sv b/hw/rtl/libs/VX_stream_pack.sv index df0000307..7f024b184 100644 --- a/hw/rtl/libs/VX_stream_pack.sv +++ b/hw/rtl/libs/VX_stream_pack.sv @@ -39,8 +39,9 @@ module VX_stream_pack #( input wire ready_out ); if (NUM_REQS > 1) begin + localparam LOG_NUM_REQS = `CLOG2(NUM_REQS); - wire [NUM_REQS-1:0] grant_onehot; + wire [LOG_NUM_REQS-1:0] grant_index; wire grant_valid; wire grant_ready; @@ -52,21 +53,12 @@ module VX_stream_pack #( .reset (reset), .requests (valid_in), .grant_valid (grant_valid), - `UNUSED_PIN (grant_index), - .grant_onehot(grant_onehot), + .grant_index (grant_index), + `UNUSED_PIN (grant_onehot), .grant_ready (grant_ready) ); - wire [TAG_WIDTH-1:0] tag_sel; - - VX_onehot_mux #( - .DATAW (TAG_WIDTH), - .N (NUM_REQS) - ) onehot_mux ( - .data_in (tag_in), - .sel_in (grant_onehot), - .data_out (tag_sel) - ); + wire [TAG_WIDTH-1:0] tag_sel = tag_in[grant_index]; wire [NUM_REQS-1:0] tag_matches; From 0096e60f032f3d23ee722974bc4205cf6220de86 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 6 Aug 2024 12:38:30 -0700 Subject: [PATCH 74/89] Making LUT optimization optional --- hw/rtl/libs/VX_onehot_mux.sv | 2 +- hw/rtl/libs/VX_rr_arbiter.sv | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/hw/rtl/libs/VX_onehot_mux.sv b/hw/rtl/libs/VX_onehot_mux.sv index 5b4cf3f38..cc0fffaa6 100644 --- a/hw/rtl/libs/VX_onehot_mux.sv +++ b/hw/rtl/libs/VX_onehot_mux.sv @@ -18,7 +18,7 @@ module VX_onehot_mux #( parameter DATAW = 1, parameter N = 1, parameter MODEL = 1, - parameter LUT_OPT = 1 + parameter LUT_OPT = 0 ) ( input wire [N-1:0][DATAW-1:0] data_in, input wire [N-1:0] sel_in, diff --git a/hw/rtl/libs/VX_rr_arbiter.sv b/hw/rtl/libs/VX_rr_arbiter.sv index 5c5f7b3b4..52a981184 100644 --- a/hw/rtl/libs/VX_rr_arbiter.sv +++ b/hw/rtl/libs/VX_rr_arbiter.sv @@ -15,9 +15,10 @@ `TRACING_OFF module VX_rr_arbiter #( - parameter NUM_REQS = 1, - parameter MODEL = 1, - parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS) + parameter NUM_REQS = 1, + parameter MODEL = 1, + parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS), + parameter LUT_OPT = 0 ) ( input wire clk, input wire reset, @@ -37,7 +38,7 @@ module VX_rr_arbiter #( assign grant_onehot = requests; assign grant_valid = requests[0]; - end else if (NUM_REQS == 2) begin + end else if (LUT_OPT && NUM_REQS == 2) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; @@ -63,7 +64,7 @@ module VX_rr_arbiter #( assign grant_onehot = grant_onehot_r; assign grant_valid = (| requests); - end /*else if (NUM_REQS == 3) begin + end else if (LUT_OPT && NUM_REQS == 3) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; @@ -93,7 +94,7 @@ module VX_rr_arbiter #( assign grant_onehot = grant_onehot_r; assign grant_valid = (| requests); - end */else if (NUM_REQS == 4) begin + end else if (LUT_OPT && NUM_REQS == 4) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; @@ -129,7 +130,7 @@ module VX_rr_arbiter #( assign grant_onehot = grant_onehot_r; assign grant_valid = (| requests); - end /*else if (NUM_REQS == 5) begin + end else if (LUT_OPT && NUM_REQS == 5) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; @@ -173,7 +174,7 @@ module VX_rr_arbiter #( assign grant_onehot = grant_onehot_r; assign grant_valid = (| requests); - end else if (NUM_REQS == 6) begin + end else if (LUT_OPT && NUM_REQS == 6) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; @@ -227,7 +228,7 @@ module VX_rr_arbiter #( assign grant_onehot = grant_onehot_r; assign grant_valid = (| requests); - end else if (NUM_REQS == 7) begin + end else if (LUT_OPT && NUM_REQS == 7) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; @@ -293,7 +294,7 @@ module VX_rr_arbiter #( assign grant_onehot = grant_onehot_r; assign grant_valid = (| requests); - end */else if (NUM_REQS == 8) begin + end else if (LUT_OPT && NUM_REQS == 8) begin reg [LOG_NUM_REQS-1:0] grant_index_r; reg [NUM_REQS-1:0] grant_onehot_r; From 50b12ef754093d03e67d0f2f42250cbb52cf107c Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 6 Aug 2024 12:46:19 -0700 Subject: [PATCH 75/89] fixed memory block size configuration --- ci/regression.sh.in | 30 ++++++++++++++++++++---------- hw/rtl/libs/VX_mem_coalescer.sv | 6 +++++- sim/common/util.h | 26 +++++++++++++++++++++++++- sim/rtlsim/processor.cpp | 5 +++-- 4 files changed, 53 insertions(+), 14 deletions(-) diff --git a/ci/regression.sh.in b/ci/regression.sh.in index e4fb1c999..ba8e0f2bb 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -21,6 +21,8 @@ rm -f blackbox.*.cache XLEN=${XLEN:=@XLEN@} +XSIZE=$(XLEN/8) + echo "Vortex Regression Test: XLEN=$XLEN" unittest() @@ -97,11 +99,11 @@ regression() # test global barrier CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tgbar" --cores=2 - CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tgbar" --cores=2 + CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tgbar" --cores=2 # test local barrier ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar" - ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tbar" + ./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tbar" echo "regression tests done!" } @@ -137,10 +139,10 @@ cache() CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx # reduce l1 line size - CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=rtlsim --app=io_addr - CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=simx --app=io_addr - CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx + CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=rtlsim --app=io_addr + CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=io_addr + CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx # test cache ways CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx @@ -254,16 +256,24 @@ config2() CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo # test AXI bus - AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=demo + AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=mstress # test 128-bit MEM block - CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=demo + CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=mstress + + # test XLEN-bit MEM block + CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=opae --app=mstress + CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=mstress + + # test memory coalescing + CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8 + CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8 # test single-bank DRAM - CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --app=demo + CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress # test 27-bit DRAM address - CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --app=demo + CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --app=mstress echo "configuration-2 tests done!" } diff --git a/hw/rtl/libs/VX_mem_coalescer.sv b/hw/rtl/libs/VX_mem_coalescer.sv index d9c13691f..d1ffde09a 100644 --- a/hw/rtl/libs/VX_mem_coalescer.sv +++ b/hw/rtl/libs/VX_mem_coalescer.sv @@ -135,7 +135,11 @@ module VX_mem_coalescer #( `UNUSED_PIN (onehot), .valid_out (batch_valid_n[i]) ); - assign seed_idx[i] = NUM_REQS_W'(i * DATA_RATIO) + NUM_REQS_W'(batch_idx); + if (OUT_REQS > 1) begin + assign seed_idx[i] = {(NUM_REQS_W-DATA_RATIO_W)'(i), batch_idx}; + end else begin + assign seed_idx[i] = batch_idx; + end end for (genvar i = 0; i < OUT_REQS; ++i) begin diff --git a/sim/common/util.h b/sim/common/util.h index 83fdee7df..fd234d279 100644 --- a/sim/common/util.h +++ b/sim/common/util.h @@ -70,4 +70,28 @@ const char* fileExtension(const char* filepath); #endif void *aligned_malloc(size_t size, size_t alignment); -void aligned_free(void *ptr); \ No newline at end of file +void aligned_free(void *ptr); + +namespace vortex { + +// Verilator data type casting +template +class VDataCast; +template +class VDataCast 8)>::type> { +public: + template + static R get(T& obj) { + return reinterpret_cast(obj.data()); + } +}; +template +class VDataCast::type> { +public: + template + static R get(T& obj) { + return reinterpret_cast(&obj); + } +}; + +} \ No newline at end of file diff --git a/sim/rtlsim/processor.cpp b/sim/rtlsim/processor.cpp index e8ce35329..e5e00f49e 100644 --- a/sim/rtlsim/processor.cpp +++ b/sim/rtlsim/processor.cpp @@ -39,6 +39,7 @@ typedef VVortex Device; #include #include +#include #ifndef MEMORY_BANKS #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS @@ -469,7 +470,7 @@ private: } printf("\n"); */ - memcpy(device_->mem_rsp_data.data(), mem_rsp->block.data(), MEM_BLOCK_SIZE); + memcpy(VDataCast::get(device_->mem_rsp_data), mem_rsp->block.data(), MEM_BLOCK_SIZE); device_->mem_rsp_tag = mem_rsp->tag; pending_mem_reqs_.erase(mem_rsp_it); mem_rd_rsp_active_ = true; @@ -484,7 +485,7 @@ private: uint64_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE); if (device_->mem_req_rw) { auto byteen = device_->mem_req_byteen; - auto data = (uint8_t*)(device_->mem_req_data.data()); + auto data = VDataCast::get(device_->mem_req_data); if (byte_addr >= uint64_t(IO_COUT_ADDR) && byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) { From d276875ab9d86213e35d086409588aa75c8e990b Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 6 Aug 2024 12:47:05 -0700 Subject: [PATCH 76/89] fixed memory block size configuration --- hw/rtl/VX_config.vh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 804715aad..8d1c280fd 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -202,7 +202,7 @@ `ifndef IO_COUT_ADDR `define IO_COUT_ADDR `IO_BASE_ADDR `endif -`define IO_COUT_SIZE `MEM_BLOCK_SIZE +`define IO_COUT_SIZE 64 `ifndef IO_MPM_ADDR `define IO_MPM_ADDR (`IO_COUT_ADDR + `IO_COUT_SIZE) From c265ff97b8dc9c839c78cc48fa0aa64a8edbdaf5 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 6 Aug 2024 12:58:58 -0700 Subject: [PATCH 77/89] minor updates --- hw/rtl/core/VX_core.sv | 1 + hw/rtl/core/VX_lmem_unit.sv | 1 + hw/rtl/libs/VX_elastic_buffer.sv | 40 ++++++++++- hw/rtl/libs/VX_pipe_register.sv | 111 +++++++++++++++++++------------ 4 files changed, 108 insertions(+), 45 deletions(-) diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 090f47199..4c82db812 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -313,6 +313,7 @@ module VX_core import VX_gpu_pkg::*; #( .DATA_SIZE (DCACHE_WORD_SIZE), .TAG_WIDTH (DCACHE_TAG_WIDTH), .TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH), + .ARBITER ("P"), .REQ_OUT_BUF (0), .RSP_OUT_BUF (0) ) lsu_adapter ( diff --git a/hw/rtl/core/VX_lmem_unit.sv b/hw/rtl/core/VX_lmem_unit.sv index 9a3be3dc6..accb7a586 100644 --- a/hw/rtl/core/VX_lmem_unit.sv +++ b/hw/rtl/core/VX_lmem_unit.sv @@ -162,6 +162,7 @@ module VX_lmem_unit import VX_gpu_pkg::*; #( .DATA_SIZE (LSU_WORD_SIZE), .TAG_WIDTH (LSU_TAG_WIDTH), .TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH), + .ARBITER ("P"), .REQ_OUT_BUF (3), .RSP_OUT_BUF (0) ) lsu_adapter ( diff --git a/hw/rtl/libs/VX_elastic_buffer.sv b/hw/rtl/libs/VX_elastic_buffer.sv index ee6f31b58..9213572d3 100644 --- a/hw/rtl/libs/VX_elastic_buffer.sv +++ b/hw/rtl/libs/VX_elastic_buffer.sv @@ -18,7 +18,8 @@ module VX_elastic_buffer #( parameter DATAW = 1, parameter SIZE = 1, parameter OUT_REG = 0, - parameter LUTRAM = 0 + parameter LUTRAM = 0, + parameter MAX_FANOUT = 0 ) ( input wire clk, input wire reset, @@ -40,6 +41,43 @@ module VX_elastic_buffer #( assign data_out = data_in; assign ready_in = ready_out; + end else if (MAX_FANOUT != 0 && (DATAW > (MAX_FANOUT + MAX_FANOUT/2))) begin + + localparam NUM_SLICES = `CDIV(DATAW, MAX_FANOUT); + localparam N_DATAW = DATAW / NUM_SLICES; + + for (genvar i = 0; i < NUM_SLICES; ++i) begin + + localparam S_DATAW = (i == NUM_SLICES-1) ? (DATAW - i * N_DATAW) : N_DATAW; + + wire valid_out_t, ready_in_t; + `UNUSED_VAR (valid_out_t) + `UNUSED_VAR (ready_in_t) + + `RESET_RELAY (slice_reset, reset); + + VX_elastic_buffer #( + .DATAW (S_DATAW), + .SIZE (SIZE), + .OUT_REG (OUT_REG), + .LUTRAM (LUTRAM) + ) buffer_slice ( + .clk (clk), + .reset (slice_reset), + .valid_in (valid_in), + .data_in (data_in[i * N_DATAW +: S_DATAW]), + .ready_in (ready_in_t), + .valid_out (valid_out_t), + .data_out (data_out[i * N_DATAW +: S_DATAW]), + .ready_out (ready_out) + ); + + if (i == 0) begin + assign ready_in = ready_in_t; + assign valid_out = valid_out_t; + end + end + end else if (SIZE == 1) begin VX_pipe_buffer #( diff --git a/hw/rtl/libs/VX_pipe_register.sv b/hw/rtl/libs/VX_pipe_register.sv index f8537ba78..707438abd 100644 --- a/hw/rtl/libs/VX_pipe_register.sv +++ b/hw/rtl/libs/VX_pipe_register.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -14,10 +14,11 @@ `include "VX_platform.vh" `TRACING_OFF -module VX_pipe_register #( - parameter DATAW = 1, - parameter RESETW = 0, - parameter DEPTH = 1 +module VX_pipe_register #( + parameter DATAW = 1, + parameter RESETW = 0, + parameter DEPTH = 1, + parameter MAX_FANOUT = 0 ) ( input wire clk, input wire reset, @@ -25,54 +26,76 @@ module VX_pipe_register #( input wire [DATAW-1:0] data_in, output wire [DATAW-1:0] data_out ); - if (DEPTH == 0) begin + if (DEPTH == 0) begin `UNUSED_VAR (clk) `UNUSED_VAR (reset) `UNUSED_VAR (enable) - assign data_out = data_in; - end else if (DEPTH == 1) begin - if (RESETW == 0) begin - `UNUSED_VAR (reset) - reg [DATAW-1:0] value; - - always @(posedge clk) begin - if (enable) begin - value <= data_in; - end + assign data_out = data_in; + end else if (DEPTH == 1) begin + if (MAX_FANOUT != 0 && (DATAW > (MAX_FANOUT + MAX_FANOUT/2))) begin + localparam NUM_SLICES = `CDIV(DATAW, MAX_FANOUT); + localparam N_DATAW = DATAW / NUM_SLICES; + for (genvar i = 0; i < NUM_SLICES; ++i) begin + localparam SLICE_START = i * N_DATAW; + localparam SLICE_END = SLICE_START + S_DATAW - 1; + localparam S_DATAW = (i == NUM_SLICES-1) ? (DATAW - SLICE_START) : N_DATAW; + localparam S_RESETW = (SLICE_END >= (DATAW - RESETW)) ? + ((SLICE_START >= (DATAW - RESETW)) ? S_DATAW : (SLICE_END - (DATAW - RESETW) + 1)) : 0; + VX_pipe_register #( + .DATAW (S_DATAW), + .RESETW (S_RESETW) + ) pipe_register_slice ( + .clk (clk), + .reset (reset), + .enable (enable), + .data_in (data_in[i * N_DATAW +: S_DATAW]), + .data_out (data_out[i * N_DATAW +: S_DATAW]) + ); end - assign data_out = value; - end else if (RESETW == DATAW) begin - reg [DATAW-1:0] value; - - always @(posedge clk) begin - if (reset) begin - value <= RESETW'(0); - end else if (enable) begin - value <= data_in; - end - end - assign data_out = value; end else begin - reg [DATAW-RESETW-1:0] value_d; - reg [RESETW-1:0] value_r; + if (RESETW == 0) begin + `UNUSED_VAR (reset) + reg [DATAW-1:0] value; - always @(posedge clk) begin - if (reset) begin - value_r <= RESETW'(0); - end else if (enable) begin - value_r <= data_in[DATAW-1:DATAW-RESETW]; + always @(posedge clk) begin + if (enable) begin + value <= data_in; + end end + assign data_out = value; + end else if (RESETW == DATAW) begin + reg [DATAW-1:0] value; + + always @(posedge clk) begin + if (reset) begin + value <= RESETW'(0); + end else if (enable) begin + value <= data_in; + end + end + assign data_out = value; + end else begin + reg [DATAW-RESETW-1:0] value_d; + reg [RESETW-1:0] value_r; + + always @(posedge clk) begin + if (reset) begin + value_r <= RESETW'(0); + end else if (enable) begin + value_r <= data_in[DATAW-1:DATAW-RESETW]; + end + end + + always @(posedge clk) begin + if (enable) begin + value_d <= data_in[DATAW-RESETW-1:0]; + end + end + assign data_out = {value_r, value_d}; end - - always @(posedge clk) begin - if (enable) begin - value_d <= data_in[DATAW-RESETW-1:0]; - end - end - assign data_out = {value_r, value_d}; end end else begin - wire [DEPTH:0][DATAW-1:0] data_delayed; + wire [DEPTH:0][DATAW-1:0] data_delayed; assign data_delayed[0] = data_in; for (genvar i = 1; i <= DEPTH; ++i) begin VX_pipe_register #( From df8355ac76fea73e50d705c33b4df19224559584 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 6 Aug 2024 13:11:28 -0700 Subject: [PATCH 78/89] fixed minor typo --- ci/regression.sh.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/regression.sh.in b/ci/regression.sh.in index ba8e0f2bb..3cd46a463 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -21,7 +21,7 @@ rm -f blackbox.*.cache XLEN=${XLEN:=@XLEN@} -XSIZE=$(XLEN/8) +XSIZE=$((XLEN / 8)) echo "Vortex Regression Test: XLEN=$XLEN" From 7cdfac8ea1a37306db29b6511d42cced4401f52d Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 6 Aug 2024 17:20:01 -0700 Subject: [PATCH 79/89] fixed kernel lib dependency --- kernel/src/vx_start.S | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/src/vx_start.S b/kernel/src/vx_start.S index 799967432..290b68058 100644 --- a/kernel/src/vx_start.S +++ b/kernel/src/vx_start.S @@ -13,6 +13,7 @@ #include #include +#include #include "common.h" .section .init, "ax" From c94c3651ec71a3e2e1d8ff8485da01c1f85da33d Mon Sep 17 00:00:00 2001 From: sij814 Date: Sun, 11 Aug 2024 14:47:43 -0700 Subject: [PATCH 80/89] configure change 22.04 --- configure | 1 + 1 file changed, 1 insertion(+) diff --git a/configure b/configure index 62975784b..cab5142c5 100755 --- a/configure +++ b/configure @@ -26,6 +26,7 @@ detect_osversion() { case "$VERSION_CODENAME" in bionic) osversion="ubuntu/bionic";; focal) osversion="ubuntu/focal";; + jammy) osversion="ubuntu/focal";; # Add new versions as needed esac ;; From de81baaabf79b2816339e232dfa2c5006ea8602a Mon Sep 17 00:00:00 2001 From: sij814 Date: Mon, 12 Aug 2024 02:52:47 -0700 Subject: [PATCH 81/89] hbm for vortex 2.2 --- hw/rtl/VX_config.vh | 2 +- hw/rtl/VX_types.vh | 5 +++ runtime/include/vortex.h | 1 + runtime/simx/vortex.cpp | 3 ++ runtime/stub/utils.cpp | 21 ++++++++++ sim/common/dram_sim.cpp | 1 + sim/simx/cache_cluster.h | 4 +- sim/simx/cache_sim.cpp | 82 +++++++++++++++++++++++++++++++-------- sim/simx/cache_sim.h | 4 +- sim/simx/cluster.cpp | 4 +- sim/simx/constants.h | 2 +- sim/simx/emulator.cpp | 3 ++ sim/simx/mem_sim.cpp | 78 ++++++++++++++++++++----------------- sim/simx/mem_sim.h | 18 ++++++--- sim/simx/processor.cpp | 29 ++++++++------ sim/simx/processor_impl.h | 1 + 16 files changed, 180 insertions(+), 78 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 8d1c280fd..615c1ae6d 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -617,7 +617,7 @@ // Number of Banks `ifndef L3_NUM_BANKS -`define L3_NUM_BANKS `MIN(4, `NUM_CLUSTERS) +`define L3_NUM_BANKS `MIN(8, `NUM_CLUSTERS) `endif // Core Response Queue Size diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 927ffae96..685051b6c 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -173,6 +173,11 @@ `define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C `define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts `define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D +// PERF: hbm +`define VX_CSR_HBM_BANK_CNTR 12'hB1E // hbm banks +`define VX_CSR_HBM_BANK_CNTR_H 12'hB9E +`define VX_CSR_HBM_BANK_TICK 12'hB1F // hbm ticks +`define VX_CSR_HBM_BANK_TICK_H 12'hB9F // Machine Performance-monitoring memory counters (class 3) /////////////////// // diff --git a/runtime/include/vortex.h b/runtime/include/vortex.h index 8481002e1..bf263da09 100644 --- a/runtime/include/vortex.h +++ b/runtime/include/vortex.h @@ -34,6 +34,7 @@ typedef void* vx_buffer_h; #define VX_CAPS_GLOBAL_MEM_SIZE 0x5 #define VX_CAPS_LOCAL_MEM_SIZE 0x6 #define VX_CAPS_ISA_FLAGS 0x7 +#define VX_CAPS_L3CACHE_NUM_BANKS 0x8 // device isa flags #define VX_ISA_STD_A (1ull << ISA_STD_A) diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp index 89856f3a0..be7173fc3 100644 --- a/runtime/simx/vortex.cpp +++ b/runtime/simx/vortex.cpp @@ -81,6 +81,9 @@ public: case VX_CAPS_ISA_FLAGS: _value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD; break; + case VX_CAPS_L3CACHE_NUM_BANKS: + _value = L3_NUM_BANKS; + break; default: std::cout << "invalid caps id: " << caps_id << std::endl; std::abort(); diff --git a/runtime/stub/utils.cpp b/runtime/stub/utils.cpp index 9826db711..ae894fcbb 100644 --- a/runtime/stub/utils.cpp +++ b/runtime/stub/utils.cpp @@ -211,6 +211,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t mem_reads = 0; uint64_t mem_writes = 0; uint64_t mem_lat = 0; + + // PERF: hbm + uint64_t hbm_counter = 0; + uint64_t hbm_ticks = 0; uint64_t num_cores; CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), { @@ -222,6 +226,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { return err; }); + uint64_t l3cache_banks; + CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_L3CACHE_NUM_BANKS, &l3cache_banks), { + return err; + }); + bool icache_enable = isa_flags & VX_ISA_EXT_ICACHE; bool dcache_enable = isa_flags & VX_ISA_EXT_DCACHE; bool l2cache_enable = isa_flags & VX_ISA_EXT_L2CACHE; @@ -522,6 +531,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MSHR_ST, core_id, &l3cache_mshr_stalls), { return err; }); + + // PERF: HBM + CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_HBM_BANK_CNTR, core_id, &hbm_counter), { + return err; + }); + CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_HBM_BANK_TICK, core_id, &hbm_ticks), { + return err; + }); } // PERF: memory CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_READS, core_id, &mem_reads), { @@ -606,6 +623,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, write_hit_ratio); fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, bank_utilization); fprintf(stream, "PERF: l3cache mshr stalls=%ld (utilization=%d%%)\n", l3cache_mshr_stalls, mshr_utilization); + + // HBM + float util = (float)hbm_counter / (hbm_ticks * l3cache_banks) * 100; + fprintf(stream, "PERF: hbm bank utilization=%f\n", util); } int mem_avg_lat = caclAverage(mem_lat, mem_reads); diff --git a/sim/common/dram_sim.cpp b/sim/common/dram_sim.cpp index f7cfa8a32..c2a9e9ee0 100644 --- a/sim/common/dram_sim.cpp +++ b/sim/common/dram_sim.cpp @@ -41,6 +41,7 @@ public: dram_config["MemorySystem"]["DRAM"]["impl"] = "HBM2"; dram_config["MemorySystem"]["DRAM"]["org"]["preset"] = "HBM2_8Gb"; dram_config["MemorySystem"]["DRAM"]["org"]["density"] = 8192; + dram_config["MemorySystem"]["DRAM"]["org"]["channel"] = 8; dram_config["MemorySystem"]["DRAM"]["timing"]["preset"] = "HBM2_2Gbps"; dram_config["MemorySystem"]["Controller"]["impl"] = "Generic"; dram_config["MemorySystem"]["Controller"]["Scheduler"]["impl"] = "FRFCFS"; diff --git a/sim/simx/cache_cluster.h b/sim/simx/cache_cluster.h index 63016577b..2ba26dc21 100644 --- a/sim/simx/cache_cluster.h +++ b/sim/simx/cache_cluster.h @@ -77,8 +77,8 @@ public: caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i)); } - caches_.at(i)->MemReqPort.bind(&cache_arb->ReqIn.at(i)); - cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPort); + caches_.at(i)->MemReqPorts.at(0).bind(&cache_arb->ReqIn.at(i)); + cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPorts.at(0)); } cache_arb->ReqOut.at(0).bind(&this->MemReqPort); diff --git a/sim/simx/cache_sim.cpp b/sim/simx/cache_sim.cpp index 65a8da70b..d7d1727f6 100644 --- a/sim/simx/cache_sim.cpp +++ b/sim/simx/cache_sim.cpp @@ -19,6 +19,7 @@ #include #include #include +#include using namespace vortex; @@ -315,27 +316,74 @@ public: simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i)); bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i)); } - bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort); - simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0)); + bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPorts.at(0)); + simobject->MemRspPorts.at(0).bind(&bypass_switch_->RspOut.at(0)); return; } - bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2); - bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort); - simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0)); + if (strcmp(simobject->name().c_str(), "l3cache")) { + bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2); + bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPorts.at(0)); + simobject->MemRspPorts.at(0).bind(&bypass_switch_->RspOut.at(0)); - if (config.B != 0) { - snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); - bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B)); - for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) { - mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i)); - bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); + if (config.B != 0) { + snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); + bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B)); + for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) { + mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i)); + bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); + } + bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0)); + } else { + mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0)); } - bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0)); - bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0)); } else { - mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); - bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0)); + uint32_t max = MAX(2, config_.num_inputs); + //printf("%s connecting\n", simobject_->name().c_str()); + //3 + if (config.B != 0) { + bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, max, max); + for (uint32_t i = 0; i < max; ++i) { + //printf("%s connecting input=%d to MemPorts\n", simobject_->name().c_str(), i); + bypass_switch_->ReqOut.at(i).bind(&simobject->MemReqPorts.at(i % (1 << config.B))); + simobject->MemRspPorts.at(i % (1 << config.B)).bind(&bypass_switch_->RspOut.at(i)); + } + } else { + bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2); + bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPorts.at(0)); + simobject->MemRspPorts.at(0).bind(&bypass_switch_->RspOut.at(0)); + } + + if (config.B != 0) + { + snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); + bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B), (1 << config.B)); + for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) + { + //1 + //printf("%s Connecting memory ports to bank=%d\n", simobject_->name().c_str(), i); + mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i)); + bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); + } + //2 + if (config_.num_inputs > 1) { + for (uint32_t i = 0; i < max; ++i) { + //printf("%s connecting bank and bypass port=%d\n", simobject_->name().c_str(), i); + bank_switch_->ReqOut.at(i % (1 << config.B)).bind(&bypass_switch_->ReqIn.at(i)); + bypass_switch_->RspIn.at(i).bind(&bank_switch_->RspOut.at(i % (1 << config.B))); + } + } else { + bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0)); + } + } + else + { + mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0)); + } } // calculate cache initialization cycles @@ -673,8 +721,8 @@ CacheSim::CacheSim(const SimContext& ctx, const char* name, const Config& config : SimObject(ctx, name) , CoreReqPorts(config.num_inputs, this) , CoreRspPorts(config.num_inputs, this) - , MemReqPort(this) - , MemRspPort(this) + , MemReqPorts((1 << config.B), this) + , MemRspPorts((1 << config.B), this) , impl_(new Impl(this, config)) {} diff --git a/sim/simx/cache_sim.h b/sim/simx/cache_sim.h index df62bf854..aad489546 100644 --- a/sim/simx/cache_sim.h +++ b/sim/simx/cache_sim.h @@ -75,8 +75,8 @@ public: std::vector> CoreReqPorts; std::vector> CoreRspPorts; - SimPort MemReqPort; - SimPort MemRspPort; + std::vector> MemReqPorts; + std::vector> MemRspPorts; CacheSim(const SimContext& ctx, const char* name, const Config& config); ~CacheSim(); diff --git a/sim/simx/cluster.cpp b/sim/simx/cluster.cpp index ec5e3f2b6..e23df448b 100644 --- a/sim/simx/cluster.cpp +++ b/sim/simx/cluster.cpp @@ -76,8 +76,8 @@ Cluster::Cluster(const SimContext& ctx, 2, // pipeline latency }); - l2cache_->MemReqPort.bind(&this->mem_req_port); - this->mem_rsp_port.bind(&l2cache_->MemRspPort); + l2cache_->MemReqPorts.at(0).bind(&this->mem_req_port); + this->mem_rsp_port.bind(&l2cache_->MemRspPorts.at(0)); icache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(0)); l2cache_->CoreRspPorts.at(0).bind(&icache_switch->RspOut.at(0)); diff --git a/sim/simx/constants.h b/sim/simx/constants.h index 09a509ce1..81a626b84 100644 --- a/sim/simx/constants.h +++ b/sim/simx/constants.h @@ -22,7 +22,7 @@ #endif #ifndef MEMORY_BANKS -#define MEMORY_BANKS 2 +#define MEMORY_BANKS 8 #endif #define LSU_WORD_SIZE (XLEN / 8) diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 7ed9a10f9..d76113249 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -455,6 +455,9 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { CSR_READ_64(VX_CSR_MPM_LMEM_READS, lmem_perf.reads); CSR_READ_64(VX_CSR_MPM_LMEM_WRITES, lmem_perf.writes); CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls); + + CSR_READ_64(VX_CSR_HBM_BANK_CNTR, proc_perf.memsim.counter); + CSR_READ_64(VX_CSR_HBM_BANK_TICK, proc_perf.memsim.ticks); } } break; default: { diff --git a/sim/simx/mem_sim.cpp b/sim/simx/mem_sim.cpp index a12713fea..6d8015d1f 100644 --- a/sim/simx/mem_sim.cpp +++ b/sim/simx/mem_sim.cpp @@ -33,6 +33,7 @@ private: struct DramCallbackArgs { MemSim* simobject; MemReq request; + uint32_t i; }; public: @@ -56,46 +57,49 @@ public: void tick() { dram_sim_.tick(); + uint32_t counter = 0; - if (simobject_->MemReqPort.empty()) - return; + for (uint32_t i = 0; i < L3_NUM_BANKS; ++i) { + if (simobject_->MemReqPorts.at(i).empty()) + continue; - auto& mem_req = simobject_->MemReqPort.front(); + auto& mem_req = simobject_->MemReqPorts.at(i).front(); - // try to enqueue the request to the memory system - auto req_args = new DramCallbackArgs{simobject_, mem_req}; - auto enqueue_success = dram_sim_.send_request( - mem_req.write, - mem_req.addr, - 0, - [](void* arg) { - auto rsp_args = reinterpret_cast(arg); - // only send a response for read requests - if (!rsp_args->request.write) { - MemRsp mem_rsp{rsp_args->request.tag, rsp_args->request.cid, rsp_args->request.uuid}; - rsp_args->simobject->MemRspPort.push(mem_rsp, 1); - DT(3, rsp_args->simobject->name() << " mem-rsp: " << mem_rsp); - } - delete rsp_args; - }, - req_args - ); + // try to enqueue the request to the memory system + auto req_args = new DramCallbackArgs{simobject_, mem_req, i}; + auto enqueue_success = dram_sim_.send_request( + mem_req.write, + mem_req.addr, + i, + [](void* arg) { + auto rsp_args = reinterpret_cast(arg); + // only send a response for read requests + if (!rsp_args->request.write) { + MemRsp mem_rsp{rsp_args->request.tag, rsp_args->request.cid, rsp_args->request.uuid}; + rsp_args->simobject->MemRspPorts.at(rsp_args->i).push(mem_rsp, 1); + DT(3, rsp_args->simobject->name() << " mem-rsp: " << mem_rsp); + } + delete rsp_args; + }, + req_args + ); - // check if the request was enqueued successfully - if (!enqueue_success) { - delete req_args; - return; + // check if the request was enqueued successfully + if (!enqueue_success) { + delete req_args; + continue; + } + + DT(3, simobject_->name() << " mem-req: " << mem_req << " bank: " << i); + + simobject_->MemReqPorts.at(i).pop(); + counter++; } - if (mem_req.write) { - ++perf_stats_.writes; - } else { - ++perf_stats_.reads; + perf_stats_.counter += counter; + if (counter > 0) { + ++perf_stats_.ticks; } - - DT(3, simobject_->name() << " mem-req: " << mem_req); - - simobject_->MemReqPort.pop(); } }; @@ -103,8 +107,8 @@ public: MemSim::MemSim(const SimContext& ctx, const char* name, const Config& config) : SimObject(ctx, name) - , MemReqPort(this) - , MemRspPort(this) + , MemReqPorts(L3_NUM_BANKS, this) + , MemRspPorts(L3_NUM_BANKS, this) , impl_(new Impl(this, config)) {} @@ -118,4 +122,8 @@ void MemSim::reset() { void MemSim::tick() { impl_->tick(); +} + +const MemSim::PerfStats &MemSim::perf_stats() const { + return impl_->perf_stats(); } \ No newline at end of file diff --git a/sim/simx/mem_sim.h b/sim/simx/mem_sim.h index 3f4d9801e..2f4f96187 100644 --- a/sim/simx/mem_sim.h +++ b/sim/simx/mem_sim.h @@ -26,17 +26,23 @@ public: }; struct PerfStats { - uint64_t reads; - uint64_t writes; + uint64_t counter; + uint64_t ticks; PerfStats() - : reads(0) - , writes(0) + : counter(0) + , ticks(0) {} + + PerfStats& operator+=(const PerfStats& rhs) { + this->counter += rhs.counter; + this->ticks += rhs.ticks; + return *this; + } }; - SimPort MemReqPort; - SimPort MemRspPort; + std::vector> MemReqPorts; + std::vector> MemRspPorts; MemSim(const SimContext& ctx, const char* name, const Config& config); ~MemSim(); diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index 3807fa5e8..b3664f3fa 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -47,8 +47,10 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) ); // connect L3 memory ports - l3cache_->MemReqPort.bind(&memsim_->MemReqPort); - memsim_->MemRspPort.bind(&l3cache_->MemRspPort); + for (uint32_t i = 0; i < L3_NUM_BANKS; ++i) { + l3cache_->MemReqPorts.at(i).bind(&memsim_->MemReqPorts.at(i)); + memsim_->MemRspPorts.at(i).bind(&l3cache_->MemRspPorts.at(i)); + } // create clusters for (uint32_t i = 0; i < arch.num_clusters(); ++i) { @@ -59,16 +61,18 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) } // set up memory profiling - memsim_->MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){ - __unused (cycle); - perf_mem_reads_ += !req.write; - perf_mem_writes_ += req.write; - perf_mem_pending_reads_ += !req.write; - }); - memsim_->MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){ - __unused (cycle); - --perf_mem_pending_reads_; - }); + for (uint32_t i = 0; i < L3_NUM_BANKS; ++i) { + memsim_->MemReqPorts.at(i).tx_callback([&](const MemReq& req, uint64_t cycle){ + __unused (cycle); + perf_mem_reads_ += !req.write; + perf_mem_writes_ += req.write; + perf_mem_pending_reads_ += !req.write; + }); + memsim_->MemRspPorts.at(i).tx_callback([&](const MemRsp&, uint64_t cycle){ + __unused (cycle); + --perf_mem_pending_reads_; + }); + } #ifndef NDEBUG // dump device configuration @@ -131,6 +135,7 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const { perf.mem_writes = perf_mem_writes_; perf.mem_latency = perf_mem_latency_; perf.l3cache = l3cache_->perf_stats(); + perf.memsim = memsim_->perf_stats(); return perf; } diff --git a/sim/simx/processor_impl.h b/sim/simx/processor_impl.h index dcfba84d7..cffeffbfe 100644 --- a/sim/simx/processor_impl.h +++ b/sim/simx/processor_impl.h @@ -25,6 +25,7 @@ class ProcessorImpl { public: struct PerfStats { CacheSim::PerfStats l3cache; + MemSim::PerfStats memsim; uint64_t mem_reads; uint64_t mem_writes; uint64_t mem_latency; From bab9496117a2438e54ebc1429969a3eb281f539c Mon Sep 17 00:00:00 2001 From: sij814 Date: Mon, 12 Aug 2024 03:52:48 -0700 Subject: [PATCH 82/89] debugging segmentation fault with 8 clusters --- sim/common/dram_sim.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sim/common/dram_sim.cpp b/sim/common/dram_sim.cpp index c2a9e9ee0..aa6f882e1 100644 --- a/sim/common/dram_sim.cpp +++ b/sim/common/dram_sim.cpp @@ -46,7 +46,6 @@ public: dram_config["MemorySystem"]["Controller"]["impl"] = "Generic"; dram_config["MemorySystem"]["Controller"]["Scheduler"]["impl"] = "FRFCFS"; dram_config["MemorySystem"]["Controller"]["RefreshManager"]["impl"] = "AllBank"; - dram_config["MemorySystem"]["Controller"]["RefreshManager"]["impl"] = "AllBank"; dram_config["MemorySystem"]["Controller"]["RowPolicy"]["impl"] = "OpenRowPolicy"; { YAML::Node draw_plugin; @@ -67,7 +66,9 @@ public: auto original_buf = std::cout.rdbuf(); std::cout.rdbuf(nullstream.rdbuf()); ramulator_frontend_->finalize(); - ramulator_memorysystem_->finalize(); + ramulator_memorysystem_->finalize(); + delete ramulator_frontend_; + delete ramulator_memorysystem_; std::cout.rdbuf(original_buf); } From 47427ab22e20984d615f4574afde1a6cdd4f053d Mon Sep 17 00:00:00 2001 From: sij814 Date: Mon, 12 Aug 2024 16:22:30 -0700 Subject: [PATCH 83/89] regression test with source_id 0 --- sim/common/dram_sim.cpp | 2 -- sim/simx/cache_sim.cpp | 1 + sim/simx/mem_sim.cpp | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sim/common/dram_sim.cpp b/sim/common/dram_sim.cpp index aa6f882e1..684dd6f7d 100644 --- a/sim/common/dram_sim.cpp +++ b/sim/common/dram_sim.cpp @@ -67,8 +67,6 @@ public: std::cout.rdbuf(nullstream.rdbuf()); ramulator_frontend_->finalize(); ramulator_memorysystem_->finalize(); - delete ramulator_frontend_; - delete ramulator_memorysystem_; std::cout.rdbuf(original_buf); } diff --git a/sim/simx/cache_sim.cpp b/sim/simx/cache_sim.cpp index d7d1727f6..ca98c1e5f 100644 --- a/sim/simx/cache_sim.cpp +++ b/sim/simx/cache_sim.cpp @@ -340,6 +340,7 @@ public: bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0)); } } else { + // TODO: Change this into a crossbar uint32_t max = MAX(2, config_.num_inputs); //printf("%s connecting\n", simobject_->name().c_str()); //3 diff --git a/sim/simx/mem_sim.cpp b/sim/simx/mem_sim.cpp index 6d8015d1f..04395683a 100644 --- a/sim/simx/mem_sim.cpp +++ b/sim/simx/mem_sim.cpp @@ -70,7 +70,7 @@ public: auto enqueue_success = dram_sim_.send_request( mem_req.write, mem_req.addr, - i, + 0, [](void* arg) { auto rsp_args = reinterpret_cast(arg); // only send a response for read requests From 1a9a04ac7602ed7c88f799af5a179bc900ca0efe Mon Sep 17 00:00:00 2001 From: donghanyuan Date: Tue, 13 Aug 2024 18:06:53 +0800 Subject: [PATCH 84/89] replace local static allocator to global static Ensure MemoryPool construct before SimPlatform, thus MemoryPool destruct after SimPlatform. Avoid use-after-free issue clearing events_ of SimPlatform after SimPortEvent's allocator is destructed. --- sim/common/simobject.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/sim/common/simobject.h b/sim/common/simobject.h index f4c84e3f3..31fc4c0e6 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -168,23 +168,23 @@ public: {} void* operator new(size_t /*size*/) { - return allocator().allocate(); + return allocator_.allocate(); } void operator delete(void* ptr) { - allocator().deallocate(ptr); + allocator_.deallocate(ptr); } protected: Func func_; Pkt pkt_; - static MemoryPool>& allocator() { - static MemoryPool> instance(64); - return instance; - } + static MemoryPool> allocator_; }; +template +MemoryPool> SimCallEvent::allocator_(64); + /////////////////////////////////////////////////////////////////////////////// template @@ -201,23 +201,23 @@ public: {} void* operator new(size_t /*size*/) { - return allocator().allocate(); + return allocator_.allocate(); } void operator delete(void* ptr) { - allocator().deallocate(ptr); + allocator_.deallocate(ptr); } protected: const SimPort* port_; Pkt pkt_; - static MemoryPool>& allocator() { - static MemoryPool> instance(64); - return instance; - } + static MemoryPool> allocator_; }; +template +MemoryPool> SimPortEvent::allocator_(64); + /////////////////////////////////////////////////////////////////////////////// class SimContext; From ea34239b4361c356c93495e2a3a7b0dfd335f9f1 Mon Sep 17 00:00:00 2001 From: sij814 Date: Tue, 13 Aug 2024 16:52:27 -0700 Subject: [PATCH 85/89] changes made for initial feedback --- hw/rtl/VX_config.vh | 9 +++++++++ hw/rtl/VX_types.vh | 9 ++++----- runtime/include/vortex.h | 2 +- runtime/simx/vortex.cpp | 4 ++-- runtime/stub/utils.cpp | 34 ++++++++++++++-------------------- sim/simx/cache_sim.cpp | 4 ++-- sim/simx/constants.h | 4 ---- sim/simx/emulator.cpp | 5 ++--- sim/simx/mem_sim.cpp | 6 +++--- sim/simx/processor.cpp | 4 ++-- 10 files changed, 39 insertions(+), 42 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 615c1ae6d..3ff9e3a54 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -650,6 +650,15 @@ `define L3_WRITEBACK 0 `endif +`ifndef MEMORY_BANKS +`define MEMORY_BANKS 8 +`endif + +// Number of Memory Ports from LLC +`ifndef NUM_MEM_PORTS +`define NUM_MEM_PORTS `MIN(`MEMORY_BANKS, `L3_NUM_BANKS) +`endif + // ISA Extensions ///////////////////////////////////////////////////////////// `ifdef EXT_A_ENABLE diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 685051b6c..2eac22a5a 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -166,6 +166,10 @@ `define VX_CSR_MPM_MEM_WRITES_H 12'hB99 `define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency `define VX_CSR_MPM_MEM_LT_H 12'hB9A +`define VX_CSR_MPM_MEM_BANK_CNTR 12'hB1E // memory bank requests +`define VX_CSR_MPM_MEM_BANK_CNTR_H 12'hB9E +`define VX_CSR_MPM_MEM_BANK_TICK 12'hB1F // memory ticks +`define VX_CSR_MPM_MEM_BANK_TICK_H 12'hB9F // PERF: lmem `define VX_CSR_MPM_LMEM_READS 12'hB1B // memory reads `define VX_CSR_MPM_LMEM_READS_H 12'hB9B @@ -173,11 +177,6 @@ `define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C `define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts `define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D -// PERF: hbm -`define VX_CSR_HBM_BANK_CNTR 12'hB1E // hbm banks -`define VX_CSR_HBM_BANK_CNTR_H 12'hB9E -`define VX_CSR_HBM_BANK_TICK 12'hB1F // hbm ticks -`define VX_CSR_HBM_BANK_TICK_H 12'hB9F // Machine Performance-monitoring memory counters (class 3) /////////////////// // diff --git a/runtime/include/vortex.h b/runtime/include/vortex.h index bf263da09..853da5994 100644 --- a/runtime/include/vortex.h +++ b/runtime/include/vortex.h @@ -34,7 +34,7 @@ typedef void* vx_buffer_h; #define VX_CAPS_GLOBAL_MEM_SIZE 0x5 #define VX_CAPS_LOCAL_MEM_SIZE 0x6 #define VX_CAPS_ISA_FLAGS 0x7 -#define VX_CAPS_L3CACHE_NUM_BANKS 0x8 +#define VX_CAPS_NUM_MEM_BANKS 0x8 // device isa flags #define VX_ISA_STD_A (1ull << ISA_STD_A) diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp index be7173fc3..70ceb7fc4 100644 --- a/runtime/simx/vortex.cpp +++ b/runtime/simx/vortex.cpp @@ -81,8 +81,8 @@ public: case VX_CAPS_ISA_FLAGS: _value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD; break; - case VX_CAPS_L3CACHE_NUM_BANKS: - _value = L3_NUM_BANKS; + case VX_CAPS_NUM_MEM_BANKS: + _value = MEMORY_BANKS; break; default: std::cout << "invalid caps id: " << caps_id << std::endl; diff --git a/runtime/stub/utils.cpp b/runtime/stub/utils.cpp index ae894fcbb..c1f75f092 100644 --- a/runtime/stub/utils.cpp +++ b/runtime/stub/utils.cpp @@ -211,10 +211,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t mem_reads = 0; uint64_t mem_writes = 0; uint64_t mem_lat = 0; - - // PERF: hbm - uint64_t hbm_counter = 0; - uint64_t hbm_ticks = 0; + uint64_t mem_req_counter = 0; + uint64_t mem_ticks = 0; uint64_t num_cores; CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), { @@ -225,9 +223,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), { return err; }); - - uint64_t l3cache_banks; - CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_L3CACHE_NUM_BANKS, &l3cache_banks), { + + uint64_t num_mem_bank_ports; + CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_MEM_BANKS, &num_mem_bank_ports), { return err; }); @@ -531,14 +529,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MSHR_ST, core_id, &l3cache_mshr_stalls), { return err; }); - - // PERF: HBM - CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_HBM_BANK_CNTR, core_id, &hbm_counter), { - return err; - }); - CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_HBM_BANK_TICK, core_id, &hbm_ticks), { - return err; - }); } // PERF: memory CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_READS, core_id, &mem_reads), { @@ -550,6 +540,12 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_LT, core_id, &mem_lat), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_BANK_CNTR, core_id, &mem_req_counter), { + return err; + }); + CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_BANK_TICK, core_id, &mem_ticks), { + return err; + }); } } break; default: @@ -616,22 +612,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads); int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes); int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls); - int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls); + int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls); fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads); fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes); fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio); fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, write_hit_ratio); fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, bank_utilization); fprintf(stream, "PERF: l3cache mshr stalls=%ld (utilization=%d%%)\n", l3cache_mshr_stalls, mshr_utilization); - - // HBM - float util = (float)hbm_counter / (hbm_ticks * l3cache_banks) * 100; - fprintf(stream, "PERF: hbm bank utilization=%f\n", util); } int mem_avg_lat = caclAverage(mem_lat, mem_reads); + int memory_bank_port_utilization = calcAvgPercent(mem_req_counter, (mem_ticks * num_mem_bank_ports)); fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes); fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat); + fprintf(stream, "PERF: memory bank port utilization=%d%%\n", memory_bank_port_utilization); } break; default: break; diff --git a/sim/simx/cache_sim.cpp b/sim/simx/cache_sim.cpp index ca98c1e5f..4f357f195 100644 --- a/sim/simx/cache_sim.cpp +++ b/sim/simx/cache_sim.cpp @@ -722,8 +722,8 @@ CacheSim::CacheSim(const SimContext& ctx, const char* name, const Config& config : SimObject(ctx, name) , CoreReqPorts(config.num_inputs, this) , CoreRspPorts(config.num_inputs, this) - , MemReqPorts((1 << config.B), this) - , MemRspPorts((1 << config.B), this) + , MemReqPorts(NUM_MEM_PORTS, this) + , MemRspPorts(NUM_MEM_PORTS, this) , impl_(new Impl(this, config)) {} diff --git a/sim/simx/constants.h b/sim/simx/constants.h index 81a626b84..0c707b55c 100644 --- a/sim/simx/constants.h +++ b/sim/simx/constants.h @@ -21,10 +21,6 @@ #define MEM_CLOCK_RATIO 1 #endif -#ifndef MEMORY_BANKS -#define MEMORY_BANKS 8 -#endif - #define LSU_WORD_SIZE (XLEN / 8) #define LSU_CHANNELS NUM_LSU_LANES #define LSU_NUM_REQS (NUM_LSU_BLOCKS * LSU_CHANNELS) diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index d76113249..3dfdf420b 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -451,13 +451,12 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { CSR_READ_64(VX_CSR_MPM_MEM_READS, proc_perf.mem_reads); CSR_READ_64(VX_CSR_MPM_MEM_WRITES, proc_perf.mem_writes); CSR_READ_64(VX_CSR_MPM_MEM_LT, proc_perf.mem_latency); + CSR_READ_64(VX_CSR_MPM_MEM_BANK_CNTR, proc_perf.memsim.counter); + CSR_READ_64(VX_CSR_MPM_MEM_BANK_TICK, proc_perf.memsim.ticks); CSR_READ_64(VX_CSR_MPM_LMEM_READS, lmem_perf.reads); CSR_READ_64(VX_CSR_MPM_LMEM_WRITES, lmem_perf.writes); CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls); - - CSR_READ_64(VX_CSR_HBM_BANK_CNTR, proc_perf.memsim.counter); - CSR_READ_64(VX_CSR_HBM_BANK_TICK, proc_perf.memsim.ticks); } } break; default: { diff --git a/sim/simx/mem_sim.cpp b/sim/simx/mem_sim.cpp index 04395683a..c1ff87680 100644 --- a/sim/simx/mem_sim.cpp +++ b/sim/simx/mem_sim.cpp @@ -59,7 +59,7 @@ public: dram_sim_.tick(); uint32_t counter = 0; - for (uint32_t i = 0; i < L3_NUM_BANKS; ++i) { + for (uint32_t i = 0; i < NUM_MEM_PORTS; ++i) { if (simobject_->MemReqPorts.at(i).empty()) continue; @@ -107,8 +107,8 @@ public: MemSim::MemSim(const SimContext& ctx, const char* name, const Config& config) : SimObject(ctx, name) - , MemReqPorts(L3_NUM_BANKS, this) - , MemRspPorts(L3_NUM_BANKS, this) + , MemReqPorts(NUM_MEM_PORTS, this) + , MemRspPorts(NUM_MEM_PORTS, this) , impl_(new Impl(this, config)) {} diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index b3664f3fa..58fabf14c 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -47,7 +47,7 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) ); // connect L3 memory ports - for (uint32_t i = 0; i < L3_NUM_BANKS; ++i) { + for (uint32_t i = 0; i < NUM_MEM_PORTS; ++i) { l3cache_->MemReqPorts.at(i).bind(&memsim_->MemReqPorts.at(i)); memsim_->MemRspPorts.at(i).bind(&l3cache_->MemRspPorts.at(i)); } @@ -61,7 +61,7 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) } // set up memory profiling - for (uint32_t i = 0; i < L3_NUM_BANKS; ++i) { + for (uint32_t i = 0; i < NUM_MEM_PORTS; ++i) { memsim_->MemReqPorts.at(i).tx_callback([&](const MemReq& req, uint64_t cycle){ __unused (cycle); perf_mem_reads_ += !req.write; From d7e8fd74ff33046e60c7580592642b8073424d96 Mon Sep 17 00:00:00 2001 From: sij814 Date: Thu, 15 Aug 2024 19:40:52 -0700 Subject: [PATCH 86/89] source_id = 0 --- sim/simx/mem_sim.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sim/simx/mem_sim.cpp b/sim/simx/mem_sim.cpp index c1ff87680..a38f4c01c 100644 --- a/sim/simx/mem_sim.cpp +++ b/sim/simx/mem_sim.cpp @@ -77,7 +77,7 @@ public: if (!rsp_args->request.write) { MemRsp mem_rsp{rsp_args->request.tag, rsp_args->request.cid, rsp_args->request.uuid}; rsp_args->simobject->MemRspPorts.at(rsp_args->i).push(mem_rsp, 1); - DT(3, rsp_args->simobject->name() << " mem-rsp: " << mem_rsp); + DT(3, rsp_args->simobject->name() << " mem-rsp: " << mem_rsp << " bank: " << rsp_args->i); } delete rsp_args; }, From a523afbebe248f0de5c425a4d0440555722d01e0 Mon Sep 17 00:00:00 2001 From: sij814 Date: Thu, 15 Aug 2024 22:30:32 -0700 Subject: [PATCH 87/89] removed jammy --- configure | 1 - 1 file changed, 1 deletion(-) diff --git a/configure b/configure index cab5142c5..62975784b 100755 --- a/configure +++ b/configure @@ -26,7 +26,6 @@ detect_osversion() { case "$VERSION_CODENAME" in bionic) osversion="ubuntu/bionic";; focal) osversion="ubuntu/focal";; - jammy) osversion="ubuntu/focal";; # Add new versions as needed esac ;; From 7a61b67170373cb4da487552d7e3607b18290c17 Mon Sep 17 00:00:00 2001 From: sij814 Date: Fri, 16 Aug 2024 15:47:03 -0700 Subject: [PATCH 88/89] added CAPS --- runtime/opae/vortex.cpp | 3 +++ runtime/rtlsim/vortex.cpp | 3 +++ runtime/xrt/vortex.cpp | 3 +++ 3 files changed, 9 insertions(+) diff --git a/runtime/opae/vortex.cpp b/runtime/opae/vortex.cpp index 390d5acc4..06458fa1f 100755 --- a/runtime/opae/vortex.cpp +++ b/runtime/opae/vortex.cpp @@ -232,6 +232,9 @@ public: case VX_CAPS_ISA_FLAGS: _value = isa_caps_; break; + case VX_CAPS_NUM_MEM_BANKS: + _value = MEMORY_BANKS; + break; default: fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id); std::abort(); diff --git a/runtime/rtlsim/vortex.cpp b/runtime/rtlsim/vortex.cpp index c75a6c12f..91df7f7e8 100644 --- a/runtime/rtlsim/vortex.cpp +++ b/runtime/rtlsim/vortex.cpp @@ -77,6 +77,9 @@ public: case VX_CAPS_ISA_FLAGS: _value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD; break; + case VX_CAPS_NUM_MEM_BANKS: + _value = MEMORY_BANKS; + break; default: std::cout << "invalid caps id: " << caps_id << std::endl; std::abort(); diff --git a/runtime/xrt/vortex.cpp b/runtime/xrt/vortex.cpp index 408bf23ed..5f4e27ff2 100644 --- a/runtime/xrt/vortex.cpp +++ b/runtime/xrt/vortex.cpp @@ -404,6 +404,9 @@ public: case VX_CAPS_ISA_FLAGS: _value = isa_caps_; break; + case VX_CAPS_NUM_MEM_BANKS: + _value = MEMORY_BANKS; + break; default: fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id); std::abort(); From e34e4b790a623536d02db13228d570f850a9b564 Mon Sep 17 00:00:00 2001 From: sij814 Date: Fri, 16 Aug 2024 16:53:18 -0700 Subject: [PATCH 89/89] forced memory bank change in opae --- sim/opaesim/opae_sim.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sim/opaesim/opae_sim.cpp b/sim/opaesim/opae_sim.cpp index 9d43ea595..7a1bae3e4 100644 --- a/sim/opaesim/opae_sim.cpp +++ b/sim/opaesim/opae_sim.cpp @@ -35,13 +35,13 @@ #include #include -#ifndef MEMORY_BANKS +//#ifndef MEMORY_BANKS #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS #else #define MEMORY_BANKS 2 #endif -#endif +//#endif #ifndef MEM_CLOCK_RATIO #define MEM_CLOCK_RATIO 1