mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 13:57:17 -04:00
Merge pull request #226 from MichaelJSr/simx-vpu
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
Merge tensor_unit into simx-vpu (vector_unit)
This commit is contained in:
commit
403f7da2b8
63 changed files with 2035 additions and 3582 deletions
12
.github/workflows/ci.yml
vendored
12
.github/workflows/ci.yml
vendored
|
@ -27,7 +27,7 @@ jobs:
|
|||
|
||||
- name: Cache Toolchain Directory
|
||||
id: cache-toolchain
|
||||
uses: actions/cache@v2
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: tools
|
||||
key: ${{ runner.os }}-toolchain-v0.1
|
||||
|
@ -36,7 +36,7 @@ jobs:
|
|||
|
||||
- name: Cache Third Party Directory
|
||||
id: cache-thirdparty
|
||||
uses: actions/cache@v2
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: third_party
|
||||
key: ${{ runner.os }}-thirdparty-v0.1
|
||||
|
@ -79,7 +79,7 @@ jobs:
|
|||
|
||||
- name: Cache Toolchain Directory
|
||||
id: cache-toolchain
|
||||
uses: actions/cache@v2
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: tools
|
||||
key: ${{ runner.os }}-toolchain-v0.1
|
||||
|
@ -88,7 +88,7 @@ jobs:
|
|||
|
||||
- name: Cache Third Party Directory
|
||||
id: cache-thirdparty
|
||||
uses: actions/cache@v2
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: third_party
|
||||
key: ${{ runner.os }}-thirdparty-v0.1
|
||||
|
@ -130,7 +130,7 @@ jobs:
|
|||
|
||||
- name: Cache Toolchain Directory
|
||||
id: cache-toolchain
|
||||
uses: actions/cache@v2
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: tools
|
||||
key: ${{ runner.os }}-toolchain-v0.1
|
||||
|
@ -139,7 +139,7 @@ jobs:
|
|||
|
||||
- name: Cache Third Party Directory
|
||||
id: cache-thirdparty
|
||||
uses: actions/cache@v2
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: third_party
|
||||
key: ${{ runner.os }}-thirdparty-v0.1
|
||||
|
|
|
@ -104,9 +104,6 @@ regression()
|
|||
# test temp driver mode for
|
||||
./ci/blackbox.sh --driver=simx --app=vecadd --rebuild=3
|
||||
|
||||
# test for matmul
|
||||
CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1"
|
||||
|
||||
echo "regression tests done!"
|
||||
}
|
||||
|
||||
|
@ -301,11 +298,11 @@ config2()
|
|||
|
||||
# test single-bank memory
|
||||
if [ "$XLEN" == "64" ]; then
|
||||
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=48" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=48" ./ci/blackbox.sh --driver=xrt --app=mstress
|
||||
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress
|
||||
else
|
||||
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32" ./ci/blackbox.sh --driver=xrt --app=mstress
|
||||
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress
|
||||
fi
|
||||
|
||||
# test larger memory address
|
||||
|
@ -322,10 +319,10 @@ config2()
|
|||
CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=0" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
|
||||
# test memory ports
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=opae --app=mstress --threads=8
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=xrt --app=mstress --threads=8
|
||||
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifndef EXT_M_DISABLE
|
||||
`define EXT_M_ENABLE
|
||||
`endif
|
||||
|
@ -113,24 +114,6 @@
|
|||
`define SOCKET_SIZE `MIN(4, `NUM_CORES)
|
||||
`endif
|
||||
|
||||
// Size of Tensor Core
|
||||
`ifndef TC_SIZE
|
||||
`define TC_SIZE 8
|
||||
`endif
|
||||
|
||||
// Number of TCs per Warp
|
||||
`ifndef TC_NUM
|
||||
`define TC_NUM 4
|
||||
`endif
|
||||
|
||||
`ifndef NUM_TCU_LANES
|
||||
`define NUM_TCU_LANES `TC_NUM
|
||||
`endif
|
||||
|
||||
`ifndef NUM_TCU_BLOCKS
|
||||
`define NUM_TCU_BLOCKS `ISSUE_WIDTH
|
||||
`endif
|
||||
|
||||
`ifdef L2_ENABLE
|
||||
`define L2_ENABLED 1
|
||||
`else
|
||||
|
@ -172,8 +155,26 @@
|
|||
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_MEMORY_BANKS
|
||||
`define PLATFORM_MEMORY_BANKS 2
|
||||
// Platform memory parameters
|
||||
|
||||
`ifndef PLATFORM_MEMORY_NUM_BANKS
|
||||
`define PLATFORM_MEMORY_NUM_BANKS 2
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_MEMORY_ADDR_WIDTH
|
||||
`ifdef XLEN_64
|
||||
`define PLATFORM_MEMORY_ADDR_WIDTH 48
|
||||
`else
|
||||
`define PLATFORM_MEMORY_ADDR_WIDTH 32
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_MEMORY_DATA_SIZE
|
||||
`define PLATFORM_MEMORY_DATA_SIZE 64
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_MEMORY_INTERLEAVE
|
||||
`define PLATFORM_MEMORY_INTERLEAVE 1
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_64
|
||||
|
@ -299,7 +300,8 @@
|
|||
`define MEM_PAGE_LOG2_SIZE (12)
|
||||
`endif
|
||||
|
||||
// Virtual Memory Configuration ///////////////////////////////////////////////////////
|
||||
// Virtual Memory Configuration ///////////////////////////////////////////////
|
||||
|
||||
`ifdef VM_ENABLE
|
||||
`ifdef XLEN_32
|
||||
`ifndef VM_ADDR_MODE
|
||||
|
@ -527,6 +529,12 @@
|
|||
`define FNCP_PE_RATIO 2
|
||||
`endif
|
||||
|
||||
// Tensore Units //////////////////////////////////////////////////////////////
|
||||
|
||||
`ifndef NUM_TENSOR_CORES
|
||||
`define NUM_TENSOR_CORES `ISSUE_WIDTH
|
||||
`endif
|
||||
|
||||
// Icache Configurable Knobs //////////////////////////////////////////////////
|
||||
|
||||
// Cache Enable
|
||||
|
@ -656,9 +664,9 @@
|
|||
// Number of Memory Ports
|
||||
`ifndef L1_MEM_PORTS
|
||||
`ifdef L1_DISABLE
|
||||
`define L1_MEM_PORTS `MIN(DCACHE_NUM_REQS, `PLATFORM_MEMORY_BANKS)
|
||||
`define L1_MEM_PORTS `MIN(DCACHE_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
|
||||
`else
|
||||
`define L1_MEM_PORTS `MIN(`DCACHE_NUM_BANKS, `PLATFORM_MEMORY_BANKS)
|
||||
`define L1_MEM_PORTS `MIN(`DCACHE_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
|
@ -735,9 +743,9 @@
|
|||
// Number of Memory Ports
|
||||
`ifndef L2_MEM_PORTS
|
||||
`ifdef L2_ENABLE
|
||||
`define L2_MEM_PORTS `MIN(`L2_NUM_BANKS, `PLATFORM_MEMORY_BANKS)
|
||||
`define L2_MEM_PORTS `MIN(`L2_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
|
||||
`else
|
||||
`define L2_MEM_PORTS `MIN(L2_NUM_REQS, `PLATFORM_MEMORY_BANKS)
|
||||
`define L2_MEM_PORTS `MIN(L2_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
|
@ -796,9 +804,9 @@
|
|||
// Number of Memory Ports
|
||||
`ifndef L3_MEM_PORTS
|
||||
`ifdef L3_ENABLE
|
||||
`define L3_MEM_PORTS `MIN(`L3_NUM_BANKS, `PLATFORM_MEMORY_BANKS)
|
||||
`define L3_MEM_PORTS `MIN(`L3_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
|
||||
`else
|
||||
`define L3_MEM_PORTS `MIN(L3_NUM_REQS, `PLATFORM_MEMORY_BANKS)
|
||||
`define L3_MEM_PORTS `MIN(L3_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
|
@ -846,6 +854,12 @@
|
|||
`define EXT_ZICOND_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef EXT_TPU_ENABLE
|
||||
`define EXT_TPU_ENABLED 1
|
||||
`else
|
||||
`define EXT_TPU_ENABLED 0
|
||||
`endif
|
||||
|
||||
`define ISA_STD_A 0
|
||||
`define ISA_STD_C 2
|
||||
`define ISA_STD_D 3
|
||||
|
|
|
@ -229,10 +229,4 @@
|
|||
`define VX_CSR_NUM_CORES 12'hFC2
|
||||
`define VX_CSR_LOCAL_MEM_BASE 12'hFC3
|
||||
|
||||
`define VX_MAT_MUL_SIZE 12'hFC4 // VX_MAT_MUL_SIZE = Matrix Size / TC Size
|
||||
`define VX_TC_NUM 12'hFC5
|
||||
`define VX_TC_SIZE 12'hFC6
|
||||
|
||||
|
||||
|
||||
`endif // VX_TYPES_VH
|
||||
|
|
|
@ -193,7 +193,7 @@ module Vortex_axi import VX_gpu_pkg::*; #(
|
|||
.TAG_WIDTH_OUT (AXI_TID_WIDTH),
|
||||
.NUM_PORTS_IN (`VX_MEM_PORTS),
|
||||
.NUM_BANKS_OUT (AXI_NUM_BANKS),
|
||||
.INTERLEAVE (0),
|
||||
.INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE),
|
||||
.REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0),
|
||||
.RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || AXI_NUM_BANKS > 1) ? 2 : 0)
|
||||
) axi_adapter (
|
||||
|
|
|
@ -28,18 +28,18 @@
|
|||
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
// POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
//`include "platform_afu_top_config.vh"
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH (`PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_DATA_WIDTH/8))
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH ((`PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_NUM_BANKS)) - $clog2(`PLATFORM_MEMORY_DATA_SIZE))
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH `PLATFORM_MEMORY_DATA_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH (`PLATFORM_MEMORY_DATA_SIZE * 8)
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH `PLATFORM_MEMORY_BURST_CNT_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4
|
||||
`endif
|
||||
|
||||
package local_mem_cfg_pkg;
|
||||
|
|
|
@ -11,18 +11,14 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifndef NOPAE
|
||||
`include "afu_json_info.vh"
|
||||
`else
|
||||
`include "vortex_afu.vh"
|
||||
`endif
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifndef PLATFORM_MEMORY_INTERLEAVE
|
||||
`define PLATFORM_MEMORY_INTERLEAVE 1
|
||||
`endif
|
||||
|
||||
module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_gpu_pkg::*; #(
|
||||
parameter NUM_LOCAL_MEM_BANKS = 2
|
||||
) (
|
||||
|
|
|
@ -134,10 +134,12 @@ module VX_afu_ctrl #(
|
|||
RSTATE_RESP = 2'd2,
|
||||
RSTATE_WIDTH = 2;
|
||||
|
||||
localparam MEMORY_BANK_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH - `CLOG2(`PLATFORM_MEMORY_NUM_BANKS);
|
||||
|
||||
// device caps
|
||||
wire [63:0] dev_caps = {8'b0,
|
||||
5'(`PLATFORM_MEMORY_ADDR_WIDTH-20),
|
||||
3'(`CLOG2(`PLATFORM_MEMORY_BANKS)),
|
||||
5'(MEMORY_BANK_ADDR_WIDTH-20),
|
||||
3'(`CLOG2(`PLATFORM_MEMORY_NUM_BANKS)),
|
||||
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
|
||||
16'(`NUM_CORES * `NUM_CLUSTERS),
|
||||
8'(`NUM_WARPS),
|
||||
|
|
|
@ -16,12 +16,16 @@
|
|||
`include "vortex_afu.vh"
|
||||
|
||||
module VX_afu_wrap #(
|
||||
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
|
||||
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
|
||||
parameter C_M_AXI_MEM_ID_WIDTH = 32,
|
||||
parameter C_M_AXI_MEM_DATA_WIDTH = 512,
|
||||
parameter C_M_AXI_MEM_ADDR_WIDTH = 25,
|
||||
parameter C_M_AXI_MEM_NUM_BANKS = 2
|
||||
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
|
||||
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
|
||||
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
|
||||
parameter C_M_AXI_MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_SIZE * 8,
|
||||
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
|
||||
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
|
||||
parameter C_M_AXI_MEM_NUM_BANKS = 1
|
||||
`else
|
||||
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS
|
||||
`endif
|
||||
) (
|
||||
// System signals
|
||||
input wire clk,
|
||||
|
@ -31,7 +35,7 @@ module VX_afu_wrap #(
|
|||
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
|
||||
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
`else
|
||||
`REPEAT (`PLATFORM_MEMORY_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
`endif
|
||||
// AXI4-Lite slave interface
|
||||
input wire s_axi_ctrl_awvalid,
|
||||
|
@ -58,11 +62,7 @@ module VX_afu_wrap #(
|
|||
|
||||
output wire interrupt
|
||||
);
|
||||
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
|
||||
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH + $clog2(`PLATFORM_MEMORY_BANKS);
|
||||
`else
|
||||
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH;
|
||||
`endif
|
||||
|
||||
typedef enum logic [1:0] {
|
||||
STATE_IDLE = 0,
|
||||
|
@ -71,8 +71,8 @@ module VX_afu_wrap #(
|
|||
STATE_DONE = 3
|
||||
} state_e;
|
||||
|
||||
localparam PENDING_SIZEW = 12; // max outstanding requests size
|
||||
localparam C_M_AXI_MEM_NUM_BANKS_SW = `CLOG2(C_M_AXI_MEM_NUM_BANKS+1);
|
||||
localparam PENDING_WR_SIZEW = 12; // max outstanding requests size
|
||||
localparam NUM_MEM_BANKS_SIZEW = `CLOG2(C_M_AXI_MEM_NUM_BANKS+1);
|
||||
|
||||
wire m_axi_mem_awvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_awready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
@ -108,11 +108,11 @@ module VX_afu_wrap #(
|
|||
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
|
||||
`REPEAT (1, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
|
||||
`else
|
||||
`REPEAT (`PLATFORM_MEMORY_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
|
||||
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
|
||||
`endif
|
||||
|
||||
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
|
||||
reg [PENDING_SIZEW-1:0] vx_pending_writes;
|
||||
reg [PENDING_WR_SIZEW-1:0] vx_pending_writes;
|
||||
reg vx_reset = 1; // asserted at initialization
|
||||
wire vx_busy;
|
||||
|
||||
|
@ -200,7 +200,7 @@ module VX_afu_wrap #(
|
|||
end
|
||||
|
||||
wire [C_M_AXI_MEM_NUM_BANKS-1:0] m_axi_wr_req_fire, m_axi_wr_rsp_fire;
|
||||
wire [C_M_AXI_MEM_NUM_BANKS_SW-1:0] cur_wr_reqs, cur_wr_rsps;
|
||||
wire [NUM_MEM_BANKS_SIZEW-1:0] cur_wr_reqs, cur_wr_rsps;
|
||||
|
||||
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_req_fire
|
||||
VX_axi_write_ack axi_write_ack (
|
||||
|
@ -224,14 +224,14 @@ module VX_afu_wrap #(
|
|||
`POP_COUNT(cur_wr_reqs, m_axi_wr_req_fire);
|
||||
`POP_COUNT(cur_wr_rsps, m_axi_wr_rsp_fire);
|
||||
|
||||
wire signed [C_M_AXI_MEM_NUM_BANKS_SW:0] reqs_sub = (C_M_AXI_MEM_NUM_BANKS_SW+1)'(cur_wr_reqs) -
|
||||
(C_M_AXI_MEM_NUM_BANKS_SW+1)'(cur_wr_rsps);
|
||||
wire signed [NUM_MEM_BANKS_SIZEW:0] reqs_sub = (NUM_MEM_BANKS_SIZEW+1)'(cur_wr_reqs) -
|
||||
(NUM_MEM_BANKS_SIZEW+1)'(cur_wr_rsps);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
vx_pending_writes <= '0;
|
||||
end else begin
|
||||
vx_pending_writes <= vx_pending_writes + PENDING_SIZEW'(reqs_sub);
|
||||
vx_pending_writes <= vx_pending_writes + PENDING_WR_SIZEW'(reqs_sub);
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -270,7 +270,7 @@ module VX_afu_wrap #(
|
|||
.ap_ready (ap_ready),
|
||||
.ap_idle (ap_idle),
|
||||
.interrupt (interrupt),
|
||||
|
||||
|
||||
.ap_ctrl_read (ap_ctrl_read),
|
||||
|
||||
`ifdef SCOPE
|
||||
|
@ -287,9 +287,8 @@ module VX_afu_wrap #(
|
|||
wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_u [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_addressing
|
||||
localparam [C_M_AXI_MEM_ADDR_WIDTH-1:0] BANK_OFFSET = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET) + C_M_AXI_MEM_ADDR_WIDTH'(i) << M_AXI_MEM_ADDR_WIDTH;
|
||||
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_u[i]) + BANK_OFFSET;
|
||||
assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_u[i]) + BANK_OFFSET;
|
||||
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET);
|
||||
assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET);
|
||||
end
|
||||
|
||||
`SCOPE_IO_SWITCH (2);
|
||||
|
|
|
@ -17,12 +17,12 @@ module vortex_afu #(
|
|||
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
|
||||
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
|
||||
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
|
||||
parameter C_M_AXI_MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_WIDTH,
|
||||
parameter C_M_AXI_MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8),
|
||||
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
|
||||
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
|
||||
parameter C_M_AXI_MEM_NUM_BANKS = 1
|
||||
`else
|
||||
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_BANKS
|
||||
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS
|
||||
`endif
|
||||
) (
|
||||
// System signals
|
||||
|
@ -33,7 +33,7 @@ module vortex_afu #(
|
|||
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
|
||||
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
`else
|
||||
`REPEAT (`PLATFORM_MEMORY_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
`endif
|
||||
|
||||
// AXI4-Lite slave interface
|
||||
|
@ -75,7 +75,7 @@ module vortex_afu #(
|
|||
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
|
||||
`REPEAT (1, AXI_MEM_ARGS, REPEAT_COMMA),
|
||||
`else
|
||||
`REPEAT (`PLATFORM_MEMORY_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
|
||||
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
|
||||
`endif
|
||||
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
|
||||
.s_axi_ctrl_awready (s_axi_ctrl_awready),
|
||||
|
@ -94,7 +94,7 @@ module vortex_afu #(
|
|||
.s_axi_ctrl_rready (s_axi_ctrl_rready),
|
||||
.s_axi_ctrl_rdata (s_axi_ctrl_rdata),
|
||||
.s_axi_ctrl_rresp (s_axi_ctrl_rresp),
|
||||
|
||||
|
||||
.s_axi_ctrl_bvalid (s_axi_ctrl_bvalid),
|
||||
.s_axi_ctrl_bready (s_axi_ctrl_bready),
|
||||
.s_axi_ctrl_bresp (s_axi_ctrl_bresp),
|
||||
|
|
|
@ -14,18 +14,6 @@
|
|||
`ifndef VORTEX_AFU_VH
|
||||
`define VORTEX_AFU_VH
|
||||
|
||||
`ifndef PLATFORM_MEMORY_BANKS
|
||||
`define PLATFORM_MEMORY_BANKS 2
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_MEMORY_ADDR_WIDTH
|
||||
`define PLATFORM_MEMORY_ADDR_WIDTH 31
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_MEMORY_DATA_WIDTH
|
||||
`define PLATFORM_MEMORY_DATA_WIDTH 512
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_MEMORY_OFFSET
|
||||
`define PLATFORM_MEMORY_OFFSET 0
|
||||
`endif
|
||||
|
|
|
@ -221,7 +221,7 @@ module VX_async_ram_patch #(
|
|||
VX_placeholder #(
|
||||
.O (1)
|
||||
) placeholder2 (
|
||||
.in (),
|
||||
.in (1'b0),
|
||||
.out (is_raddr_reg)
|
||||
);
|
||||
wire [DATAW-1:0] rdata_a;
|
||||
|
|
|
@ -251,7 +251,13 @@ module VX_axi_adapter #(
|
|||
// AXI write address channel
|
||||
|
||||
assign m_axi_awvalid[i] = req_xbar_valid_out[i] && xbar_rw_out && ~m_axi_aw_ack;
|
||||
assign m_axi_awaddr[i] = ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE;
|
||||
|
||||
if (INTERLEAVE) begin : g_m_axi_awaddr_i
|
||||
assign m_axi_awaddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << (BANK_SEL_BITS + LOG2_DATA_SIZE)) | (ADDR_WIDTH_OUT'(i) << LOG2_DATA_SIZE);
|
||||
end else begin : g_m_axi_awaddr_ni
|
||||
assign m_axi_awaddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE) | (ADDR_WIDTH_OUT'(i) << (BANK_ADDR_WIDTH + LOG2_DATA_SIZE));
|
||||
end
|
||||
|
||||
assign m_axi_awid[i] = TAG_WIDTH_OUT'(xbar_tag_out);
|
||||
assign m_axi_awlen[i] = 8'b00000000;
|
||||
assign m_axi_awsize[i] = 3'(LOG2_DATA_SIZE);
|
||||
|
@ -280,7 +286,13 @@ module VX_axi_adapter #(
|
|||
end
|
||||
|
||||
assign m_axi_arvalid[i] = req_xbar_valid_out[i] && ~xbar_rw_out;
|
||||
assign m_axi_araddr[i] = ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE;
|
||||
|
||||
// convert address to byte-addressable space
|
||||
if (INTERLEAVE) begin : g_m_axi_araddr_i
|
||||
assign m_axi_araddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << (BANK_SEL_BITS + LOG2_DATA_SIZE)) | (ADDR_WIDTH_OUT'(i) << LOG2_DATA_SIZE);
|
||||
end else begin : g_m_axi_araddr_ni
|
||||
assign m_axi_araddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE) | (ADDR_WIDTH_OUT'(i) << (BANK_ADDR_WIDTH + LOG2_DATA_SIZE));
|
||||
end
|
||||
assign m_axi_arid[i] = TAG_WIDTH_OUT'(xbar_tag_r_out);
|
||||
assign m_axi_arlen[i] = 8'b00000000;
|
||||
assign m_axi_arsize[i] = 3'(LOG2_DATA_SIZE);
|
||||
|
|
|
@ -7,22 +7,6 @@ include ../../common.mk
|
|||
# AFU parameters
|
||||
CONFIGS += -DNOPAE
|
||||
CONFIGS += -DPLATFORM_PROVIDES_LOCAL_MEMORY
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_BANKS,$(CONFIGS)))
|
||||
CONFIGS += -DPLATFORM_MEMORY_BANKS=2
|
||||
endif
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_ADDR_WIDTH,$(CONFIGS)))
|
||||
ifeq ($(XLEN),64)
|
||||
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=47
|
||||
else
|
||||
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=31
|
||||
endif
|
||||
endif
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_DATA_WIDTH,$(CONFIGS)))
|
||||
CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
|
||||
endif
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_BURST_CNT_WIDTH,$(CONFIGS)))
|
||||
CONFIGS += -DPLATFORM_MEMORY_BURST_CNT_WIDTH=4
|
||||
endif
|
||||
|
||||
#CONFIGS += -DNUM_CORES=2
|
||||
#CONFIGS += -DNUM_WARPS=32
|
||||
|
|
|
@ -99,7 +99,7 @@ ifdef PERF
|
|||
endif
|
||||
|
||||
# ast dump flags
|
||||
XML_CFLAGS = $(filter-out -DSYNTHESIS -DQUARTUS, $(CFLAGS)) $(RTL_PKGS) -I$(AFU_DIR)/ccip -I$(DPI_DIR) -DPLATFORM_PROVIDES_LOCAL_MEMORY -DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32 -DPLATFORM_MEMORY_DATA_WIDTH=512 -DPLATFORM_MEMORY_BURST_CNT_WIDTH=4 -DNOPAE -DSV_DPI
|
||||
XML_CFLAGS = $(filter-out -DSYNTHESIS -DQUARTUS, $(CFLAGS)) $(RTL_PKGS) -I$(AFU_DIR)/ccip -I$(DPI_DIR) -DPLATFORM_PROVIDES_LOCAL_MEMORY -DPLATFORM_MEMORY_NUM_BANKS=1 -DNOPAE -DSV_DPI
|
||||
|
||||
all: swconfig ip-gen setup build
|
||||
|
||||
|
|
|
@ -52,7 +52,7 @@ foreach def $vdefines_list {
|
|||
if { $name == "CHIPSCOPE" } {
|
||||
set chipscope 1
|
||||
}
|
||||
if { $name == "PLATFORM_MEMORY_BANKS" } {
|
||||
if { $name == "PLATFORM_MEMORY_NUM_BANKS" } {
|
||||
set num_banks [lindex $fields 1]
|
||||
}
|
||||
if { $name == "PLATFORM_MERGED_MEMORY_INTERFACE" } {
|
||||
|
|
|
@ -5,31 +5,36 @@ CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
|
|||
|
||||
ifeq ($(DEV_ARCH), zynquplus)
|
||||
# zynquplus
|
||||
CONFIGS += -DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
|
||||
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
|
||||
else ifeq ($(DEV_ARCH), versal)
|
||||
# versal
|
||||
CONFIGS += -DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
|
||||
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
|
||||
ifneq ($(findstring xilinx_vck5000,$(XSA)),)
|
||||
CONFIGS += -DPLATFORM_MEMORY_OFFSET=40'hC000000000
|
||||
endif
|
||||
else
|
||||
# alveo
|
||||
ifneq ($(findstring xilinx_u55c,$(XSA)),)
|
||||
CONFIGS += -DPLATFORM_MEMORY_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=29
|
||||
# 16 GB of HBM2 with 32 channels (512 MB per channel)
|
||||
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=34
|
||||
CONFIGS += -DPLATFORM_MERGED_MEMORY_INTERFACE
|
||||
VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31]
|
||||
#VPP_FLAGS += $(foreach i,$(shell seq 0 31), --connectivity.sp vortex_afu_1.m_axi_mem_$(i):HBM[$(i)])
|
||||
else ifneq ($(findstring xilinx_u50,$(XSA)),)
|
||||
CONFIGS += -DPLATFORM_MEMORY_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=28
|
||||
# 8 GB of HBM2 with 32 channels (256 MB per channel)
|
||||
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=33
|
||||
VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31]
|
||||
else ifneq ($(findstring xilinx_u280,$(XSA)),)
|
||||
CONFIGS += -DPLATFORM_MEMORY_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=28
|
||||
# 8 GB of HBM2 with 32 channels (256 MB per channel)
|
||||
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=33
|
||||
VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31]
|
||||
else ifneq ($(findstring xilinx_u250,$(XSA)),)
|
||||
CONFIGS += -DPLATFORM_MEMORY_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=34
|
||||
# 64 GB of DDR4 with 4 channels (16 GB per channel)
|
||||
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=36
|
||||
else ifneq ($(findstring xilinx_u200,$(XSA)),)
|
||||
CONFIGS += -DPLATFORM_MEMORY_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=34
|
||||
# 64 GB of DDR4 with 4 channels (16 GB per channel)
|
||||
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=36
|
||||
else
|
||||
CONFIGS += -DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
|
||||
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
|
||||
endif
|
||||
endif
|
||||
|
|
|
@ -221,22 +221,28 @@ inline void vx_fence() {
|
|||
__asm__ volatile ("fence iorw, iorw");
|
||||
}
|
||||
|
||||
//Matrix load
|
||||
inline void vx_matrix_load(unsigned dest, unsigned addr)
|
||||
{
|
||||
asm volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr));
|
||||
inline unsigned vx_u4_mmadd(unsigned a, unsigned b, unsigned c) {
|
||||
unsigned ret;
|
||||
asm volatile (".insn r4 %1, 0, %2, %0, %3, %4, %5" : "=r"(ret) : "i"(RISCV_CUSTOM1), "i"(0), "r"(a), "r"(b), "r"(c));
|
||||
return ret;
|
||||
}
|
||||
|
||||
//Matrix Store
|
||||
inline void vx_matrix_store(unsigned addr)
|
||||
{
|
||||
asm volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr));
|
||||
inline unsigned vx_u8_mmadd(unsigned a, unsigned b, unsigned c) {
|
||||
unsigned ret;
|
||||
asm volatile (".insn r4 %1, 0, %2, %0, %3, %4, %5" : "=r"(ret) : "i"(RISCV_CUSTOM1), "i"(1), "r"(a), "r"(b), "r"(c));
|
||||
return ret;
|
||||
}
|
||||
|
||||
//Matrix Mul
|
||||
inline void vx_matrix_mul()
|
||||
{
|
||||
asm volatile (".insn i 0x7b, 2, x0, 0(x0)");
|
||||
inline unsigned vx_f16_mmadd(unsigned a, unsigned b, unsigned c) {
|
||||
unsigned ret;
|
||||
asm volatile (".insn r4 %1, 0, %2, %0, %3, %4, %5" : "=r"(ret) : "i"(RISCV_CUSTOM1), "i"(2), "r"(a), "r"(b), "r"(c));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline unsigned vx_bf16_mmadd(unsigned a, unsigned b, unsigned c) {
|
||||
unsigned ret;
|
||||
asm volatile (".insn r4 %1, 0, %2, %0, %3, %4, %5" : "=r"(ret) : "i"(RISCV_CUSTOM1), "i"(3), "r"(a), "r"(b), "r"(c));
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -36,8 +36,6 @@ typedef void* vx_buffer_h;
|
|||
#define VX_CAPS_ISA_FLAGS 0x7
|
||||
#define VX_CAPS_NUM_MEM_BANKS 0x8
|
||||
#define VX_CAPS_MEM_BANK_SIZE 0x9
|
||||
#define VX_CAPS_TC_SIZE 0xA
|
||||
#define VX_CAPS_TC_NUM 0xB
|
||||
|
||||
// device isa flags
|
||||
#define VX_ISA_STD_A (1ull << ISA_STD_A)
|
||||
|
|
|
@ -78,10 +78,10 @@ public:
|
|||
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
|
||||
break;
|
||||
case VX_CAPS_NUM_MEM_BANKS:
|
||||
_value = PLATFORM_MEMORY_BANKS;
|
||||
_value = PLATFORM_MEMORY_NUM_BANKS;
|
||||
break;
|
||||
case VX_CAPS_MEM_BANK_SIZE:
|
||||
_value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_BANKS);
|
||||
_value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_NUM_BANKS);
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
|
|
|
@ -94,12 +94,6 @@ public:
|
|||
case VX_CAPS_NUM_CORES:
|
||||
_value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_TC_SIZE:
|
||||
_value = TC_SIZE;
|
||||
break;
|
||||
case VX_CAPS_TC_NUM:
|
||||
_value = TC_NUM;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
_value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
|
@ -113,10 +107,10 @@ public:
|
|||
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
|
||||
break;
|
||||
case VX_CAPS_NUM_MEM_BANKS:
|
||||
_value = PLATFORM_MEMORY_BANKS;
|
||||
_value = PLATFORM_MEMORY_NUM_BANKS;
|
||||
break;
|
||||
case VX_CAPS_MEM_BANK_SIZE:
|
||||
_value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_BANKS);
|
||||
_value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_NUM_BANKS);
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
|
|
|
@ -66,22 +66,22 @@ constexpr unsigned ceil2(T value) {
|
|||
return (sizeof(T) * 8) - count_leading_zeros<T>(value);
|
||||
}
|
||||
|
||||
inline uint64_t bit_clr(uint64_t bits, uint32_t index) {
|
||||
constexpr uint64_t bit_clr(uint64_t bits, uint32_t index) {
|
||||
assert(index <= 63);
|
||||
return bits & ~(1ull << index);
|
||||
}
|
||||
|
||||
inline uint64_t bit_set(uint64_t bits, uint32_t index) {
|
||||
constexpr uint64_t bit_set(uint64_t bits, uint32_t index) {
|
||||
assert(index <= 63);
|
||||
return bits | (1ull << index);
|
||||
}
|
||||
|
||||
inline bool bit_get(uint64_t bits, uint32_t index) {
|
||||
constexpr bool bit_get(uint64_t bits, uint32_t index) {
|
||||
assert(index <= 63);
|
||||
return (bits >> index) & 0x1;
|
||||
}
|
||||
|
||||
inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) {
|
||||
constexpr uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) {
|
||||
assert(end >= start);
|
||||
assert(end <= 63);
|
||||
uint32_t shift = 63 - end;
|
||||
|
@ -89,7 +89,7 @@ inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) {
|
|||
return bits & ~mask;
|
||||
}
|
||||
|
||||
inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) {
|
||||
constexpr uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) {
|
||||
assert(end >= start);
|
||||
assert(end <= 63);
|
||||
uint32_t shift = 63 - end;
|
||||
|
@ -97,14 +97,14 @@ inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t v
|
|||
return bit_clrw(bits, start, end) | dirty;
|
||||
}
|
||||
|
||||
inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
|
||||
constexpr uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
|
||||
assert(end >= start);
|
||||
assert(end <= 63);
|
||||
uint32_t shift = 63 - end;
|
||||
return (bits << shift) >> (shift + start);
|
||||
}
|
||||
|
||||
inline uint64_t bit_reverse(uint64_t bits) {
|
||||
constexpr uint64_t bit_reverse(uint64_t bits) {
|
||||
bits = ((bits & 0xAAAAAAAAAAAAAAAA) >> 1) | ((bits & 0x5555555555555555) << 1);
|
||||
bits = ((bits & 0xCCCCCCCCCCCCCCCC) >> 2) | ((bits & 0x3333333333333333) << 2);
|
||||
bits = ((bits & 0xF0F0F0F0F0F0F0F0) >> 4) | ((bits & 0x0F0F0F0F0F0F0F0F) << 4);
|
||||
|
@ -114,7 +114,7 @@ inline uint64_t bit_reverse(uint64_t bits) {
|
|||
return bits;
|
||||
}
|
||||
|
||||
inline uint64_t bit_reverse(uint64_t bits, uint32_t width) {
|
||||
constexpr uint64_t bit_reverse(uint64_t bits, uint32_t width) {
|
||||
assert(width <= 64);
|
||||
uint64_t reversed(0);
|
||||
for (uint32_t i = 0; i < width; ++i) {
|
||||
|
@ -126,7 +126,7 @@ inline uint64_t bit_reverse(uint64_t bits, uint32_t width) {
|
|||
}
|
||||
|
||||
template <typename T = uint32_t>
|
||||
T sext(const T& word, uint32_t width) {
|
||||
constexpr T sext(const T& word, uint32_t width) {
|
||||
assert(width > 1);
|
||||
assert(width <= (sizeof(T) * 8));
|
||||
if (width == (sizeof(T) * 8))
|
||||
|
@ -136,7 +136,7 @@ T sext(const T& word, uint32_t width) {
|
|||
}
|
||||
|
||||
template <typename T = uint32_t>
|
||||
T zext(const T& word, uint32_t width) {
|
||||
constexpr T zext(const T& word, uint32_t width) {
|
||||
assert(width > 1);
|
||||
assert(width <= (sizeof(T) * 8));
|
||||
if (width == (sizeof(T) * 8))
|
||||
|
@ -144,3 +144,8 @@ T zext(const T& word, uint32_t width) {
|
|||
T mask((static_cast<T>(1) << width) - 1);
|
||||
return word & mask;
|
||||
}
|
||||
|
||||
constexpr int pow2_sqrt(int x) {
|
||||
assert(ispow2(x));
|
||||
return 1 << (count_trailing_zeros(x) / 2);
|
||||
}
|
|
@ -29,19 +29,54 @@ using namespace vortex;
|
|||
|
||||
class DramSim::Impl {
|
||||
private:
|
||||
struct mem_req_t {
|
||||
uint64_t addr;
|
||||
bool is_write;
|
||||
ResponseCallback callback;
|
||||
void* arg;
|
||||
};
|
||||
|
||||
Ramulator::IFrontEnd* ramulator_frontend_;
|
||||
Ramulator::IMemorySystem* ramulator_memorysystem_;
|
||||
uint32_t cpu_channel_size_;
|
||||
uint64_t cpu_cycles_;
|
||||
uint32_t scaled_dram_cycles_;
|
||||
static const uint32_t tick_cycles_ = 1000;
|
||||
static const uint32_t dram_channel_size_ = 16; // 128 bits
|
||||
std::queue<mem_req_t> pending_reqs_;
|
||||
|
||||
void handle_pending_requests() {
|
||||
if (pending_reqs_.empty())
|
||||
return;
|
||||
auto& req = pending_reqs_.front();
|
||||
auto req_type = req.is_write ? Ramulator::Request::Type::Write : Ramulator::Request::Type::Read;
|
||||
std::function<void(Ramulator::Request&)> callback = nullptr;
|
||||
if (req.callback) {
|
||||
callback = [req_callback = std::move(req.callback), req_arg = std::move(req.arg)](Ramulator::Request& /*dram_req*/) {
|
||||
req_callback(req_arg);
|
||||
};
|
||||
}
|
||||
if (ramulator_frontend_->receive_external_requests(req_type, req.addr, 0, callback)) {
|
||||
if (req.is_write) {
|
||||
// Ramulator does not handle write responses, so we fire the callback ourselves.
|
||||
if (req.callback) {
|
||||
req.callback(req.arg);
|
||||
}
|
||||
}
|
||||
pending_reqs_.pop();
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
Impl(int clock_ratio) {
|
||||
Impl(uint32_t num_channels, uint32_t channel_size, float clock_ratio) {
|
||||
YAML::Node dram_config;
|
||||
dram_config["Frontend"]["impl"] = "GEM5";
|
||||
dram_config["MemorySystem"]["impl"] = "GenericDRAM";
|
||||
dram_config["MemorySystem"]["clock_ratio"] = clock_ratio;
|
||||
dram_config["MemorySystem"]["clock_ratio"] = 1;
|
||||
dram_config["MemorySystem"]["DRAM"]["impl"] = "HBM2";
|
||||
dram_config["MemorySystem"]["DRAM"]["org"]["preset"] = "HBM2_8Gb";
|
||||
dram_config["MemorySystem"]["DRAM"]["org"]["density"] = 8192;
|
||||
dram_config["MemorySystem"]["DRAM"]["org"]["channel"] = 8;
|
||||
dram_config["MemorySystem"]["DRAM"]["org"]["channel"] = num_channels;
|
||||
dram_config["MemorySystem"]["DRAM"]["timing"]["preset"] = "HBM2_2Gbps";
|
||||
dram_config["MemorySystem"]["Controller"]["impl"] = "Generic";
|
||||
dram_config["MemorySystem"]["Controller"]["Scheduler"]["impl"] = "FRFCFS";
|
||||
|
@ -59,6 +94,10 @@ public:
|
|||
ramulator_memorysystem_ = Ramulator::Factory::create_memory_system(dram_config);
|
||||
ramulator_frontend_->connect_memory_system(ramulator_memorysystem_);
|
||||
ramulator_memorysystem_->connect_frontend(ramulator_frontend_);
|
||||
|
||||
cpu_channel_size_ = channel_size;
|
||||
scaled_dram_cycles_ = static_cast<uint64_t>(clock_ratio * tick_cycles_);
|
||||
this->reset();
|
||||
}
|
||||
|
||||
~Impl() {
|
||||
|
@ -66,41 +105,49 @@ public:
|
|||
auto original_buf = std::cout.rdbuf();
|
||||
std::cout.rdbuf(nullstream.rdbuf());
|
||||
ramulator_frontend_->finalize();
|
||||
ramulator_memorysystem_->finalize();
|
||||
ramulator_memorysystem_->finalize();
|
||||
std::cout.rdbuf(original_buf);
|
||||
}
|
||||
|
||||
void reset() {
|
||||
//--
|
||||
cpu_cycles_ = 0;
|
||||
}
|
||||
|
||||
void tick() {
|
||||
ramulator_memorysystem_->tick();
|
||||
cpu_cycles_ += tick_cycles_;
|
||||
while (cpu_cycles_ >= scaled_dram_cycles_) {
|
||||
this->handle_pending_requests();
|
||||
ramulator_memorysystem_->tick();
|
||||
cpu_cycles_ -= scaled_dram_cycles_;
|
||||
}
|
||||
}
|
||||
|
||||
bool send_request(bool is_write, uint64_t addr, int source_id, ResponseCallback response_cb, void* arg) {
|
||||
if (!ramulator_frontend_->receive_external_requests(
|
||||
is_write ? Ramulator::Request::Type::Write : Ramulator::Request::Type::Read,
|
||||
addr,
|
||||
source_id,
|
||||
[callback_ = std::move(response_cb), arg_ = std::move(arg)](Ramulator::Request& /*dram_req*/) {
|
||||
callback_(arg_);
|
||||
void send_request(uint64_t addr, bool is_write, ResponseCallback response_cb, void* arg) {
|
||||
// enqueue the request
|
||||
if (cpu_channel_size_ > dram_channel_size_) {
|
||||
uint32_t n = cpu_channel_size_ / dram_channel_size_;
|
||||
for (uint32_t i = 0; i < n; ++i) {
|
||||
uint64_t dram_byte_addr = (addr / cpu_channel_size_) * dram_channel_size_ + (i * dram_channel_size_);
|
||||
if (i == 0) {
|
||||
pending_reqs_.push({dram_byte_addr, is_write, response_cb, arg});
|
||||
} else {
|
||||
pending_reqs_.push({dram_byte_addr, is_write, nullptr, nullptr});
|
||||
}
|
||||
}
|
||||
)) {
|
||||
return false;
|
||||
} else if (cpu_channel_size_ < dram_channel_size_) {
|
||||
uint64_t dram_byte_addr = (addr / cpu_channel_size_) * dram_channel_size_;
|
||||
pending_reqs_.push({dram_byte_addr, is_write, response_cb, arg});
|
||||
} else {
|
||||
uint64_t dram_byte_addr = addr;
|
||||
pending_reqs_.push({dram_byte_addr, is_write, response_cb, arg});
|
||||
}
|
||||
if (is_write) {
|
||||
// Ramulator does not handle write responses, so we call the callback ourselves
|
||||
response_cb(arg);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
DramSim::DramSim(int clock_ratio)
|
||||
: impl_(new Impl(clock_ratio))
|
||||
DramSim::DramSim(uint32_t num_channels, uint32_t channel_size, float clock_ratio)
|
||||
: impl_(new Impl(num_channels, channel_size, clock_ratio))
|
||||
{}
|
||||
|
||||
DramSim::~DramSim() {
|
||||
|
@ -115,6 +162,6 @@ void DramSim::tick() {
|
|||
impl_->tick();
|
||||
}
|
||||
|
||||
bool DramSim::send_request(bool is_write, uint64_t addr, int source_id, ResponseCallback callback, void* arg) {
|
||||
return impl_->send_request(is_write, addr, source_id, callback, arg);
|
||||
void DramSim::send_request(uint64_t addr, bool is_write, ResponseCallback callback, void* arg) {
|
||||
impl_->send_request(addr, is_write, callback, arg);
|
||||
}
|
|
@ -19,14 +19,15 @@ class DramSim {
|
|||
public:
|
||||
typedef void (*ResponseCallback)(void *arg);
|
||||
|
||||
DramSim(int clock_ratio);
|
||||
DramSim(uint32_t num_channels, uint32_t channel_size, float clock_ratio);
|
||||
~DramSim();
|
||||
|
||||
void reset();
|
||||
|
||||
void tick();
|
||||
|
||||
bool send_request(bool is_write, uint64_t addr, int source_id, ResponseCallback response_cb, void* arg);
|
||||
// addr: per-channel block address
|
||||
void send_request(uint64_t addr, bool is_write, ResponseCallback response_cb, void* arg);
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
|
|
|
@ -24,9 +24,13 @@ extern "C" {
|
|||
#define F32_SIGN 0x80000000
|
||||
#define F64_SIGN 0x8000000000000000
|
||||
|
||||
inline float16_t to_float16_t(uint16_t x) { return float16_t{x}; }
|
||||
inline bfloat16_t to_bfloat16_t(uint16_t x) { return bfloat16_t{x}; }
|
||||
inline float32_t to_float32_t(uint32_t x) { return float32_t{x}; }
|
||||
inline float64_t to_float64_t(uint64_t x) { return float64_t{x}; }
|
||||
|
||||
inline uint16_t from_float16_t(float16_t x) { return uint16_t(x.v); }
|
||||
inline uint16_t from_bfloat16_t(bfloat16_t x) { return uint16_t(x.v); }
|
||||
inline uint32_t from_float32_t(float32_t x) { return uint32_t(x.v); }
|
||||
inline uint64_t from_float64_t(float64_t x) { return uint64_t(x.v); }
|
||||
|
||||
|
@ -530,6 +534,34 @@ uint64_t rv_ftod(uint32_t a) {
|
|||
return from_float64_t(r);
|
||||
}
|
||||
|
||||
uint32_t rv_htof_s(uint16_t a, uint32_t frm, uint32_t* fflags) {
|
||||
rv_init(frm);
|
||||
auto r = f16_to_f32(to_float16_t(a));
|
||||
if (fflags) { *fflags = softfloat_exceptionFlags; }
|
||||
return from_float32_t(r);
|
||||
}
|
||||
|
||||
uint16_t rv_ftoh_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
|
||||
rv_init(frm);
|
||||
auto r = f32_to_f16(to_float32_t(a));
|
||||
if (fflags) { *fflags = softfloat_exceptionFlags; }
|
||||
return from_float16_t(r);
|
||||
}
|
||||
|
||||
uint32_t rv_btof_s(uint16_t a, uint32_t frm, uint32_t* fflags) {
|
||||
rv_init(frm);
|
||||
auto r = bf16_to_f32(to_bfloat16_t(a));
|
||||
if (fflags) { *fflags = softfloat_exceptionFlags; }
|
||||
return from_float32_t(r);
|
||||
}
|
||||
|
||||
uint16_t rv_ftob_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
|
||||
rv_init(frm);
|
||||
auto r = f32_to_bf16(to_float32_t(a));
|
||||
if (fflags) { *fflags = softfloat_exceptionFlags; }
|
||||
return from_bfloat16_t(r);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -92,6 +92,12 @@ uint32_t rv_dtof(uint64_t a);
|
|||
uint32_t rv_dtof_r(uint64_t a, uint32_t frm);
|
||||
uint64_t rv_ftod(uint32_t a);
|
||||
|
||||
uint32_t rv_htof_s(uint16_t a, uint32_t frm, uint32_t* fflags);
|
||||
uint16_t rv_ftoh_s(uint32_t a, uint32_t frm, uint32_t* fflags);
|
||||
|
||||
uint32_t rv_btof_s(uint16_t a, uint32_t frm, uint32_t* fflags);
|
||||
uint16_t rv_ftob_s(uint32_t a, uint32_t frm, uint32_t* fflags);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -31,24 +31,6 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
|
|||
DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
|
||||
DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
|
||||
|
||||
# Platform parameters
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_BANKS,$(CONFIGS)))
|
||||
CONFIGS += -DPLATFORM_MEMORY_BANKS=2
|
||||
endif
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_ADDR_WIDTH,$(CONFIGS)))
|
||||
ifeq ($(XLEN),64)
|
||||
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=47
|
||||
else
|
||||
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=31
|
||||
endif
|
||||
endif
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_DATA_WIDTH,$(CONFIGS)))
|
||||
CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
|
||||
endif
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_BURST_CNT_WIDTH,$(CONFIGS)))
|
||||
CONFIGS += -DPLATFORM_MEMORY_BURST_CNT_WIDTH=4
|
||||
endif
|
||||
|
||||
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
|
||||
|
||||
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
|
||||
|
|
|
@ -35,8 +35,6 @@
|
|||
#include <unordered_map>
|
||||
#include <util.h>
|
||||
|
||||
#define PLATFORM_MEMORY_DATA_SIZE (PLATFORM_MEMORY_DATA_WIDTH/8)
|
||||
|
||||
#ifndef MEM_CLOCK_RATIO
|
||||
#define MEM_CLOCK_RATIO 1
|
||||
#endif
|
||||
|
@ -66,6 +64,8 @@
|
|||
|
||||
using namespace vortex;
|
||||
|
||||
static uint32_t g_mem_bank_addr_width = (PLATFORM_MEMORY_ADDR_WIDTH - log2ceil(PLATFORM_MEMORY_NUM_BANKS));
|
||||
|
||||
static uint64_t timestamp = 0;
|
||||
|
||||
double sc_time_stamp() {
|
||||
|
@ -95,7 +95,7 @@ public:
|
|||
Impl()
|
||||
: device_(nullptr)
|
||||
, ram_(nullptr)
|
||||
, dram_sim_(MEM_CLOCK_RATIO)
|
||||
, dram_sim_(PLATFORM_MEMORY_NUM_BANKS, PLATFORM_MEMORY_DATA_SIZE, MEM_CLOCK_RATIO)
|
||||
, stop_(false)
|
||||
, host_buffer_ids_(0)
|
||||
#ifdef VCD_OUTPUT
|
||||
|
@ -146,9 +146,6 @@ public:
|
|||
// allocate RAM
|
||||
ram_ = new RAM(0, RAM_PAGE_SIZE);
|
||||
|
||||
// calculate memory bank size
|
||||
mem_bank_size_ = 1ull << PLATFORM_MEMORY_ADDR_WIDTH;
|
||||
|
||||
// reset the device
|
||||
this->reset();
|
||||
|
||||
|
@ -274,16 +271,15 @@ private:
|
|||
|
||||
if (!dram_queue_.empty()) {
|
||||
auto mem_req = dram_queue_.front();
|
||||
if (dram_sim_.send_request(mem_req->write, mem_req->addr, mem_req->bank_id, [](void* arg) {
|
||||
dram_sim_.send_request(mem_req->addr, mem_req->write, [](void* arg) {
|
||||
auto orig_req = reinterpret_cast<mem_req_t*>(arg);
|
||||
if (orig_req->ready) {
|
||||
delete orig_req;
|
||||
} else {
|
||||
orig_req->ready = true;
|
||||
}
|
||||
}, mem_req)) {
|
||||
dram_queue_.pop();
|
||||
}
|
||||
}, mem_req);
|
||||
dram_queue_.pop();
|
||||
}
|
||||
|
||||
dram_sim_.tick();
|
||||
|
@ -407,14 +403,14 @@ private:
|
|||
}
|
||||
|
||||
void avs_bus_reset() {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
device_->avs_readdatavalid[b] = 0;
|
||||
device_->avs_waitrequest[b] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void avs_bus_eval() {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
// process memory responses
|
||||
device_->avs_readdatavalid[b] = 0;
|
||||
if (!pending_mem_reqs_[b].empty()
|
||||
|
@ -430,7 +426,12 @@ private:
|
|||
|
||||
// process memory requests
|
||||
assert(!device_->avs_read[b] || !device_->avs_write[b]);
|
||||
uint64_t byte_addr = b * mem_bank_size_ + uint64_t(device_->avs_address[b]) * PLATFORM_MEMORY_DATA_SIZE;
|
||||
#if PLATFORM_MEMORY_INTERLEAVE == 1
|
||||
uint64_t byte_addr = (uint64_t(device_->avs_address[b]) * PLATFORM_MEMORY_NUM_BANKS + b) * PLATFORM_MEMORY_DATA_SIZE;
|
||||
#else
|
||||
uint64_t byte_addr = (uint64_t(device_->avs_address[b]) + (b << g_mem_bank_addr_width)) * PLATFORM_MEMORY_DATA_SIZE;
|
||||
#endif
|
||||
|
||||
if (device_->avs_write[b]) {
|
||||
// process write request
|
||||
uint64_t byteen = device_->avs_byteenable[b];
|
||||
|
@ -515,9 +516,8 @@ private:
|
|||
|
||||
std::unordered_map<int64_t, host_buffer_t> host_buffers_;
|
||||
uint64_t host_buffer_ids_;
|
||||
uint64_t mem_bank_size_;
|
||||
|
||||
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_BANKS];
|
||||
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_NUM_BANKS];
|
||||
|
||||
std::list<cci_rd_req_t> cci_reads_;
|
||||
std::list<cci_wr_req_t> cci_writes_;
|
||||
|
|
|
@ -78,22 +78,22 @@ module vortex_afu_shim import local_mem_cfg_pkg::*; import ccip_if_pkg::*; (
|
|||
output t_ccip_mmioData af2cp_sTxPort_c2_data,
|
||||
|
||||
// Avalon signals for local memory access
|
||||
output t_local_mem_data avs_writedata [`PLATFORM_MEMORY_BANKS],
|
||||
input t_local_mem_data avs_readdata [`PLATFORM_MEMORY_BANKS],
|
||||
output t_local_mem_addr avs_address [`PLATFORM_MEMORY_BANKS],
|
||||
input logic avs_waitrequest [`PLATFORM_MEMORY_BANKS],
|
||||
output logic avs_write [`PLATFORM_MEMORY_BANKS],
|
||||
output logic avs_read [`PLATFORM_MEMORY_BANKS],
|
||||
output t_local_mem_byte_mask avs_byteenable [`PLATFORM_MEMORY_BANKS],
|
||||
output t_local_mem_burst_cnt avs_burstcount [`PLATFORM_MEMORY_BANKS],
|
||||
input avs_readdatavalid [`PLATFORM_MEMORY_BANKS]
|
||||
output t_local_mem_data avs_writedata [`PLATFORM_MEMORY_NUM_BANKS],
|
||||
input t_local_mem_data avs_readdata [`PLATFORM_MEMORY_NUM_BANKS],
|
||||
output t_local_mem_addr avs_address [`PLATFORM_MEMORY_NUM_BANKS],
|
||||
input logic avs_waitrequest [`PLATFORM_MEMORY_NUM_BANKS],
|
||||
output logic avs_write [`PLATFORM_MEMORY_NUM_BANKS],
|
||||
output logic avs_read [`PLATFORM_MEMORY_NUM_BANKS],
|
||||
output t_local_mem_byte_mask avs_byteenable [`PLATFORM_MEMORY_NUM_BANKS],
|
||||
output t_local_mem_burst_cnt avs_burstcount [`PLATFORM_MEMORY_NUM_BANKS],
|
||||
input avs_readdatavalid [`PLATFORM_MEMORY_NUM_BANKS]
|
||||
);
|
||||
|
||||
t_if_ccip_Rx cp2af_sRxPort;
|
||||
t_if_ccip_Tx af2cp_sTxPort;
|
||||
|
||||
vortex_afu #(
|
||||
.NUM_LOCAL_MEM_BANKS(`PLATFORM_MEMORY_BANKS)
|
||||
.NUM_LOCAL_MEM_BANKS(`PLATFORM_MEMORY_NUM_BANKS)
|
||||
) afu (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
|
|
|
@ -24,21 +24,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_AFU
|
|||
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_GBAR
|
||||
|
||||
# Platform parameters
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_BANKS,$(CONFIGS)))
|
||||
CONFIGS += -DPLATFORM_MEMORY_BANKS=2
|
||||
endif
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_ADDR_WIDTH,$(CONFIGS)))
|
||||
ifeq ($(XLEN),64)
|
||||
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=47
|
||||
else
|
||||
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=31
|
||||
endif
|
||||
endif
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_DATA_WIDTH,$(CONFIGS)))
|
||||
CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
|
||||
endif
|
||||
|
||||
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
|
||||
|
||||
RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
|
||||
|
|
|
@ -35,8 +35,6 @@
|
|||
#include <dram_sim.h>
|
||||
#include <util.h>
|
||||
|
||||
#define PLATFORM_MEMORY_DATA_SIZE (PLATFORM_MEMORY_DATA_WIDTH/8)
|
||||
|
||||
#ifndef MEM_CLOCK_RATIO
|
||||
#define MEM_CLOCK_RATIO 1
|
||||
#endif
|
||||
|
@ -66,6 +64,8 @@ typedef uint64_t Word;
|
|||
|
||||
using namespace vortex;
|
||||
|
||||
static uint32_t g_mem_bank_addr_width = (PLATFORM_MEMORY_ADDR_WIDTH - log2ceil(PLATFORM_MEMORY_NUM_BANKS));
|
||||
|
||||
static uint64_t timestamp = 0;
|
||||
|
||||
double sc_time_stamp() {
|
||||
|
@ -93,7 +93,7 @@ void sim_trace_enable(bool enable) {
|
|||
|
||||
class Processor::Impl {
|
||||
public:
|
||||
Impl() : dram_sim_(MEM_CLOCK_RATIO) {
|
||||
Impl() : dram_sim_(PLATFORM_MEMORY_NUM_BANKS, PLATFORM_MEMORY_DATA_SIZE, MEM_CLOCK_RATIO) {
|
||||
// force random values for uninitialized signals
|
||||
Verilated::randReset(VERILATOR_RESET_VALUE);
|
||||
Verilated::randSeed(50);
|
||||
|
@ -154,7 +154,7 @@ public:
|
|||
|
||||
// start
|
||||
device_->reset = 0;
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
device_->mem_req_ready[b] = 1;
|
||||
}
|
||||
|
||||
|
@ -195,7 +195,7 @@ private:
|
|||
reqs.clear();
|
||||
}
|
||||
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
std::queue<mem_req_t*> empty;
|
||||
std::swap(dram_queue_[b], empty);
|
||||
}
|
||||
|
@ -224,17 +224,15 @@ private:
|
|||
|
||||
dram_sim_.tick();
|
||||
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
if (!dram_queue_[b].empty()) {
|
||||
auto mem_req = dram_queue_[b].front();
|
||||
if (dram_sim_.send_request(mem_req->write, mem_req->addr, b, [](void* arg) {
|
||||
dram_sim_.send_request(mem_req->addr, mem_req->write, [](void* arg) {
|
||||
// mark completed request as ready
|
||||
auto orig_req = reinterpret_cast<mem_req_t*>(arg);
|
||||
orig_req->ready = true;
|
||||
}, mem_req)) {
|
||||
// was successfully sent to dram, remove from queue
|
||||
dram_queue_[b].pop();
|
||||
}
|
||||
}, mem_req);
|
||||
dram_queue_[b].pop();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -254,7 +252,7 @@ private:
|
|||
}
|
||||
|
||||
void mem_bus_reset() {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
device_->mem_req_ready[b] = 0;
|
||||
device_->mem_rsp_valid[b] = 0;
|
||||
}
|
||||
|
@ -262,13 +260,13 @@ private:
|
|||
|
||||
void mem_bus_eval(bool clk) {
|
||||
if (!clk) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
mem_rd_rsp_ready_[b] = device_->mem_rsp_ready[b];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
// process memory responses
|
||||
if (device_->mem_rsp_valid[b] && mem_rd_rsp_ready_[b]) {
|
||||
device_->mem_rsp_valid[b] = 0;
|
||||
|
@ -293,11 +291,16 @@ private:
|
|||
|
||||
// process memory requests
|
||||
if (device_->mem_req_valid[b] && device_->mem_req_ready[b]) {
|
||||
uint64_t byte_addr = (device_->mem_req_addr[b] * PLATFORM_MEMORY_DATA_SIZE);
|
||||
#if PLATFORM_MEMORY_INTERLEAVE == 1
|
||||
uint64_t byte_addr = (uint64_t(device_->mem_req_addr[b]) * PLATFORM_MEMORY_NUM_BANKS + b) * PLATFORM_MEMORY_DATA_SIZE;
|
||||
#else
|
||||
uint64_t byte_addr = (uint64_t(device_->mem_req_addr[b]) + (b << g_mem_bank_addr_width)) * PLATFORM_MEMORY_DATA_SIZE;
|
||||
#endif
|
||||
// check read/write
|
||||
if (device_->mem_req_rw[b]) {
|
||||
auto byteen = device_->mem_req_byteen[b];
|
||||
auto data = VDataCast<uint8_t*, PLATFORM_MEMORY_DATA_SIZE>::get(device_->mem_req_data[b]);
|
||||
// check address range
|
||||
// check if console output address
|
||||
if (byte_addr >= uint64_t(IO_COUT_ADDR)
|
||||
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
|
||||
// process console output
|
||||
|
@ -313,21 +316,23 @@ private:
|
|||
}
|
||||
}
|
||||
} else {
|
||||
// process writes
|
||||
// process memory writes
|
||||
/*printf("%0ld: [sim] MEM Wr Req[%d]: addr=0x%0lx, tag=0x%0lx, byteen=0x", timestamp, b, byte_addr, device_->mem_req_tag[b]);
|
||||
for (int i = (PLATFORM_MEMORY_DATA_SIZE/4)-1; i >= 0; --i) {
|
||||
printf("%x", (int)((byteen >> (4 * i)) & 0xf));
|
||||
}
|
||||
printf(", data=0x");
|
||||
for (int i = PLATFORM_MEMORY_DATA_SIZE-1; i >= 0; --i) {
|
||||
printf("%d=%02x,", i, data[i]);
|
||||
printf("%02x", data[i]);
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
for (int i = 0; i < PLATFORM_MEMORY_DATA_SIZE; i++) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
(*ram_)[byte_addr + i] = data[i];
|
||||
}
|
||||
}
|
||||
|
||||
auto mem_req = new mem_req_t();
|
||||
mem_req->tag = device_->mem_req_tag[b];
|
||||
mem_req->addr = byte_addr;
|
||||
|
@ -341,7 +346,7 @@ private:
|
|||
pending_mem_reqs_[b].emplace_back(mem_req);
|
||||
}
|
||||
} else {
|
||||
// process reads
|
||||
// process memory reads
|
||||
auto mem_req = new mem_req_t();
|
||||
mem_req->tag = device_->mem_req_tag[b];
|
||||
mem_req->addr = byte_addr;
|
||||
|
@ -388,11 +393,11 @@ private:
|
|||
|
||||
std::unordered_map<int, std::stringstream> print_bufs_;
|
||||
|
||||
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_BANKS];
|
||||
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_NUM_BANKS];
|
||||
|
||||
std::queue<mem_req_t*> dram_queue_[PLATFORM_MEMORY_BANKS];
|
||||
std::queue<mem_req_t*> dram_queue_[PLATFORM_MEMORY_NUM_BANKS];
|
||||
|
||||
std::array<bool, PLATFORM_MEMORY_BANKS> mem_rd_rsp_ready_;
|
||||
std::array<bool, PLATFORM_MEMORY_NUM_BANKS> mem_rd_rsp_ready_;
|
||||
|
||||
DramSim dram_sim_;
|
||||
|
||||
|
|
|
@ -14,9 +14,9 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module rtlsim_shim import VX_gpu_pkg::*; #(
|
||||
parameter MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_WIDTH,
|
||||
parameter MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH,
|
||||
parameter MEM_NUM_BANKS = `PLATFORM_MEMORY_BANKS,
|
||||
parameter MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8),
|
||||
parameter MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_NUM_BANKS),
|
||||
parameter MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS,
|
||||
parameter MEM_TAG_WIDTH = 64
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
@ -159,7 +159,7 @@ module rtlsim_shim import VX_gpu_pkg::*; #(
|
|||
.TAG_WIDTH_OUT (MEM_TAG_WIDTH),
|
||||
.NUM_PORTS_IN (`VX_MEM_PORTS),
|
||||
.NUM_BANKS_OUT (MEM_NUM_BANKS),
|
||||
.INTERLEAVE (0),
|
||||
.INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE),
|
||||
.REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0),
|
||||
.RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || MEM_NUM_BANKS > 1) ? 2 : 0)
|
||||
) mem_bank_adapter (
|
||||
|
|
|
@ -25,6 +25,11 @@ ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),)
|
|||
SRCS += $(SRC_DIR)/vec_unit.cpp $(SRC_DIR)/vpu.cpp
|
||||
endif
|
||||
|
||||
# Add TPU extension sources
|
||||
ifneq ($(findstring -DEXT_TPU_ENABLE, $(CONFIGS)),)
|
||||
SRCS += $(SRC_DIR)/tensor_unit.cpp
|
||||
endif
|
||||
|
||||
# Debugging
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <VX_config.h>
|
||||
#include <bitmanip.h>
|
||||
|
||||
#ifndef RAM_PAGE_SIZE
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
@ -23,6 +24,8 @@
|
|||
#define MEM_CLOCK_RATIO 1
|
||||
#endif
|
||||
|
||||
inline constexpr int VLENB = (VLEN / 8);
|
||||
|
||||
inline constexpr int LSU_WORD_SIZE = (XLEN / 8);
|
||||
inline constexpr int LSU_CHANNELS = NUM_LSU_LANES;
|
||||
inline constexpr int LSU_NUM_REQS = (NUM_LSU_BLOCKS * LSU_CHANNELS);
|
||||
|
@ -38,4 +41,6 @@ inline constexpr int L2_NUM_REQS = NUM_SOCKETS * L1_MEM_PORTS;
|
|||
|
||||
inline constexpr int L3_NUM_REQS = NUM_CLUSTERS * L2_MEM_PORTS;
|
||||
|
||||
inline constexpr int PER_ISSUE_WARPS = NUM_WARPS / ISSUE_WIDTH;
|
||||
inline constexpr int PER_ISSUE_WARPS = NUM_WARPS / ISSUE_WIDTH;
|
||||
|
||||
inline constexpr int TENSOR_TILE_SIZE = pow2_sqrt(NUM_THREADS);
|
|
@ -55,8 +55,15 @@ Core::Core(const SimContext& ctx,
|
|||
{
|
||||
char sname[100];
|
||||
|
||||
#ifdef EXT_TPU_ENABLE
|
||||
{
|
||||
snprintf(sname, 100, "%s-tpu", this->name().c_str());
|
||||
tensor_unit_ = TensorUnit::Create(sname, TENSOR_TILE_SIZE);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
operands_.at(i) = SimPlatform::instance().create_object<Operand>();
|
||||
operands_.at(i) = Operand::Create();
|
||||
}
|
||||
|
||||
// create the memory coalescer
|
||||
|
@ -135,14 +142,12 @@ Core::Core(const SimContext& ctx,
|
|||
dispatchers_.at((int)FUType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
|
||||
dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_LSU_BLOCKS, NUM_LSU_LANES);
|
||||
dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_SFU_BLOCKS, NUM_SFU_LANES);
|
||||
dispatchers_.at((int)FUType::TCU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_TCU_BLOCKS, NUM_TCU_LANES);
|
||||
|
||||
// initialize execute units
|
||||
func_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
|
||||
func_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);
|
||||
func_units_.at((int)FUType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
|
||||
func_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
|
||||
func_units_.at((int)FUType::TCU) = SimPlatform::instance().create_object<TcuUnit>(this);
|
||||
|
||||
// bind commit arbiters
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
|
|
|
@ -141,6 +141,12 @@ public:
|
|||
return mem_coalescers_.at(idx);
|
||||
}
|
||||
|
||||
#ifdef EXT_TPU_ENABLE
|
||||
TensorUnit::Ptr& tensor_unit() {
|
||||
return tensor_unit_;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef EXT_V_ENABLE
|
||||
VecUnit::Ptr& vec_unit() {
|
||||
return vec_unit_;
|
||||
|
@ -166,6 +172,10 @@ private:
|
|||
Socket* socket_;
|
||||
const Arch& arch_;
|
||||
|
||||
#ifdef EXT_TPU_ENABLE
|
||||
TensorUnit::Ptr tensor_unit_;
|
||||
#endif
|
||||
|
||||
#ifdef EXT_V_ENABLE
|
||||
VecUnit::Ptr vec_unit_;
|
||||
#endif
|
||||
|
@ -200,7 +210,6 @@ private:
|
|||
friend class AluUnit;
|
||||
friend class FpuUnit;
|
||||
friend class SfuUnit;
|
||||
friend class TcuUnit;
|
||||
};
|
||||
|
||||
} // namespace vortex
|
||||
|
|
|
@ -51,8 +51,7 @@ static const std::unordered_map<Opcode, InstType> sc_instTable = {
|
|||
{Opcode::EXT1, InstType::R},
|
||||
{Opcode::EXT2, InstType::R4},
|
||||
{Opcode::R_W, InstType::R},
|
||||
{Opcode::I_W, InstType::I},
|
||||
{Opcode::TCU, InstType::I},
|
||||
{Opcode::I_W, InstType::I}
|
||||
};
|
||||
|
||||
static const char* op_string(const Instr &instr) {
|
||||
|
@ -390,15 +389,22 @@ static const char* op_string(const Instr &instr) {
|
|||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
||||
case Opcode::TCU:
|
||||
switch(func3)
|
||||
{
|
||||
case 0: return "ML"; // Matrix Load
|
||||
case 1: return "MS"; // Matrix Store
|
||||
case 2: return "MATMUL"; // Matrix Multiply
|
||||
case Opcode::EXT2:
|
||||
switch(func3) {
|
||||
case 0: // reserved
|
||||
case 1: // reserved
|
||||
std::abort();
|
||||
case 2:
|
||||
switch (func2) {
|
||||
case 0: return "MMADD.u4_i32";
|
||||
case 1: return "MMADD.u8_i32";
|
||||
case 2: return "MMADD.f16_f32";
|
||||
case 3: return "MMADD.bf16_f32";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
default:
|
||||
std::abort();
|
||||
|
@ -455,12 +461,12 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
|
|||
if (sep++ != 0) { os << ", "; } else { os << " "; }
|
||||
os << "0x" << std::hex << instr.getImm() << std::dec;
|
||||
}
|
||||
#ifdef EXT_V_ENABLE
|
||||
if (instr.getOpcode() == Opcode::SYS && instr.getFunc3() >= 5) {
|
||||
// CSRs with immediate values
|
||||
if (sep++ != 0) { os << ", "; } else { os << " "; }
|
||||
os << "0x" << std::hex << instr.getRSrc(0);
|
||||
}
|
||||
#ifdef EXT_V_ENABLE
|
||||
// Log vector-specific attributes
|
||||
if (instr.getVattrMask() != 0) {
|
||||
print_vec_attr(os, instr);
|
||||
|
@ -592,14 +598,6 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
|
|||
|
||||
case InstType::I: {
|
||||
switch (op) {
|
||||
case Opcode::TCU: {
|
||||
instr->setDestReg(rs1, RegType::Integer);
|
||||
instr->addSrcReg(rs1, RegType::Integer);
|
||||
instr->setFunc3(func3);
|
||||
instr->setFunc7(func7);
|
||||
auto imm = code >> shift_rs2;
|
||||
instr->setImm(sext(imm, width_i_imm));
|
||||
} break;
|
||||
case Opcode::I:
|
||||
case Opcode::I_W:
|
||||
case Opcode::JALR:
|
||||
|
@ -706,10 +704,59 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
|
|||
case InstType::V:
|
||||
switch (op) {
|
||||
case Opcode::VSET: {
|
||||
instr->setDestReg(rd, RegType::Integer);
|
||||
instr->setFunc3(func3);
|
||||
switch (func3) {
|
||||
case 0: { // OPIVV
|
||||
instr->setDestReg(rd, RegType::Vector);
|
||||
instr->addSrcReg(rs1, RegType::Vector);
|
||||
instr->addSrcReg(rs2, RegType::Vector);
|
||||
instr->setVmask((code >> shift_func7) & 0x1);
|
||||
instr->setFunc6(func6);
|
||||
} break;
|
||||
case 1: { // OPFVV
|
||||
instr->setDestReg(rd, (func6 == 16) ? RegType::Float : RegType::Vector);
|
||||
instr->addSrcReg(rs1, RegType::Vector);
|
||||
instr->addSrcReg(rs2, RegType::Vector);
|
||||
instr->setVmask((code >> shift_func7) & 0x1);
|
||||
instr->setFunc6(func6);
|
||||
} break;
|
||||
case 2: { // OPMVV
|
||||
instr->setDestReg(rd, (func6 == 16) ? RegType::Integer : RegType::Vector);
|
||||
instr->addSrcReg(rs1, RegType::Vector);
|
||||
instr->addSrcReg(rs2, RegType::Vector);
|
||||
instr->setVmask((code >> shift_func7) & 0x1);
|
||||
instr->setFunc6(func6);
|
||||
} break;
|
||||
case 3: { // OPIVI
|
||||
instr->setDestReg(rd, RegType::Vector);
|
||||
instr->addSrcReg(rs2, RegType::Vector);
|
||||
instr->setImm(rs1);
|
||||
instr->setVmask((code >> shift_func7) & 0x1);
|
||||
instr->setFunc6(func6);
|
||||
} break;
|
||||
case 4: { // OPIVX
|
||||
instr->setDestReg(rd, RegType::Vector);
|
||||
instr->addSrcReg(rs1, RegType::Integer);
|
||||
instr->addSrcReg(rs2, RegType::Vector);
|
||||
instr->setVmask((code >> shift_func7) & 0x1);
|
||||
instr->setFunc6(func6);
|
||||
} break;
|
||||
case 5: { // OPFVF
|
||||
instr->setDestReg(rd, RegType::Vector);
|
||||
instr->addSrcReg(rs1, RegType::Float);
|
||||
instr->addSrcReg(rs2, RegType::Vector);
|
||||
instr->setVmask((code >> shift_func7) & 0x1);
|
||||
instr->setFunc6(func6);
|
||||
} break;
|
||||
case 6: { // POMVX
|
||||
instr->setDestReg(rd, (func6 == 16) ? RegType::Integer : RegType::Vector);
|
||||
instr->addSrcReg(rs1, RegType::Integer);
|
||||
instr->addSrcReg(rs2, RegType::Vector);
|
||||
instr->setVmask((code >> shift_func7) & 0x1);
|
||||
instr->setFunc6(func6);
|
||||
} break;
|
||||
case 7: {
|
||||
instr->setDestReg(rd, RegType::Integer);
|
||||
if (code >> (shift_vset - 1) == 0b10) { // vsetvl
|
||||
instr->addSrcReg(rs1, RegType::Integer);
|
||||
instr->addSrcReg(rs2, RegType::Integer);
|
||||
|
@ -727,13 +774,6 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
|
|||
}
|
||||
}
|
||||
} break;
|
||||
case 3: { // Vector - immediate arithmetic instructions
|
||||
instr->setDestReg(rd, RegType::Vector);
|
||||
instr->addSrcReg(rs2, RegType::Vector);
|
||||
instr->setImm(rs1);
|
||||
instr->setVmask((code >> shift_func7) & 0x1);
|
||||
instr->setFunc6(func6);
|
||||
} break;
|
||||
default: { // Vector - vector/scalar arithmetic instructions
|
||||
if (func3 == 1 && func6 == 16) {
|
||||
instr->setDestReg(rd, RegType::Float);
|
||||
|
@ -750,9 +790,10 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
|
|||
}
|
||||
} break;
|
||||
case Opcode::FL:
|
||||
case Opcode::FS: {
|
||||
instr->addSrcReg(rs1, RegType::Integer);
|
||||
instr->setVmop((code >> shift_vmop) & 0b11);
|
||||
switch (instr->getVmop()) {
|
||||
uint32_t vmop = (code >> shift_vmop) & 0b11;
|
||||
switch (vmop) {
|
||||
case 0b00:
|
||||
instr->setVumop(rs2);
|
||||
break;
|
||||
|
@ -764,36 +805,17 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
|
|||
instr->addSrcReg(rs2, RegType::Vector);
|
||||
break;
|
||||
}
|
||||
instr->setVsew(func3 & 0x3);
|
||||
instr->setDestReg(rd, RegType::Vector);
|
||||
instr->setVlsWidth(func3);
|
||||
instr->setVmask((code >> shift_func7) & 0x1);
|
||||
instr->setVnf((code >> shift_vnf) & mask_func3);
|
||||
break;
|
||||
|
||||
case Opcode::FS:
|
||||
instr->addSrcReg(rs1, RegType::Integer);
|
||||
instr->setVmop((code >> shift_vmop) & 0b11);
|
||||
switch (instr->getVmop()) {
|
||||
case 0b00:
|
||||
instr->setVumop(rs2);
|
||||
break;
|
||||
case 0b10:
|
||||
instr->addSrcReg(rs2, RegType::Integer);
|
||||
break;
|
||||
case 0b01:
|
||||
case 0b11:
|
||||
instr->addSrcReg(rs2, RegType::Vector);
|
||||
break;
|
||||
if (op == Opcode::FL) {
|
||||
instr->setDestReg(rd, RegType::Vector);
|
||||
} else {
|
||||
instr->addSrcReg(rd, RegType::Vector);
|
||||
}
|
||||
instr->setVsew(func3 & 0x3);
|
||||
instr->addSrcReg(rd, RegType::Vector);
|
||||
instr->setVlsWidth(func3);
|
||||
instr->setVmask((code >> shift_func7) & 0x1);
|
||||
instr->setVmop((code >> shift_vmop) & 0b11);
|
||||
instr->setVmop(vmop);
|
||||
instr->setVsew(func3 & 0x3);
|
||||
instr->setVnf((code >> shift_vnf) & mask_func3);
|
||||
break;
|
||||
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
|
|
@ -30,16 +30,17 @@
|
|||
|
||||
using namespace vortex;
|
||||
|
||||
Emulator::warp_t::warp_t(const Arch& arch)
|
||||
: ireg_file(arch.num_threads(), std::vector<Word>(MAX_NUM_REGS))
|
||||
, freg_file(arch.num_threads(), std::vector<uint64_t>(MAX_NUM_REGS))
|
||||
warp_t::warp_t(uint32_t num_threads)
|
||||
: ireg_file(MAX_NUM_REGS, std::vector<Word>(num_threads))
|
||||
, freg_file(MAX_NUM_REGS, std::vector<uint64_t>(num_threads))
|
||||
#ifdef EXT_V_ENABLE
|
||||
, vreg_file(MAX_NUM_REGS, std::vector<Byte>(VLEN / 8))
|
||||
, vreg_file(num_threads, std::vector(MAX_NUM_REGS, std::vector<Byte>(VLENB)))
|
||||
, vcsrs(num_threads)
|
||||
#endif
|
||||
, uuid(0)
|
||||
{}
|
||||
|
||||
void Emulator::warp_t::clear(uint64_t startup_addr) {
|
||||
void warp_t::clear(uint64_t startup_addr) {
|
||||
this->PC = startup_addr;
|
||||
this->tmask.reset();
|
||||
this->uuid = 0;
|
||||
|
@ -53,7 +54,11 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
|
|||
reg = std::rand();
|
||||
#endif
|
||||
}
|
||||
reg_file.at(0) = 0; // r0 = 0
|
||||
}
|
||||
|
||||
// set x0 to zero
|
||||
for (auto& reg : this->ireg_file.at(0)) {
|
||||
reg = 0;
|
||||
}
|
||||
|
||||
for (auto& reg_file : this->freg_file) {
|
||||
|
@ -69,13 +74,24 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
|
|||
#ifdef EXT_V_ENABLE
|
||||
for (auto& reg_file : this->vreg_file) {
|
||||
for (auto& reg : reg_file) {
|
||||
#ifndef NDEBUG
|
||||
reg = 0;
|
||||
#else
|
||||
reg = std::rand();
|
||||
#endif
|
||||
for (auto& elm : reg) {
|
||||
#ifndef NDEBUG
|
||||
elm = 0;
|
||||
#else
|
||||
elm = std::rand();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto& vcsrs : this->vcsrs) {
|
||||
vcsrs.vstart = 0;
|
||||
vcsrs.vxsat = 0;
|
||||
vcsrs.vxrm = 0;
|
||||
vcsrs.vcsr = 0;
|
||||
vcsrs.vlenb = 0;
|
||||
vcsrs.vtype = 0;
|
||||
vcsrs.vl = 0;
|
||||
}
|
||||
this->vtype = {0, 0, 0, 0, 0};
|
||||
this->vl = 0;
|
||||
this->vlmax = 0;
|
||||
|
@ -88,26 +104,17 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
|
|||
: arch_(arch)
|
||||
, dcrs_(dcrs)
|
||||
, core_(core)
|
||||
, warps_(arch.num_warps(), arch)
|
||||
#ifdef EXT_TPU_ENABLE
|
||||
, tensor_unit_(core->tensor_unit())
|
||||
#endif
|
||||
, warps_(arch.num_warps(), arch.num_threads())
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
, ipdom_size_(arch.num_threads()-1)
|
||||
// [TBC] Currently, tradeoff between scratchpad size & performance has not been evaluated. Scratchpad is
|
||||
// considered to be big enough to hold input tiles for one output tile.
|
||||
// In future versions, scratchpad size should be fixed to an appropriate value.
|
||||
, scratchpad(std::vector<Word>(32 * 32 * 32768))
|
||||
#ifdef EXT_V_ENABLE
|
||||
, vec_unit_(core->vec_unit())
|
||||
, csrs_(arch.num_warps())
|
||||
#endif
|
||||
{
|
||||
std::srand(50);
|
||||
|
||||
#ifdef EXT_V_ENABLE
|
||||
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
|
||||
csrs_.at(i).resize(arch.num_threads());
|
||||
}
|
||||
#endif
|
||||
|
||||
this->clear();
|
||||
}
|
||||
|
||||
|
@ -147,10 +154,6 @@ void Emulator::clear() {
|
|||
active_warps_.set(0);
|
||||
warps_[0].tmask.set(0);
|
||||
wspawn_.valid = false;
|
||||
|
||||
for (auto& reg : scratchpad) {
|
||||
reg = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void Emulator::attach_ram(RAM* ram) {
|
||||
|
@ -230,12 +233,12 @@ instr_trace_t* Emulator::step() {
|
|||
DPN(5, " %r" << std::setfill('0') << std::setw(2) << i << ':' << std::hex);
|
||||
// Integer register file
|
||||
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
|
||||
DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << warp.ireg_file.at(j).at(i) << std::setfill(' ') << ' ');
|
||||
DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << warp.ireg_file.at(i).at(j) << std::setfill(' ') << ' ');
|
||||
}
|
||||
DPN(5, '|');
|
||||
// Floating point register file
|
||||
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
|
||||
DPN(5, ' ' << std::setfill('0') << std::setw(16) << warp.freg_file.at(j).at(i) << std::setfill(' ') << ' ');
|
||||
DPN(5, ' ' << std::setfill('0') << std::setw(16) << warp.freg_file.at(i).at(j) << std::setfill(' ') << ' ');
|
||||
}
|
||||
DPN(5, std::dec << std::endl);
|
||||
}
|
||||
|
@ -248,7 +251,7 @@ bool Emulator::running() const {
|
|||
}
|
||||
|
||||
int Emulator::get_exitcode() const {
|
||||
return warps_.at(0).ireg_file.at(0).at(3);
|
||||
return warps_.at(0).ireg_file.at(3).at(0);
|
||||
}
|
||||
|
||||
void Emulator::suspend(uint32_t wid) {
|
||||
|
@ -454,18 +457,6 @@ void Emulator::cout_flush() {
|
|||
case (addr + (VX_CSR_MPM_BASE_H-VX_CSR_MPM_BASE)) : return ((value >> 32) & 0xFFFFFFFF)
|
||||
#endif
|
||||
|
||||
Word Emulator::get_tiles() {
|
||||
return mat_size;
|
||||
}
|
||||
|
||||
Word Emulator::get_tc_size() {
|
||||
return tc_size;
|
||||
}
|
||||
|
||||
Word Emulator::get_tc_num() {
|
||||
return tc_num;
|
||||
}
|
||||
|
||||
Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
auto core_perf = core_->perf_stats();
|
||||
switch (addr) {
|
||||
|
@ -487,35 +478,26 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
case VX_CSR_MCAUSE:
|
||||
return 0;
|
||||
|
||||
case VX_CSR_FFLAGS: return warps_.at(wid).fcsr & 0x1F;
|
||||
case VX_CSR_FRM: return (warps_.at(wid).fcsr >> 5);
|
||||
case VX_CSR_FCSR: return warps_.at(wid).fcsr;
|
||||
case VX_CSR_FFLAGS: return warps_.at(wid).fcsr & 0x1F;
|
||||
case VX_CSR_FRM: return (warps_.at(wid).fcsr >> 5);
|
||||
case VX_CSR_FCSR: return warps_.at(wid).fcsr;
|
||||
|
||||
#ifdef EXT_V_ENABLE
|
||||
// Vector CRSs
|
||||
case VX_CSR_VSTART:
|
||||
return csrs_.at(wid).at(tid)[VX_CSR_VSTART];
|
||||
return warps_.at(wid).vcsrs.at(tid).vstart;
|
||||
case VX_CSR_VXSAT:
|
||||
return csrs_.at(wid).at(tid)[VX_CSR_VXSAT];
|
||||
return warps_.at(wid).vcsrs.at(tid).vxsat;
|
||||
case VX_CSR_VXRM:
|
||||
return csrs_.at(wid).at(tid)[VX_CSR_VXRM];
|
||||
case VX_CSR_VCSR: {
|
||||
Word vxsat = csrs_.at(wid).at(tid)[VX_CSR_VXSAT];
|
||||
Word vxrm = csrs_.at(wid).at(tid)[VX_CSR_VXRM];
|
||||
return (vxrm << 1) | vxsat;
|
||||
}
|
||||
return warps_.at(wid).vcsrs.at(tid).vxrm;
|
||||
case VX_CSR_VCSR:
|
||||
return ( warps_.at(wid).vcsrs.at(tid).vxrm << 1) | warps_.at(wid).vcsrs.at(tid).vxsat;
|
||||
case VX_CSR_VL:
|
||||
return csrs_.at(wid).at(tid)[VX_CSR_VL];
|
||||
return warps_.at(wid).vcsrs.at(tid).vl;
|
||||
case VX_CSR_VTYPE:
|
||||
return csrs_.at(wid).at(tid)[VX_CSR_VTYPE];
|
||||
return warps_.at(wid).vcsrs.at(tid).vtype;
|
||||
case VX_CSR_VLENB:
|
||||
return VLEN / 8;
|
||||
case VX_CSR_VCYCLE:
|
||||
return csrs_.at(wid).at(tid)[VX_CSR_VCYCLE];
|
||||
case VX_CSR_VTIME:
|
||||
return csrs_.at(wid).at(tid)[VX_CSR_VTIME];
|
||||
case VX_CSR_VINSTRET:
|
||||
return csrs_.at(wid).at(tid)[VX_CSR_VINSTRET];
|
||||
return VLENB;
|
||||
#endif
|
||||
|
||||
case VX_CSR_MHARTID: return (core_->id() * arch_.num_warps() + wid) * arch_.num_threads() + tid;
|
||||
|
@ -529,9 +511,6 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
case VX_CSR_NUM_CORES: return uint32_t(arch_.num_cores()) * arch_.num_clusters();
|
||||
case VX_CSR_LOCAL_MEM_BASE: return arch_.local_mem_base();
|
||||
case VX_CSR_MSCRATCH: return csr_mscratch_;
|
||||
case VX_MAT_MUL_SIZE: return mat_size;
|
||||
case VX_TC_NUM: return tc_num;
|
||||
case VX_TC_SIZE: return tc_size;
|
||||
|
||||
CSR_READ_64(VX_CSR_MCYCLE, core_perf.cycles);
|
||||
CSR_READ_64(VX_CSR_MINSTRET, core_perf.instrs);
|
||||
|
@ -656,34 +635,34 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
|
|||
#ifdef EXT_V_ENABLE
|
||||
// Vector CRSs
|
||||
case VX_CSR_VSTART:
|
||||
csrs_.at(wid).at(tid)[VX_CSR_VSTART] = value;
|
||||
warps_.at(wid).vcsrs.at(tid).vstart = value;
|
||||
break;
|
||||
case VX_CSR_VXSAT:
|
||||
csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1;
|
||||
warps_.at(wid).vcsrs.at(tid).vxsat = value & 0b1;
|
||||
break;
|
||||
case VX_CSR_VXRM:
|
||||
csrs_.at(wid).at(tid)[VX_CSR_VXRM] = value & 0b11;
|
||||
warps_.at(wid).vcsrs.at(tid).vxrm = value & 0b11;
|
||||
break;
|
||||
case VX_CSR_VCSR:
|
||||
csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1;
|
||||
csrs_.at(wid).at(tid)[VX_CSR_VXRM] = (value >> 1) & 0b11;
|
||||
warps_.at(wid).vcsrs.at(tid).vxsat = value & 0b1;
|
||||
warps_.at(wid).vcsrs.at(tid).vxrm = (value >> 1) & 0b11;
|
||||
break;
|
||||
case VX_CSR_VL: // read only, written by vset(i)vl(i)
|
||||
csrs_.at(wid).at(tid)[VX_CSR_VL] = value;
|
||||
case VX_CSR_VL:
|
||||
warps_.at(wid).vcsrs.at(tid).vl = value;
|
||||
break;
|
||||
case VX_CSR_VTYPE: // read only, written by vset(i)vl(i)
|
||||
csrs_.at(wid).at(tid)[VX_CSR_VTYPE] = value;
|
||||
case VX_CSR_VTYPE:
|
||||
warps_.at(wid).vcsrs.at(tid).vtype = value;
|
||||
break;
|
||||
case VX_CSR_VLENB: // read only
|
||||
std::abort();
|
||||
break;
|
||||
case VX_CSR_VLENB: // read only, set to VLEN / 8
|
||||
#endif
|
||||
|
||||
case VX_CSR_SATP:
|
||||
#ifdef VM_ENABLE
|
||||
// warps_.at(wid).fcsr = (warps_.at(wid).fcsr & ~0x1F) | (value & 0x1F);
|
||||
// csrs_.at(wid).at(tid)[addr] = value; //what is wid and tid?
|
||||
mmu_.set_satp(value);
|
||||
break;
|
||||
#endif
|
||||
break;
|
||||
case VX_CSR_MSTATUS:
|
||||
case VX_CSR_MEDELEG:
|
||||
case VX_CSR_MIDELEG:
|
||||
|
@ -695,18 +674,10 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
|
|||
case VX_CSR_MNSTATUS:
|
||||
case VX_CSR_MCAUSE:
|
||||
break;
|
||||
case VX_MAT_MUL_SIZE:
|
||||
mat_size = value;
|
||||
break;
|
||||
case VX_TC_NUM:
|
||||
tc_num = value;
|
||||
break;
|
||||
case VX_TC_SIZE:
|
||||
tc_size = value;
|
||||
break;
|
||||
|
||||
default: {
|
||||
std::cout << "Error: invalid CSR write addr=0x" << std::hex << addr << ", value=0x" << value << std::dec << std::endl;
|
||||
std::flush(std::cout);
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <stack>
|
||||
#include <mem.h>
|
||||
#include "types.h"
|
||||
#include "tensor_unit.h"
|
||||
#ifdef EXT_V_ENABLE
|
||||
#include "vec_unit.h"
|
||||
#endif
|
||||
|
@ -31,11 +32,77 @@ class Core;
|
|||
class Instr;
|
||||
class instr_trace_t;
|
||||
|
||||
struct ipdom_entry_t {
|
||||
ipdom_entry_t(const ThreadMask &orig_tmask, const ThreadMask &else_tmask, Word PC)
|
||||
: orig_tmask (orig_tmask)
|
||||
, else_tmask (else_tmask)
|
||||
, PC (PC)
|
||||
, fallthrough(false)
|
||||
{}
|
||||
|
||||
ThreadMask orig_tmask;
|
||||
ThreadMask else_tmask;
|
||||
Word PC;
|
||||
bool fallthrough;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct vtype_t {
|
||||
uint32_t vill;
|
||||
uint32_t vma;
|
||||
uint32_t vta;
|
||||
uint32_t vsew;
|
||||
uint32_t vlmul;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct vcsrs_t {
|
||||
uint32_t vstart;
|
||||
uint32_t vxsat;
|
||||
uint32_t vxrm;
|
||||
uint32_t vcsr;
|
||||
uint32_t vl;
|
||||
uint32_t vtype;
|
||||
uint32_t vlenb;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct warp_t {
|
||||
warp_t(uint32_t num_threads);
|
||||
void clear(uint64_t startup_addr);
|
||||
|
||||
Word PC;
|
||||
ThreadMask tmask;
|
||||
std::vector<std::vector<Word>> ireg_file;
|
||||
std::vector<std::vector<uint64_t>>freg_file;
|
||||
std::stack<ipdom_entry_t> ipdom_stack;
|
||||
Byte fcsr;
|
||||
#ifdef EXT_V_ENABLE
|
||||
std::vector<std::vector<std::vector<Byte>>> vreg_file;
|
||||
std::vector<vcsrs_t> vcsrs;
|
||||
vtype_t vtype;
|
||||
uint32_t vl;
|
||||
uint32_t vlmax;
|
||||
#endif
|
||||
uint32_t uuid;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct wspawn_t {
|
||||
bool valid;
|
||||
uint32_t num_warps;
|
||||
Word nextPC;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class Emulator {
|
||||
public:
|
||||
Emulator(const Arch &arch,
|
||||
const DCRS &dcrs,
|
||||
Core* core);
|
||||
Emulator(const Arch &arch, const DCRS &dcrs, Core* core);
|
||||
|
||||
~Emulator();
|
||||
|
||||
|
@ -60,83 +127,20 @@ public:
|
|||
|
||||
int get_exitcode() const;
|
||||
|
||||
Word get_tiles();
|
||||
Word get_tc_size();
|
||||
Word get_tc_num();
|
||||
|
||||
void dcache_read(void* data, uint64_t addr, uint32_t size);
|
||||
|
||||
void dcache_write(const void* data, uint64_t addr, uint32_t size);
|
||||
|
||||
private:
|
||||
|
||||
struct ipdom_entry_t {
|
||||
ipdom_entry_t(const ThreadMask &orig_tmask, const ThreadMask &else_tmask, Word PC)
|
||||
: orig_tmask (orig_tmask)
|
||||
, else_tmask (else_tmask)
|
||||
, PC (PC)
|
||||
, fallthrough(false)
|
||||
{}
|
||||
|
||||
ThreadMask orig_tmask;
|
||||
ThreadMask else_tmask;
|
||||
Word PC;
|
||||
bool fallthrough;
|
||||
};
|
||||
|
||||
struct vtype_t {
|
||||
uint32_t vill;
|
||||
uint32_t vma;
|
||||
uint32_t vta;
|
||||
uint32_t vsew;
|
||||
uint32_t vlmul;
|
||||
};
|
||||
|
||||
union reg_data_t {
|
||||
Word u;
|
||||
WordI i;
|
||||
WordF f;
|
||||
float f32;
|
||||
double f64;
|
||||
uint32_t u32;
|
||||
uint64_t u64;
|
||||
int32_t i32;
|
||||
int64_t i64;
|
||||
};
|
||||
|
||||
struct warp_t {
|
||||
warp_t(const Arch& arch);
|
||||
void clear(uint64_t startup_addr);
|
||||
|
||||
Word PC;
|
||||
ThreadMask tmask;
|
||||
std::vector<std::vector<Word>> ireg_file;
|
||||
std::vector<std::vector<uint64_t>>freg_file;
|
||||
std::stack<ipdom_entry_t> ipdom_stack;
|
||||
Byte fcsr;
|
||||
#ifdef EXT_V_ENABLE
|
||||
std::vector<std::vector<Byte>> vreg_file;
|
||||
vtype_t vtype;
|
||||
uint32_t vl;
|
||||
Word vlmax;
|
||||
#endif
|
||||
uint32_t uuid;
|
||||
};
|
||||
|
||||
struct wspawn_t {
|
||||
bool valid;
|
||||
uint32_t num_warps;
|
||||
Word nextPC;
|
||||
};
|
||||
|
||||
std::shared_ptr<Instr> decode(uint32_t code) const;
|
||||
|
||||
void execute(const Instr &instr, uint32_t wid, instr_trace_t *trace);
|
||||
|
||||
#ifdef EXT_V_ENABLE
|
||||
void loadVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
|
||||
void storeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
|
||||
void executeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata);
|
||||
void loadVector(const Instr &instr, uint32_t wid, uint32_t tid, const std::vector<reg_data_t>& rs1_data, const std::vector<reg_data_t>& rs2_data);
|
||||
void storeVector(const Instr &instr, uint32_t wid, uint32_t tid, const std::vector<reg_data_t>& rs1_data, const std::vector<reg_data_t>& rs2_data);
|
||||
bool executeVector(const Instr &instr, uint32_t wid, uint32_t tid, const std::vector<reg_data_t>& rs1_data, const std::vector<reg_data_t>& rs2_data, std::vector<reg_data_t>& rd_data);
|
||||
#endif
|
||||
|
||||
void icache_read(void* data, uint64_t addr, uint32_t size);
|
||||
|
@ -165,6 +169,11 @@ private:
|
|||
const Arch& arch_;
|
||||
const DCRS& dcrs_;
|
||||
Core* core_;
|
||||
|
||||
#ifdef EXT_TPU_ENABLE
|
||||
TensorUnit::Ptr tensor_unit_;
|
||||
#endif
|
||||
|
||||
std::vector<warp_t> warps_;
|
||||
WarpMask active_warps_;
|
||||
WarpMask stalled_warps_;
|
||||
|
@ -174,13 +183,9 @@ private:
|
|||
uint32_t ipdom_size_;
|
||||
Word csr_mscratch_;
|
||||
wspawn_t wspawn_;
|
||||
std::vector<Word> scratchpad;
|
||||
uint32_t mat_size;
|
||||
uint32_t tc_size;
|
||||
uint32_t tc_num;
|
||||
|
||||
#ifdef EXT_V_ENABLE
|
||||
VecUnit::Ptr vec_unit_;
|
||||
std::vector<std::vector<std::unordered_map<uint32_t, uint32_t>>> csrs_;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -166,7 +166,7 @@ void LsuUnit::tick() {
|
|||
continue;
|
||||
}
|
||||
|
||||
bool is_write = ((trace->lsu_type == LsuType::STORE) || (trace->lsu_type == LsuType::TCU_STORE));
|
||||
bool is_write = (trace->lsu_type == LsuType::STORE);
|
||||
|
||||
// check pending queue capacity
|
||||
if (!is_write && state.pending_rd_reqs.full()) {
|
||||
|
@ -222,96 +222,6 @@ void LsuUnit::tick() {
|
|||
input.pop();
|
||||
}
|
||||
}
|
||||
/* TO BE FIXED:Tensor_core code
|
||||
send_request is not used anymore. Need to be modified number of load
|
||||
*/
|
||||
/*
|
||||
int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
|
||||
int count = 0;
|
||||
|
||||
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
|
||||
bool is_write = ((trace->lsu_type == LsuType::STORE) || (trace->lsu_type == LsuType::TCU_STORE));
|
||||
|
||||
uint16_t req_per_thread = 1;
|
||||
if ((trace->lsu_type == LsuType::TCU_LOAD) || (trace->lsu_type == LsuType::TCU_STORE))
|
||||
{
|
||||
req_per_thread= (1>(trace_data->mem_addrs.at(0).size)/4)? 1: ((trace_data->mem_addrs.at(0).size)/4);
|
||||
}
|
||||
|
||||
auto t0 = trace->pid * NUM_LSU_LANES;
|
||||
|
||||
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
|
||||
uint32_t t = t0 + i;
|
||||
if (!trace->tmask.test(t))
|
||||
continue;
|
||||
|
||||
int req_idx = block_idx * LSU_CHANNELS + (i % LSU_CHANNELS);
|
||||
auto& dcache_req_port = core_->lmem_switch_.at(req_idx)->ReqIn;
|
||||
|
||||
auto mem_addr = trace_data->mem_addrs.at(t);
|
||||
auto type = get_addr_type(mem_addr.addr);
|
||||
// DT(3, "addr_type = " << type << ", " << *trace);
|
||||
uint32_t mem_bytes = 1;
|
||||
for (int i = 0; i < req_per_thread; i++)
|
||||
{
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr.addr + (i*mem_bytes);
|
||||
mem_req.write = is_write;
|
||||
mem_req.type = type;
|
||||
mem_req.tag = tag;
|
||||
mem_req.cid = trace->cid;
|
||||
mem_req.uuid = trace->uuid;
|
||||
|
||||
dcache_req_port.push(mem_req, 1);
|
||||
DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
|
||||
<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
|
||||
|
||||
if (is_write) {
|
||||
++core_->perf_stats_.stores;
|
||||
} else {
|
||||
++core_->perf_stats_.loads;
|
||||
++pending_loads_;
|
||||
}
|
||||
|
||||
++count;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
*/
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TcuUnit::TcuUnit(const SimContext& ctx, Core* core)
|
||||
: FuncUnit(ctx, core, "TCU")
|
||||
{}
|
||||
|
||||
void TcuUnit::tick() {
|
||||
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
auto& input = Inputs.at(i);
|
||||
if (input.empty())
|
||||
continue;
|
||||
auto& output = Outputs.at(i);
|
||||
auto trace = input.front();
|
||||
uint32_t n_tiles = core_->emulator_.get_tiles();
|
||||
uint32_t tc_size = core_->emulator_.get_tc_size();
|
||||
|
||||
switch (trace->tcu_type) {
|
||||
case TCUType::TCU_MUL:
|
||||
{ //mat size = n_tiles * tc_size
|
||||
int matmul_latency = (n_tiles * tc_size) + tc_size + tc_size;
|
||||
output.push(trace, matmul_latency);
|
||||
DT(3, "matmul_latency = " << matmul_latency << ", " << *trace);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
DT(3, "pipeline-execute: op=" << trace->tcu_type << ", " << *trace);
|
||||
input.pop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -354,6 +264,14 @@ void SfuUnit::tick() {
|
|||
release_warp = core_->barrier(trace_data->arg1, trace_data->arg2, trace->wid);
|
||||
}
|
||||
} break;
|
||||
#ifdef EXT_TPU_ENABLE
|
||||
case SfuType::MMADD: {
|
||||
if (trace->eop) {
|
||||
auto trace_data = std::dynamic_pointer_cast<TensorUnit::TraceData>(trace->data);
|
||||
output.push(trace, trace_data->latency + delay);
|
||||
}
|
||||
} break;
|
||||
#endif
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
|
|
@ -98,14 +98,6 @@ private:
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class TcuUnit : public FuncUnit {
|
||||
public:
|
||||
TcuUnit(const SimContext& ctx, Core*);
|
||||
void tick();
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class SfuUnit : public FuncUnit {
|
||||
public:
|
||||
SfuUnit(const SimContext& ctx, Core*);
|
||||
|
|
|
@ -48,7 +48,7 @@ enum class Opcode {
|
|||
EXT1 = 0x0b,
|
||||
EXT2 = 0x2b,
|
||||
EXT3 = 0x5b,
|
||||
TCU = 0x7b
|
||||
EXT4 = 0x7b
|
||||
};
|
||||
|
||||
enum class InstType {
|
||||
|
@ -73,11 +73,11 @@ enum DecodeConstants {
|
|||
width_vmask = 1,
|
||||
width_i_imm = 12,
|
||||
width_j_imm = 20,
|
||||
width_v_zimm = 11,
|
||||
width_v_ma = 1,
|
||||
width_v_ta = 1,
|
||||
width_v_zimm= 11,
|
||||
width_v_ma = 1,
|
||||
width_v_ta = 1,
|
||||
width_v_sew = 3,
|
||||
width_v_lmul = 3,
|
||||
width_v_lmul= 3,
|
||||
width_aq = 1,
|
||||
width_rl = 1,
|
||||
|
||||
|
@ -142,9 +142,9 @@ public:
|
|||
, func7_(0)
|
||||
, vmask_(0)
|
||||
, vlsWidth_(0)
|
||||
, vMop_(0)
|
||||
, vUmop_(0)
|
||||
, vNf_(0)
|
||||
, vmop_(0)
|
||||
, vumop_(0)
|
||||
, vnf_(0)
|
||||
, vs3_(0)
|
||||
, has_zimm_(false)
|
||||
, vlmul_(0)
|
||||
|
@ -189,10 +189,10 @@ public:
|
|||
|
||||
// Attributes for Vector instructions
|
||||
void setVlsWidth(uint32_t width) { vlsWidth_ = width; vattr_mask_ |= vattr_vlswidth; }
|
||||
void setVmop(uint32_t mop) { vMop_ = mop; vattr_mask_ |= vattr_vmop; }
|
||||
void setVumop(uint32_t umop) { vUmop_ = umop; vattr_mask_ |= vattr_vumop; }
|
||||
void setVnf(uint32_t nf) { vNf_ = nf; vattr_mask_ |= vattr_vnf; }
|
||||
void setVmask(uint32_t mask) { vmask_ = mask; vattr_mask_ |= vattr_vmask; }
|
||||
void setVmop(uint32_t mop) { vmop_ = mop; vattr_mask_ |= vattr_vmop; }
|
||||
void setVumop(uint32_t umop) { vumop_ = umop; vattr_mask_ |= vattr_vumop; }
|
||||
void setVnf(uint32_t nf) { vnf_ = nf; vattr_mask_ |= vattr_vnf; }
|
||||
void setVmask(uint32_t vmask) { vmask_ = vmask; vattr_mask_ |= vattr_vmask; }
|
||||
void setVs3(uint32_t vs) { vs3_ = vs; vattr_mask_ |= vattr_vs3; }
|
||||
void setZimm(bool has_zimm) { has_zimm_ = has_zimm; vattr_mask_ |= vattr_zimm; }
|
||||
void setVlmul(uint32_t lmul) { vlmul_ = lmul; vattr_mask_ |= vattr_vlmul; }
|
||||
|
@ -218,10 +218,11 @@ public:
|
|||
uint32_t getFunc6() const { return func6_; }
|
||||
uint32_t getFunc7() const { return func7_; }
|
||||
|
||||
// Vector
|
||||
uint32_t getVlsWidth() const { return vlsWidth_; }
|
||||
uint32_t getVmop() const { return vMop_; }
|
||||
uint32_t getVumop() const { return vUmop_; }
|
||||
uint32_t getVnf() const { return vNf_; }
|
||||
uint32_t getVmop() const { return vmop_; }
|
||||
uint32_t getVumop() const { return vumop_; }
|
||||
uint32_t getVnf() const { return vnf_; }
|
||||
uint32_t getVmask() const { return vmask_; }
|
||||
uint32_t getVs3() const { return vs3_; }
|
||||
bool hasZimm() const { return has_zimm_; }
|
||||
|
@ -254,9 +255,9 @@ private:
|
|||
// Vector
|
||||
uint32_t vmask_;
|
||||
uint32_t vlsWidth_;
|
||||
uint32_t vMop_;
|
||||
uint32_t vUmop_;
|
||||
uint32_t vNf_;
|
||||
uint32_t vmop_;
|
||||
uint32_t vumop_;
|
||||
uint32_t vnf_;
|
||||
uint32_t vs3_;
|
||||
bool has_zimm_;
|
||||
uint32_t vlmul_;
|
||||
|
|
|
@ -87,7 +87,6 @@ public:
|
|||
#ifdef EXT_V_ENABLE
|
||||
VpuType vpu_type;
|
||||
#endif
|
||||
TCUType tcu_type;
|
||||
};
|
||||
|
||||
ITraceData::Ptr data;
|
||||
|
@ -177,4 +176,9 @@ inline std::ostream &operator<<(std::ostream &os, const instr_trace_t& trace) {
|
|||
return os;
|
||||
}
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, instr_trace_t* trace) {
|
||||
os << *trace;
|
||||
return os;
|
||||
}
|
||||
|
||||
}
|
|
@ -43,8 +43,13 @@ public:
|
|||
|
||||
char sname[100];
|
||||
snprintf(sname, 100, "%s-xbar", simobject->name().c_str());
|
||||
uint32_t wsel_bits = log2ceil(config_.line_size);
|
||||
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::Priority, config.num_reqs, (1 << config.B), wsel_bits);
|
||||
uint32_t lg2_line_size = log2ceil(config_.line_size);
|
||||
uint32_t num_banks = 1 << config.B;
|
||||
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::Priority, config.num_reqs, num_banks, 1,
|
||||
[lg2_line_size, num_banks](const MemCrossBar::ReqType& req) {
|
||||
// Custom logic to calculate the output index using bank interleaving
|
||||
return (uint32_t)((req.addr >> lg2_line_size) & (num_banks-1));
|
||||
});
|
||||
for (uint32_t i = 0; i < config.num_reqs; ++i) {
|
||||
simobject->Inputs.at(i).bind(&mem_xbar_->ReqIn.at(i));
|
||||
mem_xbar_->RspIn.at(i).bind(&simobject->Outputs.at(i));
|
||||
|
|
|
@ -30,7 +30,6 @@ private:
|
|||
MemCrossBar::Ptr mem_xbar_;
|
||||
DramSim dram_sim_;
|
||||
mutable PerfStats perf_stats_;
|
||||
|
||||
struct DramCallbackArgs {
|
||||
MemSim::Impl* memsim;
|
||||
MemReq request;
|
||||
|
@ -41,11 +40,15 @@ public:
|
|||
Impl(MemSim* simobject, const Config& config)
|
||||
: simobject_(simobject)
|
||||
, config_(config)
|
||||
, dram_sim_(MEM_CLOCK_RATIO)
|
||||
, dram_sim_(config.num_banks, config.block_size, config.clock_ratio)
|
||||
{
|
||||
char sname[100];
|
||||
snprintf(sname, 100, "%s-xbar", simobject->name().c_str());
|
||||
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_ports, config.num_banks);
|
||||
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_ports, config.num_banks, 1,
|
||||
[lg2_block_size = log2ceil(config.block_size), num_banks = config.num_banks](const MemCrossBar::ReqType& req) {
|
||||
// Custom logic to calculate the output index using bank interleaving
|
||||
return (uint32_t)((req.addr >> lg2_block_size) & (num_banks-1));
|
||||
});
|
||||
for (uint32_t i = 0; i < config.num_ports; ++i) {
|
||||
simobject->MemReqPorts.at(i).bind(&mem_xbar_->ReqIn.at(i));
|
||||
mem_xbar_->RspIn.at(i).bind(&simobject->MemRspPorts.at(i));
|
||||
|
@ -74,16 +77,15 @@ public:
|
|||
|
||||
auto& mem_req = mem_xbar_->ReqOut.at(i).front();
|
||||
|
||||
// try to enqueue the request to the memory system
|
||||
// enqueue the request to the memory system
|
||||
auto req_args = new DramCallbackArgs{this, mem_req, i};
|
||||
auto enqueue_success = dram_sim_.send_request(
|
||||
mem_req.write,
|
||||
dram_sim_.send_request(
|
||||
mem_req.addr,
|
||||
0,
|
||||
mem_req.write,
|
||||
[](void* arg) {
|
||||
auto rsp_args = reinterpret_cast<const DramCallbackArgs*>(arg);
|
||||
// only send a response for read requests
|
||||
if (!rsp_args->request.write) {
|
||||
// only send a response for read requests
|
||||
MemRsp mem_rsp{rsp_args->request.tag, rsp_args->request.cid, rsp_args->request.uuid};
|
||||
rsp_args->memsim->mem_xbar_->RspOut.at(rsp_args->bank_id).push(mem_rsp, 1);
|
||||
DT(3, rsp_args->memsim->simobject_->name() << "-mem-rsp[" << rsp_args->bank_id << "]: " << mem_rsp);
|
||||
|
@ -93,14 +95,7 @@ public:
|
|||
req_args
|
||||
);
|
||||
|
||||
// check if the request was enqueued successfully
|
||||
if (!enqueue_success) {
|
||||
delete req_args;
|
||||
continue;
|
||||
}
|
||||
|
||||
DT(3, simobject_->name() << "-mem-req[" << i << "]: " << mem_req);
|
||||
|
||||
mem_xbar_->ReqOut.at(i).pop();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,6 +23,8 @@ public:
|
|||
struct Config {
|
||||
uint32_t num_banks;
|
||||
uint32_t num_ports;
|
||||
uint32_t block_size;
|
||||
float clock_ratio;
|
||||
};
|
||||
|
||||
struct PerfStats {
|
||||
|
|
|
@ -22,10 +22,14 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
|
|||
{
|
||||
SimPlatform::instance().initialize();
|
||||
|
||||
assert(PLATFORM_MEMORY_DATA_SIZE == MEM_BLOCK_SIZE);
|
||||
|
||||
// create memory simulator
|
||||
memsim_ = MemSim::Create("dram", MemSim::Config{
|
||||
PLATFORM_MEMORY_BANKS,
|
||||
L3_MEM_PORTS
|
||||
PLATFORM_MEMORY_NUM_BANKS,
|
||||
L3_MEM_PORTS,
|
||||
MEM_BLOCK_SIZE,
|
||||
MEM_CLOCK_RATIO
|
||||
});
|
||||
|
||||
// create clusters
|
||||
|
|
264
sim/simx/tensor_unit.cpp
Normal file
264
sim/simx/tensor_unit.cpp
Normal file
|
@ -0,0 +1,264 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tensor_unit.h"
|
||||
#include "mem.h"
|
||||
#include <VX_config.h>
|
||||
#include <rvfloats.h>
|
||||
#include <algorithm>
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
union flaot_uint32_t {
|
||||
float f;
|
||||
uint32_t u;
|
||||
};
|
||||
|
||||
inline uint32_t read_element(const std::vector<reg_data_t>& reg_data, int index, TensorFormat format) {
|
||||
switch (format) {
|
||||
case TensorFormat::Int4: {
|
||||
return reg_data.at(index / 8).u >> (index % 8);
|
||||
}
|
||||
case TensorFormat::Int8: {
|
||||
return reg_data.at(index / 4).u >> (index % 4);
|
||||
}
|
||||
case TensorFormat::FP16: {
|
||||
return reg_data.at(index / 2).u >> (index % 2);
|
||||
}
|
||||
case TensorFormat::BF16: {
|
||||
return reg_data.at(index / 2).u >> (index % 2);
|
||||
}
|
||||
default: assert(false);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline void write_element(std::vector<reg_data_t>& reg_data, int index, uint32_t value, TensorFormat format) {
|
||||
switch (format) {
|
||||
case TensorFormat::Int32:
|
||||
case TensorFormat::FP32: {
|
||||
reg_data.at(index).i = value;
|
||||
break;
|
||||
}
|
||||
default: assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
inline float type_to_float(uint32_t value, TensorFormat format) {
|
||||
switch (format) {
|
||||
case TensorFormat::Int4: {
|
||||
flaot_uint32_t u2f;
|
||||
u2f.u = rv_itof_s(value, 0, nullptr);
|
||||
return u2f.f;
|
||||
}
|
||||
case TensorFormat::Int8: {
|
||||
flaot_uint32_t u2f;
|
||||
u2f.u = rv_itof_s(value, 0, nullptr);
|
||||
return u2f.f;
|
||||
}
|
||||
case TensorFormat::FP16: {
|
||||
flaot_uint32_t u2f;
|
||||
u2f.u = rv_htof_s(value, 0, nullptr);
|
||||
return u2f.f;
|
||||
}
|
||||
case TensorFormat::BF16: {
|
||||
flaot_uint32_t u2f;
|
||||
u2f.u = rv_btof_s(value, 0, nullptr);
|
||||
return u2f.f;
|
||||
}
|
||||
default: assert(false);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
inline uint32_t float_to_type(float value, TensorFormat format) {
|
||||
switch (format) {
|
||||
case TensorFormat::Int32: {
|
||||
flaot_uint32_t f2u;
|
||||
f2u.f = value;
|
||||
return rv_ftoi_s(f2u.u, 0, nullptr);
|
||||
}
|
||||
case TensorFormat::FP32: {
|
||||
flaot_uint32_t f2u;
|
||||
f2u.f = value;
|
||||
return f2u.u;
|
||||
}
|
||||
default: assert(false);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
class TensorCore : public SimObject<TensorCore> {
|
||||
public:
|
||||
struct PerfStats {
|
||||
uint64_t latency;
|
||||
|
||||
PerfStats()
|
||||
: latency(0)
|
||||
{}
|
||||
|
||||
PerfStats& operator+=(const PerfStats& rhs) {
|
||||
this->latency += rhs.latency;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
SimPort<instr_trace_t*> Input;
|
||||
SimPort<instr_trace_t*> Output;
|
||||
|
||||
TensorCore(const SimContext& ctx, const char* name, uint32_t tile_size)
|
||||
: SimObject<TensorCore>(ctx, name)
|
||||
, Input(this)
|
||||
, Output(this)
|
||||
, tile_size_(tile_size)
|
||||
{}
|
||||
|
||||
~TensorCore() {
|
||||
this->reset();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
//--
|
||||
}
|
||||
|
||||
void tick() {
|
||||
//--
|
||||
}
|
||||
|
||||
void mmadd(TensorFormat from_format,
|
||||
TensorFormat to_format,
|
||||
const std::vector<reg_data_t>& rs1_data,
|
||||
const std::vector<reg_data_t>& rs2_data,
|
||||
const std::vector<reg_data_t>& rs3_data,
|
||||
std::vector<reg_data_t>& rd_data,
|
||||
TensorUnit::TraceData::Ptr trace_data) {
|
||||
assert(rd_data.size() <= tile_size_);
|
||||
trace_data->latency = 2 + tile_size_;
|
||||
// matrix multiplication and accumulation
|
||||
for (uint32_t i = 0; i < tile_size_; i++) {
|
||||
for (uint32_t j = 0; j < tile_size_; j++) {
|
||||
float sum = type_to_float(read_element(rs3_data, i * tile_size_ + j, to_format), to_format);
|
||||
for (uint32_t k = 0; k < tile_size_; k++) {
|
||||
auto a = type_to_float(read_element(rs1_data, i * tile_size_ + k, from_format), from_format);
|
||||
auto b = type_to_float(read_element(rs2_data, k * tile_size_ + j, from_format), from_format);
|
||||
sum += a * b;
|
||||
}
|
||||
write_element(rd_data, i * tile_size_ + j, float_to_type(sum, to_format), to_format);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
PerfStats perf_stats_;
|
||||
uint32_t tile_size_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class TensorUnit::Impl {
|
||||
public:
|
||||
|
||||
Impl(TensorUnit* simobject, uint32_t tile_size)
|
||||
: simobject_(simobject)
|
||||
, tensor_cores_(NUM_TENSOR_CORES)
|
||||
, tc_sel_(0)
|
||||
{
|
||||
char sname[100];
|
||||
for (uint32_t i = 0; i < NUM_TENSOR_CORES; i++) {
|
||||
snprintf(sname, 100, "%s-core%d", simobject->name().c_str(), i);
|
||||
tensor_cores_[i] = TensorCore::Create(sname, tile_size);
|
||||
}
|
||||
|
||||
this->reset();
|
||||
}
|
||||
|
||||
~Impl() {}
|
||||
|
||||
void reset() {
|
||||
//--
|
||||
}
|
||||
|
||||
void tick() {
|
||||
// forward input to tensor cores
|
||||
auto& input = simobject_->Input;
|
||||
if (input.empty())
|
||||
return;
|
||||
auto trace = input.front();
|
||||
auto trace_data = std::dynamic_pointer_cast<TraceData>(trace->data);
|
||||
tensor_cores_.at(trace_data->tc_idx)->Input.push(trace, 1);
|
||||
input.pop();
|
||||
}
|
||||
|
||||
void mmadd(TensorFormat from_format,
|
||||
TensorFormat to_format,
|
||||
const std::vector<reg_data_t>& rs1_data,
|
||||
const std::vector<reg_data_t>& rs2_data,
|
||||
const std::vector<reg_data_t>& rs3_data,
|
||||
std::vector<reg_data_t>& rd_data,
|
||||
TensorUnit::TraceData::Ptr trace_data) {
|
||||
tensor_cores_.at(tc_sel_)->mmadd(from_format, to_format, rs1_data, rs2_data, rs3_data, rd_data, trace_data);
|
||||
trace_data->tc_idx = tc_sel_;
|
||||
tc_sel_ = (tc_sel_ + 1) % NUM_TENSOR_CORES;
|
||||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
TensorUnit* simobject_;
|
||||
std::vector<TensorCore::Ptr> tensor_cores_;
|
||||
uint32_t tc_sel_;
|
||||
PerfStats perf_stats_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TensorUnit::TensorUnit(const SimContext& ctx, const char* name, uint32_t tile_size)
|
||||
: SimObject<TensorUnit>(ctx, name)
|
||||
, Input(this)
|
||||
, Output(this)
|
||||
, impl_(new Impl(this, tile_size))
|
||||
{}
|
||||
|
||||
TensorUnit::~TensorUnit() {
|
||||
delete impl_;
|
||||
}
|
||||
|
||||
void TensorUnit::reset() {
|
||||
impl_->reset();
|
||||
}
|
||||
|
||||
void TensorUnit::tick() {
|
||||
impl_->tick();
|
||||
}
|
||||
|
||||
void TensorUnit::mmadd(TensorFormat from_format,
|
||||
TensorFormat to_format,
|
||||
const std::vector<reg_data_t>& rs1_data,
|
||||
const std::vector<reg_data_t>& rs2_data,
|
||||
const std::vector<reg_data_t>& rs3_data,
|
||||
std::vector<reg_data_t>& rd_data,
|
||||
TensorUnit::TraceData::Ptr trace_data) {
|
||||
impl_->mmadd(from_format, to_format, rs1_data, rs2_data, rs3_data, rd_data, trace_data);
|
||||
}
|
||||
|
||||
const TensorUnit::PerfStats& TensorUnit::perf_stats() const {
|
||||
return impl_->perf_stats();
|
||||
}
|
81
sim/simx/tensor_unit.h
Normal file
81
sim/simx/tensor_unit.h
Normal file
|
@ -0,0 +1,81 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <simobject.h>
|
||||
#include "pipeline.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
enum class TensorFormat : int {
|
||||
Int4 = 0,
|
||||
Int8 = 1,
|
||||
Int16 = 2,
|
||||
Int32 = 3,
|
||||
Int64 = 4,
|
||||
FP16 = 5,
|
||||
FP32 = 6,
|
||||
FP64 = 7,
|
||||
BF16 = 8,
|
||||
_MAX = 9
|
||||
};
|
||||
|
||||
class TensorUnit : public SimObject<TensorUnit> {
|
||||
public:
|
||||
struct TraceData : public ITraceData {
|
||||
using Ptr = std::shared_ptr<TraceData>;
|
||||
uint32_t tc_idx;
|
||||
uint32_t latency;
|
||||
};
|
||||
|
||||
struct PerfStats {
|
||||
uint64_t latency;
|
||||
|
||||
PerfStats()
|
||||
: latency(0)
|
||||
{}
|
||||
|
||||
PerfStats& operator+=(const PerfStats& rhs) {
|
||||
this->latency += rhs.latency;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
SimPort<instr_trace_t*> Input;
|
||||
SimPort<instr_trace_t*> Output;
|
||||
|
||||
TensorUnit(const SimContext& ctx, const char* name, uint32_t tile_size);
|
||||
~TensorUnit();
|
||||
|
||||
void reset();
|
||||
|
||||
void tick();
|
||||
|
||||
void mmadd(TensorFormat from_format,
|
||||
TensorFormat to_format,
|
||||
const std::vector<reg_data_t>& rs1_data,
|
||||
const std::vector<reg_data_t>& rs2_data,
|
||||
const std::vector<reg_data_t>& rs3_data,
|
||||
std::vector<reg_data_t>& rd_data,
|
||||
TensorUnit::TraceData::Ptr trace_data);
|
||||
|
||||
const PerfStats& perf_stats() const;
|
||||
|
||||
private:
|
||||
|
||||
class Impl;
|
||||
Impl* impl_;
|
||||
};
|
||||
|
||||
}
|
|
@ -24,8 +24,9 @@
|
|||
#include <VX_types.h>
|
||||
#include <simobject.h>
|
||||
#include <bitvector.h>
|
||||
#include "debug.h"
|
||||
#include <iostream>
|
||||
#include "debug.h"
|
||||
#include "constants.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
|
@ -35,13 +36,11 @@ typedef uint32_t Word;
|
|||
typedef int32_t WordI;
|
||||
typedef uint64_t DWord;
|
||||
typedef int64_t DWordI;
|
||||
typedef uint32_t WordF;
|
||||
#elif (XLEN == 64)
|
||||
typedef uint64_t Word;
|
||||
typedef int64_t WordI;
|
||||
typedef __uint128_t DWord;
|
||||
typedef __int128_t DWordI;
|
||||
typedef uint64_t WordF;
|
||||
#else
|
||||
#error unsupported XLEN
|
||||
#endif
|
||||
|
@ -59,6 +58,21 @@ typedef std::bitset<MAX_NUM_WARPS> WarpMask;
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
union reg_data_t {
|
||||
uint8_t u8;
|
||||
uint16_t u16;
|
||||
Word u;
|
||||
WordI i;
|
||||
float f32;
|
||||
double f64;
|
||||
uint32_t u32;
|
||||
uint64_t u64;
|
||||
int32_t i32;
|
||||
int64_t i64;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class ThreadMaskOS {
|
||||
public:
|
||||
ThreadMaskOS(const ThreadMask& mask, int size)
|
||||
|
@ -106,7 +120,6 @@ enum class FUType {
|
|||
LSU,
|
||||
FPU,
|
||||
SFU,
|
||||
TCU,
|
||||
Count
|
||||
};
|
||||
|
||||
|
@ -116,7 +129,6 @@ inline std::ostream &operator<<(std::ostream &os, const FUType& type) {
|
|||
case FUType::LSU: os << "LSU"; break;
|
||||
case FUType::FPU: os << "FPU"; break;
|
||||
case FUType::SFU: os << "SFU"; break;
|
||||
case FUType::TCU: os << "TCU"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
|
@ -148,30 +160,14 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
|
|||
|
||||
enum class LsuType {
|
||||
LOAD,
|
||||
TCU_LOAD,
|
||||
STORE,
|
||||
TCU_STORE,
|
||||
FENCE
|
||||
};
|
||||
|
||||
enum class TCUType {
|
||||
TCU_MUL
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const TCUType& type) {
|
||||
switch (type) {
|
||||
case TCUType::TCU_MUL: os << "TCU MUL"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
|
||||
switch (type) {
|
||||
case LsuType::LOAD: os << "LOAD"; break;
|
||||
case LsuType::TCU_LOAD: os << "TCU_LOAD"; break;
|
||||
case LsuType::STORE: os << "STORE"; break;
|
||||
case LsuType::TCU_STORE: os << "TCU_STORE"; break;
|
||||
case LsuType::FENCE: os << "FENCE"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
|
@ -248,7 +244,10 @@ enum class SfuType {
|
|||
PRED,
|
||||
CSRRW,
|
||||
CSRRS,
|
||||
CSRRC
|
||||
CSRRC,
|
||||
#ifdef EXT_TPU_ENABLE
|
||||
MMADD,
|
||||
#endif
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
|
||||
|
@ -262,6 +261,9 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
|
|||
case SfuType::CSRRW: os << "CSRRW"; break;
|
||||
case SfuType::CSRRS: os << "CSRRS"; break;
|
||||
case SfuType::CSRRC: os << "CSRRC"; break;
|
||||
#ifdef EXT_TPU_ENABLE
|
||||
case SfuType::MMADD: os << "MMADD"; break;
|
||||
#endif
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
|
@ -440,6 +442,8 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
|
|||
template <typename T>
|
||||
class HashTable {
|
||||
public:
|
||||
typedef T DataType;
|
||||
|
||||
HashTable(uint32_t capacity)
|
||||
: entries_(capacity)
|
||||
, size_(0)
|
||||
|
@ -512,6 +516,8 @@ private:
|
|||
template <typename Type>
|
||||
class Arbiter : public SimObject<Arbiter<Type>> {
|
||||
public:
|
||||
typedef Type ReqType;
|
||||
|
||||
std::vector<SimPort<Type>> Inputs;
|
||||
std::vector<SimPort<Type>> Outputs;
|
||||
|
||||
|
@ -598,6 +604,8 @@ protected:
|
|||
template <typename Type>
|
||||
class CrossBar : public SimObject<CrossBar<Type>> {
|
||||
public:
|
||||
typedef Type ReqType;
|
||||
|
||||
std::vector<SimPort<Type>> Inputs;
|
||||
std::vector<SimPort<Type>> Outputs;
|
||||
|
||||
|
@ -607,8 +615,8 @@ public:
|
|||
ArbiterType type,
|
||||
uint32_t num_inputs,
|
||||
uint32_t num_outputs = 1,
|
||||
uint32_t addr_start = 0,
|
||||
uint32_t delay = 1
|
||||
uint32_t delay = 1,
|
||||
std::function<uint32_t(const Type& req)> output_sel = nullptr
|
||||
)
|
||||
: SimObject<CrossBar<Type>>(ctx, name)
|
||||
, Inputs(num_inputs, this)
|
||||
|
@ -618,12 +626,18 @@ public:
|
|||
, grants_(num_outputs, 0)
|
||||
, lg2_inputs_(log2ceil(num_inputs))
|
||||
, lg2_outputs_(log2ceil(num_outputs))
|
||||
, addr_start_(addr_start)
|
||||
, collisions_(0) {
|
||||
assert(delay != 0);
|
||||
assert(num_inputs <= 64);
|
||||
assert(num_outputs <= 64);
|
||||
assert(ispow2(num_outputs));
|
||||
if (output_sel != nullptr) {
|
||||
output_sel_ = output_sel;
|
||||
} else {
|
||||
output_sel_ = [this](const Type& req) {
|
||||
return (uint32_t)bit_getw(req.addr, 0, (lg2_outputs_-1));
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
void reset() {
|
||||
|
@ -651,7 +665,8 @@ public:
|
|||
auto& req = req_in.front();
|
||||
uint32_t output_idx = 0;
|
||||
if (lg2_outputs_ != 0) {
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
|
||||
// select output index
|
||||
output_idx = output_sel_(req);
|
||||
// skip if input is not going to current output
|
||||
if (output_idx != o)
|
||||
continue;
|
||||
|
@ -691,7 +706,7 @@ protected:
|
|||
std::vector<uint32_t> grants_;
|
||||
uint32_t lg2_inputs_;
|
||||
uint32_t lg2_outputs_;
|
||||
uint32_t addr_start_;
|
||||
std::function<uint32_t(const Type& req)> output_sel_;
|
||||
uint64_t collisions_;
|
||||
};
|
||||
|
||||
|
@ -700,6 +715,9 @@ protected:
|
|||
template <typename Req, typename Rsp>
|
||||
class TxArbiter : public SimObject<TxArbiter<Req, Rsp>> {
|
||||
public:
|
||||
typedef Req ReqType;
|
||||
typedef Rsp RspType;
|
||||
|
||||
std::vector<SimPort<Req>> ReqIn;
|
||||
std::vector<SimPort<Rsp>> RspIn;
|
||||
|
||||
|
@ -813,6 +831,9 @@ protected:
|
|||
template <typename Req, typename Rsp>
|
||||
class TxCrossBar : public SimObject<TxCrossBar<Req, Rsp>> {
|
||||
public:
|
||||
typedef Req ReqType;
|
||||
typedef Rsp RspType;
|
||||
|
||||
std::vector<SimPort<Req>> ReqIn;
|
||||
std::vector<SimPort<Rsp>> RspIn;
|
||||
|
||||
|
@ -825,8 +846,8 @@ public:
|
|||
ArbiterType type,
|
||||
uint32_t num_inputs,
|
||||
uint32_t num_outputs = 1,
|
||||
uint32_t addr_start = 0,
|
||||
uint32_t delay = 1
|
||||
uint32_t delay = 1,
|
||||
std::function<uint32_t(const Req& req)> output_sel = nullptr
|
||||
)
|
||||
: SimObject<TxCrossBar<Req, Rsp>>(ctx, name)
|
||||
, ReqIn(num_inputs, this)
|
||||
|
@ -839,7 +860,6 @@ public:
|
|||
, rsp_grants_(num_inputs, 0)
|
||||
, lg2_inputs_(log2ceil(num_inputs))
|
||||
, lg2_outputs_(log2ceil(num_outputs))
|
||||
, addr_start_(addr_start)
|
||||
, req_collisions_(0)
|
||||
, rsp_collisions_(0) {
|
||||
assert(delay != 0);
|
||||
|
@ -847,6 +867,13 @@ public:
|
|||
assert(num_outputs <= 64);
|
||||
assert(ispow2(num_inputs));
|
||||
assert(ispow2(num_outputs));
|
||||
if (output_sel != nullptr) {
|
||||
output_sel_ = output_sel;
|
||||
} else {
|
||||
output_sel_ = [this](const Req& req) {
|
||||
return (uint32_t)bit_getw(req.addr, 0, (lg2_outputs_-1));
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
void reset() {
|
||||
|
@ -917,7 +944,8 @@ public:
|
|||
auto& req = req_in.front();
|
||||
uint32_t output_idx = 0;
|
||||
if (lg2_outputs_ != 0) {
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
|
||||
// select output index
|
||||
output_idx = output_sel_(req);
|
||||
// skip if request is not going to current output
|
||||
if (output_idx != o)
|
||||
continue;
|
||||
|
@ -971,7 +999,7 @@ protected:
|
|||
std::vector<uint32_t> rsp_grants_;
|
||||
uint32_t lg2_inputs_;
|
||||
uint32_t lg2_outputs_;
|
||||
uint32_t addr_start_;
|
||||
std::function<uint32_t(const Req& req)> output_sel_;
|
||||
uint64_t req_collisions_;
|
||||
uint64_t rsp_collisions_;
|
||||
};
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#ifdef EXT_V_ENABLE
|
||||
|
||||
#include "vec_unit.h"
|
||||
#include "emulator.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
|
|
|
@ -9,26 +9,6 @@
|
|||
|
||||
namespace vortex {
|
||||
|
||||
struct vtype_t {
|
||||
uint32_t vill;
|
||||
uint32_t vma;
|
||||
uint32_t vta;
|
||||
uint32_t vsew;
|
||||
uint32_t vlmul;
|
||||
};
|
||||
|
||||
union reg_data_t {
|
||||
Word u;
|
||||
WordI i;
|
||||
WordF f;
|
||||
float f32;
|
||||
double f64;
|
||||
uint32_t u32;
|
||||
uint64_t u64;
|
||||
int32_t i32;
|
||||
int64_t i64;
|
||||
};
|
||||
|
||||
class VecUnit : public SimObject<VecUnit> {
|
||||
public:
|
||||
struct PerfStats {
|
||||
|
|
2200
sim/simx/vpu.cpp
2200
sim/simx/vpu.cpp
File diff suppressed because it is too large
Load diff
350
sim/simx/vpu.h
350
sim/simx/vpu.h
|
@ -1,7 +1,18 @@
|
|||
#ifdef EXT_V_ENABLE
|
||||
#pragma once
|
||||
|
||||
using namespace vortex;
|
||||
#include <stdlib.h>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <bitmanip.h>
|
||||
#include <rvfloats.h>
|
||||
#include "types.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
template <typename T, typename R>
|
||||
class Add {
|
||||
|
@ -1120,222 +1131,90 @@ public:
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
bool isMasked(std::vector<std::vector<Byte>> &vreg_file, uint32_t maskVreg, uint32_t byteI, bool vmask) {
|
||||
bool isMasked(const std::vector<std::vector<Byte>> &vreg_file, uint32_t maskVreg, uint32_t byteI, uint32_t vmask) {
|
||||
if (vmask == 1)
|
||||
return false; // unmasked
|
||||
auto &mask = vreg_file.at(maskVreg);
|
||||
uint8_t emask = *(uint8_t *)(mask.data() + byteI / 8);
|
||||
uint8_t value = (emask >> (byteI % 8)) & 0x1;
|
||||
DP(4, "Masking enabled: " << +!vmask << " mask element: " << +value);
|
||||
return !vmask && value == 0;
|
||||
DP(4, "Masking enabled: " << +value);
|
||||
return (value == 0);
|
||||
}
|
||||
|
||||
template <typename DT>
|
||||
uint32_t getVreg(uint32_t baseVreg, uint32_t byteI) {
|
||||
uint32_t vsew = sizeof(DT) * 8;
|
||||
return (baseVreg + (byteI / (VLEN / vsew))) % 32;
|
||||
DT getVregData(const std::vector<Byte>& reg_data, uint32_t eltIndex) {
|
||||
assert(eltIndex < (VLENB / sizeof(DT)));
|
||||
return *reinterpret_cast<const DT*>(reg_data.data() + eltIndex * sizeof(DT));
|
||||
}
|
||||
|
||||
template <typename DT>
|
||||
DT &getVregData(std::vector<vortex::Byte> &baseVregVec, uint32_t byteI) {
|
||||
uint32_t vsew = sizeof(DT) * 8;
|
||||
return *(DT *)(baseVregVec.data() + (byteI % (VLEN / vsew)) * vsew / 8);
|
||||
void setVregData(std::vector<Byte>& reg_data, uint32_t eltIndex, DT value) {
|
||||
assert(eltIndex < (VLENB / sizeof(DT)));
|
||||
*reinterpret_cast<DT*>(reg_data.data() + eltIndex * sizeof(DT)) = value;
|
||||
}
|
||||
|
||||
template <typename DT>
|
||||
DT &getVregData(std::vector<std::vector<vortex::Byte>> &vreg_file, uint32_t baseVreg, uint32_t byteI) {
|
||||
auto &vr1 = vreg_file.at(getVreg<DT>(baseVreg, byteI));
|
||||
return getVregData<DT>(vr1, byteI);
|
||||
uint32_t getVregNo(uint32_t baseVreg, uint32_t eltIndex) {
|
||||
uint32_t num_elts = VLENB / sizeof(DT);
|
||||
uint32_t grp_index = eltIndex / num_elts;
|
||||
assert(baseVreg + grp_index < 32);
|
||||
return baseVreg + grp_index;
|
||||
}
|
||||
|
||||
template <typename DT>
|
||||
void vector_op_vix_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
|
||||
uint32_t vsew = sizeof(DT) * 8;
|
||||
uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
|
||||
if (nfields * emul > 8) {
|
||||
std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
for (uint32_t i = 0; i < vl * nfields; i++) {
|
||||
if (isMasked(vreg_file, 0, i / nfields, vmask))
|
||||
continue;
|
||||
|
||||
uint32_t nfields_strided = strided ? nfields : 1;
|
||||
Word mem_addr = (base_addr & 0xFFFFFFFC) + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
|
||||
Word mem_data = 0;
|
||||
emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
|
||||
DP(4, "Loading data " << mem_data << " from: " << mem_addr << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
|
||||
DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
|
||||
DP(4, "Previous data: " << +result);
|
||||
result = (DT)mem_data;
|
||||
}
|
||||
uint32_t getVregElt(uint32_t eltIndex) {
|
||||
uint32_t num_elts = VLENB / sizeof(DT);
|
||||
return eltIndex % num_elts;
|
||||
}
|
||||
|
||||
void vector_op_vix_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
|
||||
template <typename DT>
|
||||
DT getVregData(const std::vector<std::vector<Byte>> &vreg_file, uint32_t baseVreg, uint32_t eltIndex) {
|
||||
uint32_t reg_no = getVregNo<DT>(baseVreg, eltIndex);
|
||||
uint32_t reg_elt = getVregElt<DT>(eltIndex);
|
||||
auto value = getVregData<DT>(vreg_file.at(reg_no), reg_elt);
|
||||
DP(4, "VRF Read: v[" << reg_no << "][" << reg_elt * sizeof(DT) << "]=0x" << std::hex << +value << std::dec);
|
||||
return value;
|
||||
}
|
||||
|
||||
template <typename DT>
|
||||
void setVregData(std::vector<std::vector<Byte>> &vreg_file, uint32_t baseVreg, uint32_t eltIndex, DT value) {
|
||||
uint32_t reg_no = getVregNo<DT>(baseVreg, eltIndex);
|
||||
uint32_t reg_elt = getVregElt<DT>(eltIndex);
|
||||
DP(4, "VRF Write: v[" << reg_no << "][" << reg_elt * sizeof(DT) << "]=0x" << std::hex << +value << std::dec);
|
||||
setVregData<DT>(vreg_file.at(reg_no), reg_elt, value);
|
||||
}
|
||||
|
||||
inline uint64_t getVregData(uint32_t vsew, const std::vector<std::vector<Byte>> &vreg_file, uint32_t baseVreg, uint32_t eltIndex) {
|
||||
switch (vsew) {
|
||||
case 8:
|
||||
vector_op_vix_load<uint8_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
|
||||
break;
|
||||
return getVregData<uint8_t>(vreg_file, baseVreg, eltIndex);
|
||||
case 16:
|
||||
vector_op_vix_load<uint16_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
|
||||
break;
|
||||
return getVregData<uint16_t>(vreg_file, baseVreg, eltIndex);
|
||||
case 32:
|
||||
vector_op_vix_load<uint32_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
|
||||
break;
|
||||
return getVregData<uint32_t>(vreg_file, baseVreg, eltIndex);
|
||||
case 64:
|
||||
vector_op_vix_load<uint64_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
|
||||
break;
|
||||
return getVregData<uint64_t>(vreg_file, baseVreg, eltIndex);
|
||||
default:
|
||||
std::cout << "Failed to execute VLE for vsew: " << vsew << std::endl;
|
||||
std::abort();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DT>
|
||||
void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
|
||||
uint32_t vsew = sizeof(DT) * 8;
|
||||
uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
|
||||
if (nfields * emul > 8) {
|
||||
std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
for (uint32_t i = 0; i < vl * nfields; i++) {
|
||||
if (isMasked(vreg_file, 0, i / nfields, vmask))
|
||||
continue;
|
||||
|
||||
Word offset = 0;
|
||||
switch (iSew) {
|
||||
case 8:
|
||||
offset = getVregData<uint8_t>(vreg_file, rsrc1, i / nfields);
|
||||
break;
|
||||
case 16:
|
||||
offset = getVregData<uint16_t>(vreg_file, rsrc1, i / nfields);
|
||||
break;
|
||||
case 32:
|
||||
offset = getVregData<uint32_t>(vreg_file, rsrc1, i / nfields);
|
||||
break;
|
||||
case 64:
|
||||
offset = getVregData<uint64_t>(vreg_file, rsrc1, i / nfields);
|
||||
break;
|
||||
default:
|
||||
std::cout << "Unsupported iSew: " << iSew << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
|
||||
Word mem_addr = (base_addr & 0xFFFFFFFC) + offset + (i % nfields) * sizeof(DT);
|
||||
Word mem_data = 0;
|
||||
emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
|
||||
DP(4, "VLUX/VLOX - Loading data " << mem_data << " from: " << mem_addr << " with offset: " << std::dec << offset << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
|
||||
DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
|
||||
DP(4, "Previous data: " << +result);
|
||||
result = (DT)mem_data;
|
||||
}
|
||||
}
|
||||
|
||||
void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
|
||||
inline void setVregData(uint32_t vsew, std::vector<std::vector<Byte>> &vreg_file, uint32_t baseVreg, uint32_t eltIndex, uint64_t value) {
|
||||
switch (vsew) {
|
||||
case 8:
|
||||
vector_op_vv_load<uint8_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
|
||||
setVregData<uint8_t>(vreg_file, baseVreg, eltIndex, value);
|
||||
break;
|
||||
case 16:
|
||||
vector_op_vv_load<uint16_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
|
||||
setVregData<uint16_t>(vreg_file, baseVreg, eltIndex, value);
|
||||
break;
|
||||
case 32:
|
||||
vector_op_vv_load<uint32_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
|
||||
setVregData<uint32_t>(vreg_file, baseVreg, eltIndex, value);
|
||||
break;
|
||||
case 64:
|
||||
vector_op_vv_load<uint64_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
|
||||
setVregData<uint64_t>(vreg_file, baseVreg, eltIndex, value);
|
||||
break;
|
||||
default:
|
||||
std::cout << "Failed to execute VLUX/VLOX for vsew: " << vsew << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DT>
|
||||
void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
|
||||
uint32_t vsew = sizeof(DT) * 8;
|
||||
uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
|
||||
for (uint32_t i = 0; i < vl * nfields; i++) {
|
||||
if (isMasked(vreg_file, 0, i / nfields, vmask))
|
||||
continue;
|
||||
|
||||
uint32_t nfields_strided = strided ? nfields : 1;
|
||||
Word mem_addr = base_addr + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
|
||||
Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
|
||||
DP(4, "Storing: " << std::hex << mem_data << " at: " << mem_addr << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
|
||||
emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
|
||||
}
|
||||
}
|
||||
|
||||
void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
|
||||
switch (vsew) {
|
||||
case 8:
|
||||
vector_op_vix_store<uint8_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
|
||||
break;
|
||||
case 16:
|
||||
vector_op_vix_store<uint16_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
|
||||
break;
|
||||
case 32:
|
||||
vector_op_vix_store<uint32_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
|
||||
break;
|
||||
case 64:
|
||||
vector_op_vix_store<uint64_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
|
||||
break;
|
||||
default:
|
||||
std::cout << "Failed to execute VSE for vsew: " << vsew << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DT>
|
||||
void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
|
||||
uint32_t vsew = sizeof(DT) * 8;
|
||||
uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
|
||||
for (uint32_t i = 0; i < vl * nfields; i++) {
|
||||
if (isMasked(vreg_file, 0, i / nfields, vmask))
|
||||
continue;
|
||||
|
||||
Word offset = 0;
|
||||
switch (iSew) {
|
||||
case 8:
|
||||
offset = getVregData<uint8_t>(vreg_file, rsrc1, i / nfields);
|
||||
break;
|
||||
case 16:
|
||||
offset = getVregData<uint16_t>(vreg_file, rsrc1, i / nfields);
|
||||
break;
|
||||
case 32:
|
||||
offset = getVregData<uint32_t>(vreg_file, rsrc1, i / nfields);
|
||||
break;
|
||||
case 64:
|
||||
offset = getVregData<uint64_t>(vreg_file, rsrc1, i / nfields);
|
||||
break;
|
||||
default:
|
||||
std::cout << "Unsupported iSew: " << iSew << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
|
||||
Word mem_addr = base_addr + offset + (i % nfields) * sizeof(DT);
|
||||
Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
|
||||
DP(4, "VSUX/VSOX - Storing: " << std::hex << mem_data << " at: " << mem_addr << " with offset: " << std::dec << offset << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
|
||||
emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
|
||||
}
|
||||
}
|
||||
|
||||
void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
|
||||
switch (vsew) {
|
||||
case 8:
|
||||
vector_op_vv_store<uint8_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
|
||||
break;
|
||||
case 16:
|
||||
vector_op_vv_store<uint16_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
|
||||
break;
|
||||
case 32:
|
||||
vector_op_vv_store<uint32_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
|
||||
break;
|
||||
case 64:
|
||||
vector_op_vv_store<uint64_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
|
||||
break;
|
||||
default:
|
||||
std::cout << "Failed to execute VSUX/VSOX for vsew: " << vsew << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
@ -1345,12 +1224,11 @@ void vector_op_vix(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DT second = getVregData<DT>(vreg_file, rsrc0, i);
|
||||
DT third = getVregData<DT>(vreg_file, rdest, i);
|
||||
DT result = OP<DT, DT>::apply(first, second, third);
|
||||
DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
|
||||
getVregData<DT>(vreg_file, rdest, i) = result;
|
||||
setVregData<DT>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1382,7 +1260,7 @@ void vector_op_vix_carry(DT first, std::vector<std::vector<Byte>> &vreg_file, ui
|
|||
bool third = !isMasked(vreg_file, 0, i, false);
|
||||
DT result = OP<DT, DT>::apply(first, second, third);
|
||||
DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
|
||||
getVregData<DT>(vreg_file, rdest, i) = result;
|
||||
setVregData<DT>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1414,10 +1292,11 @@ void vector_op_vix_carry_out(DT first, std::vector<std::vector<Byte>> &vreg_file
|
|||
bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
|
||||
bool result = OP<DT, DTR>::apply(first, second, third);
|
||||
DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
|
||||
auto old_value = getVregData<uint8_t>(vreg_file, rdest, i / 8);
|
||||
if (result) {
|
||||
getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
|
||||
setVregData<uint8_t>(vreg_file, rdest, i / 8, old_value | (1 << (i % 8)));
|
||||
} else {
|
||||
getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
|
||||
setVregData<uint8_t>(vreg_file, rdest, i / 8, old_value & ~(1 << (i % 8)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1448,7 +1327,7 @@ void vector_op_vix_merge(DT first, std::vector<std::vector<Byte>> &vreg_file, ui
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
DT result = isMasked(vreg_file, 0, i, vmask) ? getVregData<DT>(vreg_file, rsrc0, i) : first;
|
||||
DP(4, "Merge - Choosing result: " << +result);
|
||||
getVregData<DT>(vreg_file, rdest, i) = result;
|
||||
setVregData<DT>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1474,23 +1353,23 @@ void vector_op_vix_merge(Word src1, std::vector<std::vector<Byte>> &vreg_file, u
|
|||
}
|
||||
|
||||
template <typename DT>
|
||||
void vector_op_scalar(DT &dest, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t vsew) {
|
||||
void vector_op_scalar(DT *dest, const std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t vsew) {
|
||||
if (rsrc0 != 0) {
|
||||
std::cout << "Vwxunary0/Vwfunary0 has unsupported value for vs2: " << rsrc0 << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
switch (vsew) {
|
||||
case 8:
|
||||
dest = getVregData<uint8_t>(vreg_file, rsrc1, 0);
|
||||
*dest = getVregData<uint8_t>(vreg_file, rsrc1, 0);
|
||||
break;
|
||||
case 16:
|
||||
dest = getVregData<uint16_t>(vreg_file, rsrc1, 0);
|
||||
*dest = getVregData<uint16_t>(vreg_file, rsrc1, 0);
|
||||
break;
|
||||
case 32:
|
||||
dest = getVregData<uint32_t>(vreg_file, rsrc1, 0);
|
||||
*dest = getVregData<uint32_t>(vreg_file, rsrc1, 0);
|
||||
break;
|
||||
case 64:
|
||||
dest = getVregData<uint64_t>(vreg_file, rsrc1, 0);
|
||||
*dest = getVregData<uint64_t>(vreg_file, rsrc1, 0);
|
||||
break;
|
||||
default:
|
||||
std::cout << "Failed to execute vmv.x.s/vfmv.f.s for vsew: " << vsew << std::endl;
|
||||
|
@ -1503,12 +1382,11 @@ void vector_op_vix_w(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DT second = getVregData<DT>(vreg_file, rsrc0, i);
|
||||
DTR third = getVregData<DTR>(vreg_file, rdest, i);
|
||||
DTR result = OP<DT, DTR>::apply(first, second, third);
|
||||
DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
|
||||
getVregData<DTR>(vreg_file, rdest, i) = result;
|
||||
setVregData<DTR>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1553,11 +1431,10 @@ void vector_op_vix_n(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DT second = getVregData<DT>(vreg_file, rsrc0, i);
|
||||
DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
|
||||
DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
|
||||
getVregData<DTR>(vreg_file, rdest, i) = result;
|
||||
setVregData<DTR>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1584,11 +1461,10 @@ void vector_op_vix_sat(DTR first, std::vector<std::vector<Byte>> &vreg_file, uin
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DT second = getVregData<DTR>(vreg_file, rsrc0, i);
|
||||
DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
|
||||
DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
|
||||
getVregData<DTR>(vreg_file, rdest, i) = result;
|
||||
setVregData<DTR>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1701,14 +1577,14 @@ void vector_op_vix_mask(DT first, std::vector<std::vector<Byte>> &vreg_file, uin
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DT second = getVregData<DT>(vreg_file, rsrc0, i);
|
||||
bool result = OP<DT, bool>::apply(first, second, 0);
|
||||
DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
|
||||
auto old_value = getVregData<uint8_t>(vreg_file, rdest, i / 8);
|
||||
if (result) {
|
||||
getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
|
||||
setVregData<uint8_t>(vreg_file, rdest, i / 8, old_value | (1 << (i % 8)));
|
||||
} else {
|
||||
getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
|
||||
setVregData<uint8_t>(vreg_file, rdest, i / 8, old_value & ~(1 << (i % 8)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1743,7 +1619,7 @@ void vector_op_vix_slide(Word first, std::vector<std::vector<Byte>> &vreg_file,
|
|||
// so first is our scalar value and we need to overwrite it with 1 for later computations
|
||||
if (scalar && vl && !isMasked(vreg_file, 0, scalarPos, vmask)) {
|
||||
DP(4, "Slide - Moving scalar value " << +first << " to position " << +scalarPos);
|
||||
getVregData<DT>(vreg_file, rdest, scalarPos) = first;
|
||||
setVregData<DT>(vreg_file, rdest, scalarPos, first);
|
||||
}
|
||||
first = scalar ? 1 : first;
|
||||
|
||||
|
@ -1754,7 +1630,7 @@ void vector_op_vix_slide(Word first, std::vector<std::vector<Byte>> &vreg_file,
|
|||
__uint128_t iSrc = slideDown ? (__uint128_t)i + (__uint128_t)first : (__uint128_t)i - (__uint128_t)first; // prevent overflows/underflows
|
||||
DT value = (!slideDown || iSrc < vlmax) ? getVregData<DT>(vreg_file, rsrc0, iSrc) : 0;
|
||||
DP(4, "Slide - Moving value " << +value << " from position " << (uint64_t)iSrc << " to position " << +i);
|
||||
getVregData<DT>(vreg_file, rdest, i) = value;
|
||||
setVregData<DT>(vreg_file, rdest, i, value);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1784,10 +1660,9 @@ void vector_op_vix_gather(Word first, std::vector<std::vector<Byte>> &vreg_file,
|
|||
for (Word i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DT value = first < vlmax ? getVregData<DT>(vreg_file, rsrc0, first) : 0;
|
||||
DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
|
||||
getVregData<DT>(vreg_file, rdest, i) = value;
|
||||
setVregData<DT>(vreg_file, rdest, i, value);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1817,13 +1692,12 @@ void vector_op_vv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uin
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DT first = getVregData<DT>(vreg_file, rsrc0, i);
|
||||
DT second = getVregData<DT>(vreg_file, rsrc1, i);
|
||||
DT third = getVregData<DT>(vreg_file, rdest, i);
|
||||
DT result = OP<DT, DT>::apply(first, second, third);
|
||||
DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
|
||||
getVregData<DT>(vreg_file, rdest, i) = result;
|
||||
setVregData<DT>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1856,7 +1730,7 @@ void vector_op_vv_carry(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc
|
|||
bool third = !isMasked(vreg_file, 0, i, false);
|
||||
DT result = OP<DT, DT>::apply(first, second, third);
|
||||
DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
|
||||
getVregData<DT>(vreg_file, rdest, i) = result;
|
||||
setVregData<DT>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1889,10 +1763,11 @@ void vector_op_vv_carry_out(std::vector<std::vector<Byte>> &vreg_file, uint32_t
|
|||
bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
|
||||
bool result = OP<DT, DTR>::apply(first, second, third);
|
||||
DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
|
||||
auto old_value = getVregData<uint8_t>(vreg_file, rdest, i / 8);
|
||||
if (result) {
|
||||
getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
|
||||
setVregData<uint8_t>(vreg_file, rdest, i / 8, old_value | (1 << (i % 8)));
|
||||
} else {
|
||||
getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
|
||||
setVregData<uint8_t>(vreg_file, rdest, i / 8, old_value & ~(1 << (i % 8)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1924,7 +1799,7 @@ void vector_op_vv_merge(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc
|
|||
uint32_t rsrc = isMasked(vreg_file, 0, i, vmask) ? rsrc1 : rsrc0;
|
||||
DT result = getVregData<DT>(vreg_file, rsrc, i);
|
||||
DP(4, "Merge - Choosing result: " << +result);
|
||||
getVregData<DT>(vreg_file, rdest, i) = result;
|
||||
setVregData<DT>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1954,11 +1829,10 @@ void vector_op_vv_gather(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsr
|
|||
for (Word i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
uint32_t first = ei16 ? getVregData<uint16_t>(vreg_file, rsrc0, i) : getVregData<DT>(vreg_file, rsrc0, i);
|
||||
DT value = first < vlmax ? getVregData<DT>(vreg_file, rsrc1, first) : 0;
|
||||
DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
|
||||
getVregData<DT>(vreg_file, rdest, i) = value;
|
||||
setVregData<DT>(vreg_file, rdest, i, value);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1988,13 +1862,12 @@ void vector_op_vv_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, u
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DT first = getVregData<DT>(vreg_file, rsrc0, i);
|
||||
DT second = getVregData<DT>(vreg_file, rsrc1, i);
|
||||
DTR third = getVregData<DTR>(vreg_file, rdest, i);
|
||||
DTR result = OP<DT, DTR>::apply(first, second, third);
|
||||
DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
|
||||
getVregData<DTR>(vreg_file, rdest, i) = result;
|
||||
setVregData<DTR>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2021,13 +1894,12 @@ void vector_op_vv_wv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0,
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DT first = getVregData<DT>(vreg_file, rsrc0, i);
|
||||
DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
|
||||
DTR third = getVregData<DTR>(vreg_file, rdest, i);
|
||||
DTR result = OP<DTR, DTR>::apply(first, second, third);
|
||||
DP(4, "Widening wv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
|
||||
getVregData<DTR>(vreg_file, rdest, i) = result;
|
||||
setVregData<DTR>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2054,13 +1926,12 @@ void vector_op_vv_wfv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0,
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DT first = getVregData<DT>(vreg_file, rsrc0, i);
|
||||
DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
|
||||
DTR third = getVregData<DTR>(vreg_file, rdest, i);
|
||||
DTR result = OP<DTR, DTR>::apply(rv_ftod(first), second, third);
|
||||
DP(4, "Widening wfv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
|
||||
getVregData<DTR>(vreg_file, rdest, i) = result;
|
||||
setVregData<DTR>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2079,12 +1950,11 @@ void vector_op_vv_n(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, u
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DTR first = getVregData<DTR>(vreg_file, rsrc0, i);
|
||||
DT second = getVregData<DT>(vreg_file, rsrc1, i);
|
||||
DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
|
||||
DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
|
||||
getVregData<DTR>(vreg_file, rdest, i) = result;
|
||||
setVregData<DTR>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2111,12 +1981,11 @@ void vector_op_vv_sat(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0,
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DT first = getVregData<DTR>(vreg_file, rsrc0, i);
|
||||
DT second = getVregData<DTR>(vreg_file, rsrc1, i);
|
||||
DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
|
||||
DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
|
||||
getVregData<DTR>(vreg_file, rdest, i) = result;
|
||||
setVregData<DTR>(vreg_file, rdest, i, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2167,16 +2036,15 @@ void vector_op_vv_red(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0,
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
// use rdest as accumulator
|
||||
if (i == 0) {
|
||||
getVregData<DT>(vreg_file, rdest, 0) = getVregData<DT>(vreg_file, rsrc0, 0);
|
||||
setVregData<DT>(vreg_file, rdest, 0, getVregData<DT>(vreg_file, rsrc0, 0));
|
||||
}
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DT first = getVregData<DT>(vreg_file, rdest, 0);
|
||||
DT second = getVregData<DT>(vreg_file, rsrc1, i);
|
||||
DT result = OP<DT, DT>::apply(first, second, 0);
|
||||
DP(4, "Reduction " << (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
|
||||
getVregData<DT>(vreg_file, rdest, 0) = result;
|
||||
setVregData<DT>(vreg_file, rdest, 0, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2206,17 +2074,16 @@ void vector_op_vv_red_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
// use rdest as accumulator
|
||||
if (i == 0) {
|
||||
getVregData<DTR>(vreg_file, rdest, 0) = getVregData<DTR>(vreg_file, rsrc0, 0);
|
||||
setVregData<DTR>(vreg_file, rdest, 0, getVregData<DTR>(vreg_file, rsrc0, 0));
|
||||
}
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DTR first = getVregData<DTR>(vreg_file, rdest, 0);
|
||||
DT second = getVregData<DT>(vreg_file, rsrc1, i);
|
||||
DTR second_w = std::is_signed<DT>() ? sext((DTR)second, sizeof(DT) * 8) : zext((DTR)second, sizeof(DT) * 8);
|
||||
DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
|
||||
DP(4, "Widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
|
||||
getVregData<DTR>(vreg_file, rdest, 0) = result;
|
||||
setVregData<DTR>(vreg_file, rdest, 0, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2243,17 +2110,16 @@ void vector_op_vv_red_wf(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsr
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
// use rdest as accumulator
|
||||
if (i == 0) {
|
||||
getVregData<DTR>(vreg_file, rdest, 0) = getVregData<DTR>(vreg_file, rsrc0, 0);
|
||||
setVregData<DTR>(vreg_file, rdest, 0, getVregData<DTR>(vreg_file, rsrc0, 0));
|
||||
}
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DTR first = getVregData<DTR>(vreg_file, rdest, 0);
|
||||
DT second = getVregData<DT>(vreg_file, rsrc1, i);
|
||||
DTR second_w = rv_ftod(second);
|
||||
DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
|
||||
DP(4, "Float widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
|
||||
getVregData<DTR>(vreg_file, rdest, 0) = result;
|
||||
setVregData<DTR>(vreg_file, rdest, 0, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2272,9 +2138,8 @@ void vector_op_vid(std::vector<std::vector<Byte>> &vreg_file, uint32_t rdest, ui
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DP(4, "Element Index = " << +i);
|
||||
getVregData<DT>(vreg_file, rdest, i) = i;
|
||||
setVregData<DT>(vreg_file, rdest, i, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2303,15 +2168,15 @@ void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0
|
|||
for (uint32_t i = 0; i < vl; i++) {
|
||||
if (isMasked(vreg_file, 0, i, vmask))
|
||||
continue;
|
||||
|
||||
DT first = getVregData<DT>(vreg_file, rsrc0, i);
|
||||
DT second = getVregData<DT>(vreg_file, rsrc1, i);
|
||||
bool result = OP<DT, bool>::apply(first, second, 0);
|
||||
DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
|
||||
auto old_value = getVregData<uint8_t>(vreg_file, rdest, i / 8);
|
||||
if (result) {
|
||||
getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
|
||||
setVregData<uint8_t>(vreg_file, rdest, i / 8, old_value | (1 << (i % 8)));
|
||||
} else {
|
||||
getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
|
||||
setVregData<uint8_t>(vreg_file, rdest, i / 8, old_value & ~(1 << (i % 8)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2346,10 +2211,11 @@ void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0
|
|||
bool second = (secondMask >> (i % 8)) & 0x1;
|
||||
bool result = OP<uint8_t, uint8_t>::apply(first, second, 0) & 0x1;
|
||||
DP(4, "Compare mask bits " << (OP<uint8_t, uint8_t>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
|
||||
auto old_value = getVregData<uint8_t>(vreg_file, rdest, i / 8);
|
||||
if (result) {
|
||||
getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
|
||||
setVregData<uint8_t>(vreg_file, rdest, i / 8, old_value | (1 << (i % 8)));
|
||||
} else {
|
||||
getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
|
||||
setVregData<uint8_t>(vreg_file, rdest, i / 8, old_value & ~(1 << (i % 8)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2362,10 +2228,9 @@ void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t r
|
|||
// This instruction is always masked (vmask == 0), but encoded as unmasked (vmask == 1)
|
||||
if (isMasked(vreg_file, rsrc0, i, 0))
|
||||
continue;
|
||||
|
||||
DT value = getVregData<DT>(vreg_file, rsrc1, i);
|
||||
DP(4, "Compression - Moving value " << +value << " from position " << i << " to position " << currPos);
|
||||
getVregData<DT>(vreg_file, rdest, currPos) = value;
|
||||
setVregData<DT>(vreg_file, rdest, currPos, value);
|
||||
currPos++;
|
||||
}
|
||||
}
|
||||
|
@ -2390,4 +2255,5 @@ void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t r
|
|||
std::abort();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
|
@ -31,21 +31,6 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
|
|||
DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
|
||||
DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
|
||||
|
||||
# Platform parameters
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_BANKS,$(CONFIGS)))
|
||||
CONFIGS += -DPLATFORM_MEMORY_BANKS=2
|
||||
endif
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_ADDR_WIDTH,$(CONFIGS)))
|
||||
ifeq ($(XLEN),64)
|
||||
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=47
|
||||
else
|
||||
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=31
|
||||
endif
|
||||
endif
|
||||
ifeq (,$(findstring PLATFORM_MEMORY_DATA_WIDTH,$(CONFIGS)))
|
||||
CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
|
||||
endif
|
||||
|
||||
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
|
||||
|
||||
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
|
||||
|
|
|
@ -17,16 +17,16 @@ module vortex_afu_shim #(
|
|||
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
|
||||
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
|
||||
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
|
||||
parameter C_M_AXI_MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_WIDTH,
|
||||
parameter C_M_AXI_MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8),
|
||||
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
|
||||
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_BANKS
|
||||
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS
|
||||
) (
|
||||
// System signals
|
||||
input wire ap_clk,
|
||||
input wire ap_rst_n,
|
||||
|
||||
// AXI4 master interface
|
||||
`REPEAT (`PLATFORM_MEMORY_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
|
||||
// AXI4-Lite slave interface
|
||||
input wire s_axi_ctrl_awvalid,
|
||||
|
@ -61,7 +61,7 @@ module vortex_afu_shim #(
|
|||
.clk (ap_clk),
|
||||
.reset (~ap_rst_n),
|
||||
|
||||
`REPEAT (`PLATFORM_MEMORY_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
|
||||
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
|
||||
|
||||
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
|
||||
.s_axi_ctrl_awready (s_axi_ctrl_awready),
|
||||
|
|
|
@ -37,8 +37,6 @@
|
|||
|
||||
#include <iostream>
|
||||
|
||||
#define PLATFORM_MEMORY_DATA_SIZE (PLATFORM_MEMORY_DATA_WIDTH/8)
|
||||
|
||||
#ifndef MEM_CLOCK_RATIO
|
||||
#define MEM_CLOCK_RATIO 1
|
||||
#endif
|
||||
|
@ -61,10 +59,10 @@
|
|||
|
||||
#define CPU_GPU_LATENCY 200
|
||||
|
||||
#if PLATFORM_MEMORY_DATA_WIDTH > 64
|
||||
typedef VlWide<(PLATFORM_MEMORY_DATA_WIDTH/32)> Vl_m_data_t;
|
||||
#if PLATFORM_MEMORY_DATA_SIZE > 8
|
||||
typedef VlWide<(PLATFORM_MEMORY_DATA_SIZE/4)> Vl_m_data_t;
|
||||
#else
|
||||
#if PLATFORM_MEMORY_DATA_WIDTH > 32
|
||||
#if PLATFORM_MEMORY_DATA_SIZE > 4
|
||||
typedef QData Vl_m_data_t;
|
||||
#else
|
||||
typedef IData Vl_m_data_t;
|
||||
|
@ -130,7 +128,7 @@ public:
|
|||
Impl()
|
||||
: device_(nullptr)
|
||||
, ram_(nullptr)
|
||||
, dram_sim_(MEM_CLOCK_RATIO)
|
||||
, dram_sim_(PLATFORM_MEMORY_NUM_BANKS, PLATFORM_MEMORY_DATA_SIZE, MEM_CLOCK_RATIO)
|
||||
, stop_(false)
|
||||
#ifdef VCD_OUTPUT
|
||||
, tfp_(nullptr)
|
||||
|
@ -142,7 +140,7 @@ public:
|
|||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
delete mem_alloc_[b];
|
||||
}
|
||||
if (ram_) {
|
||||
|
@ -178,16 +176,16 @@ public:
|
|||
#endif
|
||||
|
||||
// calculate memory bank size
|
||||
mem_bank_size_ = 1ull << PLATFORM_MEMORY_ADDR_WIDTH;
|
||||
mem_bank_size_ = (1ull << PLATFORM_MEMORY_ADDR_WIDTH) / PLATFORM_MEMORY_NUM_BANKS;
|
||||
|
||||
// allocate RAM
|
||||
ram_ = new RAM(0, RAM_PAGE_SIZE);
|
||||
|
||||
// initialize AXI memory interfaces
|
||||
MP_M_AXI_MEM(PLATFORM_MEMORY_BANKS);
|
||||
MP_M_AXI_MEM(PLATFORM_MEMORY_NUM_BANKS);
|
||||
|
||||
// initialize memory allocator
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
mem_alloc_[b] = new MemoryAllocator(0, mem_bank_size_, 4096, 64);
|
||||
}
|
||||
|
||||
|
@ -209,13 +207,13 @@ public:
|
|||
}
|
||||
|
||||
int mem_alloc(uint64_t size, uint32_t bank_id, uint64_t* addr) {
|
||||
if (bank_id >= PLATFORM_MEMORY_BANKS)
|
||||
if (bank_id >= PLATFORM_MEMORY_NUM_BANKS)
|
||||
return -1;
|
||||
return mem_alloc_[bank_id]->allocate(size, addr);
|
||||
}
|
||||
|
||||
int mem_free(uint32_t bank_id, uint64_t addr) {
|
||||
if (bank_id >= PLATFORM_MEMORY_BANKS)
|
||||
if (bank_id >= PLATFORM_MEMORY_NUM_BANKS)
|
||||
return -1;
|
||||
return mem_alloc_[bank_id]->release(addr);
|
||||
}
|
||||
|
@ -223,7 +221,7 @@ public:
|
|||
int mem_write(uint32_t bank_id, uint64_t addr, uint64_t size, const void* data) {
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
|
||||
if (bank_id >= PLATFORM_MEMORY_BANKS)
|
||||
if (bank_id >= PLATFORM_MEMORY_NUM_BANKS)
|
||||
return -1;
|
||||
uint64_t base_addr = bank_id * mem_bank_size_ + addr;
|
||||
ram_->write(data, base_addr, size);
|
||||
|
@ -238,7 +236,7 @@ public:
|
|||
int mem_read(uint32_t bank_id, uint64_t addr, uint64_t size, void* data) {
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
|
||||
if (bank_id >= PLATFORM_MEMORY_BANKS)
|
||||
if (bank_id >= PLATFORM_MEMORY_NUM_BANKS)
|
||||
return -1;
|
||||
uint64_t base_addr = bank_id * mem_bank_size_ + addr;
|
||||
ram_->read(data, base_addr, size);
|
||||
|
@ -321,7 +319,7 @@ private:
|
|||
reqs.clear();
|
||||
}
|
||||
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
std::queue<mem_req_t*> empty;
|
||||
std::swap(dram_queues_[b], empty);
|
||||
}
|
||||
|
@ -338,7 +336,7 @@ private:
|
|||
device_->ap_rst_n = 1;
|
||||
|
||||
// this AXI device is always ready to accept new requests
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
*m_axi_mem_[b].arready = 1;
|
||||
*m_axi_mem_[b].awready = 1;
|
||||
*m_axi_mem_[b].wready = 1;
|
||||
|
@ -358,19 +356,18 @@ private:
|
|||
|
||||
dram_sim_.tick();
|
||||
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
if (!dram_queues_[b].empty()) {
|
||||
auto mem_req = dram_queues_[b].front();
|
||||
if (dram_sim_.send_request(mem_req->write, mem_req->addr, b, [](void* arg) {
|
||||
dram_sim_.send_request(mem_req->addr, mem_req->write, [](void* arg) {
|
||||
auto orig_req = reinterpret_cast<mem_req_t*>(arg);
|
||||
if (orig_req->ready) {
|
||||
delete orig_req;
|
||||
} else {
|
||||
orig_req->ready = true;
|
||||
}
|
||||
}, mem_req)) {
|
||||
dram_queues_[b].pop();
|
||||
}
|
||||
}, mem_req);
|
||||
dram_queues_[b].pop();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -411,7 +408,7 @@ private:
|
|||
}
|
||||
|
||||
void axi_mem_bus_reset() {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
// read request address
|
||||
*m_axi_mem_[b].arready = 0;
|
||||
|
||||
|
@ -435,14 +432,14 @@ private:
|
|||
|
||||
void axi_mem_bus_eval(bool clk) {
|
||||
if (!clk) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
m_axi_states_[b].read_rsp_ready = *m_axi_mem_[b].rready;
|
||||
m_axi_states_[b].write_rsp_ready = *m_axi_mem_[b].bready;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
|
||||
// handle read responses
|
||||
if (*m_axi_mem_[b].rvalid && m_axi_states_[b].read_rsp_ready) {
|
||||
*m_axi_mem_[b].rvalid = 0;
|
||||
|
@ -493,7 +490,7 @@ private:
|
|||
|
||||
/*printf("%0ld: [sim] axi-mem-read[%d]: addr=0x%lx, tag=0x%x, data=0x", timestamp, b, mem_req->addr, mem_req->tag);
|
||||
for (int i = PLATFORM_MEMORY_DATA_SIZE-1; i >= 0; --i) {
|
||||
printf("%02x", mem_req->data[b]);
|
||||
printf("%02x", mem_req->data[i]);
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
|
@ -536,7 +533,7 @@ private:
|
|||
|
||||
/*printf("%0ld: [sim] axi-mem-write[%d]: addr=0x%lx, byteen=0x%lx, tag=0x%x, data=0x", timestamp, b, mem_req->addr, byteen, mem_req->tag);
|
||||
for (int i = PLATFORM_MEMORY_DATA_SIZE-1; i >= 0; --i) {
|
||||
printf("%02x", m_axi_states_[b].write_req_data[i]]);
|
||||
printf("%02x", m_axi_states_[b].write_req_data[i]);
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
|
@ -607,15 +604,15 @@ private:
|
|||
|
||||
std::mutex mutex_;
|
||||
|
||||
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_BANKS];
|
||||
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_NUM_BANKS];
|
||||
|
||||
m_axi_mem_t m_axi_mem_[PLATFORM_MEMORY_BANKS];
|
||||
m_axi_mem_t m_axi_mem_[PLATFORM_MEMORY_NUM_BANKS];
|
||||
|
||||
MemoryAllocator* mem_alloc_[PLATFORM_MEMORY_BANKS];
|
||||
MemoryAllocator* mem_alloc_[PLATFORM_MEMORY_NUM_BANKS];
|
||||
|
||||
m_axi_state_t m_axi_states_[PLATFORM_MEMORY_BANKS];
|
||||
m_axi_state_t m_axi_states_[PLATFORM_MEMORY_NUM_BANKS];
|
||||
|
||||
std::queue<mem_req_t*> dram_queues_[PLATFORM_MEMORY_BANKS];
|
||||
std::queue<mem_req_t*> dram_queues_[PLATFORM_MEMORY_NUM_BANKS];
|
||||
|
||||
#ifdef VCD_OUTPUT
|
||||
VerilatedVcdC* tfp_;
|
||||
|
|
|
@ -1,14 +0,0 @@
|
|||
ROOT_DIR := $(realpath ../../..)
|
||||
include $(ROOT_DIR)/config.mk
|
||||
|
||||
PROJECT := matmul
|
||||
|
||||
SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
|
||||
|
||||
SRCS := $(SRC_DIR)/main.cpp
|
||||
|
||||
VX_SRCS := $(SRC_DIR)/kernel.cpp
|
||||
|
||||
OPTS ?= -n128 -d1
|
||||
|
||||
include ../common.mk
|
|
@ -1,17 +0,0 @@
|
|||
#ifndef _COMMON_H_
|
||||
#define _COMMON_H_
|
||||
|
||||
typedef struct {
|
||||
uint32_t num_tasks;
|
||||
uint32_t num_warps;
|
||||
uint32_t num_threads;
|
||||
uint32_t TC_per_warp;
|
||||
uint32_t matrix_size;
|
||||
uint32_t data_size;
|
||||
uint64_t tc_size;
|
||||
uint64_t src0_addr;
|
||||
uint64_t src1_addr;
|
||||
uint64_t dst_addr;
|
||||
} kernel_arg_t;
|
||||
|
||||
#endif
|
|
@ -1,127 +0,0 @@
|
|||
#include <stdint.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
|
||||
uint32_t task_id = blockIdx.x;
|
||||
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
|
||||
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
|
||||
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
|
||||
uint64_t a_addr = reinterpret_cast<uint64_t>(src0_ptr);
|
||||
uint64_t b_addr = reinterpret_cast<uint64_t>(src1_ptr);
|
||||
uint64_t c_addr = reinterpret_cast<uint64_t>(dst_ptr);
|
||||
|
||||
uint32_t tc_size = arg->tc_size;
|
||||
uint32_t TC_per_warp = arg->TC_per_warp;
|
||||
unsigned num_threads = arg->num_threads;
|
||||
int num_warps = arg->num_warps;
|
||||
uint32_t matrix_size = arg->matrix_size;
|
||||
|
||||
int n_tiles = matrix_size/tc_size;
|
||||
int num_output_tiles = (matrix_size*matrix_size)/(tc_size*tc_size);
|
||||
|
||||
int num_tasks = arg->num_tasks;
|
||||
|
||||
//Assuming matrix size always > tensor core size
|
||||
int warps_actual;
|
||||
if (TC_per_warp > num_output_tiles)
|
||||
warps_actual = 1;
|
||||
else
|
||||
warps_actual = num_output_tiles/TC_per_warp;
|
||||
|
||||
int num_warps_actual = (warps_actual < num_warps)? warps_actual: num_warps;
|
||||
int num_threads_per_tc = (1> num_threads/TC_per_warp)? 1: num_threads/TC_per_warp;
|
||||
|
||||
int num_tasks_per_thread = (1> (num_tasks/(num_threads*num_warps_actual)))? 1: (num_tasks/(num_threads*num_warps_actual));
|
||||
int num_tasks_per_warp = (1 > num_tasks/num_warps_actual)? 1:num_tasks/num_warps_actual;
|
||||
int task_id_first_warp = task_id%num_tasks_per_warp;
|
||||
|
||||
//A&B
|
||||
int num_data_per_op_tile = tc_size*tc_size*n_tiles;
|
||||
int num_data_per_warp = num_data_per_op_tile*((1> (num_output_tiles/num_warps_actual))?1:(num_output_tiles/num_warps_actual));
|
||||
|
||||
int addr_shift;
|
||||
if (((tc_size*tc_size*n_tiles)/(num_threads)) > 1)
|
||||
addr_shift = (tc_size*tc_size*n_tiles)/(num_threads);
|
||||
else
|
||||
addr_shift = 1;
|
||||
//Offset for 1st warp
|
||||
int offset = ((task_id_first_warp/num_tasks_per_thread)*addr_shift) + ((task_id_first_warp%num_tasks_per_thread)*num_data_per_op_tile);
|
||||
offset = offset + (num_data_per_warp*(task_id/num_tasks_per_warp));
|
||||
|
||||
//C
|
||||
int num_data_per_op_tile_c = tc_size*tc_size;
|
||||
int num_data_per_warp_c = num_data_per_warp/n_tiles;
|
||||
|
||||
int addr_shift_c;
|
||||
if (((tc_size*tc_size)/(num_threads)) > 1)
|
||||
addr_shift_c = tc_size;
|
||||
else
|
||||
addr_shift_c = 1;
|
||||
//Offset for 1st warp
|
||||
int offset_c = ((task_id_first_warp/num_tasks_per_thread)*addr_shift_c) + ((task_id_first_warp%num_tasks_per_thread)*num_data_per_op_tile_c);
|
||||
offset_c = offset_c + (num_data_per_warp_c*(task_id/num_tasks_per_warp));
|
||||
|
||||
int thread_limit = (num_threads < tc_size*tc_size*n_tiles*TC_per_warp)? num_threads : tc_size*tc_size*n_tiles*TC_per_warp;
|
||||
int thread_limit_c = (num_threads<tc_size*tc_size)? num_threads:tc_size*tc_size;
|
||||
|
||||
//OLD TASK DISTRIBUTION // For 8x8 matrix, 2x2 tc_size, 1 tc_num, 4threads, 2warps => 64 tasks => 32 tasks/warp => 8 tasks/thread
|
||||
/*task0->thread0, warp0
|
||||
task1->thread0 , warp0
|
||||
task2->thread0 , warp0
|
||||
.
|
||||
task7->thread0
|
||||
task8->thread1
|
||||
task9->thread1
|
||||
.
|
||||
.
|
||||
------
|
||||
task32 -> thread0, warp1
|
||||
task33 -> thread1, warp1
|
||||
.
|
||||
*/
|
||||
|
||||
//NEW TASK DISTRIBUTION // For 8x8 matrix, 2x2 tc_size, 1 tc_num, 4threads, 2warps => 64 tasks => 32 tasks/warp => 8 tasks/thread
|
||||
/*task0->thread0, warp0
|
||||
task1->thread1 , warp0
|
||||
task2->thread2 , warp0
|
||||
task3->thread3 ,...
|
||||
task4->thread0
|
||||
task5->thread1
|
||||
.
|
||||
.
|
||||
------
|
||||
task32 -> thread0, warp1
|
||||
task33 -> thread1, warp1
|
||||
.
|
||||
.*/
|
||||
|
||||
//TODO :: change this for new task->thread distribution
|
||||
if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit)
|
||||
{
|
||||
uint64_t a_addr_base = a_addr + offset*arg->data_size;
|
||||
uint64_t b_addr_base = b_addr + offset*arg->data_size;
|
||||
uint64_t c_addr_base = c_addr + offset_c*arg->data_size;
|
||||
csr_write(VX_MAT_MUL_SIZE,n_tiles);
|
||||
csr_write(VX_TC_NUM,TC_per_warp);
|
||||
csr_write(VX_TC_SIZE,tc_size);
|
||||
|
||||
vx_matrix_load (0, a_addr_base);
|
||||
vx_matrix_load (1, b_addr_base);
|
||||
//In case of multiple threads - sync load
|
||||
vx_fence();
|
||||
|
||||
vx_matrix_mul(); //Assuming padding to ensure matrix size is a multiple of tc_size
|
||||
vx_fence();
|
||||
if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit_c)
|
||||
vx_matrix_store(c_addr_base);
|
||||
//In case of multiple threads - sync store
|
||||
vx_fence();
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)kernel_body, arg);
|
||||
}
|
|
@ -1,348 +0,0 @@
|
|||
#include <iostream>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
#include <vortex.h>
|
||||
#include <cmath>
|
||||
#include "common.h"
|
||||
|
||||
#define RT_CHECK(_expr) \
|
||||
do { \
|
||||
int _ret = _expr; \
|
||||
if (0 == _ret) \
|
||||
break; \
|
||||
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
const char* kernel_file = "kernel.vxbin";
|
||||
uint32_t matrix_size = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
vx_buffer_h A_buffer = nullptr;
|
||||
vx_buffer_h B_buffer = nullptr;
|
||||
vx_buffer_h C_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
|
||||
std::vector<uint8_t> staging_buf;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Vortex Test." << std::endl;
|
||||
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv, uint32_t &data_size) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:k:d:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
matrix_size = atoi(optarg);
|
||||
break;
|
||||
case 'k':
|
||||
kernel_file = optarg;
|
||||
break;
|
||||
case 'd':
|
||||
data_size = atoi(optarg);
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
} break;
|
||||
default:
|
||||
show_usage();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(A_buffer);
|
||||
vx_mem_free(B_buffer);
|
||||
vx_mem_free(C_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename TYPE>
|
||||
class mainVariables
|
||||
{
|
||||
public:
|
||||
// Constructor
|
||||
mainVariables(uint32_t bufSize, uint32_t dataSize, uint32_t matrixSize)
|
||||
: buf_size(bufSize), data_size(dataSize), matrix_size(matrixSize)
|
||||
{
|
||||
// Resize vectors to specified sizes
|
||||
src_A.resize(buf_size/data_size);
|
||||
src_B.resize(buf_size/data_size);
|
||||
refs.resize(buf_size/data_size);
|
||||
}
|
||||
|
||||
void init_inputs ()
|
||||
{
|
||||
std::cout << "inside init" << std::endl;
|
||||
for (uint32_t i = 0; i < matrix_size*matrix_size; ++i)
|
||||
{
|
||||
auto a = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
auto b = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
src_A[i] = static_cast<TYPE>(a * matrix_size);
|
||||
src_B[i] = static_cast<TYPE>(b * matrix_size);
|
||||
}
|
||||
}
|
||||
|
||||
void matmul_cpu()
|
||||
{
|
||||
for (uint32_t row = 0; row < matrix_size; ++row)
|
||||
{
|
||||
for (uint32_t col = 0; col < matrix_size; ++col)
|
||||
{
|
||||
TYPE sum(0);
|
||||
for (uint32_t e = 0; e < matrix_size; ++e) {
|
||||
sum += src_A[row * matrix_size + e] * src_B[e * matrix_size + col];
|
||||
}
|
||||
refs[row * matrix_size + col] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Public variables
|
||||
std::vector<TYPE> src_A;
|
||||
std::vector<TYPE> src_B;
|
||||
std::vector<TYPE> refs;
|
||||
|
||||
std::vector<uint8_t> A_mat;
|
||||
std::vector<uint8_t> B_mat;
|
||||
|
||||
private:
|
||||
uint32_t buf_size;
|
||||
uint32_t data_size;
|
||||
uint32_t matrix_size;
|
||||
};
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
uint32_t data_size = 0;
|
||||
parse_args(argc, argv, data_size);
|
||||
if (matrix_size == 0) {
|
||||
matrix_size = 2;
|
||||
}
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
uint64_t tc_size, TC_per_warp;
|
||||
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
//Add assert/knob
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
|
||||
|
||||
std::cout << "Debug :: tc_size = " << tc_size << std::endl;
|
||||
std::cout << "Debug :: tc_num = " << TC_per_warp << std::endl;
|
||||
|
||||
int threads_per_tc;
|
||||
//TODO - can be changed
|
||||
//Number of output tiles * number of threads
|
||||
if (TC_per_warp > num_threads)
|
||||
threads_per_tc = 1;
|
||||
else
|
||||
threads_per_tc = num_threads/TC_per_warp;
|
||||
|
||||
uint32_t num_tasks = ((matrix_size*matrix_size)/(tc_size*tc_size))*threads_per_tc;
|
||||
|
||||
//size of each operand
|
||||
uint32_t buf_size = ((matrix_size*matrix_size)/(tc_size*tc_size))*(matrix_size/(tc_size))*(tc_size*tc_size)*data_size;
|
||||
|
||||
//256
|
||||
std::cout << "Debug :: buf_size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &A_buffer));
|
||||
RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &B_buffer));
|
||||
RT_CHECK(vx_mem_address(B_buffer, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &C_buffer));
|
||||
RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "A_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "B_addr=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "C_addr=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
mainVariables<int> variables (buf_size, data_size, matrix_size);
|
||||
variables.init_inputs();
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// generate source data
|
||||
//////////////////////////////////////////////////
|
||||
variables.matmul_cpu();
|
||||
|
||||
uint32_t tc_size_f = tc_size*tc_size;
|
||||
uint32_t n_tiles = matrix_size/tc_size;
|
||||
|
||||
variables.A_mat.resize(buf_size);
|
||||
variables.B_mat.resize(buf_size);
|
||||
|
||||
//Demand matrix creation for A / traverse through the rows
|
||||
for(uint32_t k=0; k<n_tiles; k++)
|
||||
{
|
||||
//traverse through output tiles in a row
|
||||
for(uint32_t i=0; i<n_tiles; i++)
|
||||
{
|
||||
//traverse through tiles for one output tile
|
||||
for(uint32_t j=0; j< n_tiles; j++)
|
||||
{
|
||||
for(int t=0; t < tc_size*tc_size; t++)
|
||||
{
|
||||
variables.A_mat[n_tiles*n_tiles*tc_size_f*k + n_tiles*tc_size_f*i+tc_size_f*j + t] = variables.src_A[k*tc_size*matrix_size+ tc_size*j +(t/tc_size)*matrix_size + t%tc_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Demand matrix creation for B / traverse through the rows
|
||||
for(uint32_t k=0; k<n_tiles; k++)
|
||||
{
|
||||
//traverse through output tiles in a row
|
||||
for(uint32_t i=0; i<n_tiles; i++)
|
||||
{
|
||||
//traverse through tiles for one output tile
|
||||
for(uint32_t j=0; j< n_tiles; j++)
|
||||
{
|
||||
for(int t=0; t < tc_size*tc_size; t++)
|
||||
{
|
||||
variables.B_mat[n_tiles*n_tiles*tc_size_f*k + n_tiles*tc_size_f*i+tc_size_f*j + t] = variables.src_B[i*tc_size+ tc_size*matrix_size*j +(t/tc_size)*matrix_size + t%tc_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
// upload matrix A buffer
|
||||
{
|
||||
std::cout << "upload matrix A buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(A_buffer, (int8_t*)variables.A_mat.data(), 0, buf_size));
|
||||
}
|
||||
|
||||
// upload matrix B buffer
|
||||
{
|
||||
std::cout << "upload matrix B buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(B_buffer, (int8_t*)variables.B_mat.data(), 0, buf_size));
|
||||
}
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
//Prep kernel arguments
|
||||
//////////////////////////////////////////////////
|
||||
//1
|
||||
std::cout << "Debug :: num_tasks = " << num_tasks << std::endl;
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.num_warps = num_warps;
|
||||
kernel_arg.num_threads = num_threads;
|
||||
kernel_arg.TC_per_warp = TC_per_warp;
|
||||
//1
|
||||
kernel_arg.matrix_size = matrix_size;
|
||||
kernel_arg.data_size = data_size;
|
||||
kernel_arg.tc_size = tc_size;
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev((int8_t*)variables.B_mat.data(), C_buffer, 0, buf_size));
|
||||
|
||||
// verify result (TODO : needs to be fixed for for functional correctness)
|
||||
/*
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (int8_t*)staging_buf.data();
|
||||
uint64_t tc_size = kernel_arg.tc_size;
|
||||
std::cout << "tc_size = " << tc_size << std::endl;
|
||||
int Result[matrix_size*matrix_size];
|
||||
int n_tiles = (matrix_size/tc_size);
|
||||
int tc_size_f = tc_size*tc_size;
|
||||
|
||||
//converting buf ptr (tile by tile) to CPU style linear (row by row)
|
||||
for(int k = 0; k < matrix_size/tc_size; k+= 1)
|
||||
{
|
||||
for(int j = 0; j < matrix_size; j+= tc_size)
|
||||
{
|
||||
for(int i =0; i < tc_size*tc_size; i++)
|
||||
{
|
||||
Result[ tc_size*matrix_size*k +j+ (i/tc_size)*matrix_size +i%(tc_size)] = buf_ptr[matrix_size*tc_size*k+tc_size*j+i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < matrix_size*matrix_size; ++i) {
|
||||
//int ref = i + i;
|
||||
int cur = Result[i];
|
||||
if (cur != refs[i]) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "CONDITIONALLY PASSED!" << std::endl;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,26 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# README:
|
||||
# This script launches a sweep of TC_SIZE, TC_NUM and MATRIX SIZES
|
||||
# default values of NUM_WARPS=32, NUM_THREADS=32, NUM_CORES=4, DATA_SIZE=1
|
||||
# Edit matrix_sizes, tcsizes & tcnums variables to vary the sweep limits
|
||||
|
||||
# Define arrays for tc_size,tc_num and matrix sizes
|
||||
matrix_sizes=(16 32 64 128 256 512)
|
||||
tcsizes=(8 16 32)
|
||||
tcnums=(4 8 16 32)
|
||||
|
||||
cd ../../../build/
|
||||
|
||||
# Loop through each combination of above configs
|
||||
for size in "${matrix_sizes[@]}"; do
|
||||
for tcsize in "${tcsizes[@]}"; do
|
||||
for tcnum in "${tcnums[@]}"; do
|
||||
mkdir -p sim_final/mat${size}
|
||||
log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t"
|
||||
cmd="CONFIGS=\"-DTC_NUM=${tcnum} -DTC_SIZE=${tcsize}\" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args=\"-n${size} -d1\" --rebuild=1 --perf=1 > ${log_name} 2>&1"
|
||||
echo $cmd
|
||||
eval $cmd
|
||||
done
|
||||
done
|
||||
done
|
Loading…
Add table
Add a link
Reference in a new issue