mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 05:47:35 -04:00
pltform independent source tree refactoring
This commit is contained in:
parent
92fd9d16ac
commit
c613985de4
202 changed files with 9799 additions and 103942 deletions
6
Makefile
6
Makefile
|
@ -2,14 +2,14 @@ all:
|
|||
$(MAKE) -C third_party
|
||||
$(MAKE) -C hw
|
||||
$(MAKE) -C sim
|
||||
$(MAKE) -C driver
|
||||
$(MAKE) -C kernel
|
||||
$(MAKE) -C runtime
|
||||
$(MAKE) -C tests
|
||||
|
||||
clean:
|
||||
$(MAKE) -C hw clean
|
||||
$(MAKE) -C sim clean
|
||||
$(MAKE) -C driver clean
|
||||
$(MAKE) -C kernel clean
|
||||
$(MAKE) -C runtime clean
|
||||
$(MAKE) -C tests clean
|
||||
|
||||
|
@ -17,6 +17,6 @@ clean-all:
|
|||
$(MAKE) -C third_party clean
|
||||
$(MAKE) -C hw clean
|
||||
$(MAKE) -C sim clean
|
||||
$(MAKE) -C driver clean
|
||||
$(MAKE) -C kernel clean
|
||||
$(MAKE) -C runtime clean
|
||||
$(MAKE) -C tests clean-all
|
|
@ -91,19 +91,19 @@ done
|
|||
|
||||
case $DRIVER in
|
||||
rtlsim)
|
||||
DRIVER_PATH=$VORTEX_HOME/driver/rtlsim
|
||||
DRIVER_PATH=$VORTEX_HOME/runtime/rtlsim
|
||||
;;
|
||||
vlsim)
|
||||
DRIVER_PATH=$VORTEX_HOME/driver/vlsim
|
||||
DRIVER_PATH=$VORTEX_HOME/runtime/vlsim
|
||||
;;
|
||||
asesim)
|
||||
DRIVER_PATH=$VORTEX_HOME/driver/asesim
|
||||
DRIVER_PATH=$VORTEX_HOME/runtime/asesim
|
||||
;;
|
||||
fpga)
|
||||
DRIVER_PATH=$VORTEX_HOME/driver/fpga
|
||||
DRIVER_PATH=$VORTEX_HOME/runtime/fpga
|
||||
;;
|
||||
simx)
|
||||
DRIVER_PATH=$VORTEX_HOME/driver/simx
|
||||
DRIVER_PATH=$VORTEX_HOME/runtime/simx
|
||||
;;
|
||||
*)
|
||||
echo "invalid driver: $DRIVER"
|
||||
|
@ -149,7 +149,7 @@ status=0
|
|||
make -C hw config
|
||||
|
||||
# ensure the stub driver is present
|
||||
make -C $VORTEX_HOME/driver/stub
|
||||
make -C $VORTEX_HOME/runtime/stub
|
||||
|
||||
if [ $DEBUG -ne 0 ]
|
||||
then
|
||||
|
|
|
@ -9,17 +9,17 @@ make -s
|
|||
# clear POCL cache
|
||||
rm -rf ~/.cache/pocl
|
||||
|
||||
# rebuild runtime
|
||||
# rebuild runtime library
|
||||
make -C runtime clean
|
||||
make -C runtime
|
||||
|
||||
# rebuild drivers
|
||||
make -C driver clean
|
||||
make -C driver
|
||||
# rebuild kernel library
|
||||
make -C kernel clean
|
||||
make -C kernel
|
||||
|
||||
# rebuild runtime tests
|
||||
make -C tests/runtime clean
|
||||
make -C tests/runtime
|
||||
# rebuild kernel tests
|
||||
make -C tests/kernel clean
|
||||
make -C tests/kernel
|
||||
|
||||
# rebuild regression tests
|
||||
make -C tests/regression clean-all
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
all: stub rtlsim simx vlsim
|
||||
|
||||
stub:
|
||||
$(MAKE) -C stub
|
||||
|
||||
fpga:
|
||||
$(MAKE) -C fpga
|
||||
|
||||
asesim:
|
||||
$(MAKE) -C asesim
|
||||
|
||||
vlsim:
|
||||
$(MAKE) -C vlsim
|
||||
|
||||
rtlsim:
|
||||
$(MAKE) -C rtlsim
|
||||
|
||||
simx:
|
||||
$(MAKE) -C simx
|
||||
|
||||
clean:
|
||||
$(MAKE) clean -C stub
|
||||
$(MAKE) clean -C fpga
|
||||
$(MAKE) clean -C asesim
|
||||
$(MAKE) clean -C vlsim
|
||||
$(MAKE) clean -C rtlsim
|
||||
$(MAKE) clean -C simx
|
||||
|
||||
.PHONY: all stub fpga asesim vlsim rtlsim simx clean
|
103
hw/afu/xrt/vortex_afu.v
Normal file
103
hw/afu/xrt/vortex_afu.v
Normal file
|
@ -0,0 +1,103 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module vortex_afu #(
|
||||
parameter AXI_DATA_WIDTH = `VX_MEM_DATA_WIDTH,
|
||||
parameter AXI_ADDR_WIDTH = `VX_MEM_ADDR_WIDTH,
|
||||
parameter AXI_TID_WIDTH = 12,
|
||||
parameter AXI_STROBE_WIDTH = `VX_MEM_BYTEEN_WIDTH,
|
||||
parameter AXI_DCR_ADDR_WIDTH = `VX_DCR_ADDR_WIDTH,
|
||||
parameter AXI_DCR_DATA_WIDTH = `VX_DCR_DATA_WIDTH
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
output wire [AXI_TID_WIDTH - 1:0] m_axi_awid,
|
||||
output wire [AXI_ADDR_WIDTH - 1:0] m_axi_awaddr,
|
||||
output wire [7:0] m_axi_awlen,
|
||||
output wire [2:0] m_axi_awsize,
|
||||
output wire [1:0] m_axi_awburst,
|
||||
output wire m_axi_awlock,
|
||||
output wire [3:0] m_axi_awcache,
|
||||
output wire [2:0] m_axi_awprot,
|
||||
output wire [3:0] m_axi_awqos,
|
||||
output wire m_axi_awvalid,
|
||||
input wire m_axi_awready,
|
||||
output wire [AXI_DATA_WIDTH - 1:0] m_axi_wdata,
|
||||
output wire [AXI_STROBE_WIDTH - 1:0] m_axi_wstrb,
|
||||
output wire m_axi_wlast,
|
||||
output wire m_axi_wvalid,
|
||||
input wire m_axi_wready,
|
||||
input wire [AXI_TID_WIDTH - 1:0] m_axi_bid,
|
||||
input wire [1:0] m_axi_bresp,
|
||||
input wire m_axi_bvalid,
|
||||
output wire m_axi_bready,
|
||||
output wire [AXI_TID_WIDTH - 1:0] m_axi_arid,
|
||||
output wire [AXI_ADDR_WIDTH - 1:0] m_axi_araddr,
|
||||
output wire [7:0] m_axi_arlen,
|
||||
output wire [2:0] m_axi_arsize,
|
||||
output wire [1:0] m_axi_arburst,
|
||||
output wire m_axi_arlock,
|
||||
output wire [3:0] m_axi_arcache,
|
||||
output wire [2:0] m_axi_arprot,
|
||||
output wire [3:0] m_axi_arqos,
|
||||
output wire m_axi_arvalid,
|
||||
input wire m_axi_arready,
|
||||
input wire [AXI_TID_WIDTH - 1:0] m_axi_rid,
|
||||
input wire [AXI_DATA_WIDTH - 1:0] m_axi_rdata,
|
||||
input wire [1:0] m_axi_rresp,
|
||||
input wire m_axi_rlast,
|
||||
input wire m_axi_rvalid,
|
||||
output wire m_axi_rready,
|
||||
output wire busy
|
||||
);
|
||||
|
||||
Vortex_axi #(
|
||||
.AXI_DATA_WIDTH (AXI_DATA_WIDTH),
|
||||
.AXI_ADDR_WIDTH (AXI_ADDR_WIDTH),
|
||||
.AXI_TID_WIDTH (AXI_TID_WIDTH),
|
||||
.AXI_STROBE_WIDTH (AXI_STROBE_WIDTH),
|
||||
.AXI_DCR_ADDR_WIDTH (AXI_DCR_ADDR_WIDTH),
|
||||
.AXI_DCR_DATA_WIDTH (AXI_DCR_DATA_WIDTH)
|
||||
) inst (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.m_axi_awid(m_axi_awid),
|
||||
.m_axi_awaddr(m_axi_awaddr),
|
||||
.m_axi_awlen(m_axi_awlen),
|
||||
.m_axi_awsize(m_axi_awsize),
|
||||
.m_axi_awburst(m_axi_awburst),
|
||||
.m_axi_awlock(m_axi_awlock),
|
||||
.m_axi_awcache(m_axi_awcache),
|
||||
.m_axi_awprot(m_axi_awprot),
|
||||
.m_axi_awqos(m_axi_awqos),
|
||||
.m_axi_awvalid(m_axi_awvalid),
|
||||
.m_axi_awready(m_axi_awready),
|
||||
.m_axi_wdata(m_axi_wdata),
|
||||
.m_axi_wstrb(m_axi_wstrb),
|
||||
.m_axi_wlast(m_axi_wlast),
|
||||
.m_axi_wvalid(m_axi_wvalid),
|
||||
.m_axi_wready(m_axi_wready),
|
||||
.m_axi_bid(m_axi_bid),
|
||||
.m_axi_bresp(m_axi_bresp),
|
||||
.m_axi_bvalid(m_axi_bvalid),
|
||||
.m_axi_bready(m_axi_bready),
|
||||
.m_axi_arid(m_axi_arid),
|
||||
.m_axi_araddr(m_axi_araddr),
|
||||
.m_axi_arlen(m_axi_arlen),
|
||||
.m_axi_arsize(m_axi_arsize),
|
||||
.m_axi_arburst(m_axi_arburst),
|
||||
.m_axi_arlock(m_axi_arlock),
|
||||
.m_axi_arcache(m_axi_arcache),
|
||||
.m_axi_arprot(m_axi_arprot),
|
||||
.m_axi_arqos(m_axi_arqos),
|
||||
.m_axi_arvalid(m_axi_arvalid),
|
||||
.m_axi_arready(m_axi_arready),
|
||||
.m_axi_rid(m_axi_rid),
|
||||
.m_axi_rdata(m_axi_rdata),
|
||||
.m_axi_rresp(m_axi_rresp),
|
||||
.m_axi_rlast(m_axi_rlast),
|
||||
.m_axi_rvalid(m_axi_rvalid),
|
||||
.m_axi_rready(m_axi_rready),
|
||||
.busy(busy)
|
||||
);
|
||||
|
||||
endmodule
|
|
@ -2,6 +2,8 @@ DEVICE_FAMILY ?= arria10
|
|||
BUILD_DIR ?= build
|
||||
RTL_DIR = ../../rtl
|
||||
DPI_DIR = ../../dpi
|
||||
AFU_DIR = ../../afu/opae
|
||||
IP_DIR = ../ip/$(DEVICE_FAMILY)
|
||||
|
||||
ifeq ($(shell which qsub-synth),)
|
||||
RUN_SYNTH=$(OPAE_PLATFORM_ROOT)/bin/run.sh > build.log 2>&1 &
|
||||
|
@ -39,12 +41,12 @@ CONFIG16 := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE $(CONFIGS)
|
|||
CONFIG32 := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE $(CONFIGS)
|
||||
CONFIG64 := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE $(CONFIGS)
|
||||
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fpu_unit -I$(RTL_DIR)/fpu_unit/altera/$(DEVICE_FAMILY)
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fpu_unit
|
||||
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
|
||||
RASTER_INCLUDE = -I$(RTL_DIR)/raster_unit
|
||||
ROP_INCLUDE = -I$(RTL_DIR)/rop_unit
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/afu $(FPU_INCLUDE)
|
||||
RTL_INCLUDE += $(TEX_INCLUDE) $(RASTER_INCLUDE) $(ROP_INCLUDE)
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(AFU_DIR) -I$(IP_DIR)
|
||||
RTL_INCLUDE += $(FPU_INCLUDE) $(TEX_INCLUDE) $(RASTER_INCLUDE) $(ROP_INCLUDE)
|
||||
|
||||
CFLAGS += $(RTL_INCLUDE)
|
||||
|
|
@ -7,7 +7,7 @@ BUILD_DIR=$1
|
|||
PROGRAM=$(basename "$2")
|
||||
PROGRAM_DIR=`dirname $2`
|
||||
|
||||
VORTEX_DRV_PATH=$SCRIPT_DIR/../../../driver
|
||||
VORTEX_RT_PATH=$SCRIPT_DIR/../../../runtime
|
||||
|
||||
# Export ASE_WORKDIR variable
|
||||
export ASE_WORKDIR=$SCRIPT_DIR/$BUILD_DIR/work
|
||||
|
@ -35,5 +35,5 @@ done
|
|||
# run application
|
||||
pushd $PROGRAM_DIR
|
||||
echo " [DBG] running ./$PROGRAM $*"
|
||||
ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH/lib:$VORTEX_DRV_PATH/asesim:$LD_LIBRARY_PATH ./$PROGRAM $*
|
||||
ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH/lib:$VORTEX_RT_PATH/asesim:$LD_LIBRARY_PATH ./$PROGRAM $*
|
||||
popd
|
|
@ -6,15 +6,15 @@ THIRD_PARTY_DIR = ../../../../../third_party
|
|||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
|
||||
IP_DIR = ../../ip/arria10
|
||||
|
||||
#FAMILY = "Stratix 10"
|
||||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
|
||||
#IP_DIR = ../../ip/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
|
@ -6,14 +6,14 @@ THIRD_PARTY_DIR = ../../../../../third_party
|
|||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
|
||||
IP_DIR = ../../ip/arria10
|
||||
|
||||
#FAMILY = "Stratix 10"
|
||||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
|
||||
#IP_DIR = ../../ip/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(IP_DIR)
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
# Part, Family
|
|
@ -8,15 +8,15 @@ CONFIGS += -set "EXT_GFX_ENABLE"
|
|||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
|
||||
IP_DIR = ../../ip/arria10
|
||||
|
||||
#FAMILY = "Stratix 10"
|
||||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
|
||||
#IP_DIR = ../../ip/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
GFX_INCLUDE = $(RTL_DIR)/tex_unit;$(RTL_DIR)/raster_unit;$(RTL_DIR)/rop_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(GFX_INCLUDE)
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(GFX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
|
@ -6,15 +6,15 @@ THIRD_PARTY_DIR = ../../../../../third_party
|
|||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
|
||||
IP_DIR = ../../ip/arria10
|
||||
|
||||
#FAMILY = "Stratix 10"
|
||||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
|
||||
#IP_DIR = ../../ip/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
|
@ -2,15 +2,16 @@ PROJECT = vortex_afu
|
|||
TOP_LEVEL_ENTITY = vortex_afu
|
||||
SRC_FILE = vortex_afu.sv
|
||||
RTL_DIR = ../../../../rtl
|
||||
AFU_DIR = ../../../../afu/opae
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
FPU_CORE_PATH=$(RTL_DIR)/fpu_unit/altera/arria10
|
||||
IP_DIR = ../../ip/arria10
|
||||
|
||||
#FAMILY = "Stratix 10"
|
||||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
|
||||
#IP_DIR = ../../ip/stratix10
|
||||
|
||||
CONFIGS += -set "NOPAE"
|
||||
CONFIGS += -set "EXT_GFX_ENABLE"
|
||||
|
@ -21,9 +22,9 @@ CONFIGS += -set "NUM_CORES=4"
|
|||
#CONFIGS += -set "SM_DISABLE"
|
||||
#CONFIGS += -set "RCACHE_DISABLE" -set "OCACHE_DISABLE" -set "TCACHE_DISABLE"
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit;$(RTL_DIR)/raster_unit;$(RTL_DIR)/rop_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(AFU_DIR);$(AFU_DIR)/ccip;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
|
@ -2,15 +2,16 @@ PROJECT = vortex_afu
|
|||
TOP_LEVEL_ENTITY = vortex_afu
|
||||
SRC_FILE = vortex_afu.sv
|
||||
RTL_DIR = ../../../../rtl
|
||||
AFU_DIR = ../../../../afu/opae
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
|
||||
IP_DIR = ../../ip/arria10
|
||||
|
||||
#FAMILY = "Stratix 10"
|
||||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
|
||||
#IP_DIR = ../../ip/stratix10
|
||||
|
||||
CONFIGS += -set "NOPAE"
|
||||
|
||||
|
@ -19,9 +20,9 @@ CONFIGS += -set "NUM_CORES=4"
|
|||
#CONFIGS += -set "L1_DISABLE"
|
||||
#CONFIGS += -set "SM_DISABLE"
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(AFU_DIR);$(AFU_DIR)/ccip;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
|
@ -6,15 +6,15 @@ THIRD_PARTY_DIR = ../../../../../third_party
|
|||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
|
||||
IP_DIR = ../../ip/arria10
|
||||
|
||||
#FAMILY = "Stratix 10"
|
||||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
|
||||
#IP_DIR = ../../ip/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
|
@ -6,11 +6,11 @@ THIRD_PARTY_DIR = ../../../../../third_party
|
|||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
|
||||
IP_DIR = ../../ip/arria10
|
||||
|
||||
#FAMILY = "Stratix 10"
|
||||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
|
||||
#IP_DIR = ../../ip/stratix10
|
||||
|
||||
CONFIGS += -set "EXT_GFX_ENABLE"
|
||||
|
||||
|
@ -26,9 +26,9 @@ CONFIGS += -set "EXT_GFX_ENABLE"
|
|||
|
||||
CONFIGS += -set "NUM_CORES=4"
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
GFX_INCLUDE = $(RTL_DIR)/tex_unit;$(RTL_DIR)/raster_unit;$(RTL_DIR)/rop_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(GFX_INCLUDE)
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(GFX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
|
@ -6,11 +6,11 @@ THIRD_PARTY_DIR = ../../../../../third_party
|
|||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
|
||||
IP_DIR = ../../ip/arria10
|
||||
|
||||
#FAMILY = "Stratix 10"
|
||||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
|
||||
#IP_DIR = ../../ip/stratix10
|
||||
|
||||
#CONFIGS += -set "L1_DISABLE"
|
||||
|
||||
|
@ -22,9 +22,9 @@ FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
|
|||
|
||||
CONFIGS += -set "NUM_CORES=4"
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
0
hw/syn/xilinx/ip/ultrascale/fmadd.v
Normal file
0
hw/syn/xilinx/ip/ultrascale/fmadd.v
Normal file
0
hw/syn/xilinx/ip/ultrascale/fsqrt.v
Normal file
0
hw/syn/xilinx/ip/ultrascale/fsqrt.v
Normal file
0
kernel/.gitignore
vendored
Normal file
0
kernel/.gitignore
vendored
Normal file
49
kernel/Makefile
Normal file
49
kernel/Makefile
Normal file
|
@ -0,0 +1,49 @@
|
|||
XLEN ?= 32
|
||||
|
||||
ifeq ($(XLEN),32)
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
else
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
|
||||
endif
|
||||
|
||||
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc
|
||||
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc-ar
|
||||
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objdump
|
||||
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objcopy
|
||||
|
||||
ifeq ($(XLEN),32)
|
||||
CFLAGS += -march=rv32imf -mabi=ilp32f
|
||||
else
|
||||
CFLAGS += -march=rv64imfd -mabi=lp64d
|
||||
endif
|
||||
|
||||
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -fdata-sections -ffunction-sections
|
||||
CFLAGS += -I./include -I../hw
|
||||
|
||||
PROJECT = libvortexrt
|
||||
|
||||
SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/tinyprintf.c ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c
|
||||
|
||||
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
|
||||
|
||||
all: $(PROJECT).a $(PROJECT).dump
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).a
|
||||
$(DP) -D $(PROJECT).a > $(PROJECT).dump
|
||||
|
||||
%.S.o: src/%.S
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
%.c.o: src/%.c
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
$(PROJECT).a: $(OBJS)
|
||||
$(AR) rcs $@ $^
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CC) $(CFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf *.a *.o *.dump .depend
|
|
@ -1,41 +0,0 @@
|
|||
|
||||
LIB_PATH = ../../runtime
|
||||
|
||||
|
||||
COMP = /home/fares/dev/riscv-gnu-toolchain-vector/drops/bin/riscv32-unknown-elf-gcc
|
||||
|
||||
CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/startup/vx_link.ld -march=rv32imv -mabi=ilp32
|
||||
|
||||
DMP = /home/fares/dev/riscv-gnu-toolchain-vector/drops/bin/riscv32-unknown-elf-objdump
|
||||
CPY = /home/fares/dev/riscv-gnu-toolchain-vector/drops/bin/riscv32-unknown-elf-objcopy
|
||||
|
||||
# VX_STR = ../../startup/vx_start.S
|
||||
|
||||
|
||||
|
||||
NEWLIB = $(LIB_PATH)/newlib/newlib.c
|
||||
VX_STR = $(LIB_PATH)/startup/vx_start.S
|
||||
VX_INT = $(LIB_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_IO = $(LIB_PATH)/io/vx_io.S $(LIB_PATH)/io/vx_io.c
|
||||
VX_API = $(LIB_PATH)/vx_api/vx_api.c
|
||||
VX_FIO = $(LIB_PATH)/fileio/fileio.S
|
||||
VX_VEC = vx_vec.s
|
||||
LIBS = /home/fares/dev/riscv-gnu-toolchain-vector/drops/riscv32-unknown-elf/lib/libc.a /home/fares/dev/riscv-gnu-toolchain-vector/drops/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
|
||||
|
||||
VX_MAIN = vx_vector_main
|
||||
|
||||
all: HEX DUMP ELF
|
||||
|
||||
DUMP: ELF
|
||||
$(DMP) -D $(VX_MAIN).elf > $(VX_MAIN).dump
|
||||
|
||||
HEX: ELF
|
||||
$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
|
||||
|
||||
ELF:
|
||||
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||
|
||||
run:
|
||||
../../simx/obj_dir/Vcache_simX -E -a rv32i --core vx_vector_main.hex -s -b 1> emulator.debug
|
||||
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
|
||||
|
||||
|
||||
.type vx_vec_test, @function
|
||||
.global vx_vec_test
|
||||
vx_vec_test:
|
||||
li a1, 7
|
||||
sw a1, 0(a0)
|
||||
ret
|
||||
|
||||
|
||||
|
||||
|
||||
# slli a0, a0, 2
|
||||
# add a0, a0, a3
|
||||
# vmv.v.x vv0, a2
|
||||
# # vsplat4 vv0, a2
|
||||
# stripmine_loop:
|
||||
# vlb4 vv1, (a1)
|
||||
# vcmpez4 vp0, vv1
|
||||
# !vp0 vlw4 vv1, (a3)
|
||||
# !vp0 vlw4 vv2, (a4)
|
||||
# !vp0 vfma4 vv1, vv0, vv1, vv2
|
||||
# !vp0 vsw4 vv1, (a4)
|
||||
# addi a1, a1, 4
|
||||
# addi a3, a3, 16
|
||||
# addi a4, a4, 16
|
||||
# bleu a3, a0, stripmine_loop
|
||||
# handle edge cases
|
||||
# when (n % 4) != 0 ...
|
|
@ -1,32 +0,0 @@
|
|||
|
||||
#include "../../runtime/intrinsics/vx_intrinsics.h"
|
||||
#include "vx_vec.h"
|
||||
|
||||
int main()
|
||||
{
|
||||
vx_tmc(1);
|
||||
// int * a = malloc(4);
|
||||
// int * b = malloc(4);
|
||||
// int * c = malloc(4);
|
||||
|
||||
|
||||
int * a = malloc(4);
|
||||
*a = 5;
|
||||
printf("Value of a: %d\n", *a);
|
||||
|
||||
vx_vec_test(a);
|
||||
|
||||
printf("Value of a: %d\n", *a);
|
||||
|
||||
|
||||
// for (int i = 0; i < 4; i++)
|
||||
// {
|
||||
// if (c[i] != (a[i] + b[i]))
|
||||
// {
|
||||
// printf("Fail\n");
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
|
||||
vx_tmc(0);
|
||||
}
|
|
@ -1,91 +0,0 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "../../runtime/intrinsics/vx_intrinsics.h"
|
||||
#include "vx_vec.h"
|
||||
|
||||
int main()
|
||||
{
|
||||
vx_tmc(1);
|
||||
#if 0
|
||||
# vector-vector add routine of 32-bit integers
|
||||
# void vvaddint32(size_t n, const int*x, const int*y, int*z)
|
||||
# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
|
||||
#
|
||||
# a0 = n, a1 = x, a2 = y, a3 = z
|
||||
# Non-vector instructions are indented
|
||||
#endif
|
||||
#if 1
|
||||
int n = 5;
|
||||
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||
|
||||
for(int i = 0; i < n; ++i)
|
||||
{
|
||||
a[i] = b[i] = c[i] = 1;
|
||||
}
|
||||
|
||||
for(int i = 0; i < n; ++i) printf("%d, ", a[i]);
|
||||
printf("\n");
|
||||
// for(int i = 0; i < n; ++i) printf("%d, ", b[i]);
|
||||
// printf("\n");
|
||||
// for(int i = 0; i < n; ++i) printf("%d, ", c[i]);
|
||||
|
||||
int *d;
|
||||
*d = 1;
|
||||
vx_vec_test(n, d, b, c);
|
||||
|
||||
|
||||
printf("(after: n = %d, %d)\n", n, *d);
|
||||
for(int i = 0; i < n; ++i) printf("%d, ", a[i]);
|
||||
// printf("\n");
|
||||
// for(int i = 0; i < n; ++i) printf("%d, ", b[i]);
|
||||
// printf("\n");
|
||||
// for(int i = 0; i < n; ++i) printf("%d, ", c[i]);
|
||||
|
||||
#endif
|
||||
#if 0
|
||||
int * a = malloc(sizeof(int) * 10);
|
||||
for(int i = 0; i < 10; ++i) a[i] = 5;
|
||||
|
||||
|
||||
for(int i = 0; i < 10; ++i)
|
||||
printf("%d, ", a[i]);
|
||||
|
||||
vx_vec_test(a);
|
||||
//vx_vec_test(2, a, a, a);
|
||||
|
||||
printf("after--------\n");
|
||||
for(int i = 0; i < 10; ++i)
|
||||
printf("%d, ", a[i]);
|
||||
#endif
|
||||
#if 0
|
||||
int n = 5;
|
||||
int *a = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1};
|
||||
int *b = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1};
|
||||
int *c = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1};
|
||||
|
||||
for(int i = 0; i < n; ++i)
|
||||
{
|
||||
a[i] = 1;
|
||||
b[i] = 1;
|
||||
c[i] = 0;
|
||||
}
|
||||
|
||||
printf("Value of a: %d, b: %d, c: %d, n: %d\n", a[0], b[0], c[0], n);
|
||||
vx_vec_test(n, a, b, c);
|
||||
printf("Value of a: %d, b: %d, c: %d, n: %d\n", a[0], b[0], c[0], n);
|
||||
|
||||
#endif
|
||||
|
||||
// for (int i = 0; i < 4; i++)
|
||||
// {
|
||||
// if (c[i] != (a[i] + b[i]))
|
||||
// {
|
||||
// printf("Fail\n");
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
|
||||
vx_tmc(0);
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
|
||||
|
||||
#pragma once
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void vx_vec_test(int n, int* a, int* b, int* c); //vvaddint32
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -1,23 +0,0 @@
|
|||
|
||||
|
||||
.type vx_vec_test, @function
|
||||
.global vx_vec_test
|
||||
vx_vec_test:
|
||||
# vector-vector add routine of 32-bit integers
|
||||
# void vvaddint32(size_t n, const int*x, const int*y, int*z)
|
||||
# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
|
||||
#
|
||||
# a0 = n, a1 = x, a2 = y, a3 = z
|
||||
# Non-vector instructions are indented
|
||||
vsetvli t0, a0, e32 # Set vector length based on 32-bit vectors
|
||||
vlw.v v0, (a1) # Get first vector
|
||||
sub a0, a0, t0 # Decrement number done
|
||||
slli t0, t0, 2 # Multiply number done by 4 bytes
|
||||
add a1, a1, t0 # Bump pointer
|
||||
vlw.v v1, (a2) # Get second vector
|
||||
add a2, a2, t0 # Bump pointer
|
||||
vadd.vv v2, v0, v1 # Sum vectors
|
||||
vsw.v v2, (a3) # Store result
|
||||
add a3, a3, t0 # Bump pointer
|
||||
bnez a0, vx_vec_test # Loop back
|
||||
ret # Finished
|
|
@ -1,27 +0,0 @@
|
|||
#include "../../runtime/intrinsics/vx_intrinsics.h"
|
||||
#include "vx_vec.h"
|
||||
|
||||
int main()
|
||||
{
|
||||
vx_tmc(1);
|
||||
printf("----------------hello!!! \n");
|
||||
|
||||
int n = 8;
|
||||
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||
|
||||
printf("hello!!! \n");
|
||||
|
||||
for(int i = 0; i < n; ++i)
|
||||
{
|
||||
a[i] = b[i] = c[i] = 1;
|
||||
}
|
||||
|
||||
vx_vec_test(n, a, b, c);
|
||||
|
||||
for(int i = 0; i < n; ++i)
|
||||
printf("%d ", c[i]);
|
||||
|
||||
vx_tmc(0);
|
||||
}
|
|
@ -1,29 +0,0 @@
|
|||
#include "../../runtime/intrinsics/vx_intrinsics.h"
|
||||
#include "vx_vec.h"
|
||||
|
||||
int main()
|
||||
{
|
||||
vx_tmc(1);
|
||||
|
||||
printf("Hello\n");
|
||||
|
||||
int n = 64;
|
||||
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||
|
||||
for(int i = 0; i < n; ++i)
|
||||
{
|
||||
a[i] = b[i] = c[i] = 1;
|
||||
}
|
||||
|
||||
vx_vec_test(n, a, b, c);
|
||||
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
printf("a[%d]=%d, b[%d]=%d, c[%d]=%d\n", i, a[i], i, b[i], i, c[i]);
|
||||
}
|
||||
|
||||
|
||||
vx_tmc(0);
|
||||
}
|
|
@ -1,39 +0,0 @@
|
|||
LIB_PATH = ../../runtime
|
||||
|
||||
COMP = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-gcc
|
||||
|
||||
CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/startup/vx_link.ld -march=rv32imv -mabi=ilp32
|
||||
|
||||
DMP = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objdump
|
||||
CPY = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objcopy
|
||||
|
||||
# VX_STR = ../../startup/vx_start.S
|
||||
|
||||
NEWLIB = $(LIB_PATH)/newlib/newlib.c
|
||||
VX_STR = $(LIB_PATH)/startup/vx_start.S
|
||||
VX_INT = $(LIB_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_IO = $(LIB_PATH)/io/vx_io.S $(LIB_PATH)/io/vx_io.c
|
||||
VX_API = $(LIB_PATH)/vx_api/vx_api.c
|
||||
VX_FIO = $(LIB_PATH)/fileio/fileio.S
|
||||
VX_VEC1 = vx_vec_vvaddint32.s
|
||||
VX_VEC2 = vx_vec_saxpy.s #float --> int
|
||||
VX_VEC3 = vx_vec_sgemm_float.s #float --> int
|
||||
VX_VEC4 = vx_vec_vsadd.s
|
||||
VX_VEC5 = vx_vec_memcpy.s
|
||||
LIBS = /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libc.a /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
|
||||
|
||||
VX_MAIN = vx_vec_benchmark
|
||||
|
||||
all: HEX DUMP ELF
|
||||
|
||||
DUMP: ELF
|
||||
$(DMP) -D $(VX_MAIN).elf > $(VX_MAIN).dump
|
||||
|
||||
HEX: ELF
|
||||
$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
|
||||
|
||||
ELF:
|
||||
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC3) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC4) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC5) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf~
|
|
@ -1,9 +0,0 @@
|
|||
1. add benchmarks under (Vortex/benchmarks/..)
|
||||
1.1 bfs --> blas spmv approach
|
||||
1.2 kmeans // stage 2
|
||||
1.3 saxpy --> sample
|
||||
1.4 sfilter // stage 2
|
||||
1.5 sgemm --> sample modify (float --> int)
|
||||
1.6 vecadd --> sample
|
||||
|
||||
|
|
@ -1,177 +0,0 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "../../runtime/intrinsics/vx_intrinsics.h"
|
||||
#include "vx_vec_benchmark.h"
|
||||
|
||||
int main()
|
||||
{
|
||||
vx_tmc(1);
|
||||
|
||||
int n = 5;
|
||||
int scalar = 10;
|
||||
|
||||
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||
|
||||
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2; c[i] = 5; }
|
||||
|
||||
#if 0
|
||||
//---------------------------------------------------------------
|
||||
/* vvaddint32
|
||||
* # vector-vector add routine of 32-bit integers
|
||||
* # void vvaddint32(size_t n, const int*x, const int*y, int*z)
|
||||
* # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } } */
|
||||
printf("vvaddint...\na[%d]: ", n);
|
||||
for(int i = 0; i < n; ++i) printf("%d ", a[i]);
|
||||
printf("\nb[%d]: ", n);
|
||||
for(int i = 0; i < n; ++i) printf("%d ", b[i]);
|
||||
printf("\nc[%d] = a[%d] + b[%d]: ", n, n, n);
|
||||
for(int i = 0; i < n; ++i) printf("%d ", c[i]);
|
||||
|
||||
vx_vec_vvaddint32(n, a, b, c);
|
||||
|
||||
for(int i = 0; i < n; ++i)
|
||||
{
|
||||
if(c[i] != (a[i]+b[i]))
|
||||
{
|
||||
printf("\n<vddint32> failed at <index: %d>! \n", i);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
printf("\nPASSED.......................... <vddint32> \n");
|
||||
#endif
|
||||
#if 0
|
||||
//---------------------------------------------------------------
|
||||
/* # vector-scalar add
|
||||
# for (i=0; i<N; i++) { C[i] = A[i] + B; } // 32-bit ints */
|
||||
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 1;}
|
||||
printf("vsadd...scalar:%d\na[%d]: ", scalar, n);
|
||||
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
|
||||
printf("\nb: %d", scalar);
|
||||
|
||||
vx_vec_vsadd(n, a, scalar);
|
||||
|
||||
for(int i = 0; i < n; ++i)
|
||||
{
|
||||
if(a[i] != (b[i] * scalar))
|
||||
{
|
||||
printf("\n<vsadd> failed at <index: %d>! \n", i);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
printf("\nPASSED.......................... <vsadd> \n");
|
||||
|
||||
#endif
|
||||
#if 0
|
||||
//---------------------------------------------------------------
|
||||
/* # memory copy
|
||||
# void *memcpy(void* dest, const void* src, size_t n) */
|
||||
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2;}
|
||||
printf("memcpy\na[%d]: ", n);
|
||||
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
|
||||
printf("\nb[%d]: ", n);
|
||||
for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
|
||||
|
||||
vx_vec_memcpy(a, b, n);
|
||||
|
||||
for(int i = 0; i < n; ++i)
|
||||
{
|
||||
if(a[i] != b[i])
|
||||
{
|
||||
printf("\n<memcpy> failed at <index: %d>! \n", i);
|
||||
<<<<<<< HEAD
|
||||
return;
|
||||
}
|
||||
}
|
||||
printf("\nPASSED.......................... <memcpy> \n");
|
||||
=======
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
printf("\nPASSED.......................... <memcpy> \n");
|
||||
#endif
|
||||
#if 1
|
||||
//---------------------------------------------------------------
|
||||
/* # void saxpy(size_t n, const float a, const float *x, float *y)
|
||||
# ==> convert to int!!
|
||||
# void saxpy(size_t n, const int a, const int *x, int *y)
|
||||
# {
|
||||
# size_t i;
|
||||
# for (i=0; i<n; i++) y[i] = a * x[i] + y[i];
|
||||
# } */
|
||||
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2; c[i] = 2;}
|
||||
|
||||
printf("saxpy\na[%d]: ", n);
|
||||
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
|
||||
printf("\nb[%d]: ", n);
|
||||
for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
|
||||
|
||||
vx_vec_saxpy(n, scalar, a, b);
|
||||
|
||||
printf("saxpy\na[%d]: ", n);
|
||||
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
|
||||
printf("\nb[%d]: ", n);
|
||||
for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
|
||||
|
||||
for(int i = 0; i < n; ++i)
|
||||
{
|
||||
if(b[i] != ((a[i] * scalar) + c[i]))
|
||||
{
|
||||
printf("\n<saxpy> failed at <index: %d>! \n", i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
printf("\nPASSED.......................... <saxpy> \n");
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
printf("\nPASSED.......................... <saxpy> \n");
|
||||
#endif
|
||||
#if 0
|
||||
//---------------------------------------------------------------
|
||||
/* # void sgemm_nn(size_t n, size_t m, size_t k, const float*a, // m * k matrix
|
||||
# size_t lda, const float*b, // k * n matrix
|
||||
# size_t ldb, float*c, // m * n matrix
|
||||
# size_t ldc)
|
||||
# c += a*b (alpha=1, no transpose on input matrices)
|
||||
# matrices stored in C row-major order */
|
||||
|
||||
int m = 8;
|
||||
int k = 8;
|
||||
int n = 8
|
||||
int lda = 4;
|
||||
int ldb = 4;
|
||||
int ldc = 4;
|
||||
|
||||
int* a1 = (int*)malloc(sizeof(m * k));
|
||||
int* b1 = (int*)malloc(sizeof(k * n));
|
||||
int* c1 = (int*)malloc(sizeof(m * n));
|
||||
|
||||
for(int i = 0; i < (m * k); ++i) a1[i] = 1;
|
||||
for(int i = 0; i < (k * n); ++i) b1[i] = 1;
|
||||
for(int i = 0; i < (m * n); ++i) c1[i] = 1;
|
||||
|
||||
printf("sgemm_nn\na[%d]: ", n);
|
||||
for(int i = 0; i < n; ++i) printf("%d \n", a1[i]);
|
||||
printf("\nb[%d]: ", n);
|
||||
for(int i = 0; i < n; ++i) printf("%d \n", b1[i]);
|
||||
|
||||
vx_vec_sgemm_nn(n, m, k, a1, lda, b1, ldb, c1, ldc);
|
||||
|
||||
//for(int i = 0; i < n; ++i)
|
||||
//{
|
||||
// if(b[i] != ((a[i] * scalar) + c[i]))
|
||||
// {
|
||||
// printf("\n<sgemm_nn> failed at <index: %d>! \n", i);
|
||||
// return;
|
||||
// }
|
||||
//}
|
||||
printf("\nNOT TESTED.......................... <sgemm_nn> \n");
|
||||
//---------------------------------------------------------------
|
||||
#endif
|
||||
|
||||
vx_tmc(0);
|
||||
return 0;
|
||||
}
|
File diff suppressed because it is too large
Load diff
Binary file not shown.
|
@ -1,16 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//void vx_vec_vvaddint32(int n, int* a, int* b, int *c);
|
||||
//void vx_vec_vsadd(int n, int* a, int scalar);
|
||||
//void vx_vec_memcpy(int* a, int* b, int n);
|
||||
void vx_vec_saxpy(int n, int scalar, int* a, int* b);
|
||||
//void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int lda, int* b1, int ldb, int* c1, int ldc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
File diff suppressed because it is too large
Load diff
|
@ -1,17 +0,0 @@
|
|||
.type vx_vec_memcpy, @function
|
||||
.global vx_vec_memcpy
|
||||
# void *memcpy(void* dest, const void* src, size_t n)
|
||||
# a0=dest, a1=src, a2=n
|
||||
#
|
||||
vx_vec_memcpy:
|
||||
# memcpy
|
||||
mv a3, a0 # Copy destination
|
||||
vsetvli t0, a2, e8,m8 # Vectors of 8b
|
||||
loop:
|
||||
vlb.v v0, (a1) # Load bytes
|
||||
add a1, a1, t0 # Bump pointer
|
||||
sub a2, a2, t0 # Decrement count
|
||||
vsb.v v0, (a3) # Store bytes
|
||||
add a3, a3, t0 # Bump pointer
|
||||
bnez a2, loop # Any more?
|
||||
ret # Return
|
|
@ -1,62 +0,0 @@
|
|||
.type vx_vec_saxpy, @function
|
||||
.global vx_vec_saxpy
|
||||
# void
|
||||
# saxpy(size_t n, const float a, const float *x, float *y)
|
||||
# {
|
||||
# size_t i;
|
||||
# for (i=0; i<n; i++)
|
||||
# y[i] = a * x[i] + y[i];
|
||||
# }
|
||||
#
|
||||
# register arguments:
|
||||
# a0 n
|
||||
# fa0 a
|
||||
# a1 x
|
||||
# a2 y
|
||||
vx_vec_saxpy:
|
||||
vsetvli a4, a0, e32, m8
|
||||
saxpy:
|
||||
vlw.v v0, (a1)
|
||||
sub a0, a0, a4
|
||||
slli a4, a4, 2
|
||||
add a1, a1, a4
|
||||
vlw.v v8, (a2)
|
||||
vfmacc.vf v8, fa0, v0
|
||||
vsw.v v8, (a2)
|
||||
add a2, a2, a4
|
||||
bnez a0, saxpy
|
||||
ret
|
||||
#vx_vec_saxpy:
|
||||
# vsetvli a4, a0, e32, m8
|
||||
#saxpy:
|
||||
# vlw.v v0, (a1)
|
||||
# sub a0, a0, a4
|
||||
# slli a4, a4, 2
|
||||
# add a1, a1, a4
|
||||
# vlw.v v8, (a2)
|
||||
# vfmacc.vf v8, fa0, v0
|
||||
# vsw.v v8, (a2)
|
||||
# add a2, a2, a4
|
||||
# bnez a0, saxpy
|
||||
# ret
|
||||
|
||||
# a0 n, rs1 a, a2 x, a3 y
|
||||
|
||||
# a0 n, a1 a, a2 x, a3 y
|
||||
vx_vec_saxpy:
|
||||
vsetvli a4, a0, e32, m1
|
||||
saxpy:
|
||||
vlw.v v0, (a2)
|
||||
sub a0, a0, a4
|
||||
slli a4, a4, 2
|
||||
add a2, a2, a4
|
||||
vlw.v v1, (a3)
|
||||
vmul.vx v0, v0, a1
|
||||
# vmul.vv v0, v0, v1
|
||||
# li x1, 10
|
||||
# vmul.vx v0, v0, x1
|
||||
vadd.vv v1, v0, v1
|
||||
vsw.v v1, (a3)
|
||||
add a3, a3, a4
|
||||
bnez a0, saxpy
|
||||
ret
|
|
@ -1,28 +0,0 @@
|
|||
.type vx_vec_saxpy, @function
|
||||
.global vx_vec_saxpy_float
|
||||
# void
|
||||
# saxpy(size_t n, const float a, const float *x, float *y)
|
||||
# {
|
||||
# size_t i;
|
||||
# for (i=0; i<n; i++)
|
||||
# y[i] = a * x[i] + y[i];
|
||||
# }
|
||||
#
|
||||
# register arguments:
|
||||
# a0 n
|
||||
# fa0 a
|
||||
# a1 x
|
||||
# a2 y
|
||||
vx_vec_saxpy_float:
|
||||
vsetvli a4, a0, e32, m8
|
||||
saxpy:
|
||||
vlw.v v0, (a1)
|
||||
sub a0, a0, a4
|
||||
slli a4, a4, 2
|
||||
add a1, a1, a4
|
||||
vlw.v v8, (a2)
|
||||
vfmacc.vf v8, fa0, v0
|
||||
vsw.v v8, (a2)
|
||||
add a2, a2, a4
|
||||
bnez a0, saxpy
|
||||
ret
|
|
@ -1,220 +0,0 @@
|
|||
.type vx_vec_sgemm_nn, @function
|
||||
.global vx_vec_sgemm_nn
|
||||
# RV64IDV system
|
||||
#
|
||||
# void
|
||||
# sgemm_nn(size_t n,
|
||||
# size_t m,
|
||||
# size_t k,
|
||||
# const float*a, // m * k matrix
|
||||
# size_t lda,
|
||||
# const float*b, // k * n matrix
|
||||
# size_t ldb,
|
||||
# float*c, // m * n matrix
|
||||
# size_t ldc)
|
||||
#
|
||||
# c += a*b (alpha=1, no transpose on input matrices)
|
||||
# matrices stored in C row-major order
|
||||
|
||||
#define n a0
|
||||
#define m a1
|
||||
#define k a2
|
||||
#define ap a3
|
||||
#define astride a4
|
||||
#define bp a5
|
||||
#define bstride a6
|
||||
#define cp a7
|
||||
#define cstride t0
|
||||
#define kt t1
|
||||
#define nt t2
|
||||
#define bnp t3
|
||||
#define cnp t4
|
||||
#define akp t5
|
||||
#define bkp s0
|
||||
#define nvl s1
|
||||
#define ccp s2
|
||||
#define amp s3
|
||||
|
||||
# Use args as additional temporaries
|
||||
#define ft12 fa0
|
||||
#define ft13 fa1
|
||||
#define ft14 fa2
|
||||
#define ft15 fa3
|
||||
|
||||
# This version holds a 16*VLMAX block of C matrix in vector registers
|
||||
# in inner loop, but otherwise does not cache or TLB tiling.
|
||||
vx_vec_sgemm_nn:
|
||||
#sgemm_nn:
|
||||
addi sp, sp, -FRAMESIZE
|
||||
sd s0, OFFSET(sp)
|
||||
sd s1, OFFSET(sp)
|
||||
sd s2, OFFSET(sp)
|
||||
|
||||
# Check for zero size matrices
|
||||
beqz n, exit
|
||||
beqz m, exit
|
||||
beqz k, exit
|
||||
|
||||
# Convert elements strides to byte strides.
|
||||
ld cstride, OFFSET(sp) # Get arg from stack frame
|
||||
slli astride, astride, 2
|
||||
slli bstride, bstride, 2
|
||||
slli cstride, cstride, 2
|
||||
|
||||
slti t6, m, 16
|
||||
bnez t6, end_rows
|
||||
|
||||
c_row_loop: # Loop across rows of C blocks
|
||||
|
||||
mv nt, n # Initialize n counter for next row of C blocks
|
||||
|
||||
mv bnp, bp # Initialize B n-loop pointer to start
|
||||
mv cnp, cp # Initialize C n-loop pointer
|
||||
|
||||
c_col_loop: # Loop across one row of C blocks
|
||||
vsetvli nvl, nt, e32 # 32-bit vectors, LMUL=1
|
||||
|
||||
mv akp, ap # reset pointer into A to beginning
|
||||
mv bkp, bnp # step to next column in B matrix
|
||||
|
||||
# Initalize current C submatrix block from memory.
|
||||
vlw.v v0, (cnp); add ccp, cnp, cstride;
|
||||
vlw.v v1, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v2, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v3, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v4, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v5, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v6, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v7, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v8, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v9, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v10, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v11, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v12, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v13, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v14, (ccp); add ccp, ccp, cstride;
|
||||
vlw.v v15, (ccp)
|
||||
|
||||
|
||||
mv kt, k # Initialize inner loop counter
|
||||
|
||||
# Inner loop scheduled assuming 4-clock occupancy of vfmacc instruction and single-issue pipeline
|
||||
# Software pipeline loads
|
||||
flw ft0, (akp); add amp, akp, astride;
|
||||
flw ft1, (amp); add amp, amp, astride;
|
||||
flw ft2, (amp); add amp, amp, astride;
|
||||
flw ft3, (amp); add amp, amp, astride;
|
||||
# Get vector from B matrix
|
||||
vlw.v v16, (bkp)
|
||||
|
||||
# Loop on inner dimension for current C block
|
||||
k_loop:
|
||||
vfmacc.vf v0, ft0, v16
|
||||
add bkp, bkp, bstride
|
||||
flw ft4, (amp)
|
||||
add amp, amp, astride
|
||||
vfmacc.vf v1, ft1, v16
|
||||
addi kt, kt, -1 # Decrement k counter
|
||||
flw ft5, (amp)
|
||||
add amp, amp, astride
|
||||
vfmacc.vf v2, ft2, v16
|
||||
flw ft6, (amp)
|
||||
add amp, amp, astride
|
||||
flw ft7, (amp)
|
||||
vfmacc.vf v3, ft3, v16
|
||||
add amp, amp, astride
|
||||
flw ft8, (amp)
|
||||
add amp, amp, astride
|
||||
vfmacc.vf v4, ft4, v16
|
||||
flw ft9, (amp)
|
||||
add amp, amp, astride
|
||||
vfmacc.vf v5, ft5, v16
|
||||
flw ft10, (amp)
|
||||
add amp, amp, astride
|
||||
vfmacc.vf v6, ft6, v16
|
||||
flw ft11, (amp)
|
||||
add amp, amp, astride
|
||||
vfmacc.vf v7, ft7, v16
|
||||
flw ft12, (amp)
|
||||
add amp, amp, astride
|
||||
vfmacc.vf v8, ft8, v16
|
||||
flw ft13, (amp)
|
||||
add amp, amp, astride
|
||||
vfmacc.vf v9, ft9, v16
|
||||
flw ft14, (amp)
|
||||
add amp, amp, astride
|
||||
vfmacc.vf v10, ft10, v16
|
||||
flw ft15, (amp)
|
||||
add amp, amp, astride
|
||||
addi akp, akp, 4 # Move to next column of a
|
||||
vfmacc.vf v11, ft11, v16
|
||||
beqz kt, 1f # Don't load past end of matrix
|
||||
flw ft0, (akp)
|
||||
add amp, akp, astride
|
||||
1: vfmacc.vf v12, ft12, v16
|
||||
beqz kt, 1f
|
||||
flw ft1, (amp)
|
||||
add amp, amp, astride
|
||||
1: vfmacc.vf v13, ft13, v16
|
||||
beqz kt, 1f
|
||||
flw ft2, (amp)
|
||||
add amp, amp, astride
|
||||
1: vfmacc.vf v14, ft14, v16
|
||||
beqz kt, 1f # Exit out of loop
|
||||
flw ft3, (amp)
|
||||
add amp, amp, astride
|
||||
vfmacc.vf v15, ft15, v16
|
||||
vlw.v v16, (bkp) # Get next vector from B matrix, overlap loads with jump stalls
|
||||
j k_loop
|
||||
|
||||
1: vfmacc.vf v15, ft15, v16
|
||||
|
||||
# Save C matrix block back to memory
|
||||
vsw.v v0, (cnp); add ccp, cnp, cstride;
|
||||
vsw.v v1, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v2, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v3, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v4, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v5, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v6, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v7, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v8, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v9, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v10, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v11, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v12, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v13, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v14, (ccp); add ccp, ccp, cstride;
|
||||
vsw.v v15, (ccp)
|
||||
|
||||
# Following tail instructions should be scheduled earlier in free slots during C block save.
|
||||
# Leaving here for clarity.
|
||||
|
||||
# Bump pointers for loop across blocks in one row
|
||||
slli t6, nvl, 2
|
||||
add cnp, cnp, t6 # Move C block pointer over
|
||||
add bnp, bnp, t6 # Move B block pointer over
|
||||
sub nt, nt, nvl # Decrement element count in n dimension
|
||||
bnez nt, c_col_loop # Any more to do?
|
||||
|
||||
# Move to next set of rows
|
||||
addi m, m, -16 # Did 16 rows above
|
||||
slli t6, astride, 4 # Multiply astride by 16
|
||||
add ap, ap, t6 # Move A matrix pointer down 16 rows
|
||||
slli t6, cstride, 4 # Multiply cstride by 16
|
||||
add cp, cp, t6 # Move C matrix pointer down 16 rows
|
||||
|
||||
slti t6, m, 16
|
||||
beqz t6, c_row_loop
|
||||
|
||||
# Handle end of matrix with fewer than 16 rows.
|
||||
# Can use smaller versions of above decreasing in powers-of-2 depending on code-size concerns.
|
||||
end_rows:
|
||||
# Not done.
|
||||
|
||||
exit:
|
||||
ld s0, OFFSET(sp)
|
||||
ld s1, OFFSET(sp)
|
||||
ld s2, OFFSET(sp)
|
||||
addi sp, sp, FRAMESIZE
|
||||
ret
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue