pltform independent source tree refactoring

This commit is contained in:
Blaise Tine 2022-07-22 00:39:14 -04:00
parent 92fd9d16ac
commit c613985de4
202 changed files with 9799 additions and 103942 deletions

View file

@ -2,14 +2,14 @@ all:
$(MAKE) -C third_party
$(MAKE) -C hw
$(MAKE) -C sim
$(MAKE) -C driver
$(MAKE) -C kernel
$(MAKE) -C runtime
$(MAKE) -C tests
clean:
$(MAKE) -C hw clean
$(MAKE) -C sim clean
$(MAKE) -C driver clean
$(MAKE) -C kernel clean
$(MAKE) -C runtime clean
$(MAKE) -C tests clean
@ -17,6 +17,6 @@ clean-all:
$(MAKE) -C third_party clean
$(MAKE) -C hw clean
$(MAKE) -C sim clean
$(MAKE) -C driver clean
$(MAKE) -C kernel clean
$(MAKE) -C runtime clean
$(MAKE) -C tests clean-all

View file

@ -91,19 +91,19 @@ done
case $DRIVER in
rtlsim)
DRIVER_PATH=$VORTEX_HOME/driver/rtlsim
DRIVER_PATH=$VORTEX_HOME/runtime/rtlsim
;;
vlsim)
DRIVER_PATH=$VORTEX_HOME/driver/vlsim
DRIVER_PATH=$VORTEX_HOME/runtime/vlsim
;;
asesim)
DRIVER_PATH=$VORTEX_HOME/driver/asesim
DRIVER_PATH=$VORTEX_HOME/runtime/asesim
;;
fpga)
DRIVER_PATH=$VORTEX_HOME/driver/fpga
DRIVER_PATH=$VORTEX_HOME/runtime/fpga
;;
simx)
DRIVER_PATH=$VORTEX_HOME/driver/simx
DRIVER_PATH=$VORTEX_HOME/runtime/simx
;;
*)
echo "invalid driver: $DRIVER"
@ -149,7 +149,7 @@ status=0
make -C hw config
# ensure the stub driver is present
make -C $VORTEX_HOME/driver/stub
make -C $VORTEX_HOME/runtime/stub
if [ $DEBUG -ne 0 ]
then

View file

@ -9,17 +9,17 @@ make -s
# clear POCL cache
rm -rf ~/.cache/pocl
# rebuild runtime
# rebuild runtime library
make -C runtime clean
make -C runtime
# rebuild drivers
make -C driver clean
make -C driver
# rebuild kernel library
make -C kernel clean
make -C kernel
# rebuild runtime tests
make -C tests/runtime clean
make -C tests/runtime
# rebuild kernel tests
make -C tests/kernel clean
make -C tests/kernel
# rebuild regression tests
make -C tests/regression clean-all

View file

@ -1,29 +0,0 @@
all: stub rtlsim simx vlsim
stub:
$(MAKE) -C stub
fpga:
$(MAKE) -C fpga
asesim:
$(MAKE) -C asesim
vlsim:
$(MAKE) -C vlsim
rtlsim:
$(MAKE) -C rtlsim
simx:
$(MAKE) -C simx
clean:
$(MAKE) clean -C stub
$(MAKE) clean -C fpga
$(MAKE) clean -C asesim
$(MAKE) clean -C vlsim
$(MAKE) clean -C rtlsim
$(MAKE) clean -C simx
.PHONY: all stub fpga asesim vlsim rtlsim simx clean

103
hw/afu/xrt/vortex_afu.v Normal file
View file

@ -0,0 +1,103 @@
`include "VX_define.vh"
module vortex_afu #(
parameter AXI_DATA_WIDTH = `VX_MEM_DATA_WIDTH,
parameter AXI_ADDR_WIDTH = `VX_MEM_ADDR_WIDTH,
parameter AXI_TID_WIDTH = 12,
parameter AXI_STROBE_WIDTH = `VX_MEM_BYTEEN_WIDTH,
parameter AXI_DCR_ADDR_WIDTH = `VX_DCR_ADDR_WIDTH,
parameter AXI_DCR_DATA_WIDTH = `VX_DCR_DATA_WIDTH
) (
input wire clk,
input wire reset,
output wire [AXI_TID_WIDTH - 1:0] m_axi_awid,
output wire [AXI_ADDR_WIDTH - 1:0] m_axi_awaddr,
output wire [7:0] m_axi_awlen,
output wire [2:0] m_axi_awsize,
output wire [1:0] m_axi_awburst,
output wire m_axi_awlock,
output wire [3:0] m_axi_awcache,
output wire [2:0] m_axi_awprot,
output wire [3:0] m_axi_awqos,
output wire m_axi_awvalid,
input wire m_axi_awready,
output wire [AXI_DATA_WIDTH - 1:0] m_axi_wdata,
output wire [AXI_STROBE_WIDTH - 1:0] m_axi_wstrb,
output wire m_axi_wlast,
output wire m_axi_wvalid,
input wire m_axi_wready,
input wire [AXI_TID_WIDTH - 1:0] m_axi_bid,
input wire [1:0] m_axi_bresp,
input wire m_axi_bvalid,
output wire m_axi_bready,
output wire [AXI_TID_WIDTH - 1:0] m_axi_arid,
output wire [AXI_ADDR_WIDTH - 1:0] m_axi_araddr,
output wire [7:0] m_axi_arlen,
output wire [2:0] m_axi_arsize,
output wire [1:0] m_axi_arburst,
output wire m_axi_arlock,
output wire [3:0] m_axi_arcache,
output wire [2:0] m_axi_arprot,
output wire [3:0] m_axi_arqos,
output wire m_axi_arvalid,
input wire m_axi_arready,
input wire [AXI_TID_WIDTH - 1:0] m_axi_rid,
input wire [AXI_DATA_WIDTH - 1:0] m_axi_rdata,
input wire [1:0] m_axi_rresp,
input wire m_axi_rlast,
input wire m_axi_rvalid,
output wire m_axi_rready,
output wire busy
);
Vortex_axi #(
.AXI_DATA_WIDTH (AXI_DATA_WIDTH),
.AXI_ADDR_WIDTH (AXI_ADDR_WIDTH),
.AXI_TID_WIDTH (AXI_TID_WIDTH),
.AXI_STROBE_WIDTH (AXI_STROBE_WIDTH),
.AXI_DCR_ADDR_WIDTH (AXI_DCR_ADDR_WIDTH),
.AXI_DCR_DATA_WIDTH (AXI_DCR_DATA_WIDTH)
) inst (
.clk(clk),
.reset(reset),
.m_axi_awid(m_axi_awid),
.m_axi_awaddr(m_axi_awaddr),
.m_axi_awlen(m_axi_awlen),
.m_axi_awsize(m_axi_awsize),
.m_axi_awburst(m_axi_awburst),
.m_axi_awlock(m_axi_awlock),
.m_axi_awcache(m_axi_awcache),
.m_axi_awprot(m_axi_awprot),
.m_axi_awqos(m_axi_awqos),
.m_axi_awvalid(m_axi_awvalid),
.m_axi_awready(m_axi_awready),
.m_axi_wdata(m_axi_wdata),
.m_axi_wstrb(m_axi_wstrb),
.m_axi_wlast(m_axi_wlast),
.m_axi_wvalid(m_axi_wvalid),
.m_axi_wready(m_axi_wready),
.m_axi_bid(m_axi_bid),
.m_axi_bresp(m_axi_bresp),
.m_axi_bvalid(m_axi_bvalid),
.m_axi_bready(m_axi_bready),
.m_axi_arid(m_axi_arid),
.m_axi_araddr(m_axi_araddr),
.m_axi_arlen(m_axi_arlen),
.m_axi_arsize(m_axi_arsize),
.m_axi_arburst(m_axi_arburst),
.m_axi_arlock(m_axi_arlock),
.m_axi_arcache(m_axi_arcache),
.m_axi_arprot(m_axi_arprot),
.m_axi_arqos(m_axi_arqos),
.m_axi_arvalid(m_axi_arvalid),
.m_axi_arready(m_axi_arready),
.m_axi_rid(m_axi_rid),
.m_axi_rdata(m_axi_rdata),
.m_axi_rresp(m_axi_rresp),
.m_axi_rlast(m_axi_rlast),
.m_axi_rvalid(m_axi_rvalid),
.m_axi_rready(m_axi_rready),
.busy(busy)
);
endmodule

View file

@ -2,6 +2,8 @@ DEVICE_FAMILY ?= arria10
BUILD_DIR ?= build
RTL_DIR = ../../rtl
DPI_DIR = ../../dpi
AFU_DIR = ../../afu/opae
IP_DIR = ../ip/$(DEVICE_FAMILY)
ifeq ($(shell which qsub-synth),)
RUN_SYNTH=$(OPAE_PLATFORM_ROOT)/bin/run.sh > build.log 2>&1 &
@ -39,12 +41,12 @@ CONFIG16 := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE $(CONFIGS)
CONFIG32 := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE $(CONFIGS)
CONFIG64 := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE $(CONFIGS)
FPU_INCLUDE = -I$(RTL_DIR)/fpu_unit -I$(RTL_DIR)/fpu_unit/altera/$(DEVICE_FAMILY)
FPU_INCLUDE = -I$(RTL_DIR)/fpu_unit
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RASTER_INCLUDE = -I$(RTL_DIR)/raster_unit
ROP_INCLUDE = -I$(RTL_DIR)/rop_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/afu $(FPU_INCLUDE)
RTL_INCLUDE += $(TEX_INCLUDE) $(RASTER_INCLUDE) $(ROP_INCLUDE)
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(AFU_DIR) -I$(IP_DIR)
RTL_INCLUDE += $(FPU_INCLUDE) $(TEX_INCLUDE) $(RASTER_INCLUDE) $(ROP_INCLUDE)
CFLAGS += $(RTL_INCLUDE)

View file

@ -7,7 +7,7 @@ BUILD_DIR=$1
PROGRAM=$(basename "$2")
PROGRAM_DIR=`dirname $2`
VORTEX_DRV_PATH=$SCRIPT_DIR/../../../driver
VORTEX_RT_PATH=$SCRIPT_DIR/../../../runtime
# Export ASE_WORKDIR variable
export ASE_WORKDIR=$SCRIPT_DIR/$BUILD_DIR/work
@ -35,5 +35,5 @@ done
# run application
pushd $PROGRAM_DIR
echo " [DBG] running ./$PROGRAM $*"
ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH/lib:$VORTEX_DRV_PATH/asesim:$LD_LIBRARY_PATH ./$PROGRAM $*
ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH/lib:$VORTEX_RT_PATH/asesim:$LD_LIBRARY_PATH ./$PROGRAM $*
popd

View file

@ -6,15 +6,15 @@ THIRD_PARTY_DIR = ../../../../../third_party
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
IP_DIR = ../../ip/arria10
#FAMILY = "Stratix 10"
#DEVICE = 1SX280HN2F43E2VG
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
#IP_DIR = ../../ip/stratix10
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -6,14 +6,14 @@ THIRD_PARTY_DIR = ../../../../../third_party
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
IP_DIR = ../../ip/arria10
#FAMILY = "Stratix 10"
#DEVICE = 1SX280HN2F43E2VG
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
#IP_DIR = ../../ip/stratix10
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(IP_DIR)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
# Part, Family

View file

@ -8,15 +8,15 @@ CONFIGS += -set "EXT_GFX_ENABLE"
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
IP_DIR = ../../ip/arria10
#FAMILY = "Stratix 10"
#DEVICE = 1SX280HN2F43E2VG
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
#IP_DIR = ../../ip/stratix10
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
GFX_INCLUDE = $(RTL_DIR)/tex_unit;$(RTL_DIR)/raster_unit;$(RTL_DIR)/rop_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(GFX_INCLUDE)
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(GFX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -6,15 +6,15 @@ THIRD_PARTY_DIR = ../../../../../third_party
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
IP_DIR = ../../ip/arria10
#FAMILY = "Stratix 10"
#DEVICE = 1SX280HN2F43E2VG
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
#IP_DIR = ../../ip/stratix10
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -2,15 +2,16 @@ PROJECT = vortex_afu
TOP_LEVEL_ENTITY = vortex_afu
SRC_FILE = vortex_afu.sv
RTL_DIR = ../../../../rtl
AFU_DIR = ../../../../afu/opae
THIRD_PARTY_DIR = ../../../../../third_party
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
FPU_CORE_PATH=$(RTL_DIR)/fpu_unit/altera/arria10
IP_DIR = ../../ip/arria10
#FAMILY = "Stratix 10"
#DEVICE = 1SX280HN2F43E2VG
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
#IP_DIR = ../../ip/stratix10
CONFIGS += -set "NOPAE"
CONFIGS += -set "EXT_GFX_ENABLE"
@ -21,9 +22,9 @@ CONFIGS += -set "NUM_CORES=4"
#CONFIGS += -set "SM_DISABLE"
#CONFIGS += -set "RCACHE_DISABLE" -set "OCACHE_DISABLE" -set "TCACHE_DISABLE"
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
TEX_INCLUDE = $(RTL_DIR)/tex_unit;$(RTL_DIR)/raster_unit;$(RTL_DIR)/rop_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(AFU_DIR);$(AFU_DIR)/ccip;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -2,15 +2,16 @@ PROJECT = vortex_afu
TOP_LEVEL_ENTITY = vortex_afu
SRC_FILE = vortex_afu.sv
RTL_DIR = ../../../../rtl
AFU_DIR = ../../../../afu/opae
THIRD_PARTY_DIR = ../../../../../third_party
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
IP_DIR = ../../ip/arria10
#FAMILY = "Stratix 10"
#DEVICE = 1SX280HN2F43E2VG
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
#IP_DIR = ../../ip/stratix10
CONFIGS += -set "NOPAE"
@ -19,9 +20,9 @@ CONFIGS += -set "NUM_CORES=4"
#CONFIGS += -set "L1_DISABLE"
#CONFIGS += -set "SM_DISABLE"
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(AFU_DIR);$(AFU_DIR)/ccip;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -6,15 +6,15 @@ THIRD_PARTY_DIR = ../../../../../third_party
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
IP_DIR = ../../ip/arria10
#FAMILY = "Stratix 10"
#DEVICE = 1SX280HN2F43E2VG
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
#IP_DIR = ../../ip/stratix10
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -6,11 +6,11 @@ THIRD_PARTY_DIR = ../../../../../third_party
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
IP_DIR = ../../ip/arria10
#FAMILY = "Stratix 10"
#DEVICE = 1SX280HN2F43E2VG
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
#IP_DIR = ../../ip/stratix10
CONFIGS += -set "EXT_GFX_ENABLE"
@ -26,9 +26,9 @@ CONFIGS += -set "EXT_GFX_ENABLE"
CONFIGS += -set "NUM_CORES=4"
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
GFX_INCLUDE = $(RTL_DIR)/tex_unit;$(RTL_DIR)/raster_unit;$(RTL_DIR)/rop_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(GFX_INCLUDE)
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(GFX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

@ -6,11 +6,11 @@ THIRD_PARTY_DIR = ../../../../../third_party
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
IP_DIR = ../../ip/arria10
#FAMILY = "Stratix 10"
#DEVICE = 1SX280HN2F43E2VG
#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
#IP_DIR = ../../ip/stratix10
#CONFIGS += -set "L1_DISABLE"
@ -22,9 +22,9 @@ FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
CONFIGS += -set "NUM_CORES=4"
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View file

View file

0
kernel/.gitignore vendored Normal file
View file

49
kernel/Makefile Normal file
View file

@ -0,0 +1,49 @@
XLEN ?= 32
ifeq ($(XLEN),32)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objcopy
ifeq ($(XLEN),32)
CFLAGS += -march=rv32imf -mabi=ilp32f
else
CFLAGS += -march=rv64imfd -mabi=lp64d
endif
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -fdata-sections -ffunction-sections
CFLAGS += -I./include -I../hw
PROJECT = libvortexrt
SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/tinyprintf.c ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
all: $(PROJECT).a $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).a
$(DP) -D $(PROJECT).a > $(PROJECT).dump
%.S.o: src/%.S
$(CC) $(CFLAGS) -c $< -o $@
%.c.o: src/%.c
$(CC) $(CFLAGS) -c $< -o $@
$(PROJECT).a: $(OBJS)
$(AR) rcs $@ $^
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.a *.o *.dump .depend

View file

@ -1,41 +0,0 @@
LIB_PATH = ../../runtime
COMP = /home/fares/dev/riscv-gnu-toolchain-vector/drops/bin/riscv32-unknown-elf-gcc
CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/startup/vx_link.ld -march=rv32imv -mabi=ilp32
DMP = /home/fares/dev/riscv-gnu-toolchain-vector/drops/bin/riscv32-unknown-elf-objdump
CPY = /home/fares/dev/riscv-gnu-toolchain-vector/drops/bin/riscv32-unknown-elf-objcopy
# VX_STR = ../../startup/vx_start.S
NEWLIB = $(LIB_PATH)/newlib/newlib.c
VX_STR = $(LIB_PATH)/startup/vx_start.S
VX_INT = $(LIB_PATH)/intrinsics/vx_intrinsics.S
VX_IO = $(LIB_PATH)/io/vx_io.S $(LIB_PATH)/io/vx_io.c
VX_API = $(LIB_PATH)/vx_api/vx_api.c
VX_FIO = $(LIB_PATH)/fileio/fileio.S
VX_VEC = vx_vec.s
LIBS = /home/fares/dev/riscv-gnu-toolchain-vector/drops/riscv32-unknown-elf/lib/libc.a /home/fares/dev/riscv-gnu-toolchain-vector/drops/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
VX_MAIN = vx_vector_main
all: HEX DUMP ELF
DUMP: ELF
$(DMP) -D $(VX_MAIN).elf > $(VX_MAIN).dump
HEX: ELF
$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
ELF:
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
run:
../../simx/obj_dir/Vcache_simX -E -a rv32i --core vx_vector_main.hex -s -b 1> emulator.debug

View file

@ -1,30 +0,0 @@
.type vx_vec_test, @function
.global vx_vec_test
vx_vec_test:
li a1, 7
sw a1, 0(a0)
ret
# slli a0, a0, 2
# add a0, a0, a3
# vmv.v.x vv0, a2
# # vsplat4 vv0, a2
# stripmine_loop:
# vlb4 vv1, (a1)
# vcmpez4 vp0, vv1
# !vp0 vlw4 vv1, (a3)
# !vp0 vlw4 vv2, (a4)
# !vp0 vfma4 vv1, vv0, vv1, vv2
# !vp0 vsw4 vv1, (a4)
# addi a1, a1, 4
# addi a3, a3, 16
# addi a4, a4, 16
# bleu a3, a0, stripmine_loop
# handle edge cases
# when (n % 4) != 0 ...

View file

@ -1,32 +0,0 @@
#include "../../runtime/intrinsics/vx_intrinsics.h"
#include "vx_vec.h"
int main()
{
vx_tmc(1);
// int * a = malloc(4);
// int * b = malloc(4);
// int * c = malloc(4);
int * a = malloc(4);
*a = 5;
printf("Value of a: %d\n", *a);
vx_vec_test(a);
printf("Value of a: %d\n", *a);
// for (int i = 0; i < 4; i++)
// {
// if (c[i] != (a[i] + b[i]))
// {
// printf("Fail\n");
// break;
// }
// }
vx_tmc(0);
}

View file

@ -1,91 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include "../../runtime/intrinsics/vx_intrinsics.h"
#include "vx_vec.h"
int main()
{
vx_tmc(1);
#if 0
# vector-vector add routine of 32-bit integers
# void vvaddint32(size_t n, const int*x, const int*y, int*z)
# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
#
# a0 = n, a1 = x, a2 = y, a3 = z
# Non-vector instructions are indented
#endif
#if 1
int n = 5;
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
for(int i = 0; i < n; ++i)
{
a[i] = b[i] = c[i] = 1;
}
for(int i = 0; i < n; ++i) printf("%d, ", a[i]);
printf("\n");
// for(int i = 0; i < n; ++i) printf("%d, ", b[i]);
// printf("\n");
// for(int i = 0; i < n; ++i) printf("%d, ", c[i]);
int *d;
*d = 1;
vx_vec_test(n, d, b, c);
printf("(after: n = %d, %d)\n", n, *d);
for(int i = 0; i < n; ++i) printf("%d, ", a[i]);
// printf("\n");
// for(int i = 0; i < n; ++i) printf("%d, ", b[i]);
// printf("\n");
// for(int i = 0; i < n; ++i) printf("%d, ", c[i]);
#endif
#if 0
int * a = malloc(sizeof(int) * 10);
for(int i = 0; i < 10; ++i) a[i] = 5;
for(int i = 0; i < 10; ++i)
printf("%d, ", a[i]);
vx_vec_test(a);
//vx_vec_test(2, a, a, a);
printf("after--------\n");
for(int i = 0; i < 10; ++i)
printf("%d, ", a[i]);
#endif
#if 0
int n = 5;
int *a = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1};
int *b = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1};
int *c = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1};
for(int i = 0; i < n; ++i)
{
a[i] = 1;
b[i] = 1;
c[i] = 0;
}
printf("Value of a: %d, b: %d, c: %d, n: %d\n", a[0], b[0], c[0], n);
vx_vec_test(n, a, b, c);
printf("Value of a: %d, b: %d, c: %d, n: %d\n", a[0], b[0], c[0], n);
#endif
// for (int i = 0; i < 4; i++)
// {
// if (c[i] != (a[i] + b[i]))
// {
// printf("Fail\n");
// break;
// }
// }
vx_tmc(0);
}

View file

@ -1,15 +0,0 @@
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
void vx_vec_test(int n, int* a, int* b, int* c); //vvaddint32
#ifdef __cplusplus
}
#endif

View file

@ -1,23 +0,0 @@
.type vx_vec_test, @function
.global vx_vec_test
vx_vec_test:
# vector-vector add routine of 32-bit integers
# void vvaddint32(size_t n, const int*x, const int*y, int*z)
# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
#
# a0 = n, a1 = x, a2 = y, a3 = z
# Non-vector instructions are indented
vsetvli t0, a0, e32 # Set vector length based on 32-bit vectors
vlw.v v0, (a1) # Get first vector
sub a0, a0, t0 # Decrement number done
slli t0, t0, 2 # Multiply number done by 4 bytes
add a1, a1, t0 # Bump pointer
vlw.v v1, (a2) # Get second vector
add a2, a2, t0 # Bump pointer
vadd.vv v2, v0, v1 # Sum vectors
vsw.v v2, (a3) # Store result
add a3, a3, t0 # Bump pointer
bnez a0, vx_vec_test # Loop back
ret # Finished

View file

@ -1,27 +0,0 @@
#include "../../runtime/intrinsics/vx_intrinsics.h"
#include "vx_vec.h"
int main()
{
vx_tmc(1);
printf("----------------hello!!! \n");
int n = 8;
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
printf("hello!!! \n");
for(int i = 0; i < n; ++i)
{
a[i] = b[i] = c[i] = 1;
}
vx_vec_test(n, a, b, c);
for(int i = 0; i < n; ++i)
printf("%d ", c[i]);
vx_tmc(0);
}

View file

@ -1,29 +0,0 @@
#include "../../runtime/intrinsics/vx_intrinsics.h"
#include "vx_vec.h"
int main()
{
vx_tmc(1);
printf("Hello\n");
int n = 64;
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
for(int i = 0; i < n; ++i)
{
a[i] = b[i] = c[i] = 1;
}
vx_vec_test(n, a, b, c);
for (int i = 0; i < n; ++i)
{
printf("a[%d]=%d, b[%d]=%d, c[%d]=%d\n", i, a[i], i, b[i], i, c[i]);
}
vx_tmc(0);
}

View file

@ -1,39 +0,0 @@
LIB_PATH = ../../runtime
COMP = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-gcc
CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/startup/vx_link.ld -march=rv32imv -mabi=ilp32
DMP = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objdump
CPY = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objcopy
# VX_STR = ../../startup/vx_start.S
NEWLIB = $(LIB_PATH)/newlib/newlib.c
VX_STR = $(LIB_PATH)/startup/vx_start.S
VX_INT = $(LIB_PATH)/intrinsics/vx_intrinsics.S
VX_IO = $(LIB_PATH)/io/vx_io.S $(LIB_PATH)/io/vx_io.c
VX_API = $(LIB_PATH)/vx_api/vx_api.c
VX_FIO = $(LIB_PATH)/fileio/fileio.S
VX_VEC1 = vx_vec_vvaddint32.s
VX_VEC2 = vx_vec_saxpy.s #float --> int
VX_VEC3 = vx_vec_sgemm_float.s #float --> int
VX_VEC4 = vx_vec_vsadd.s
VX_VEC5 = vx_vec_memcpy.s
LIBS = /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libc.a /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
VX_MAIN = vx_vec_benchmark
all: HEX DUMP ELF
DUMP: ELF
$(DMP) -D $(VX_MAIN).elf > $(VX_MAIN).dump
HEX: ELF
$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
ELF:
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC3) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC4) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC5) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf~

View file

@ -1,9 +0,0 @@
1. add benchmarks under (Vortex/benchmarks/..)
1.1 bfs --> blas spmv approach
1.2 kmeans // stage 2
1.3 saxpy --> sample
1.4 sfilter // stage 2
1.5 sgemm --> sample modify (float --> int)
1.6 vecadd --> sample

View file

@ -1,177 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include "../../runtime/intrinsics/vx_intrinsics.h"
#include "vx_vec_benchmark.h"
int main()
{
vx_tmc(1);
int n = 5;
int scalar = 10;
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2; c[i] = 5; }
#if 0
//---------------------------------------------------------------
/* vvaddint32
* # vector-vector add routine of 32-bit integers
* # void vvaddint32(size_t n, const int*x, const int*y, int*z)
* # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } } */
printf("vvaddint...\na[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d ", a[i]);
printf("\nb[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d ", b[i]);
printf("\nc[%d] = a[%d] + b[%d]: ", n, n, n);
for(int i = 0; i < n; ++i) printf("%d ", c[i]);
vx_vec_vvaddint32(n, a, b, c);
for(int i = 0; i < n; ++i)
{
if(c[i] != (a[i]+b[i]))
{
printf("\n<vddint32> failed at <index: %d>! \n", i);
return 1;
}
}
printf("\nPASSED.......................... <vddint32> \n");
#endif
#if 0
//---------------------------------------------------------------
/* # vector-scalar add
# for (i=0; i<N; i++) { C[i] = A[i] + B; } // 32-bit ints */
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 1;}
printf("vsadd...scalar:%d\na[%d]: ", scalar, n);
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
printf("\nb: %d", scalar);
vx_vec_vsadd(n, a, scalar);
for(int i = 0; i < n; ++i)
{
if(a[i] != (b[i] * scalar))
{
printf("\n<vsadd> failed at <index: %d>! \n", i);
return 1;
}
}
printf("\nPASSED.......................... <vsadd> \n");
#endif
#if 0
//---------------------------------------------------------------
/* # memory copy
# void *memcpy(void* dest, const void* src, size_t n) */
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2;}
printf("memcpy\na[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
printf("\nb[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
vx_vec_memcpy(a, b, n);
for(int i = 0; i < n; ++i)
{
if(a[i] != b[i])
{
printf("\n<memcpy> failed at <index: %d>! \n", i);
<<<<<<< HEAD
return;
}
}
printf("\nPASSED.......................... <memcpy> \n");
=======
return 1;
}
}
printf("\nPASSED.......................... <memcpy> \n");
#endif
#if 1
//---------------------------------------------------------------
/* # void saxpy(size_t n, const float a, const float *x, float *y)
# ==> convert to int!!
# void saxpy(size_t n, const int a, const int *x, int *y)
# {
# size_t i;
# for (i=0; i<n; i++) y[i] = a * x[i] + y[i];
# } */
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2; c[i] = 2;}
printf("saxpy\na[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
printf("\nb[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
vx_vec_saxpy(n, scalar, a, b);
printf("saxpy\na[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
printf("\nb[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
for(int i = 0; i < n; ++i)
{
if(b[i] != ((a[i] * scalar) + c[i]))
{
printf("\n<saxpy> failed at <index: %d>! \n", i);
return;
}
}
printf("\nPASSED.......................... <saxpy> \n");
return 1;
}
}
printf("\nPASSED.......................... <saxpy> \n");
#endif
#if 0
//---------------------------------------------------------------
/* # void sgemm_nn(size_t n, size_t m, size_t k, const float*a, // m * k matrix
# size_t lda, const float*b, // k * n matrix
# size_t ldb, float*c, // m * n matrix
# size_t ldc)
# c += a*b (alpha=1, no transpose on input matrices)
# matrices stored in C row-major order */
int m = 8;
int k = 8;
int n = 8
int lda = 4;
int ldb = 4;
int ldc = 4;
int* a1 = (int*)malloc(sizeof(m * k));
int* b1 = (int*)malloc(sizeof(k * n));
int* c1 = (int*)malloc(sizeof(m * n));
for(int i = 0; i < (m * k); ++i) a1[i] = 1;
for(int i = 0; i < (k * n); ++i) b1[i] = 1;
for(int i = 0; i < (m * n); ++i) c1[i] = 1;
printf("sgemm_nn\na[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", a1[i]);
printf("\nb[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", b1[i]);
vx_vec_sgemm_nn(n, m, k, a1, lda, b1, ldb, c1, ldc);
//for(int i = 0; i < n; ++i)
//{
// if(b[i] != ((a[i] * scalar) + c[i]))
// {
// printf("\n<sgemm_nn> failed at <index: %d>! \n", i);
// return;
// }
//}
printf("\nNOT TESTED.......................... <sgemm_nn> \n");
//---------------------------------------------------------------
#endif
vx_tmc(0);
return 0;
}

File diff suppressed because it is too large Load diff

View file

@ -1,16 +0,0 @@
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
//void vx_vec_vvaddint32(int n, int* a, int* b, int *c);
//void vx_vec_vsadd(int n, int* a, int scalar);
//void vx_vec_memcpy(int* a, int* b, int n);
void vx_vec_saxpy(int n, int scalar, int* a, int* b);
//void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int lda, int* b1, int ldb, int* c1, int ldc);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load diff

View file

@ -1,17 +0,0 @@
.type vx_vec_memcpy, @function
.global vx_vec_memcpy
# void *memcpy(void* dest, const void* src, size_t n)
# a0=dest, a1=src, a2=n
#
vx_vec_memcpy:
# memcpy
mv a3, a0 # Copy destination
vsetvli t0, a2, e8,m8 # Vectors of 8b
loop:
vlb.v v0, (a1) # Load bytes
add a1, a1, t0 # Bump pointer
sub a2, a2, t0 # Decrement count
vsb.v v0, (a3) # Store bytes
add a3, a3, t0 # Bump pointer
bnez a2, loop # Any more?
ret # Return

View file

@ -1,62 +0,0 @@
.type vx_vec_saxpy, @function
.global vx_vec_saxpy
# void
# saxpy(size_t n, const float a, const float *x, float *y)
# {
# size_t i;
# for (i=0; i<n; i++)
# y[i] = a * x[i] + y[i];
# }
#
# register arguments:
# a0 n
# fa0 a
# a1 x
# a2 y
vx_vec_saxpy:
vsetvli a4, a0, e32, m8
saxpy:
vlw.v v0, (a1)
sub a0, a0, a4
slli a4, a4, 2
add a1, a1, a4
vlw.v v8, (a2)
vfmacc.vf v8, fa0, v0
vsw.v v8, (a2)
add a2, a2, a4
bnez a0, saxpy
ret
#vx_vec_saxpy:
# vsetvli a4, a0, e32, m8
#saxpy:
# vlw.v v0, (a1)
# sub a0, a0, a4
# slli a4, a4, 2
# add a1, a1, a4
# vlw.v v8, (a2)
# vfmacc.vf v8, fa0, v0
# vsw.v v8, (a2)
# add a2, a2, a4
# bnez a0, saxpy
# ret
# a0 n, rs1 a, a2 x, a3 y
# a0 n, a1 a, a2 x, a3 y
vx_vec_saxpy:
vsetvli a4, a0, e32, m1
saxpy:
vlw.v v0, (a2)
sub a0, a0, a4
slli a4, a4, 2
add a2, a2, a4
vlw.v v1, (a3)
vmul.vx v0, v0, a1
# vmul.vv v0, v0, v1
# li x1, 10
# vmul.vx v0, v0, x1
vadd.vv v1, v0, v1
vsw.v v1, (a3)
add a3, a3, a4
bnez a0, saxpy
ret

View file

@ -1,28 +0,0 @@
.type vx_vec_saxpy, @function
.global vx_vec_saxpy_float
# void
# saxpy(size_t n, const float a, const float *x, float *y)
# {
# size_t i;
# for (i=0; i<n; i++)
# y[i] = a * x[i] + y[i];
# }
#
# register arguments:
# a0 n
# fa0 a
# a1 x
# a2 y
vx_vec_saxpy_float:
vsetvli a4, a0, e32, m8
saxpy:
vlw.v v0, (a1)
sub a0, a0, a4
slli a4, a4, 2
add a1, a1, a4
vlw.v v8, (a2)
vfmacc.vf v8, fa0, v0
vsw.v v8, (a2)
add a2, a2, a4
bnez a0, saxpy
ret

View file

@ -1,220 +0,0 @@
.type vx_vec_sgemm_nn, @function
.global vx_vec_sgemm_nn
# RV64IDV system
#
# void
# sgemm_nn(size_t n,
# size_t m,
# size_t k,
# const float*a, // m * k matrix
# size_t lda,
# const float*b, // k * n matrix
# size_t ldb,
# float*c, // m * n matrix
# size_t ldc)
#
# c += a*b (alpha=1, no transpose on input matrices)
# matrices stored in C row-major order
#define n a0
#define m a1
#define k a2
#define ap a3
#define astride a4
#define bp a5
#define bstride a6
#define cp a7
#define cstride t0
#define kt t1
#define nt t2
#define bnp t3
#define cnp t4
#define akp t5
#define bkp s0
#define nvl s1
#define ccp s2
#define amp s3
# Use args as additional temporaries
#define ft12 fa0
#define ft13 fa1
#define ft14 fa2
#define ft15 fa3
# This version holds a 16*VLMAX block of C matrix in vector registers
# in inner loop, but otherwise does not cache or TLB tiling.
vx_vec_sgemm_nn:
#sgemm_nn:
addi sp, sp, -FRAMESIZE
sd s0, OFFSET(sp)
sd s1, OFFSET(sp)
sd s2, OFFSET(sp)
# Check for zero size matrices
beqz n, exit
beqz m, exit
beqz k, exit
# Convert elements strides to byte strides.
ld cstride, OFFSET(sp) # Get arg from stack frame
slli astride, astride, 2
slli bstride, bstride, 2
slli cstride, cstride, 2
slti t6, m, 16
bnez t6, end_rows
c_row_loop: # Loop across rows of C blocks
mv nt, n # Initialize n counter for next row of C blocks
mv bnp, bp # Initialize B n-loop pointer to start
mv cnp, cp # Initialize C n-loop pointer
c_col_loop: # Loop across one row of C blocks
vsetvli nvl, nt, e32 # 32-bit vectors, LMUL=1
mv akp, ap # reset pointer into A to beginning
mv bkp, bnp # step to next column in B matrix
# Initalize current C submatrix block from memory.
vlw.v v0, (cnp); add ccp, cnp, cstride;
vlw.v v1, (ccp); add ccp, ccp, cstride;
vlw.v v2, (ccp); add ccp, ccp, cstride;
vlw.v v3, (ccp); add ccp, ccp, cstride;
vlw.v v4, (ccp); add ccp, ccp, cstride;
vlw.v v5, (ccp); add ccp, ccp, cstride;
vlw.v v6, (ccp); add ccp, ccp, cstride;
vlw.v v7, (ccp); add ccp, ccp, cstride;
vlw.v v8, (ccp); add ccp, ccp, cstride;
vlw.v v9, (ccp); add ccp, ccp, cstride;
vlw.v v10, (ccp); add ccp, ccp, cstride;
vlw.v v11, (ccp); add ccp, ccp, cstride;
vlw.v v12, (ccp); add ccp, ccp, cstride;
vlw.v v13, (ccp); add ccp, ccp, cstride;
vlw.v v14, (ccp); add ccp, ccp, cstride;
vlw.v v15, (ccp)
mv kt, k # Initialize inner loop counter
# Inner loop scheduled assuming 4-clock occupancy of vfmacc instruction and single-issue pipeline
# Software pipeline loads
flw ft0, (akp); add amp, akp, astride;
flw ft1, (amp); add amp, amp, astride;
flw ft2, (amp); add amp, amp, astride;
flw ft3, (amp); add amp, amp, astride;
# Get vector from B matrix
vlw.v v16, (bkp)
# Loop on inner dimension for current C block
k_loop:
vfmacc.vf v0, ft0, v16
add bkp, bkp, bstride
flw ft4, (amp)
add amp, amp, astride
vfmacc.vf v1, ft1, v16
addi kt, kt, -1 # Decrement k counter
flw ft5, (amp)
add amp, amp, astride
vfmacc.vf v2, ft2, v16
flw ft6, (amp)
add amp, amp, astride
flw ft7, (amp)
vfmacc.vf v3, ft3, v16
add amp, amp, astride
flw ft8, (amp)
add amp, amp, astride
vfmacc.vf v4, ft4, v16
flw ft9, (amp)
add amp, amp, astride
vfmacc.vf v5, ft5, v16
flw ft10, (amp)
add amp, amp, astride
vfmacc.vf v6, ft6, v16
flw ft11, (amp)
add amp, amp, astride
vfmacc.vf v7, ft7, v16
flw ft12, (amp)
add amp, amp, astride
vfmacc.vf v8, ft8, v16
flw ft13, (amp)
add amp, amp, astride
vfmacc.vf v9, ft9, v16
flw ft14, (amp)
add amp, amp, astride
vfmacc.vf v10, ft10, v16
flw ft15, (amp)
add amp, amp, astride
addi akp, akp, 4 # Move to next column of a
vfmacc.vf v11, ft11, v16
beqz kt, 1f # Don't load past end of matrix
flw ft0, (akp)
add amp, akp, astride
1: vfmacc.vf v12, ft12, v16
beqz kt, 1f
flw ft1, (amp)
add amp, amp, astride
1: vfmacc.vf v13, ft13, v16
beqz kt, 1f
flw ft2, (amp)
add amp, amp, astride
1: vfmacc.vf v14, ft14, v16
beqz kt, 1f # Exit out of loop
flw ft3, (amp)
add amp, amp, astride
vfmacc.vf v15, ft15, v16
vlw.v v16, (bkp) # Get next vector from B matrix, overlap loads with jump stalls
j k_loop
1: vfmacc.vf v15, ft15, v16
# Save C matrix block back to memory
vsw.v v0, (cnp); add ccp, cnp, cstride;
vsw.v v1, (ccp); add ccp, ccp, cstride;
vsw.v v2, (ccp); add ccp, ccp, cstride;
vsw.v v3, (ccp); add ccp, ccp, cstride;
vsw.v v4, (ccp); add ccp, ccp, cstride;
vsw.v v5, (ccp); add ccp, ccp, cstride;
vsw.v v6, (ccp); add ccp, ccp, cstride;
vsw.v v7, (ccp); add ccp, ccp, cstride;
vsw.v v8, (ccp); add ccp, ccp, cstride;
vsw.v v9, (ccp); add ccp, ccp, cstride;
vsw.v v10, (ccp); add ccp, ccp, cstride;
vsw.v v11, (ccp); add ccp, ccp, cstride;
vsw.v v12, (ccp); add ccp, ccp, cstride;
vsw.v v13, (ccp); add ccp, ccp, cstride;
vsw.v v14, (ccp); add ccp, ccp, cstride;
vsw.v v15, (ccp)
# Following tail instructions should be scheduled earlier in free slots during C block save.
# Leaving here for clarity.
# Bump pointers for loop across blocks in one row
slli t6, nvl, 2
add cnp, cnp, t6 # Move C block pointer over
add bnp, bnp, t6 # Move B block pointer over
sub nt, nt, nvl # Decrement element count in n dimension
bnez nt, c_col_loop # Any more to do?
# Move to next set of rows
addi m, m, -16 # Did 16 rows above
slli t6, astride, 4 # Multiply astride by 16
add ap, ap, t6 # Move A matrix pointer down 16 rows
slli t6, cstride, 4 # Multiply cstride by 16
add cp, cp, t6 # Move C matrix pointer down 16 rows
slti t6, m, 16
beqz t6, c_row_loop
# Handle end of matrix with fewer than 16 rows.
# Can use smaller versions of above decreasing in powers-of-2 depending on code-size concerns.
end_rows:
# Not done.
exit:
ld s0, OFFSET(sp)
ld s1, OFFSET(sp)
ld s2, OFFSET(sp)
addi sp, sp, FRAMESIZE
ret

Some files were not shown because too many files have changed in this diff Show more