pltform independent source tree refactoring

2025-04-24 05:47:35 -04:00 · 2022-07-22 00:39:14 -04:00 · 2022-07-22 00:39:14 -04:00 · c613985de4
commit c613985de4
parent 92fd9d16ac
202 changed files with 9799 additions and 103942 deletions
--- a/6
+++ b/6
@ -2,14 +2,14 @@ all:
 	$(MAKE) -C third_party
 	$(MAKE) -C hw
 	$(MAKE) -C sim
-	$(MAKE) -C driver
+	$(MAKE) -C kernel
 	$(MAKE) -C runtime
 	$(MAKE) -C tests

 clean:
 	$(MAKE) -C hw clean
 	$(MAKE) -C sim clean
-	$(MAKE) -C driver clean
+	$(MAKE) -C kernel clean
 	$(MAKE) -C runtime clean
 	$(MAKE) -C tests clean

@ -17,6 +17,6 @@ clean-all:
 	$(MAKE) -C third_party clean
 	$(MAKE) -C hw clean
 	$(MAKE) -C sim clean
-	$(MAKE) -C driver clean
+	$(MAKE) -C kernel clean
 	$(MAKE) -C runtime clean
 	$(MAKE) -C tests clean-all
--- a/ci/blackbox.sh
+++ b/ci/blackbox.sh
@ -91,19 +91,19 @@ done

 case $DRIVER in
    rtlsim)
-        DRIVER_PATH=$VORTEX_HOME/driver/rtlsim
+        DRIVER_PATH=$VORTEX_HOME/runtime/rtlsim
        ;;
    vlsim)
-        DRIVER_PATH=$VORTEX_HOME/driver/vlsim
+        DRIVER_PATH=$VORTEX_HOME/runtime/vlsim
        ;;
    asesim)
-        DRIVER_PATH=$VORTEX_HOME/driver/asesim
+        DRIVER_PATH=$VORTEX_HOME/runtime/asesim
        ;;
    fpga)
-        DRIVER_PATH=$VORTEX_HOME/driver/fpga
+        DRIVER_PATH=$VORTEX_HOME/runtime/fpga
        ;; 
    simx)
-        DRIVER_PATH=$VORTEX_HOME/driver/simx
+        DRIVER_PATH=$VORTEX_HOME/runtime/simx
        ;;
    *)
        echo "invalid driver: $DRIVER"
@ -149,7 +149,7 @@ status=0
 make -C hw config

 # ensure the stub driver is present
-make -C $VORTEX_HOME/driver/stub
+make -C $VORTEX_HOME/runtime/stub

 if [ $DEBUG -ne 0 ]
 then    
--- a/ci/test_compiler.sh
+++ b/ci/test_compiler.sh
@ -9,17 +9,17 @@ make -s
 # clear POCL cache
 rm -rf ~/.cache/pocl

-# rebuild runtime
+# rebuild runtime library
 make -C runtime clean
 make -C runtime

-# rebuild drivers
-make -C driver clean
-make -C driver
+# rebuild kernel library
+make -C kernel clean
+make -C kernel

-# rebuild runtime tests
-make -C tests/runtime clean
-make -C tests/runtime
+# rebuild kernel tests
+make -C tests/kernel clean
+make -C tests/kernel

 # rebuild regression tests
 make -C tests/regression clean-all
--- a/driver/Makefile
+++ b/driver/Makefile
@ -1,29 +0,0 @@
-all: stub rtlsim simx vlsim
-
-stub:
-	$(MAKE) -C stub
-
-fpga:
-	$(MAKE) -C fpga
-
-asesim:
-	$(MAKE) -C asesim
-
-vlsim:
-	$(MAKE) -C vlsim
-
-rtlsim:
-	$(MAKE) -C rtlsim
-
-simx:
-	$(MAKE) -C simx
-
-clean:
-	$(MAKE) clean -C stub
-	$(MAKE) clean -C fpga
-	$(MAKE) clean -C asesim
-	$(MAKE) clean -C vlsim
-	$(MAKE) clean -C rtlsim
-	$(MAKE) clean -C simx
-
-.PHONY: all stub fpga asesim vlsim rtlsim simx clean
--- a/hw/afu/opae/VX_avs_adapter.sv
+++ b/hw/afu/opae/VX_avs_adapter.sv
--- a/hw/afu/opae/VX_mem_adapter.sv
+++ b/hw/afu/opae/VX_mem_adapter.sv
--- a/hw/afu/opae/ccip/ccip_if_pkg.sv
+++ b/hw/afu/opae/ccip/ccip_if_pkg.sv
--- a/hw/afu/opae/ccip/local_mem_cfg_pkg.sv
+++ b/hw/afu/opae/ccip/local_mem_cfg_pkg.sv
--- a/hw/afu/opae/ccip_interface_reg.sv
+++ b/hw/afu/opae/ccip_interface_reg.sv
--- a/hw/afu/opae/ccip_std_afu.sv
+++ b/hw/afu/opae/ccip_std_afu.sv
--- a/hw/afu/opae/vortex_afu.sv
+++ b/hw/afu/opae/vortex_afu.sv
--- a/hw/afu/opae/vortex_afu.vh
+++ b/hw/afu/opae/vortex_afu.vh
--- a/hw/afu/xrt/vortex_afu.v
+++ b/hw/afu/xrt/vortex_afu.v
@ -0,0 +1,103 @@
+`include "VX_define.vh"
+
+module vortex_afu #(
+	parameter AXI_DATA_WIDTH     = `VX_MEM_DATA_WIDTH,
+    parameter AXI_ADDR_WIDTH     = `VX_MEM_ADDR_WIDTH,
+    parameter AXI_TID_WIDTH      = 12,
+    parameter AXI_STROBE_WIDTH   = `VX_MEM_BYTEEN_WIDTH,
+	parameter AXI_DCR_ADDR_WIDTH = `VX_DCR_ADDR_WIDTH,
+    parameter AXI_DCR_DATA_WIDTH = `VX_DCR_DATA_WIDTH    
+) (
+	input wire clk,
+	input wire reset,
+	output wire [AXI_TID_WIDTH - 1:0] m_axi_awid,
+	output wire [AXI_ADDR_WIDTH - 1:0] m_axi_awaddr,
+	output wire [7:0] m_axi_awlen,
+	output wire [2:0] m_axi_awsize,
+	output wire [1:0] m_axi_awburst,
+	output wire m_axi_awlock,
+	output wire [3:0] m_axi_awcache,
+	output wire [2:0] m_axi_awprot,
+	output wire [3:0] m_axi_awqos,
+	output wire m_axi_awvalid,
+	input wire m_axi_awready,
+	output wire [AXI_DATA_WIDTH - 1:0] m_axi_wdata,
+	output wire [AXI_STROBE_WIDTH - 1:0] m_axi_wstrb,
+	output wire m_axi_wlast,
+	output wire m_axi_wvalid,
+	input wire m_axi_wready,
+	input wire [AXI_TID_WIDTH - 1:0] m_axi_bid,
+	input wire [1:0] m_axi_bresp,
+	input wire m_axi_bvalid,
+	output wire m_axi_bready,
+	output wire [AXI_TID_WIDTH - 1:0] m_axi_arid,
+	output wire [AXI_ADDR_WIDTH - 1:0] m_axi_araddr,
+	output wire [7:0] m_axi_arlen,
+	output wire [2:0] m_axi_arsize,
+	output wire [1:0] m_axi_arburst,
+	output wire m_axi_arlock,
+	output wire [3:0] m_axi_arcache,
+	output wire [2:0] m_axi_arprot,
+	output wire [3:0] m_axi_arqos,
+	output wire m_axi_arvalid,
+	input wire m_axi_arready,
+	input wire [AXI_TID_WIDTH - 1:0] m_axi_rid,
+	input wire [AXI_DATA_WIDTH - 1:0] m_axi_rdata,
+	input wire [1:0] m_axi_rresp,
+	input wire m_axi_rlast,
+	input wire m_axi_rvalid,
+	output wire m_axi_rready,
+	output wire busy
+);
+
+	Vortex_axi #(
+		.AXI_DATA_WIDTH     (AXI_DATA_WIDTH),
+		.AXI_ADDR_WIDTH     (AXI_ADDR_WIDTH),
+		.AXI_TID_WIDTH      (AXI_TID_WIDTH),
+		.AXI_STROBE_WIDTH   (AXI_STROBE_WIDTH),
+		.AXI_DCR_ADDR_WIDTH (AXI_DCR_ADDR_WIDTH),
+		.AXI_DCR_DATA_WIDTH (AXI_DCR_DATA_WIDTH)
+	) inst (
+		.clk(clk),
+		.reset(reset),
+		.m_axi_awid(m_axi_awid),
+		.m_axi_awaddr(m_axi_awaddr),
+		.m_axi_awlen(m_axi_awlen),
+		.m_axi_awsize(m_axi_awsize),
+		.m_axi_awburst(m_axi_awburst),
+		.m_axi_awlock(m_axi_awlock),
+		.m_axi_awcache(m_axi_awcache),
+		.m_axi_awprot(m_axi_awprot),
+		.m_axi_awqos(m_axi_awqos),
+		.m_axi_awvalid(m_axi_awvalid),
+		.m_axi_awready(m_axi_awready),
+		.m_axi_wdata(m_axi_wdata),
+		.m_axi_wstrb(m_axi_wstrb),
+		.m_axi_wlast(m_axi_wlast),
+		.m_axi_wvalid(m_axi_wvalid),
+		.m_axi_wready(m_axi_wready),
+		.m_axi_bid(m_axi_bid),
+		.m_axi_bresp(m_axi_bresp),
+		.m_axi_bvalid(m_axi_bvalid),
+		.m_axi_bready(m_axi_bready),
+		.m_axi_arid(m_axi_arid),
+		.m_axi_araddr(m_axi_araddr),
+		.m_axi_arlen(m_axi_arlen),
+		.m_axi_arsize(m_axi_arsize),
+		.m_axi_arburst(m_axi_arburst),
+		.m_axi_arlock(m_axi_arlock),
+		.m_axi_arcache(m_axi_arcache),
+		.m_axi_arprot(m_axi_arprot),
+		.m_axi_arqos(m_axi_arqos),
+		.m_axi_arvalid(m_axi_arvalid),
+		.m_axi_arready(m_axi_arready),
+		.m_axi_rid(m_axi_rid),
+		.m_axi_rdata(m_axi_rdata),
+		.m_axi_rresp(m_axi_rresp),
+		.m_axi_rlast(m_axi_rlast),
+		.m_axi_rvalid(m_axi_rvalid),
+		.m_axi_rready(m_axi_rready),
+		.busy(busy)
+	);
+	
+endmodule
--- a/hw/rtl/fpu_unit/altera/arria10/acl_fdiv.sv
+++ b/hw/rtl/fpu_unit/altera/arria10/acl_fdiv.sv
--- a/hw/rtl/fpu_unit/altera/arria10/acl_fdiv_memoryC0_uid112_invTables_lutmem.hex
+++ b/hw/rtl/fpu_unit/altera/arria10/acl_fdiv_memoryC0_uid112_invTables_lutmem.hex
--- a/hw/rtl/fpu_unit/altera/arria10/acl_fdiv_memoryC1_uid115_invTables_lutmem.hex
+++ b/hw/rtl/fpu_unit/altera/arria10/acl_fdiv_memoryC1_uid115_invTables_lutmem.hex
--- a/hw/rtl/fpu_unit/altera/arria10/acl_fdiv_memoryC2_uid118_invTables_lutmem.hex
+++ b/hw/rtl/fpu_unit/altera/arria10/acl_fdiv_memoryC2_uid118_invTables_lutmem.hex
--- a/hw/rtl/fpu_unit/altera/arria10/acl_fmadd.sv
+++ b/hw/rtl/fpu_unit/altera/arria10/acl_fmadd.sv
--- a/hw/rtl/fpu_unit/altera/arria10/acl_fsqrt.sv
+++ b/hw/rtl/fpu_unit/altera/arria10/acl_fsqrt.sv
--- a/hw/rtl/fpu_unit/altera/arria10/acl_fsqrt_memoryC0_uid62_sqrtTables_lutmem.hex
+++ b/hw/rtl/fpu_unit/altera/arria10/acl_fsqrt_memoryC0_uid62_sqrtTables_lutmem.hex
--- a/hw/rtl/fpu_unit/altera/arria10/acl_fsqrt_memoryC1_uid65_sqrtTables_lutmem.hex
+++ b/hw/rtl/fpu_unit/altera/arria10/acl_fsqrt_memoryC1_uid65_sqrtTables_lutmem.hex
--- a/hw/rtl/fpu_unit/altera/arria10/acl_fsqrt_memoryC2_uid68_sqrtTables_lutmem.hex
+++ b/hw/rtl/fpu_unit/altera/arria10/acl_fsqrt_memoryC2_uid68_sqrtTables_lutmem.hex
--- a/hw/rtl/fpu_unit/altera/arria10/acl_gen.log
+++ b/hw/rtl/fpu_unit/altera/arria10/acl_gen.log
--- a/hw/rtl/fpu_unit/altera/arria10/acl_gen.sh
+++ b/hw/rtl/fpu_unit/altera/arria10/acl_gen.sh
--- a/hw/rtl/fpu_unit/altera/arria10/dspba_delay_ver.sv
+++ b/hw/rtl/fpu_unit/altera/arria10/dspba_delay_ver.sv
--- a/hw/rtl/fpu_unit/altera/stratix10/acl_fdiv.sv
+++ b/hw/rtl/fpu_unit/altera/stratix10/acl_fdiv.sv
--- a/hw/rtl/fpu_unit/altera/stratix10/acl_fdiv_memoryC0_uid146_invTables_lutmem.hex
+++ b/hw/rtl/fpu_unit/altera/stratix10/acl_fdiv_memoryC0_uid146_invTables_lutmem.hex
--- a/hw/rtl/fpu_unit/altera/stratix10/acl_fdiv_memoryC1_uid149_invTables_lutmem.hex
+++ b/hw/rtl/fpu_unit/altera/stratix10/acl_fdiv_memoryC1_uid149_invTables_lutmem.hex
--- a/hw/rtl/fpu_unit/altera/stratix10/acl_fdiv_memoryC2_uid152_invTables_lutmem.hex
+++ b/hw/rtl/fpu_unit/altera/stratix10/acl_fdiv_memoryC2_uid152_invTables_lutmem.hex
--- a/hw/rtl/fpu_unit/altera/stratix10/acl_fmadd.sv
+++ b/hw/rtl/fpu_unit/altera/stratix10/acl_fmadd.sv
--- a/hw/rtl/fpu_unit/altera/stratix10/acl_fsqrt.sv
+++ b/hw/rtl/fpu_unit/altera/stratix10/acl_fsqrt.sv
--- a/hw/rtl/fpu_unit/altera/stratix10/acl_fsqrt_memoryC0_uid88_sqrtTables_lutmem.hex
+++ b/hw/rtl/fpu_unit/altera/stratix10/acl_fsqrt_memoryC0_uid88_sqrtTables_lutmem.hex
--- a/hw/rtl/fpu_unit/altera/stratix10/acl_fsqrt_memoryC1_uid91_sqrtTables_lutmem.hex
+++ b/hw/rtl/fpu_unit/altera/stratix10/acl_fsqrt_memoryC1_uid91_sqrtTables_lutmem.hex
--- a/hw/rtl/fpu_unit/altera/stratix10/acl_fsqrt_memoryC2_uid94_sqrtTables_lutmem.hex
+++ b/hw/rtl/fpu_unit/altera/stratix10/acl_fsqrt_memoryC2_uid94_sqrtTables_lutmem.hex
--- a/hw/rtl/fpu_unit/altera/stratix10/acl_gen.log
+++ b/hw/rtl/fpu_unit/altera/stratix10/acl_gen.log
--- a/hw/rtl/fpu_unit/altera/stratix10/acl_gen.sh
+++ b/hw/rtl/fpu_unit/altera/stratix10/acl_gen.sh
--- a/hw/rtl/fpu_unit/altera/stratix10/dspba_delay_ver.sv
+++ b/hw/rtl/fpu_unit/altera/stratix10/dspba_delay_ver.sv
--- a/hw/syn/altera/opae/.gitignore
+++ b/hw/syn/altera/opae/.gitignore
--- a/hw/syn/altera/opae/Makefile
+++ b/hw/syn/altera/opae/Makefile
@ -2,6 +2,8 @@ DEVICE_FAMILY ?= arria10
 BUILD_DIR ?= build
 RTL_DIR = ../../rtl
 DPI_DIR = ../../dpi
+AFU_DIR = ../../afu/opae
+IP_DIR  = ../ip/$(DEVICE_FAMILY)

 ifeq ($(shell which qsub-synth),)
 	RUN_SYNTH=$(OPAE_PLATFORM_ROOT)/bin/run.sh > build.log 2>&1 &
@ -39,12 +41,12 @@ CONFIG16 := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE $(CONFIGS)
 CONFIG32 := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE $(CONFIGS)
 CONFIG64 := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE $(CONFIGS)

-FPU_INCLUDE = -I$(RTL_DIR)/fpu_unit -I$(RTL_DIR)/fpu_unit/altera/$(DEVICE_FAMILY)
+FPU_INCLUDE = -I$(RTL_DIR)/fpu_unit
 TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
 RASTER_INCLUDE = -I$(RTL_DIR)/raster_unit
 ROP_INCLUDE = -I$(RTL_DIR)/rop_unit
-RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/afu $(FPU_INCLUDE) 
-RTL_INCLUDE += $(TEX_INCLUDE) $(RASTER_INCLUDE) $(ROP_INCLUDE)
+RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(AFU_DIR) -I$(IP_DIR)
+RTL_INCLUDE += $(FPU_INCLUDE) $(TEX_INCLUDE) $(RASTER_INCLUDE) $(ROP_INCLUDE)

 CFLAGS += $(RTL_INCLUDE)

--- a/hw/syn/altera/opae/README
+++ b/hw/syn/altera/opae/README
--- a/hw/syn/altera/opae/fpga_prog.sh
+++ b/hw/syn/altera/opae/fpga_prog.sh
--- a/hw/syn/altera/opae/gen_sources.sh
+++ b/hw/syn/altera/opae/gen_sources.sh
--- a/hw/syn/altera/opae/run_ase.sh
+++ b/hw/syn/altera/opae/run_ase.sh
@ -7,7 +7,7 @@ BUILD_DIR=$1
 PROGRAM=$(basename "$2")
 PROGRAM_DIR=`dirname $2`

-VORTEX_DRV_PATH=$SCRIPT_DIR/../../../driver
+VORTEX_RT_PATH=$SCRIPT_DIR/../../../runtime

 # Export ASE_WORKDIR variable
 export ASE_WORKDIR=$SCRIPT_DIR/$BUILD_DIR/work
@ -35,5 +35,5 @@ done
 # run application
 pushd $PROGRAM_DIR
 echo "  [DBG]  running ./$PROGRAM $*"
-ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH/lib:$VORTEX_DRV_PATH/asesim:$LD_LIBRARY_PATH ./$PROGRAM $*
+ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH/lib:$VORTEX_RT_PATH/asesim:$LD_LIBRARY_PATH ./$PROGRAM $*
 popd
--- a/hw/syn/altera/opae/setup.cfg
+++ b/hw/syn/altera/opae/setup.cfg
--- a/hw/syn/altera/opae/vortex_afu.json
+++ b/hw/syn/altera/opae/vortex_afu.json
--- a/hw/syn/altera/opae/vortex_afu.qsf
+++ b/hw/syn/altera/opae/vortex_afu.qsf
--- a/hw/syn/altera/quartus/.gitignore
+++ b/hw/syn/altera/quartus/.gitignore
--- a/hw/syn/altera/quartus/Makefile
+++ b/hw/syn/altera/quartus/Makefile
--- a/hw/syn/altera/quartus/cache/Makefile
+++ b/hw/syn/altera/quartus/cache/Makefile
--- a/hw/syn/altera/quartus/core/Makefile
+++ b/hw/syn/altera/quartus/core/Makefile
@ -6,15 +6,15 @@ THIRD_PARTY_DIR = ../../../../../third_party

 FAMILY = "Arria 10"
 DEVICE = 10AX115N3F40E2SG
-FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
+IP_DIR = ../../ip/arria10

 #FAMILY = "Stratix 10"
 #DEVICE = 1SX280HN2F43E2VG
-#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
+#IP_DIR = ../../ip/stratix10

-FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
+FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
 TEX_INCLUDE = $(RTL_DIR)/tex_unit
-RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
+RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)

 PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

--- a/hw/syn/altera/quartus/fpu_core/Makefile
+++ b/hw/syn/altera/quartus/fpu_core/Makefile
@ -6,14 +6,14 @@ THIRD_PARTY_DIR = ../../../../../third_party

 FAMILY = "Arria 10"
 DEVICE = 10AX115N3F40E2SG
-FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
+IP_DIR = ../../ip/arria10

 #FAMILY = "Stratix 10"
 #DEVICE = 1SX280HN2F43E2VG
-#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
+#IP_DIR = ../../ip/stratix10

-FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
-RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces
+FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
+RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(IP_DIR)
 PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

 # Part, Family
--- a/hw/syn/altera/quartus/project.sdc
+++ b/hw/syn/altera/quartus/project.sdc
--- a/hw/syn/altera/quartus/project.tcl
+++ b/hw/syn/altera/quartus/project.tcl
--- a/hw/syn/altera/quartus/smem/Makefile
+++ b/hw/syn/altera/quartus/smem/Makefile
--- a/hw/syn/altera/quartus/test/Makefile
+++ b/hw/syn/altera/quartus/test/Makefile
@ -8,15 +8,15 @@ CONFIGS += -set "EXT_GFX_ENABLE"

 FAMILY = "Arria 10"
 DEVICE = 10AX115N3F40E2SG
-FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
+IP_DIR = ../../ip/arria10

 #FAMILY = "Stratix 10"
 #DEVICE = 1SX280HN2F43E2VG
-#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
+#IP_DIR = ../../ip/stratix10

-FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
+FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
 GFX_INCLUDE = $(RTL_DIR)/tex_unit;$(RTL_DIR)/raster_unit;$(RTL_DIR)/rop_unit
-RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(GFX_INCLUDE)
+RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(GFX_INCLUDE)

 PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

--- a/hw/syn/altera/quartus/texunit/Makefile
+++ b/hw/syn/altera/quartus/texunit/Makefile
@ -6,15 +6,15 @@ THIRD_PARTY_DIR = ../../../../../third_party

 FAMILY = "Arria 10"
 DEVICE = 10AX115N3F40E2SG
-FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
+IP_DIR = ../../ip/arria10

 #FAMILY = "Stratix 10"
 #DEVICE = 1SX280HN2F43E2VG
-#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
+#IP_DIR = ../../ip/stratix10

-FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
+FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
 TEX_INCLUDE = $(RTL_DIR)/tex_unit
-RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
+RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)

 PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

--- a/hw/syn/altera/quartus/timing-html.tcl
+++ b/hw/syn/altera/quartus/timing-html.tcl
--- a/hw/syn/altera/quartus/top-gfx/Makefile
+++ b/hw/syn/altera/quartus/top-gfx/Makefile
@ -2,15 +2,16 @@ PROJECT = vortex_afu
 TOP_LEVEL_ENTITY = vortex_afu
 SRC_FILE = vortex_afu.sv
 RTL_DIR = ../../../../rtl
+AFU_DIR = ../../../../afu/opae
 THIRD_PARTY_DIR = ../../../../../third_party

 FAMILY = "Arria 10"
 DEVICE = 10AX115N3F40E2SG
-FPU_CORE_PATH=$(RTL_DIR)/fpu_unit/altera/arria10
+IP_DIR = ../../ip/arria10

 #FAMILY = "Stratix 10"
 #DEVICE = 1SX280HN2F43E2VG
-#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
+#IP_DIR = ../../ip/stratix10

 CONFIGS += -set "NOPAE"
 CONFIGS += -set "EXT_GFX_ENABLE"
@ -21,9 +22,9 @@ CONFIGS += -set "NUM_CORES=4"
 #CONFIGS += -set "SM_DISABLE"
 #CONFIGS += -set "RCACHE_DISABLE" -set "OCACHE_DISABLE" -set "TCACHE_DISABLE"

-FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
+FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
 TEX_INCLUDE = $(RTL_DIR)/tex_unit;$(RTL_DIR)/raster_unit;$(RTL_DIR)/rop_unit
-RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
+RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(AFU_DIR);$(AFU_DIR)/ccip;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)

 PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

--- a/hw/syn/altera/quartus/top/Makefile
+++ b/hw/syn/altera/quartus/top/Makefile
@ -2,15 +2,16 @@ PROJECT = vortex_afu
 TOP_LEVEL_ENTITY = vortex_afu
 SRC_FILE = vortex_afu.sv
 RTL_DIR = ../../../../rtl
+AFU_DIR = ../../../../afu/opae
 THIRD_PARTY_DIR = ../../../../../third_party

 FAMILY = "Arria 10"
 DEVICE = 10AX115N3F40E2SG
-FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
+IP_DIR = ../../ip/arria10

 #FAMILY = "Stratix 10"
 #DEVICE = 1SX280HN2F43E2VG
-#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
+#IP_DIR = ../../ip/stratix10

 CONFIGS += -set "NOPAE"

@ -19,9 +20,9 @@ CONFIGS += -set "NUM_CORES=4"
 #CONFIGS += -set "L1_DISABLE" 
 #CONFIGS += -set "SM_DISABLE"

-FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
+FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
 TEX_INCLUDE = $(RTL_DIR)/tex_unit
-RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
+RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(AFU_DIR);$(AFU_DIR)/ccip;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)

 PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

--- a/hw/syn/altera/quartus/unittest/Makefile
+++ b/hw/syn/altera/quartus/unittest/Makefile
@ -6,15 +6,15 @@ THIRD_PARTY_DIR = ../../../../../third_party

 FAMILY = "Arria 10"
 DEVICE = 10AX115N3F40E2SG
-FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
+IP_DIR = ../../ip/arria10

 #FAMILY = "Stratix 10"
 #DEVICE = 1SX280HN2F43E2VG
-#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10 
+#IP_DIR = ../../ip/stratix10

-FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
+FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
 TEX_INCLUDE = $(RTL_DIR)/tex_unit
-RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
+RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)

 PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

--- a/hw/syn/altera/quartus/vortex-gfx/Makefile
+++ b/hw/syn/altera/quartus/vortex-gfx/Makefile
@ -6,11 +6,11 @@ THIRD_PARTY_DIR = ../../../../../third_party

 FAMILY = "Arria 10"
 DEVICE = 10AX115N3F40E2SG
-FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
+IP_DIR = ../../ip/arria10

 #FAMILY = "Stratix 10"
 #DEVICE = 1SX280HN2F43E2VG
-#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
+#IP_DIR = ../../ip/stratix10

 CONFIGS += -set "EXT_GFX_ENABLE"

@ -26,9 +26,9 @@ CONFIGS += -set "EXT_GFX_ENABLE"

 CONFIGS +=  -set "NUM_CORES=4"

-FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
+FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
 GFX_INCLUDE = $(RTL_DIR)/tex_unit;$(RTL_DIR)/raster_unit;$(RTL_DIR)/rop_unit
-RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(GFX_INCLUDE)
+RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(GFX_INCLUDE)

 PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

--- a/hw/syn/altera/quartus/vortex/Makefile
+++ b/hw/syn/altera/quartus/vortex/Makefile
@ -6,11 +6,11 @@ THIRD_PARTY_DIR = ../../../../../third_party

 FAMILY = "Arria 10"
 DEVICE = 10AX115N3F40E2SG
-FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10
+IP_DIR = ../../ip/arria10

 #FAMILY = "Stratix 10"
 #DEVICE = 1SX280HN2F43E2VG
-#FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/stratix10
+#IP_DIR = ../../ip/stratix10

 #CONFIGS += -set "L1_DISABLE"

@ -22,9 +22,9 @@ FPU_CORE_PATH = $(RTL_DIR)/fpu_unit/altera/arria10

 CONFIGS += -set "NUM_CORES=4"

-FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
+FPU_INCLUDE = $(RTL_DIR)/fpu_unit;$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
 TEX_INCLUDE = $(RTL_DIR)/tex_unit
-RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
+RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(IP_DIR);$(FPU_INCLUDE);$(TEX_INCLUDE)

 PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

--- a/hw/syn/xilinx/ip/ultrascale/fdiv.v
+++ b/hw/syn/xilinx/ip/ultrascale/fdiv.v
--- a/hw/syn/xilinx/ip/ultrascale/fmadd.v
+++ b/hw/syn/xilinx/ip/ultrascale/fmadd.v
--- a/hw/syn/xilinx/ip/ultrascale/fsqrt.v
+++ b/hw/syn/xilinx/ip/ultrascale/fsqrt.v
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
--- a/kernel/Makefile
+++ b/kernel/Makefile
@ -0,0 +1,49 @@
+XLEN ?= 32
+
+ifeq ($(XLEN),32)
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
+else
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
+endif
+
+RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
+
+CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc
+AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc-ar
+DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objdump
+CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objcopy
+
+ifeq ($(XLEN),32)
+CFLAGS += -march=rv32imf -mabi=ilp32f
+else
+CFLAGS += -march=rv64imfd -mabi=lp64d
+endif
+
+CFLAGS += -O3 -mcmodel=medany -fno-exceptions -fdata-sections -ffunction-sections
+CFLAGS += -I./include -I../hw
+
+PROJECT = libvortexrt
+
+SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/tinyprintf.c ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c
+
+OBJS := $(addsuffix .o, $(notdir $(SRCS)))
+
+all: $(PROJECT).a $(PROJECT).dump
+
+$(PROJECT).dump: $(PROJECT).a
+	$(DP) -D $(PROJECT).a > $(PROJECT).dump
+
+%.S.o: src/%.S
+	$(CC) $(CFLAGS) -c $< -o $@
+
+%.c.o: src/%.c
+	$(CC) $(CFLAGS) -c $< -o $@
+
+$(PROJECT).a: $(OBJS)
+	$(AR) rcs $@ $^
+
+.depend: $(SRCS)
+	$(CC) $(CFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf *.a *.o *.dump .depend 
--- a/runtime/include/vx_intrinsics.h
+++ b/runtime/include/vx_intrinsics.h
--- a/runtime/include/vx_print.h
+++ b/runtime/include/vx_print.h
--- a/runtime/include/vx_spawn.h
+++ b/runtime/include/vx_spawn.h
--- a/runtime/linker/vx_link32.ld
+++ b/runtime/linker/vx_link32.ld
--- a/runtime/linker/vx_link64.ld
+++ b/runtime/linker/vx_link64.ld
--- a/runtime/src/tinyprintf.c
+++ b/runtime/src/tinyprintf.c
--- a/runtime/src/tinyprintf.h
+++ b/runtime/src/tinyprintf.h
--- a/runtime/src/vx_perf.c
+++ b/runtime/src/vx_perf.c
--- a/runtime/src/vx_print.S
+++ b/runtime/src/vx_print.S
--- a/runtime/src/vx_print.c
+++ b/runtime/src/vx_print.c
--- a/runtime/src/vx_spawn.S
+++ b/runtime/src/vx_spawn.S
--- a/runtime/src/vx_spawn.c
+++ b/runtime/src/vx_spawn.c
--- a/runtime/src/vx_start.S
+++ b/runtime/src/vx_start.S
--- a/runtime/src/vx_syscalls.c
+++ b/runtime/src/vx_syscalls.c
--- a/miscs/rvvector/basic/Makefile
+++ b/miscs/rvvector/basic/Makefile
@ -1,41 +0,0 @@
-
-LIB_PATH = ../../runtime
-
-
-COMP     = /home/fares/dev/riscv-gnu-toolchain-vector/drops/bin/riscv32-unknown-elf-gcc
-
-CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/startup/vx_link.ld -march=rv32imv -mabi=ilp32
-
-DMP  = /home/fares/dev/riscv-gnu-toolchain-vector/drops/bin/riscv32-unknown-elf-objdump
-CPY  = /home/fares/dev/riscv-gnu-toolchain-vector/drops/bin/riscv32-unknown-elf-objcopy
-
-# VX_STR  = ../../startup/vx_start.S
-
-
-
-NEWLIB  = $(LIB_PATH)/newlib/newlib.c
-VX_STR  = $(LIB_PATH)/startup/vx_start.S
-VX_INT  = $(LIB_PATH)/intrinsics/vx_intrinsics.S
-VX_IO   = $(LIB_PATH)/io/vx_io.S $(LIB_PATH)/io/vx_io.c
-VX_API  = $(LIB_PATH)/vx_api/vx_api.c
-VX_FIO  = $(LIB_PATH)/fileio/fileio.S
-VX_VEC  = vx_vec.s
-LIBS    = /home/fares/dev/riscv-gnu-toolchain-vector/drops/riscv32-unknown-elf/lib/libc.a /home/fares/dev/riscv-gnu-toolchain-vector/drops/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
-
-VX_MAIN = vx_vector_main
-
-all: HEX DUMP ELF
-
-DUMP: ELF
-	$(DMP) -D $(VX_MAIN).elf > $(VX_MAIN).dump
-
-HEX: ELF
-	$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
-
-ELF: 
-	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
-
-run:
-	../../simx/obj_dir/Vcache_simX -E -a rv32i --core vx_vector_main.hex  -s -b 1> emulator.debug
-
-
--- a/miscs/rvvector/basic/_1_vx_vec.s
+++ b/miscs/rvvector/basic/_1_vx_vec.s
@ -1,30 +0,0 @@
-
-
-
-.type vx_vec_test, @function
-.global vx_vec_test
-vx_vec_test:
-	li a1, 7
-	sw a1, 0(a0)
-	ret
-
-
-
-
-# 	slli a0, a0, 2
-# 	add a0, a0, a3
-# 	vmv.v.x vv0, a2
-# 	# vsplat4 vv0, a2
-# stripmine_loop:
-# 	vlb4 vv1, (a1)
-# 	vcmpez4 vp0, vv1
-# 	!vp0 vlw4 vv1, (a3)
-# 	!vp0 vlw4 vv2, (a4)
-# 	!vp0 vfma4 vv1, vv0, vv1, vv2
-# 	!vp0 vsw4 vv1, (a4)
-# 	addi a1, a1, 4
-# 	addi a3, a3, 16
-# 	addi a4, a4, 16
-# 	bleu a3, a0, stripmine_loop
-	# handle edge cases
-	# when (n % 4) != 0 ...
--- a/miscs/rvvector/basic/_1_vx_vector_main.c
+++ b/miscs/rvvector/basic/_1_vx_vector_main.c
@ -1,32 +0,0 @@
-
-#include "../../runtime/intrinsics/vx_intrinsics.h"
-#include "vx_vec.h"
-
-int main()
-{
-	vx_tmc(1);
-	// int * a = malloc(4);
-	// int * b = malloc(4);
-	// int * c = malloc(4);
-
-
-	int * a = malloc(4);
-	*a = 5;
-	printf("Value of a: %d\n", *a);
-
-	vx_vec_test(a);
-
-	printf("Value of a: %d\n", *a);
-
-
-	// for (int i = 0; i < 4; i++)
-	// {
-	// 	if (c[i] != (a[i] + b[i]))
-	// 	{
-	// 		printf("Fail\n");
-	// 		break;
-	// 	}
-	// }
-
-	vx_tmc(0);
-}
--- a/miscs/rvvector/basic/__vx_vector_main.c
+++ b/miscs/rvvector/basic/__vx_vector_main.c
@ -1,91 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "../../runtime/intrinsics/vx_intrinsics.h"
-#include "vx_vec.h"
-
-int main()
-{
-	vx_tmc(1);
-#if 0
-    # vector-vector add routine of 32-bit integers
-    # void vvaddint32(size_t n, const int*x, const int*y, int*z)
-    # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
-    #
-    # a0 = n, a1 = x, a2 = y, a3 = z
-    # Non-vector instructions are indented
-#endif   
-#if 1      
-        int n = 5;
-        int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
-        int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
-        int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
-
-        for(int i = 0; i < n; ++i)
-        {
-           a[i] = b[i] = c[i] = 1;
-        }
-
-        for(int i = 0; i < n; ++i) printf("%d, ", a[i]);
-        printf("\n");
-//        for(int i = 0; i < n; ++i) printf("%d, ", b[i]);
-//        printf("\n");
-//        for(int i = 0; i < n; ++i) printf("%d, ", c[i]);
-
-        int *d;
-        *d = 1;
-	vx_vec_test(n, d, b, c);
-
-
-        printf("(after: n = %d, %d)\n", n, *d);
-        for(int i = 0; i < n; ++i) printf("%d, ", a[i]);
-//        printf("\n");
-//        for(int i = 0; i < n; ++i) printf("%d, ", b[i]);
-//        printf("\n");
-//        for(int i = 0; i < n; ++i) printf("%d, ", c[i]);
-
-#endif
-#if 0
-	int * a = malloc(sizeof(int) * 10);
-	for(int i = 0; i < 10; ++i) a[i] = 5;
-   
-       
-	for(int i = 0; i < 10; ++i)
-	    printf("%d, ", a[i]);
-
-	vx_vec_test(a);
-	//vx_vec_test(2, a, a, a);
-
-	printf("after--------\n");
-        for(int i = 0; i < 10; ++i) 
-            printf("%d, ", a[i]);
-#endif
-#if 0
-        int n = 5;
-        int *a = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1};
-        int *b = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1};
-        int *c = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1}; 
-        
-        for(int i = 0; i < n; ++i)
-        {
-            a[i] = 1; 
-            b[i] = 1;
-            c[i] = 0;
-        }
-
-        printf("Value of a: %d, b: %d, c: %d, n: %d\n", a[0], b[0], c[0], n);
-        vx_vec_test(n, a, b, c);
-        printf("Value of a: %d, b: %d, c: %d, n: %d\n", a[0], b[0], c[0], n);
-        
-#endif
-
-	// for (int i = 0; i < 4; i++)
-	// {
-	// 	if (c[i] != (a[i] + b[i]))
-	// 	{
-	// 		printf("Fail\n");
-	// 		break;
-	// 	}
-	// }
-
-	vx_tmc(0);
-}
--- a/miscs/rvvector/basic/vx_vec.h
+++ b/miscs/rvvector/basic/vx_vec.h
@ -1,15 +0,0 @@
-
-
-#pragma once
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void vx_vec_test(int n, int* a, int* b, int* c); //vvaddint32 
-
-
-#ifdef __cplusplus
-}
-#endif
--- a/miscs/rvvector/basic/vx_vec.s
+++ b/miscs/rvvector/basic/vx_vec.s
@ -1,23 +0,0 @@
-
-
-.type vx_vec_test, @function
-.global vx_vec_test
-vx_vec_test:
-# vector-vector add routine of 32-bit integers
-# void vvaddint32(size_t n, const int*x, const int*y, int*z)
-# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
-#
-# a0 = n, a1 = x, a2 = y, a3 = z
-# Non-vector instructions are indented
-    vsetvli t0, a0, e32 # Set vector length based on 32-bit vectors
-    vlw.v v0, (a1)           # Get first vector
-      sub a0, a0, t0         # Decrement number done
-      slli t0, t0, 2         # Multiply number done by 4 bytes
-      add a1, a1, t0         # Bump pointer
-    vlw.v v1, (a2)           # Get second vector
-      add a2, a2, t0         # Bump pointer
-    vadd.vv v2, v0, v1        # Sum vectors
-    vsw.v v2, (a3)           # Store result
-      add a3, a3, t0         # Bump pointer
-      bnez a0, vx_vec_test   # Loop back 
-    ret                    # Finished
--- a/miscs/rvvector/basic/vx_vec_main.c
+++ b/miscs/rvvector/basic/vx_vec_main.c
@ -1,27 +0,0 @@
-#include "../../runtime/intrinsics/vx_intrinsics.h"
-#include "vx_vec.h"
-
-int main()
-{
-	vx_tmc(1);
-        printf("----------------hello!!! \n");
-
-        int n = 8;
-        int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
-        int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
-        int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
-         
-        printf("hello!!! \n");
-
-        for(int i = 0; i < n; ++i)
-        {
-           a[i] = b[i] = c[i] = 1;
-        }
-
-	vx_vec_test(n, a, b, c);
-
-        for(int i = 0; i < n; ++i)
-           printf("%d ", c[i]);
-
-	vx_tmc(0);
-}
--- a/miscs/rvvector/basic/vx_vector_main.c
+++ b/miscs/rvvector/basic/vx_vector_main.c
@ -1,29 +0,0 @@
-#include "../../runtime/intrinsics/vx_intrinsics.h"
-#include "vx_vec.h"
-
-int main()
-{
-        vx_tmc(1);
-
-        printf("Hello\n");
-
-        int n = 64;
-        int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
-        int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
-        int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
-
-        for(int i = 0; i < n; ++i)
-        {
-           a[i] = b[i] = c[i] = 1;
-        }
-
-        vx_vec_test(n, a, b, c);
-
-        for (int i = 0; i < n; ++i)
-        {
-        	printf("a[%d]=%d, b[%d]=%d, c[%d]=%d\n", i, a[i], i, b[i], i, c[i]);
-        }
-
-
-        vx_tmc(0);
-}
--- a/miscs/rvvector/benchmark_temp/Makefile
+++ b/miscs/rvvector/benchmark_temp/Makefile
@ -1,39 +0,0 @@
-LIB_PATH = ../../runtime
-
-COMP     = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-gcc
-
-CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/startup/vx_link.ld -march=rv32imv -mabi=ilp32
-
-DMP  = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objdump
-CPY  = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objcopy
-
-# VX_STR  = ../../startup/vx_start.S
-
-NEWLIB  = $(LIB_PATH)/newlib/newlib.c
-VX_STR  = $(LIB_PATH)/startup/vx_start.S
-VX_INT  = $(LIB_PATH)/intrinsics/vx_intrinsics.S
-VX_IO   = $(LIB_PATH)/io/vx_io.S $(LIB_PATH)/io/vx_io.c
-VX_API  = $(LIB_PATH)/vx_api/vx_api.c
-VX_FIO  = $(LIB_PATH)/fileio/fileio.S
-VX_VEC1  = vx_vec_vvaddint32.s
-VX_VEC2  = vx_vec_saxpy.s       #float --> int
-VX_VEC3  = vx_vec_sgemm_float.s #float --> int
-VX_VEC4  = vx_vec_vsadd.s 
-VX_VEC5  = vx_vec_memcpy.s 
-LIBS    = /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libc.a /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
-
-VX_MAIN = vx_vec_benchmark
-
-all: HEX DUMP ELF
-
-DUMP: ELF
-	$(DMP) -D $(VX_MAIN).elf > $(VX_MAIN).dump
-
-HEX: ELF
-	$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
-
-ELF:
-	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
-#	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC3) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
-#	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC4) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
-#	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC5) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf~                                
--- a/miscs/rvvector/benchmark_temp/TO_DO_LIST
+++ b/miscs/rvvector/benchmark_temp/TO_DO_LIST
@ -1,9 +0,0 @@
-1. add benchmarks under (Vortex/benchmarks/..)
-  1.1 bfs     --> blas spmv approach
-  1.2 kmeans  // stage 2
-  1.3 saxpy   --> sample
-  1.4 sfilter // stage 2
-  1.5 sgemm   --> sample modify (float --> int)
-  1.6 vecadd  --> sample
-  
-
--- a/miscs/rvvector/benchmark_temp/vx_vec_benchmark.c
+++ b/miscs/rvvector/benchmark_temp/vx_vec_benchmark.c
@ -1,177 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "../../runtime/intrinsics/vx_intrinsics.h"
-#include "vx_vec_benchmark.h"
-
-int main()
-{
-    vx_tmc(1);
-
-    int n = 5;
-    int scalar = 10;
-
-    int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
-    int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
-    int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
-
-    for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2; c[i] = 5; }
-
-#if 0
-//---------------------------------------------------------------
-/* vvaddint32
- * # vector-vector add routine of 32-bit integers
- * # void vvaddint32(size_t n, const int*x, const int*y, int*z)
- * # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } } */
-    printf("vvaddint...\na[%d]: ", n);
-    for(int i = 0; i < n; ++i) printf("%d ", a[i]);
-    printf("\nb[%d]: ", n);
-    for(int i = 0; i < n; ++i) printf("%d ", b[i]);
-    printf("\nc[%d] = a[%d] + b[%d]: ", n, n, n);
-    for(int i = 0; i < n; ++i) printf("%d ", c[i]);
-
-    vx_vec_vvaddint32(n, a, b, c);
-
-    for(int i = 0; i < n; ++i) 
-    {
-        if(c[i] != (a[i]+b[i])) 
-        {
-           printf("\n<vddint32> failed at <index: %d>! \n", i);
-           return 1;   
-        }
-    }
-    printf("\nPASSED.......................... <vddint32> \n");
-#endif
-#if 0
-//---------------------------------------------------------------
-/* #  vector-scalar add
-   # for (i=0; i<N; i++) { C[i] = A[i] + B; } // 32-bit ints */
-    for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 1;}
-    printf("vsadd...scalar:%d\na[%d]: ", scalar, n);
-    for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
-    printf("\nb: %d", scalar);
-    
-    vx_vec_vsadd(n, a, scalar);
-
-    for(int i = 0; i < n; ++i) 
-    {
-        if(a[i] != (b[i] * scalar)) 
-        {
-           printf("\n<vsadd> failed at <index: %d>! \n", i);
-           return 1;   
-        }
-    }
-    printf("\nPASSED.......................... <vsadd> \n");
-
-#endif
-#if 0
-//---------------------------------------------------------------
-/*  # memory copy
-    # void *memcpy(void* dest, const void* src, size_t n) */
-    for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2;}
-    printf("memcpy\na[%d]: ", n);
-    for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
-    printf("\nb[%d]: ", n);
-    for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
-
-    vx_vec_memcpy(a, b, n);
-
-    for(int i = 0; i < n; ++i) 
-    {
-        if(a[i] != b[i]) 
-        {
-           printf("\n<memcpy> failed at <index: %d>! \n", i);
-<<<<<<< HEAD
-           return;   
-        }
-    }
-    printf("\nPASSED.......................... <memcpy> \n");
-=======
-           return 1;   
-        }
-    }
-    printf("\nPASSED.......................... <memcpy> \n");
-#endif
-#if 1
-//---------------------------------------------------------------
-/* # void saxpy(size_t n, const float a, const float *x, float *y)
-   # ==> convert to int!!
-   # void saxpy(size_t n, const int a, const int *x, int *y)
-   # {
-   #   size_t i;
-   #   for (i=0; i<n; i++) y[i] = a * x[i] + y[i];
-   # } */
-    for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2; c[i] = 2;}
-
-    printf("saxpy\na[%d]: ", n);
-    for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
-    printf("\nb[%d]: ", n);
-    for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
-
-    vx_vec_saxpy(n, scalar, a, b);
-
-    printf("saxpy\na[%d]: ", n);
-    for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
-    printf("\nb[%d]: ", n);
-    for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
-
-    for(int i = 0; i < n; ++i) 
-    {
-        if(b[i] != ((a[i] * scalar) + c[i])) 
-        {
-           printf("\n<saxpy> failed at <index: %d>! \n", i);
-           return;   
-        }
-    }
-    printf("\nPASSED.......................... <saxpy> \n");
-
-           return 1;   
-        }
-    }
-    printf("\nPASSED.......................... <saxpy> \n");
-#endif
-#if 0
-//---------------------------------------------------------------
-/* # void sgemm_nn(size_t n, size_t m, size_t k, const float*a,   // m * k matrix
-#          size_t lda, const float*b,   // k * n matrix 
-#          size_t ldb, float*c,         // m * n matrix
-#          size_t ldc)
-#  c += a*b (alpha=1, no transpose on input matrices)
-#  matrices stored in C row-major order */
-
-    int m = 8;
-    int k = 8;
-    int n = 8
-    int lda = 4;
-    int ldb = 4;
-    int ldc = 4;
-
-    int* a1 = (int*)malloc(sizeof(m * k));
-    int* b1 = (int*)malloc(sizeof(k * n));
-    int* c1 = (int*)malloc(sizeof(m * n));
-
-    for(int i = 0; i < (m * k); ++i) a1[i] = 1;
-    for(int i = 0; i < (k * n); ++i) b1[i] = 1;
-    for(int i = 0; i < (m * n); ++i) c1[i] = 1;    
-
-    printf("sgemm_nn\na[%d]: ", n);
-    for(int i = 0; i < n; ++i) printf("%d \n", a1[i]);
-    printf("\nb[%d]: ", n);
-    for(int i = 0; i < n; ++i) printf("%d \n", b1[i]);
-
-    vx_vec_sgemm_nn(n, m, k, a1, lda, b1, ldb, c1, ldc);
-
-    //for(int i = 0; i < n; ++i) 
-    //{
-    //    if(b[i] != ((a[i] * scalar) + c[i])) 
-    //    {
-    //       printf("\n<sgemm_nn> failed at <index: %d>! \n", i);
-    //       return;   
-    //    }
-    //}
-    printf("\nNOT TESTED.......................... <sgemm_nn> \n");
-//---------------------------------------------------------------
-#endif
-    
-    vx_tmc(0);
-    return 0;
-}
--- a/miscs/rvvector/benchmark_temp/vx_vec_benchmark.dump
+++ b/miscs/rvvector/benchmark_temp/vx_vec_benchmark.dump
--- a/miscs/rvvector/benchmark_temp/vx_vec_benchmark.elf
+++ b/miscs/rvvector/benchmark_temp/vx_vec_benchmark.elf
--- a/miscs/rvvector/benchmark_temp/vx_vec_benchmark.h
+++ b/miscs/rvvector/benchmark_temp/vx_vec_benchmark.h
@ -1,16 +0,0 @@
-#pragma once
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//void vx_vec_vvaddint32(int n, int* a, int* b, int *c);
-//void vx_vec_vsadd(int n, int* a, int scalar);
-//void vx_vec_memcpy(int* a, int* b, int n);
-void vx_vec_saxpy(int n, int scalar, int* a, int* b);
-//void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int lda, int* b1, int ldb, int* c1, int ldc);
-
-#ifdef __cplusplus
-}
-#endif                   
--- a/miscs/rvvector/benchmark_temp/vx_vec_benchmark.hex
+++ b/miscs/rvvector/benchmark_temp/vx_vec_benchmark.hex
--- a/miscs/rvvector/benchmark_temp/vx_vec_memcpy.s
+++ b/miscs/rvvector/benchmark_temp/vx_vec_memcpy.s
@ -1,17 +0,0 @@
-.type vx_vec_memcpy, @function
-.global vx_vec_memcpy
-# void *memcpy(void* dest, const void* src, size_t n)
-# a0=dest, a1=src, a2=n
-#
-vx_vec_memcpy:
-#  memcpy
-  mv a3, a0 # Copy destination
-  vsetvli t0, a2, e8,m8  # Vectors of 8b
-loop:
-  vlb.v v0, (a1)                # Load bytes
-    add a1, a1, t0              # Bump pointer
-    sub a2, a2, t0              # Decrement count
-  vsb.v v0, (a3)                # Store bytes
-    add a3, a3, t0              # Bump pointer
-    bnez a2, loop               # Any more?
-    ret                         # Return
--- a/miscs/rvvector/benchmark_temp/vx_vec_saxpy.s
+++ b/miscs/rvvector/benchmark_temp/vx_vec_saxpy.s
@ -1,62 +0,0 @@
-.type vx_vec_saxpy, @function
-.global vx_vec_saxpy
-# void
-# saxpy(size_t n, const float a, const float *x, float *y)
-# {
-#   size_t i;
-#   for (i=0; i<n; i++)
-#     y[i] = a * x[i] + y[i];
-# }
-#
-# register arguments:
-#     a0      n
-#     fa0     a
-#     a1      x
-#     a2      y
-vx_vec_saxpy:
-    vsetvli a4, a0, e32, m8
-saxpy:
-    vlw.v v0, (a1)
-    sub a0, a0, a4
-    slli a4, a4, 2
-    add a1, a1, a4
-    vlw.v v8, (a2)
-    vfmacc.vf v8, fa0, v0
-    vsw.v v8, (a2)
-    add a2, a2, a4
-    bnez a0, saxpy
-    ret
-#vx_vec_saxpy:
-#    vsetvli a4, a0, e32, m8
-#saxpy:
-#    vlw.v v0, (a1)
-#    sub a0, a0, a4
-#    slli a4, a4, 2
-#    add a1, a1, a4
-#    vlw.v v8, (a2)
-#    vfmacc.vf v8, fa0, v0
-#    vsw.v v8, (a2)
-#    add a2, a2, a4
-#    bnez a0, saxpy
-#    ret
-
-# a0 n, rs1 a, a2 x, a3 y
-
-# a0 n, a1 a, a2 x, a3 y
-vx_vec_saxpy:
-    vsetvli a4, a0, e32, m1
-saxpy:
-    vlw.v v0, (a2)
-    sub a0, a0, a4
-    slli a4, a4, 2
-    add a2, a2, a4
-    vlw.v v1, (a3)
-    vmul.vx v0, v0, a1
-#    vmul.vv v0, v0, v1
-#    li x1, 10
-#    vmul.vx v0, v0, x1
-    vadd.vv v1, v0, v1
-    vsw.v v1, (a3)
-    add a3, a3, a4
-    bnez a0, saxpy
-    ret
--- a/miscs/rvvector/benchmark_temp/vx_vec_saxpy_float.s
+++ b/miscs/rvvector/benchmark_temp/vx_vec_saxpy_float.s
@ -1,28 +0,0 @@
-.type vx_vec_saxpy, @function
-.global vx_vec_saxpy_float
-# void
-# saxpy(size_t n, const float a, const float *x, float *y)
-# {
-#   size_t i;
-#   for (i=0; i<n; i++)
-#     y[i] = a * x[i] + y[i];
-# }
-#
-# register arguments:
-#     a0      n
-#     fa0     a
-#     a1      x
-#     a2      y
-vx_vec_saxpy_float:
-    vsetvli a4, a0, e32, m8
-saxpy:
-    vlw.v v0, (a1)
-    sub a0, a0, a4
-    slli a4, a4, 2
-    add a1, a1, a4
-    vlw.v v8, (a2)
-    vfmacc.vf v8, fa0, v0
-    vsw.v v8, (a2)
-    add a2, a2, a4
-    bnez a0, saxpy
-    ret
--- a/miscs/rvvector/benchmark_temp/vx_vec_sgemm.s
+++ b/miscs/rvvector/benchmark_temp/vx_vec_sgemm.s
@ -1,220 +0,0 @@
-.type vx_vec_sgemm_nn, @function
-.global vx_vec_sgemm_nn
-# RV64IDV system
-#
-# void
-# sgemm_nn(size_t n,
-#          size_t m,
-#          size_t k,
-#          const float*a,   // m * k matrix
-#          size_t lda,
-#          const float*b,   // k * n matrix
-#          size_t ldb,
-#          float*c,         // m * n matrix
-#          size_t ldc)
-#
-#  c += a*b (alpha=1, no transpose on input matrices)
-#  matrices stored in C row-major order
-
-#define n a0
-#define m a1
-#define k a2
-#define ap a3
-#define astride a4
-#define bp a5
-#define bstride a6
-#define cp a7
-#define cstride t0
-#define kt t1
-#define nt t2
-#define bnp t3
-#define cnp t4
-#define akp t5
-#define bkp s0
-#define nvl s1
-#define ccp s2
-#define amp s3
-
-# Use args as additional temporaries
-#define ft12 fa0
-#define ft13 fa1
-#define ft14 fa2
-#define ft15 fa3
-
-# This version holds a 16*VLMAX block of C matrix in vector registers
-# in inner loop, but otherwise does not cache or TLB tiling.
-vx_vec_sgemm_nn:
-#sgemm_nn:
-    addi sp, sp, -FRAMESIZE
-    sd s0, OFFSET(sp)
-    sd s1, OFFSET(sp)
-    sd s2, OFFSET(sp)
-
-    # Check for zero size matrices        
-    beqz n, exit
-    beqz m, exit
-    beqz k, exit
-
-    # Convert elements strides to byte strides.
-    ld cstride, OFFSET(sp)   # Get arg from stack frame
-    slli astride, astride, 2
-    slli bstride, bstride, 2
-    slli cstride, cstride, 2
-
-    slti t6, m, 16
-    bnez t6, end_rows
-
-c_row_loop: # Loop across rows of C blocks
-
-    mv nt, n  # Initialize n counter for next row of C blocks
-
-    mv bnp, bp # Initialize B n-loop pointer to start
-    mv cnp, cp # Initialize C n-loop pointer
-
-c_col_loop: # Loop across one row of C blocks
-    vsetvli nvl, nt, e32  # 32-bit vectors, LMUL=1
-
-    mv akp, ap   # reset pointer into A to beginning
-    mv bkp, bnp # step to next column in B matrix
-
-    # Initalize current C submatrix block from memory.
-    vlw.v  v0, (cnp); add ccp, cnp, cstride;
-    vlw.v  v1, (ccp); add ccp, ccp, cstride;
-    vlw.v  v2, (ccp); add ccp, ccp, cstride;
-    vlw.v  v3, (ccp); add ccp, ccp, cstride;
-    vlw.v  v4, (ccp); add ccp, ccp, cstride;
-    vlw.v  v5, (ccp); add ccp, ccp, cstride;
-    vlw.v  v6, (ccp); add ccp, ccp, cstride;
-    vlw.v  v7, (ccp); add ccp, ccp, cstride;
-    vlw.v  v8, (ccp); add ccp, ccp, cstride;
-    vlw.v  v9, (ccp); add ccp, ccp, cstride;
-    vlw.v v10, (ccp); add ccp, ccp, cstride;
-    vlw.v v11, (ccp); add ccp, ccp, cstride;
-    vlw.v v12, (ccp); add ccp, ccp, cstride;
-    vlw.v v13, (ccp); add ccp, ccp, cstride;
-    vlw.v v14, (ccp); add ccp, ccp, cstride;
-    vlw.v v15, (ccp)
-
-
-    mv kt, k # Initialize inner loop counter
-
-    # Inner loop scheduled assuming 4-clock occupancy of vfmacc instruction and single-issue pipeline
-    # Software pipeline loads
-    flw ft0, (akp); add amp, akp, astride;
-    flw ft1, (amp); add amp, amp, astride;
-    flw ft2, (amp); add amp, amp, astride;
-    flw ft3, (amp); add amp, amp, astride;
-    # Get vector from B matrix
-    vlw.v v16, (bkp)
-
-    # Loop on inner dimension for current C block
- k_loop:
-    vfmacc.vf v0, ft0, v16
-    add bkp, bkp, bstride
-    flw ft4, (amp)
-    add amp, amp, astride
-    vfmacc.vf v1, ft1, v16
-    addi kt, kt, -1    # Decrement k counter
-    flw ft5, (amp)
-    add amp, amp, astride
-    vfmacc.vf v2, ft2, v16
-    flw ft6, (amp)
-    add amp, amp, astride
-    flw ft7, (amp)
-    vfmacc.vf v3, ft3, v16
-    add amp, amp, astride
-    flw ft8, (amp)
-    add amp, amp, astride
-    vfmacc.vf v4, ft4, v16
-    flw ft9, (amp)
-    add amp, amp, astride
-    vfmacc.vf v5, ft5, v16
-    flw ft10, (amp)
-    add amp, amp, astride
-    vfmacc.vf v6, ft6, v16
-    flw ft11, (amp)
-    add amp, amp, astride
-    vfmacc.vf v7, ft7, v16
-    flw ft12, (amp)
-    add amp, amp, astride
-    vfmacc.vf v8, ft8, v16
-    flw ft13, (amp)
-    add amp, amp, astride
-    vfmacc.vf v9, ft9, v16
-    flw ft14, (amp)
-    add amp, amp, astride
-    vfmacc.vf v10, ft10, v16
-    flw ft15, (amp)
-    add amp, amp, astride
-    addi akp, akp, 4            # Move to next column of a
-    vfmacc.vf v11, ft11, v16
-    beqz kt, 1f                 # Don't load past end of matrix
-    flw ft0, (akp)
-    add amp, akp, astride
-1:  vfmacc.vf v12, ft12, v16
-    beqz kt, 1f
-    flw ft1, (amp)
-    add amp, amp, astride
-1:  vfmacc.vf v13, ft13, v16
-    beqz kt, 1f
-    flw ft2, (amp)
-    add amp, amp, astride
-1:  vfmacc.vf v14, ft14, v16
-    beqz kt, 1f                 # Exit out of loop
-    flw ft3, (amp)
-    add amp, amp, astride
-    vfmacc.vf v15, ft15, v16
-    vlw.v v16, (bkp)            # Get next vector from B matrix, overlap loads with jump stalls
-    j k_loop
-
-1:  vfmacc.vf v15, ft15, v16
-    
-    # Save C matrix block back to memory
-    vsw.v  v0, (cnp); add ccp, cnp, cstride;
-    vsw.v  v1, (ccp); add ccp, ccp, cstride;
-    vsw.v  v2, (ccp); add ccp, ccp, cstride;
-    vsw.v  v3, (ccp); add ccp, ccp, cstride;
-    vsw.v  v4, (ccp); add ccp, ccp, cstride;
-    vsw.v  v5, (ccp); add ccp, ccp, cstride;
-    vsw.v  v6, (ccp); add ccp, ccp, cstride;
-    vsw.v  v7, (ccp); add ccp, ccp, cstride;
-    vsw.v  v8, (ccp); add ccp, ccp, cstride;
-    vsw.v  v9, (ccp); add ccp, ccp, cstride;
-    vsw.v v10, (ccp); add ccp, ccp, cstride;
-    vsw.v v11, (ccp); add ccp, ccp, cstride;
-    vsw.v v12, (ccp); add ccp, ccp, cstride;
-    vsw.v v13, (ccp); add ccp, ccp, cstride;
-    vsw.v v14, (ccp); add ccp, ccp, cstride;
-    vsw.v v15, (ccp)
-
-    # Following tail instructions should be scheduled earlier in free slots during C block save.
-    # Leaving here for clarity.
-
-    # Bump pointers for loop across blocks in one row
-    slli t6, nvl, 2
-    add cnp, cnp, t6                         # Move C block pointer over
-    add bnp, bnp, t6                         # Move B block pointer over
-    sub nt, nt, nvl                          # Decrement element count in n dimension
-    bnez nt, c_col_loop                      # Any more to do?
-
-    # Move to next set of rows
-    addi m, m, -16  # Did 16 rows above
-    slli t6, astride, 4  # Multiply astride by 16
-    add ap, ap, t6         # Move A matrix pointer down 16 rows
-    slli t6, cstride, 4  # Multiply cstride by 16
-    add cp, cp, t6         # Move C matrix pointer down 16 rows
-    
-    slti t6, m, 16
-    beqz t6, c_row_loop
-
-    # Handle end of matrix with fewer than 16 rows.
-    # Can use smaller versions of above decreasing in powers-of-2 depending on code-size concerns.
-end_rows:
-    # Not done.
-
-exit:
-    ld s0, OFFSET(sp)
-    ld s1, OFFSET(sp)
-    ld s2, OFFSET(sp)
-    addi sp, sp, FRAMESIZE
-    ret
--- a/Show more
+++ b/Show more