Adds the riscv vector extension into simx

2025-04-23 21:39:10 -04:00 · 2024-11-26 18:41:01 -08:00 · 2024-11-26 18:41:01 -08:00 · 1e4583ac17
commit 1e4583ac17
parent 8230b37411
22 changed files with 5717 additions and 105 deletions
--- a/ci/regression.sh.in
+++ b/ci/regression.sh.in
@ -386,10 +386,20 @@ synthesis()
    echo "synthesis tests done!"
 }

+vector()
+{
+    echo "begin vector tests..."
+
+    make -C sim/simx
+    TOOLDIR=@TOOLDIR@ XLEN=@XLEN@ VLEN=256 REG_TESTS=1 ./tests/riscv/riscv-vector-tests/run-test.sh
+
+    echo "vector tests done!"
+}
+
 show_usage()
 {
    echo "Vortex Regression Test"
-    echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--all] [--h|--help]"
+    echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--vector] [--all] [--h|--help]"
 }

 declare -a tests=()
@ -439,6 +449,9 @@ while [ "$1" != "" ]; do
        --synthesis )
                tests+=("synthesis")
                ;;
+        --vector )
+                tests+=("vector")
+                ;;
        --all )
                tests=()
                tests+=("unittest")
@ -454,6 +467,7 @@ while [ "$1" != "" ]; do
                tests+=("scope")
                tests+=("stress")
                tests+=("synthesis")
+                tests+=("vector")
                ;;
        -h | --help )
                show_usage
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -87,6 +87,10 @@
 `endif
 `endif

+`ifndef VLEN
+`define VLEN 256
+`endif
+
 `ifndef NUM_CLUSTERS
 `define NUM_CLUSTERS 1
 `endif
--- a/hw/rtl/VX_types.vh
+++ b/hw/rtl/VX_types.vh
@ -188,6 +188,19 @@
 `define VX_CSR_MIMPID                   12'hF13
 `define VX_CSR_MHARTID                  12'hF14

+// Vector CSRs
+
+`define VX_CSR_VSTART                   12'h008
+`define VX_CSR_VXSAT                    12'h009
+`define VX_CSR_VXRM                     12'h00A
+`define VX_CSR_VCSR                     12'h00F
+`define VX_CSR_VL                       12'hC20
+`define VX_CSR_VTYPE                    12'hC21
+`define VX_CSR_VLENB                    12'hC22
+`define VX_CSR_VCYCLE                   12'hC00
+`define VX_CSR_VTIME                    12'hC01
+`define VX_CSR_VINSTRET                 12'hC02
+
 // GPGU CSRs

 `define VX_CSR_THREAD_ID                12'hCC0
--- a/perf/cache/cache_perf.log
+++ b/perf/cache/cache_perf.log
@ -1,3 +1,3 @@
 CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2   -DPERF_ENABLE -DICACHE_NUM_WAYS=1
 running: CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 make -C ./ci/../driver/rtlsim
-verilator --build --exe --cc Vortex --top-module Vortex --language 1800-2009 --assert -Wall -Wpedantic -Wno-DECLFILENAME -Wno-REDEFMACRO --x-initial unique --x-assign unique verilator.vlt -I../../hw/rtl -I../../hw/dpi -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/simulate -I../../hw/rtl/fp_cores -I../../third_party/fpnew/src/common_cells/include -I../../third_party/fpnew/src/common_cells/src -I../../third_party/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../third_party/fpnew/src -I../../hw/rtl/tex_unit -I../../hw/rtl/raster_unit -I../../hw/rtl/rop_unit -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2   -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -j 64 -DNDEBUG -DIMUL_DPI -DIDIV_DPI -DFPU_DPI ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp ../../hw/dpi/util_dpi.cpp ../../hw/dpi/float_dpi.cpp processor.cpp -CFLAGS '-std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds -fPIC -Wno-maybe-uninitialized -I../../../hw -I../../common -I../../../third_party/softfloat/source/include -I../../../third_party -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2   -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -O2 -DNDEBUG' -LDFLAGS '-shared ../../../third_party/softfloat/build/Linux-x86_64-GCC/softfloat.a -L../../../third_party/ramulator -lramulator' -o ../../../driver/rtlsim/librtlsim.so
+verilator --build --exe --cc Vortex --top-module Vortex --language 1800-2009 --assert -Wall -Wpedantic -Wno-DECLFILENAME -Wno-REDEFMACRO --x-initial unique --x-assign unique verilator.vlt -I../../hw/rtl -I../../hw/dpi -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/simulate -I../../hw/rtl/fp_cores -I../../third_party/fpnew/src/common_cells/include -I../../third_party/fpnew/src/common_cells/src -I../../third_party/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../third_party/fpnew/src -I../../hw/rtl/tex_unit -I../../hw/rtl/raster_unit -I../../hw/rtl/rop_unit -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2   -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -j 64 -DNDEBUG -DIMUL_DPI -DIDIV_DPI -DFPU_DPI ../common/util.cpp ../common/mem.cpp ../common/softfloat_ext.cpp ../common/rvfloats.cpp ../../hw/dpi/util_dpi.cpp ../../hw/dpi/float_dpi.cpp processor.cpp -CFLAGS '-std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds -fPIC -Wno-maybe-uninitialized -I../../../hw -I../../common -I../../../third_party/softfloat/source/include -I../../../third_party -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2   -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -O2 -DNDEBUG' -LDFLAGS '-shared ../../../third_party/softfloat/build/Linux-x86_64-GCC/softfloat.a -L../../../third_party/ramulator -lramulator' -o ../../../driver/rtlsim/librtlsim.so
--- a/sim/common/rvfloats.cpp
+++ b/sim/common/rvfloats.cpp
@ -12,6 +12,7 @@
 // limitations under the License.

 #include "rvfloats.h"
+#include "softfloat_ext.h"
 #include <stdio.h>

 extern "C" {
@ -158,6 +159,34 @@ uint64_t rv_fdiv_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
  return from_float64_t(r);
 }

+uint32_t rv_frecip7_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
+  softfloat_roundingMode = frm;
+  auto r = f32_recip7(to_float32_t(a));
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
+  return from_float32_t(r);
+}
+
+uint64_t rv_frecip7_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
+  softfloat_roundingMode = frm;
+  auto r = f64_recip7(to_float64_t(a));
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
+  return from_float64_t(r);
+}
+
+uint32_t rv_frsqrt7_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
+  softfloat_roundingMode = frm;
+  auto r = f32_rsqrte7(to_float32_t(a));
+  if (fflags) { *fflags =softfloat_exceptionFlags; }
+  return from_float32_t(r);
+}
+
+uint64_t rv_frsqrt7_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
+  softfloat_roundingMode = frm;
+  auto r = f64_rsqrte7(to_float64_t(a));
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
+  return from_float64_t(r);
+}
+
 uint32_t rv_fsqrt_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
  rv_init(frm);
  auto r = f32_sqrt(to_float32_t(a));
@ -486,6 +515,11 @@ uint64_t rv_fsgnjx_d(uint64_t a, uint64_t b) {
  return r;
 }

+uint32_t rv_dtof_r(uint64_t a, uint32_t frm) {
+  rv_init(frm);
+  return rv_dtof(a);
+}
+
 uint32_t rv_dtof(uint64_t a) {
  auto r = f64_to_f32(to_float64_t(a));
  return from_float32_t(r);
--- a/sim/common/rvfloats.h
+++ b/sim/common/rvfloats.h
@ -28,6 +28,8 @@ uint32_t rv_fnmadd_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t*
 uint32_t rv_fnmsub_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags);
 uint32_t rv_fdiv_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags);
 uint32_t rv_fsqrt_s(uint32_t a, uint32_t frm, uint32_t* fflags);
+uint32_t rv_frecip7_s(uint32_t a, uint32_t frm, uint32_t* fflags);
+uint32_t rv_frsqrt7_s(uint32_t a, uint32_t frm, uint32_t* fflags);

 uint32_t rv_ftoi_s(uint32_t a, uint32_t frm, uint32_t* fflags);
 uint32_t rv_ftou_s(uint32_t a, uint32_t frm, uint32_t* fflags);
@ -58,6 +60,8 @@ uint64_t rv_fsub_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags);
 uint64_t rv_fmul_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags);
 uint64_t rv_fdiv_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags);
 uint64_t rv_fsqrt_d(uint64_t a, uint32_t frm, uint32_t* fflags);
+uint64_t rv_frecip7_d(uint64_t a, uint32_t frm, uint32_t* fflags);
+uint64_t rv_frsqrt7_d(uint64_t a, uint32_t frm, uint32_t* fflags);

 uint64_t rv_fmadd_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags);
 uint64_t rv_fmsub_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags);
@ -85,6 +89,7 @@ uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags);
 uint64_t rv_fmax_d(uint64_t a, uint64_t b, uint32_t* fflags);

 uint32_t rv_dtof(uint64_t a);
+uint32_t rv_dtof_r(uint64_t a, uint32_t frm);
 uint64_t rv_ftod(uint32_t a);

 #ifdef __cplusplus
--- a/sim/common/softfloat_ext.cpp
+++ b/sim/common/softfloat_ext.cpp
@ -0,0 +1,486 @@
+/*============================================================================
+
+This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3e, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of
+California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#include <assert.h>
+#include <stdbool.h>
+#include <internals.h>
+#include <../RISCV/specialize.h>
+#include <softfloat.h>
+#include "softfloat_ext.h"
+
+uint_fast16_t f16_classify( float16_t a )
+{
+    union ui16_f16 uA;
+    uint_fast16_t uiA;
+
+    uA.f = a;
+    uiA = uA.ui;
+
+    uint_fast16_t infOrNaN = expF16UI( uiA ) == 0x1F;
+    uint_fast16_t subnormalOrZero = expF16UI( uiA ) == 0;
+    bool sign = signF16UI( uiA );
+    bool fracZero = fracF16UI( uiA ) == 0;
+    bool isNaN = isNaNF16UI( uiA );
+    bool isSNaN = softfloat_isSigNaNF16UI( uiA );
+
+    return
+        (  sign && infOrNaN && fracZero )          << 0 |
+        (  sign && !infOrNaN && !subnormalOrZero ) << 1 |
+        (  sign && subnormalOrZero && !fracZero )  << 2 |
+        (  sign && subnormalOrZero && fracZero )   << 3 |
+        ( !sign && infOrNaN && fracZero )          << 7 |
+        ( !sign && !infOrNaN && !subnormalOrZero ) << 6 |
+        ( !sign && subnormalOrZero && !fracZero )  << 5 |
+        ( !sign && subnormalOrZero && fracZero )   << 4 |
+        ( isNaN &&  isSNaN )                       << 8 |
+        ( isNaN && !isSNaN )                       << 9;
+}
+
+uint_fast16_t f32_classify( float32_t a )
+{
+    union ui32_f32 uA;
+    uint_fast32_t uiA;
+
+    uA.f = a;
+    uiA = uA.ui;
+
+    uint_fast16_t infOrNaN = expF32UI( uiA ) == 0xFF;
+    uint_fast16_t subnormalOrZero = expF32UI( uiA ) == 0;
+    bool sign = signF32UI( uiA );
+    bool fracZero = fracF32UI( uiA ) == 0;
+    bool isNaN = isNaNF32UI( uiA );
+    bool isSNaN = softfloat_isSigNaNF32UI( uiA );
+
+    return
+        (  sign && infOrNaN && fracZero )          << 0 |
+        (  sign && !infOrNaN && !subnormalOrZero ) << 1 |
+        (  sign && subnormalOrZero && !fracZero )  << 2 |
+        (  sign && subnormalOrZero && fracZero )   << 3 |
+        ( !sign && infOrNaN && fracZero )          << 7 |
+        ( !sign && !infOrNaN && !subnormalOrZero ) << 6 |
+        ( !sign && subnormalOrZero && !fracZero )  << 5 |
+        ( !sign && subnormalOrZero && fracZero )   << 4 |
+        ( isNaN &&  isSNaN )                       << 8 |
+        ( isNaN && !isSNaN )                       << 9;
+}
+
+uint_fast16_t f64_classify( float64_t a )
+{
+    union ui64_f64 uA;
+    uint_fast64_t uiA;
+
+    uA.f = a;
+    uiA = uA.ui;
+
+    uint_fast16_t infOrNaN = expF64UI( uiA ) == 0x7FF;
+    uint_fast16_t subnormalOrZero = expF64UI( uiA ) == 0;
+    bool sign = signF64UI( uiA );
+    bool fracZero = fracF64UI( uiA ) == 0;
+    bool isNaN = isNaNF64UI( uiA );
+    bool isSNaN = softfloat_isSigNaNF64UI( uiA );
+
+    return
+        (  sign && infOrNaN && fracZero )          << 0 |
+        (  sign && !infOrNaN && !subnormalOrZero ) << 1 |
+        (  sign && subnormalOrZero && !fracZero )  << 2 |
+        (  sign && subnormalOrZero && fracZero )   << 3 |
+        ( !sign && infOrNaN && fracZero )          << 7 |
+        ( !sign && !infOrNaN && !subnormalOrZero ) << 6 |
+        ( !sign && subnormalOrZero && !fracZero )  << 5 |
+        ( !sign && subnormalOrZero && fracZero )   << 4 |
+        ( isNaN &&  isSNaN )                       << 8 |
+        ( isNaN && !isSNaN )                       << 9;
+}
+
+static inline uint64_t extract64(uint64_t val, int pos, int len)
+{
+  assert(pos >= 0 && len > 0 && len <= 64 - pos);
+  return (val >> pos) & (~UINT64_C(0) >> (64 - len));
+}
+
+static inline uint64_t make_mask64(int pos, int len)
+{
+    assert(pos >= 0 && len > 0 && pos < 64 && len <= 64);
+    return (UINT64_MAX >> (64 - len)) << pos;
+}
+
+//user needs to truncate output to required length
+static inline uint64_t rsqrte7(uint64_t val, int e, int s, bool sub) {
+  uint64_t exp = extract64(val, s, e);
+  uint64_t sig = extract64(val, 0, s);
+  uint64_t sign = extract64(val, s + e, 1);
+  const int p = 7;
+
+  static const uint8_t table[] = {
+      52, 51, 50, 48, 47, 46, 44, 43,
+      42, 41, 40, 39, 38, 36, 35, 34,
+      33, 32, 31, 30, 30, 29, 28, 27,
+      26, 25, 24, 23, 23, 22, 21, 20,
+      19, 19, 18, 17, 16, 16, 15, 14,
+      14, 13, 12, 12, 11, 10, 10, 9,
+      9, 8, 7, 7, 6, 6, 5, 4,
+      4, 3, 3, 2, 2, 1, 1, 0,
+      127, 125, 123, 121, 119, 118, 116, 114,
+      113, 111, 109, 108, 106, 105, 103, 102,
+      100, 99, 97, 96, 95, 93, 92, 91,
+      90, 88, 87, 86, 85, 84, 83, 82,
+      80, 79, 78, 77, 76, 75, 74, 73,
+      72, 71, 70, 70, 69, 68, 67, 66,
+      65, 64, 63, 63, 62, 61, 60, 59,
+      59, 58, 57, 56, 56, 55, 54, 53};
+
+  if (sub) {
+      while (extract64(sig, s - 1, 1) == 0)
+          exp--, sig <<= 1;
+
+      sig = (sig << 1) & make_mask64(0 ,s);
+  }
+
+  int idx = ((exp & 1) << (p-1)) | (sig >> (s-p+1));
+  uint64_t out_sig = (uint64_t)(table[idx]) << (s-p);
+  uint64_t out_exp = (3 * make_mask64(0, e - 1) + ~exp) / 2;
+
+  return (sign << (s+e)) | (out_exp << s) | out_sig;
+}
+
+float16_t f16_rsqrte7(float16_t in)
+{
+    union ui16_f16 uA;
+
+    uA.f = in;
+    unsigned int ret = f16_classify(in);
+    bool sub = false;
+    switch(ret) {
+    case 0x001: // -inf
+    case 0x002: // -normal
+    case 0x004: // -subnormal
+    case 0x100: // sNaN
+        softfloat_exceptionFlags |= softfloat_flag_invalid;
+        [[fallthrough]];
+    case 0x200: //qNaN
+        uA.ui = defaultNaNF16UI;
+        break;
+    case 0x008: // -0
+        uA.ui = 0xfc00;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x010: // +0
+        uA.ui = 0x7c00;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x080: //+inf
+        uA.ui = 0x0;
+        break;
+    case 0x020: //+ sub
+        sub = true;
+        [[fallthrough]];
+    default: // +num
+        uA.ui = rsqrte7(uA.ui, 5, 10, sub);
+        break;
+    }
+
+    return uA.f;
+}
+
+float32_t f32_rsqrte7(float32_t in)
+{
+    union ui32_f32 uA;
+
+    uA.f = in;
+    unsigned int ret = f32_classify(in);
+    bool sub = false;
+    switch(ret) {
+    case 0x001: // -inf
+    case 0x002: // -normal
+    case 0x004: // -subnormal
+    case 0x100: // sNaN
+        softfloat_exceptionFlags |= softfloat_flag_invalid;
+        [[fallthrough]];
+    case 0x200: //qNaN
+        uA.ui = defaultNaNF32UI;
+        break;
+    case 0x008: // -0
+        uA.ui = 0xff800000;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x010: // +0
+        uA.ui = 0x7f800000;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x080: //+inf
+        uA.ui = 0x0;
+        break;
+    case 0x020: //+ sub
+        sub = true;
+        [[fallthrough]];
+    default: // +num
+        uA.ui = rsqrte7(uA.ui, 8, 23, sub);
+        break;
+    }
+
+    return uA.f;
+}
+
+float64_t f64_rsqrte7(float64_t in)
+{
+    union ui64_f64 uA;
+
+    uA.f = in;
+    unsigned int ret = f64_classify(in);
+    bool sub = false;
+    switch(ret) {
+    case 0x001: // -inf
+    case 0x002: // -normal
+    case 0x004: // -subnormal
+    case 0x100: // sNaN
+        softfloat_exceptionFlags |= softfloat_flag_invalid;
+        [[fallthrough]];
+    case 0x200: //qNaN
+        uA.ui = defaultNaNF64UI;
+        break;
+    case 0x008: // -0
+        uA.ui = 0xfff0000000000000ul;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x010: // +0
+        uA.ui = 0x7ff0000000000000ul;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x080: //+inf
+        uA.ui = 0x0;
+        break;
+    case 0x020: //+ sub
+        sub = true;
+        [[fallthrough]];
+    default: // +num
+        uA.ui = rsqrte7(uA.ui, 11, 52, sub);
+        break;
+    }
+
+    return uA.f;
+}
+
+//user needs to truncate output to required length
+static inline uint64_t recip7(uint64_t val, int e, int s, int rm, bool sub,
+                              bool *round_abnormal)
+{
+    uint64_t exp = extract64(val, s, e);
+    uint64_t sig = extract64(val, 0, s);
+    uint64_t sign = extract64(val, s + e, 1);
+    const int p = 7;
+
+    static const uint8_t table[] = {
+        127, 125, 123, 121, 119, 117, 116, 114,
+        112, 110, 109, 107, 105, 104, 102, 100,
+        99, 97, 96, 94, 93, 91, 90, 88,
+        87, 85, 84, 83, 81, 80, 79, 77,
+        76, 75, 74, 72, 71, 70, 69, 68,
+        66, 65, 64, 63, 62, 61, 60, 59,
+        58, 57, 56, 55, 54, 53, 52, 51,
+        50, 49, 48, 47, 46, 45, 44, 43,
+        42, 41, 40, 40, 39, 38, 37, 36,
+        35, 35, 34, 33, 32, 31, 31, 30,
+        29, 28, 28, 27, 26, 25, 25, 24,
+        23, 23, 22, 21, 21, 20, 19, 19,
+        18, 17, 17, 16, 15, 15, 14, 14,
+        13, 12, 12, 11, 11, 10, 9, 9,
+        8, 8, 7, 7, 6, 5, 5, 4,
+        4, 3, 3, 2, 2, 1, 1, 0};
+
+    if (sub) {
+        while (extract64(sig, s - 1, 1) == 0)
+            exp--, sig <<= 1;
+
+        sig = (sig << 1) & make_mask64(0 ,s);
+
+        if (exp != 0 && exp != UINT64_MAX) {
+            *round_abnormal = true;
+            if (rm == 1 ||
+                (rm == 2 && !sign) ||
+                (rm == 3 && sign))
+                return ((sign << (s+e)) | make_mask64(s, e)) - 1;
+            else
+                return (sign << (s+e)) | make_mask64(s, e);
+        }
+    }
+
+    int idx = sig >> (s-p);
+    uint64_t out_sig = (uint64_t)(table[idx]) << (s-p);
+    uint64_t out_exp = 2 * make_mask64(0, e - 1) + ~exp;
+    if (out_exp == 0 || out_exp == UINT64_MAX) {
+        out_sig = (out_sig >> 1) | make_mask64(s - 1, 1);
+        if (out_exp == UINT64_MAX) {
+            out_sig >>= 1;
+            out_exp = 0;
+        }
+    }
+
+    return (sign << (s+e)) | (out_exp << s) | out_sig;
+}
+
+float16_t f16_recip7(float16_t in)
+{
+    union ui16_f16 uA;
+
+    uA.f = in;
+    unsigned int ret = f16_classify(in);
+    bool sub = false;
+    bool round_abnormal = false;
+    switch(ret) {
+    case 0x001: // -inf
+        uA.ui = 0x8000;
+        break;
+    case 0x080: //+inf
+        uA.ui = 0x0;
+        break;
+    case 0x008: // -0
+        uA.ui = 0xfc00;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x010: // +0
+        uA.ui = 0x7c00;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x100: // sNaN
+        softfloat_exceptionFlags |= softfloat_flag_invalid;
+        [[fallthrough]];
+    case 0x200: //qNaN
+        uA.ui = defaultNaNF16UI;
+        break;
+    case 0x004: // -subnormal
+    case 0x020: //+ sub
+        sub = true;
+        [[fallthrough]];
+    default: // +- normal
+        uA.ui = recip7(uA.ui, 5, 10,
+                       softfloat_roundingMode, sub, &round_abnormal);
+        if (round_abnormal)
+            softfloat_exceptionFlags |= softfloat_flag_inexact |
+                                        softfloat_flag_overflow;
+        break;
+    }
+
+    return uA.f;
+}
+
+float32_t f32_recip7(float32_t in)
+{
+    union ui32_f32 uA;
+
+    uA.f = in;
+    unsigned int ret = f32_classify(in);
+    bool sub = false;
+    bool round_abnormal = false;
+    switch(ret) {
+    case 0x001: // -inf
+        uA.ui = 0x80000000;
+        break;
+    case 0x080: //+inf
+        uA.ui = 0x0;
+        break;
+    case 0x008: // -0
+        uA.ui = 0xff800000;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x010: // +0
+        uA.ui = 0x7f800000;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x100: // sNaN
+        softfloat_exceptionFlags |= softfloat_flag_invalid;
+        [[fallthrough]];
+    case 0x200: //qNaN
+        uA.ui = defaultNaNF32UI;
+        break;
+    case 0x004: // -subnormal
+    case 0x020: //+ sub
+        sub = true;
+        [[fallthrough]];
+    default: // +- normal
+        uA.ui = recip7(uA.ui, 8, 23,
+                       softfloat_roundingMode, sub, &round_abnormal);
+        if (round_abnormal)
+          softfloat_exceptionFlags |= softfloat_flag_inexact |
+                                      softfloat_flag_overflow;
+        break;
+    }
+
+    return uA.f;
+}
+
+float64_t f64_recip7(float64_t in)
+{
+    union ui64_f64 uA;
+
+    uA.f = in;
+    unsigned int ret = f64_classify(in);
+    bool sub = false;
+    bool round_abnormal = false;
+    switch(ret) {
+    case 0x001: // -inf
+        uA.ui = 0x8000000000000000;
+        break;
+    case 0x080: //+inf
+        uA.ui = 0x0;
+        break;
+    case 0x008: // -0
+        uA.ui = 0xfff0000000000000;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x010: // +0
+        uA.ui = 0x7ff0000000000000;
+        softfloat_exceptionFlags |= softfloat_flag_infinite;
+        break;
+    case 0x100: // sNaN
+        softfloat_exceptionFlags |= softfloat_flag_invalid;
+        [[fallthrough]];
+    case 0x200: //qNaN
+        uA.ui = defaultNaNF64UI;
+        break;
+    case 0x004: // -subnormal
+    case 0x020: //+ sub
+        sub = true;
+        [[fallthrough]];
+    default: // +- normal
+        uA.ui = recip7(uA.ui, 11, 52,
+                       softfloat_roundingMode, sub, &round_abnormal);
+        if (round_abnormal)
+            softfloat_exceptionFlags |= softfloat_flag_inexact |
+                                        softfloat_flag_overflow;
+        break;
+    }
+
+    return uA.f;
+}
--- a/sim/common/softfloat_ext.h
+++ b/sim/common/softfloat_ext.h
@ -0,0 +1,14 @@
+#include <stdint.h>
+#include <softfloat_types.h>
+
+uint_fast16_t f16_classify( float16_t );
+float16_t f16_rsqrte7( float16_t );
+float16_t f16_recip7( float16_t );
+
+uint_fast16_t f32_classify( float32_t );
+float32_t f32_rsqrte7( float32_t );
+float32_t f32_recip7( float32_t );
+
+uint_fast16_t f64_classify( float64_t );
+float64_t f64_rsqrte7( float64_t );
+float64_t f64_recip7( float64_t );
--- a/sim/opaesim/Makefile
+++ b/sim/opaesim/Makefile
@ -51,7 +51,7 @@ endif

 DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)

-SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
+SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
 SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
 SRCS += $(SRC_DIR)/fpga.cpp $(SRC_DIR)/opae_sim.cpp

--- a/sim/rtlsim/Makefile
+++ b/sim/rtlsim/Makefile
@ -35,7 +35,7 @@ ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
 endif
 RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE)

-SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
+SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
 SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
 SRCS += $(SRC_DIR)/processor.cpp

--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@ -17,8 +17,8 @@ CXXFLAGS += $(CONFIGS)
 LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
 LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulator -lramulator

-SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
-SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp
+SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
+SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/execute_vector.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp

 # Debugging
 ifdef DEBUG
--- a/sim/simx/arch.h
+++ b/sim/simx/arch.h
@ -29,6 +29,7 @@ private:
  uint16_t num_cores_;
  uint16_t num_clusters_;
  uint16_t socket_size_;
+  uint16_t vsize_;
  uint16_t num_barriers_;
  uint64_t local_mem_base_;

@ -39,6 +40,7 @@ public:
    , num_cores_(num_cores)
    , num_clusters_(NUM_CLUSTERS)
    , socket_size_(SOCKET_SIZE)
+    , vsize_(VLEN / 8)
    , num_barriers_(NUM_BARRIERS)
    , local_mem_base_(LMEM_BASE_ADDR)
  {}
@ -71,6 +73,10 @@ public:
    return socket_size_;
  }

+  uint16_t vsize() const {
+    return vsize_;
+  }
+
 };

 }
--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@ -47,6 +47,7 @@ static const std::unordered_map<Opcode, InstType> sc_instTable = {
  {Opcode::FMSUB,   InstType::R4},
  {Opcode::FMNMADD, InstType::R4},
  {Opcode::FMNMSUB, InstType::R4},
+  {Opcode::VSET,    InstType::V},
  {Opcode::EXT1,    InstType::R},
  {Opcode::EXT2,    InstType::R4},
  {Opcode::R_W,     InstType::R},
@ -54,33 +55,6 @@ static const std::unordered_map<Opcode, InstType> sc_instTable = {
  {Opcode::TCU,     InstType::I},
 };

-enum Constants {
-  width_opcode= 7,
-  width_reg   = 5,
-  width_func2 = 2,
-  width_func3 = 3,
-  width_func7 = 7,
-  width_i_imm = 12,
-  width_j_imm = 20,
-
-  shift_opcode= 0,
-  shift_rd    = width_opcode,
-  shift_func3 = shift_rd + width_reg,
-  shift_rs1   = shift_func3 + width_func3,
-  shift_rs2   = shift_rs1 + width_reg,
-  shift_func2 = shift_rs2 + width_reg,
-  shift_func7 = shift_rs2 + width_reg,
-  shift_rs3   = shift_func7 + width_func2,
-
-  mask_opcode = (1 << width_opcode) - 1,
-  mask_reg    = (1 << width_reg)   - 1,
-  mask_func2  = (1 << width_func2) - 1,
-  mask_func3  = (1 << width_func3) - 1,
-  mask_func7  = (1 << width_func7) - 1,
-  mask_i_imm  = (1 << width_i_imm) - 1,
-  mask_j_imm  = (1 << width_j_imm) - 1,
-};
-
 static const char* op_string(const Instr &instr) {
  auto opcode = instr.getOpcode();
  auto func2  = instr.getFunc2();
@ -230,10 +204,14 @@ static const char* op_string(const Instr &instr) {
  case Opcode::FENCE: return "FENCE";
  case Opcode::FL:
    switch (func3) {
-    case 0x1: return "VL";
    case 0x2: return "FLW";
    case 0x3: return "FLD";
+    case 0x0: return "VL8";
+    case 0x5: return "VL16";
+    case 0x6: return "VL32";
+    case 0x7: return "VL64";
    default:
+      std::cout << "Could not decode float/vector load with func3: " << func3 << std::endl;
      std::abort();
    }
  case Opcode::FS:
@ -241,7 +219,12 @@ static const char* op_string(const Instr &instr) {
    case 0x1: return "VS";
    case 0x2: return "FSW";
    case 0x3: return "FSD";
+    case 0x0: return "VS8";
+    case 0x5: return "VS16";
+    case 0x6: return "VS32";
+    case 0x7: return "VS64";
    default:
+      std::cout << "Could not decode float/vector store with func3: " << func3 << std::endl;
      std::abort();
    }
  case Opcode::AMO: {
@ -390,6 +373,7 @@ static const char* op_string(const Instr &instr) {
  case Opcode::FMSUB:   return func2 ? "FMSUB.D" : "FMSUB.S";
  case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
  case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
+  case Opcode::VSET:    return "VSET";
  case Opcode::EXT1:
    switch (func7) {
    case 0:
@ -421,6 +405,39 @@ static const char* op_string(const Instr &instr) {
  }
 }

+inline void vec_log(std::ostream &os, const Instr &instr) {
+  if (instr.getVUseMask() & set_func3)
+    os << ", func3:" << instr.getFunc3();
+  if (instr.getVUseMask() & set_func6)
+    os << ", func6:" << instr.getFunc6();
+  if (instr.getVUseMask() & set_imm)
+    os << ", imm:" << instr.getImm();
+  if (instr.getVUseMask() & set_vlswidth)
+    os << ", width:" << instr.getVlsWidth();
+  if (instr.getVUseMask() & set_vmop)
+    os << ", mop:" << instr.getVmop();
+  if (instr.getVUseMask() & set_vumop)
+    os << ", umop:" << instr.getVumop();
+  if (instr.getVUseMask() & set_vnf)
+    os << ", nf:" << instr.getVnf();
+  if (instr.getVUseMask() & set_vmask)
+    os << ", vmask:" << instr.getVmask();
+  if (instr.getVUseMask() & set_vs3)
+    os << ", vs3:" << instr.getVs3();
+  if (instr.getVUseMask() & set_zimm)
+    os << ", zimm:" << ((instr.hasZimm()) ? "true" : "false");
+  if (instr.getVUseMask() & set_vlmul)
+    os << ", lmul:" << instr.getVlmul();
+  if (instr.getVUseMask() & set_vsew)
+    os << ", sew:" << instr.getVsew();
+  if (instr.getVUseMask() & set_vta)
+    os << ", ta:" << instr.getVta();
+  if (instr.getVUseMask() & set_vma)
+    os << ", ma:" << instr.getVma();
+  if (instr.getVUseMask() & set_vediv)
+    os << ", ediv:" << instr.getVediv();
+}
+
 namespace vortex {
 std::ostream &operator<<(std::ostream &os, const Instr &instr) {
  os << op_string(instr);
@ -441,6 +458,13 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
    if (sep++ != 0) { os << ", "; } else { os << " "; }
    os << "0x" << std::hex << instr.getImm() << std::dec;
  }
+  if (instr.getOpcode() == Opcode::SYS && instr.getFunc3() >= 5) {
+    // CSRs with immediate values
+    if (sep++ != 0) { os << ", "; } else { os << " "; }
+    os << "0x" << std::hex << instr.getRSrc(0);
+  }
+  // Log vector-specific vtype and vreg info
+  if (instr.isVec()) vec_log(os, instr);
  return os;
 }
 }
@ -452,6 +476,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {

  auto func2 = (code >> shift_func2) & mask_func2;
  auto func3 = (code >> shift_func3) & mask_func3;
+  auto func6 = (code >> shift_func6) & mask_func6;
  auto func7 = (code >> shift_func7) & mask_func7;

  auto rd  = (code >> shift_rd)  & mask_reg;
@ -466,6 +491,12 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
  }

  auto iType = op_it->second;
+  if (op == Opcode::FL || op == Opcode::FS) {
+    if (func3 != 0x2 && func3 != 0x3) {
+      iType = InstType::V;
+    }
+  }
+
  switch (iType) {
  case InstType::R:
    switch (op) {
@ -659,7 +690,104 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
    auto imm = (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20);
    instr->setImm(sext(imm, width_j_imm+1));
  } break;
+    
+  case InstType::V:
+    instr->setVec(true);
+    switch (op) {
+    case Opcode::VSET: {
+      instr->setDestReg(rd, RegType::Integer);
+      instr->setFunc3(func3);
+      switch (func3) {
+        case 7: {
+          if (code >> (shift_vset - 1) == 0b10) { // vsetvl
+            instr->addSrcReg(rs1, RegType::Integer);
+            instr->addSrcReg(rs2, RegType::Integer);
+          } else {
+            auto zimm = (code >> shift_rs2) & mask_v_zimm;
+            instr->setZimm(true);
+            instr->setVlmul(zimm & mask_v_lmul);
+            instr->setVsew((zimm >> shift_v_sew) & mask_v_sew);
+            instr->setVta((zimm >> shift_v_ta) & mask_v_ta);
+            instr->setVma((zimm >> shift_v_ma) & mask_v_ma);
+            if ((code >> shift_vset)) { // vsetivli
+              instr->setImm(rs1);
+            } else { // vsetvli
+              instr->addSrcReg(rs1, RegType::Integer);
+            }
+          }
+        } break;
+        case 3: { // Vector - immediate arithmetic instructions
+          instr->setDestReg(rd, RegType::Vector);
+          instr->addSrcReg(rs2, RegType::Vector);
+          instr->setImm(rs1);
+          instr->setVmask((code >> shift_func7) & 0x1);
+          instr->setFunc6(func6);
+        } break;
+        default: { // Vector - vector/scalar arithmetic instructions
+          if (func3 == 1 && func6 == 16) {
+            instr->setDestReg(rd, RegType::Float);
+          } else if (func3 == 2 && func6 == 16) {
+            instr->setDestReg(rd, RegType::Integer);
+          } else {
+            instr->setDestReg(rd, RegType::Vector);
+          }
+          instr->addSrcReg(rs1, RegType::Vector);
+          instr->addSrcReg(rs2, RegType::Vector);
+          instr->setVmask((code >> shift_func7) & 0x1);
+          instr->setFunc6(func6);
+        }
+      }
+    } break;

+    case Opcode::FL:
+      instr->addSrcReg(rs1, RegType::Integer);
+      instr->setVmop((code >> shift_vmop) & 0b11);
+      switch (instr->getVmop()) {
+        case 0b00:
+          instr->setVumop(rs2);
+          break;
+        case 0b10:
+          instr->addSrcReg(rs2, RegType::Integer);
+          break;
+        case 0b01:
+        case 0b11:
+          instr->addSrcReg(rs2, RegType::Vector);
+          break;
+      }
+      instr->setVsew(func3 & 0x3);
+      instr->setDestReg(rd, RegType::Vector);
+      instr->setVlsWidth(func3);
+      instr->setVmask((code >> shift_func7) & 0x1);
+      instr->setVnf((code >> shift_vnf) & mask_func3);
+      break;
+
+    case Opcode::FS:
+      instr->addSrcReg(rs1, RegType::Integer);
+      instr->setVmop((code >> shift_vmop) & 0b11);
+      switch (instr->getVmop()) {
+        case 0b00:
+          instr->setVumop(rs2);
+          break;
+        case 0b10:
+          instr->addSrcReg(rs2, RegType::Integer);
+          break;
+        case 0b01:
+        case 0b11:
+          instr->addSrcReg(rs2, RegType::Vector);
+          break;
+      }
+      instr->setVsew(func3 & 0x3);
+      instr->addSrcReg(rd, RegType::Vector);
+      instr->setVlsWidth(func3);
+      instr->setVmask((code >> shift_func7) & 0x1);
+      instr->setVmop((code >> shift_vmop) & 0b11);
+      instr->setVnf((code >> shift_vnf) & mask_func3);
+      break;
+
+    default:
+      std::abort();
+    }
+    break;
  case InstType::R4:
    instr->setDestReg(rd, RegType::Float);
    instr->addSrcReg(rs1, RegType::Float);
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@ -33,6 +33,7 @@ using namespace vortex;
 Emulator::warp_t::warp_t(const Arch& arch)
  : ireg_file(arch.num_threads(), std::vector<Word>(MAX_NUM_REGS))
  , freg_file(arch.num_threads(), std::vector<uint64_t>(MAX_NUM_REGS))
+  , vreg_file(MAX_NUM_REGS, std::vector<Byte>(arch.vsize()))
  , uuid(0)
 {}

@ -64,6 +65,26 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
    #endif
    }
  }
+
+  for (auto& reg_file : this->vreg_file) {
+    for (auto& reg : reg_file) {
+    #ifndef NDEBUG
+      reg = 0;
+    #else
+      reg = std::rand();
+    #endif
+    }
+  }
+
+  for (auto& reg_file : this->vreg_file) {
+    for (auto& reg : reg_file) {
+    #ifndef NDEBUG
+      reg = 0;
+    #else
+      reg = std::rand();
+    #endif
+    }
+  }
 }

 ///////////////////////////////////////////////////////////////////////////////
@ -79,7 +100,12 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
    // considered to be big enough to hold input tiles for one output tile.
    // In future versions, scratchpad size should be fixed to an appropriate value.
    , scratchpad(std::vector<Word>(32 * 32 * 32768))
+    , csrs_(arch.num_warps())
 {
+  for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
+    csrs_.at(i).resize(arch.num_threads());
+  }
+
  this->clear();
 }

@ -463,6 +489,32 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
  case VX_CSR_FFLAGS:     return warps_.at(wid).fcsr & 0x1F;
  case VX_CSR_FRM:        return (warps_.at(wid).fcsr >> 5);
  case VX_CSR_FCSR:       return warps_.at(wid).fcsr;
+
+  // Vector CRSs
+  case VX_CSR_VSTART:
+    return csrs_.at(wid).at(tid)[VX_CSR_VSTART];
+  case VX_CSR_VXSAT:
+    return csrs_.at(wid).at(tid)[VX_CSR_VXSAT];
+  case VX_CSR_VXRM:
+    return csrs_.at(wid).at(tid)[VX_CSR_VXRM];
+  case VX_CSR_VCSR: {
+    Word vxsat = csrs_.at(wid).at(tid)[VX_CSR_VXSAT];
+    Word vxrm = csrs_.at(wid).at(tid)[VX_CSR_VXRM];
+    return (vxrm << 1) | vxsat;
+  }
+  case VX_CSR_VL:
+    return csrs_.at(wid).at(tid)[VX_CSR_VL];
+  case VX_CSR_VTYPE:
+    return csrs_.at(wid).at(tid)[VX_CSR_VTYPE];
+  case VX_CSR_VLENB:
+    return VLEN / 8;
+  case VX_CSR_VCYCLE:
+    return csrs_.at(wid).at(tid)[VX_CSR_VCYCLE];
+  case VX_CSR_VTIME:
+    return csrs_.at(wid).at(tid)[VX_CSR_VTIME];
+  case VX_CSR_VINSTRET:
+    return csrs_.at(wid).at(tid)[VX_CSR_VINSTRET];
+
  case VX_CSR_MHARTID:    return (core_->id() * arch_.num_warps() + wid) * arch_.num_threads() + tid;
  case VX_CSR_THREAD_ID:  return tid;
  case VX_CSR_WARP_ID:    return wid;
@ -578,6 +630,29 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
  case VX_CSR_MSCRATCH:
    csr_mscratch_ = value;
    break;
+
+  // Vector CRSs
+  case VX_CSR_VSTART:
+    csrs_.at(wid).at(tid)[VX_CSR_VSTART] = value;
+    break;
+  case VX_CSR_VXSAT:
+    csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1;
+    break;
+  case VX_CSR_VXRM:
+    csrs_.at(wid).at(tid)[VX_CSR_VXRM] = value & 0b11;
+    break;
+  case VX_CSR_VCSR:
+    csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1;
+    csrs_.at(wid).at(tid)[VX_CSR_VXRM] = (value >> 1) & 0b11;
+    break;
+  case VX_CSR_VL: // read only, written by vset(i)vl(i)
+    csrs_.at(wid).at(tid)[VX_CSR_VL] = value;
+    break;
+  case VX_CSR_VTYPE: // read only, written by vset(i)vl(i)
+    csrs_.at(wid).at(tid)[VX_CSR_VTYPE] = value;
+    break;
+  case VX_CSR_VLENB: // read only, set to VLEN / 8
+
  case VX_CSR_SATP:
  #ifdef VM_ENABLE
    // warps_.at(wid).fcsr = (warps_.at(wid).fcsr & ~0x1F) | (value & 0x1F);
--- a/sim/simx/emulator.h
+++ b/sim/simx/emulator.h
@ -28,6 +28,76 @@ class Core;
 class Instr;
 class instr_trace_t;

+enum Constants {
+  width_opcode= 7,
+  width_reg   = 5,
+  width_func2 = 2,
+  width_func3 = 3,
+  width_func6 = 6,
+  width_func7 = 7,
+  width_mop   = 3,
+  width_vmask = 1,
+  width_i_imm = 12,
+  width_j_imm = 20,
+  width_v_zimm = 11,
+  width_v_ma = 1,
+  width_v_ta = 1,
+  width_v_sew = 3,
+  width_v_lmul = 3,
+  width_aq    = 1,
+  width_rl    = 1,
+
+  shift_opcode= 0,
+  shift_rd    = width_opcode,
+  shift_func3 = shift_rd + width_reg,
+  shift_rs1   = shift_func3 + width_func3,
+  shift_rs2   = shift_rs1 + width_reg,
+  shift_func2 = shift_rs2 + width_reg,
+  shift_func7 = shift_rs2 + width_reg,
+  shift_rs3   = shift_func7 + width_func2,
+  shift_vmop  = shift_func7 + width_vmask,
+  shift_vnf   = shift_vmop + width_mop,
+  shift_func6 = shift_func7 + width_vmask,
+  shift_vset  = shift_func7 + width_func6,
+  shift_v_sew = width_v_lmul,
+  shift_v_ta  = shift_v_sew + width_v_sew,
+  shift_v_ma  = shift_v_ta + width_v_ta,
+
+  mask_opcode = (1 << width_opcode) - 1,
+  mask_reg    = (1 << width_reg)   - 1,
+  mask_func2  = (1 << width_func2) - 1,
+  mask_func3  = (1 << width_func3) - 1,
+  mask_func6  = (1 << width_func6) - 1,
+  mask_func7  = (1 << width_func7) - 1,
+  mask_i_imm  = (1 << width_i_imm) - 1,
+  mask_j_imm  = (1 << width_j_imm) - 1,
+  mask_v_zimm = (1 << width_v_zimm) - 1,
+  mask_v_ma   = (1 << width_v_ma) - 1,
+  mask_v_ta   = (1 << width_v_ta) - 1,
+  mask_v_sew  = (1 << width_v_sew) - 1,
+  mask_v_lmul  = (1 << width_v_lmul) - 1,
+};
+
+struct vtype {
+  uint32_t vill;
+  uint32_t vma;
+  uint32_t vta;
+  uint32_t vsew;
+  uint32_t vlmul;
+};
+
+union reg_data_t {
+  Word     u;
+  WordI    i;
+  WordF    f;
+  float    f32;
+  double   f64;
+  uint32_t u32;
+  uint64_t u64;
+  int32_t  i32;
+  int64_t  i64;
+};
+
 class Emulator {
 public:
  Emulator(const Arch &arch,
@ -61,6 +131,10 @@ public:
  Word get_tc_size();
  Word get_tc_num();
  
+  void dcache_read(void* data, uint64_t addr, uint32_t size);
+
+  void dcache_write(const void* data, uint64_t addr, uint32_t size);
+
 private:

  struct ipdom_entry_t {
@ -85,9 +159,14 @@ private:
    ThreadMask                        tmask;
    std::vector<std::vector<Word>>    ireg_file;
    std::vector<std::vector<uint64_t>>freg_file;
+    std::vector<std::vector<Byte>>    vreg_file;
    std::stack<ipdom_entry_t>         ipdom_stack;
    Byte                              fcsr;
    uint32_t                          uuid;
+
+    struct vtype vtype;
+    uint32_t vl;
+    Word VLMAX;
  };

  struct wspawn_t {
@ -100,12 +179,14 @@ private:

  void execute(const Instr &instr, uint32_t wid, instr_trace_t *trace);

+  void executeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata);
+
+  void loadVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
+
+  void storeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
+
  void icache_read(void* data, uint64_t addr, uint32_t size);

-  void dcache_read(void* data, uint64_t addr, uint32_t size);
-
-  void dcache_write(const void* data, uint64_t addr, uint32_t size);
-
  void dcache_amo_reserve(uint64_t addr);

  bool dcache_amo_check(uint64_t addr);
@ -142,6 +223,7 @@ private:
  uint32_t mat_size;
  uint32_t tc_size;
  uint32_t tc_num;
+  std::vector<std::vector<std::unordered_map<uint32_t, uint32_t>>> csrs_;
 };

 }
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@ -25,22 +25,11 @@
 #include "emulator.h"
 #include "instr.h"
 #include "core.h"
+#include "processor_impl.h"
 #include "VX_types.h"

 using namespace vortex;

-union reg_data_t {
-  Word     u;
-  WordI    i;
-  WordF    f;
-  float    f32;
-  double   f64;
-  uint32_t u32;
-  uint64_t u64;
-  int32_t  i32;
-  int64_t  i64;
-};
-
 inline uint64_t nan_box(uint32_t value) {
  return value | 0xffffffff00000000;
 }
@ -128,6 +117,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
        }
        DPN(2, "}" << std::endl);
        break;
+      case RegType::Vector:
+        break;
      default:
        break;
      }
@ -678,41 +669,47 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
    trace->src_regs[0] = {RegType::Integer, rsrc0};
    auto trace_data = std::make_shared<LsuTraceData>(num_threads);
    trace->data = trace_data;
-    uint32_t data_bytes = 1 << (func3 & 0x3);
-    uint32_t data_width = 8 * data_bytes;
-    for (uint32_t t = thread_start; t < num_threads; ++t) {
-      if (!warp.tmask.test(t))
-        continue;
-      uint64_t mem_addr = rsdata[t][0].i + immsrc;
-      uint64_t read_data = 0;
-      this->dcache_read(&read_data, mem_addr, data_bytes);
-      trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
-      switch (func3) {
-      case 0: // RV32I: LB
-      case 1: // RV32I: LH
-        rddata[t].i = sext((Word)read_data, data_width);
-        break;
-      case 2:
-        if (opcode == Opcode::L) {
-          // RV32I: LW
+    if ((opcode == Opcode::L )
+     || (opcode == Opcode::FL && func3 == 2)
+     || (opcode == Opcode::FL && func3 == 3)) {
+      uint32_t data_bytes = 1 << (func3 & 0x3);
+      uint32_t data_width = 8 * data_bytes;
+      for (uint32_t t = thread_start; t < num_threads; ++t) {
+        if (!warp.tmask.test(t))
+          continue;
+        uint64_t mem_addr = rsdata[t][0].i + immsrc;         
+        uint64_t read_data = 0;
+        this->dcache_read(&read_data, mem_addr, data_bytes);
+        trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
+        switch (func3) {
+        case 0: // RV32I: LB
+        case 1: // RV32I: LH
          rddata[t].i = sext((Word)read_data, data_width);
-        } else {
-          // RV32F: FLW
-          rddata[t].u64 = nan_box((uint32_t)read_data);
+          break;
+        case 2:
+          if (opcode == Opcode::L) {
+            // RV32I: LW
+            rddata[t].i = sext((Word)read_data, data_width);
+          } else {
+            // RV32F: FLW
+            rddata[t].u64 = nan_box((uint32_t)read_data);
+          }
+          break;
+        case 3: // RV64I: LD
+                // RV32D: FLD
+        case 4: // RV32I: LBU
+        case 5: // RV32I: LHU
+        case 6: // RV64I: LWU
+          rddata[t].u64 = read_data;
+          break;
+        default:
+          std::abort();      
        }
-        break;
-      case 3: // RV64I: LD
-              // RV32D: FLD
-      case 4: // RV32I: LBU
-      case 5: // RV32I: LHU
-      case 6: // RV64I: LWU
-        rddata[t].u64 = read_data;
-        break;
-      default:
-        std::abort();
      }
+      rd_write = true;
+    } else {
+      loadVector(instr, wid, rsdata);
    }
-    rd_write = true;
    break;
  }
  case Opcode::S:
@ -724,23 +721,29 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
    trace->src_regs[1] = {data_type, rsrc1};
    auto trace_data = std::make_shared<LsuTraceData>(num_threads);
    trace->data = trace_data;
-    uint32_t data_bytes = 1 << (func3 & 0x3);
-    for (uint32_t t = thread_start; t < num_threads; ++t) {
-      if (!warp.tmask.test(t))
-        continue;
-      uint64_t mem_addr = rsdata[t][0].i + immsrc;
-      uint64_t write_data = rsdata[t][1].u64;
-      trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
-      switch (func3) {
-      case 0:
-      case 1:
-      case 2:
-      case 3:
-        this->dcache_write(&write_data, mem_addr, data_bytes);
-        break;
-      default:
-        std::abort();
+    if ((opcode == Opcode::S)
+     || (opcode == Opcode::FS && func3 == 2)
+     || (opcode == Opcode::FS && func3 == 3)) {
+      uint32_t data_bytes = 1 << (func3 & 0x3);
+      for (uint32_t t = thread_start; t < num_threads; ++t) {
+        if (!warp.tmask.test(t))
+          continue;
+        uint64_t mem_addr = rsdata[t][0].i + immsrc;
+        uint64_t write_data = rsdata[t][1].u64;
+        trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
+        switch (func3) {
+        case 0:
+        case 1:
+        case 2:
+        case 3:
+          this->dcache_write(&write_data, mem_addr, data_bytes);  
+          break;
+        default:
+          std::abort();
+        }
      }
+    } else {
+      storeVector(instr, wid, rsdata);
    }
    break;
  }
@ -925,7 +928,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!warp.tmask.test(t))
        continue;
-      uint32_t frm = this->get_fpu_rm(func3, t, wid);
+      uint32_t frm = (func3 == 0x7) ? this->get_csr(VX_CSR_FRM, t, wid) : func3;
      uint32_t fflags = 0;
      switch (func7) {
      case 0x00: { // RV32F: FADD.S
@ -1240,7 +1243,10 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
        break;
      }
      }
-      this->update_fcrs(fflags, t, wid);
+      if (fflags) {
+        this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, t, wid) | fflags, t, wid);
+        this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, t, wid) | fflags, t, wid);
+      }
    }
    rd_write = true;
    break;
@ -1294,7 +1300,10 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
      default:
        break;
      }
-      this->update_fcrs(fflags, t, wid);
+      if (fflags) {
+        this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, t, wid) | fflags, t, wid);
+        this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, t, wid) | fflags, t, wid);
+      }
    }
    rd_write = true;
    break;
@ -1586,6 +1595,13 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
        std::abort();
    }
  } break;
+  case Opcode::VSET: {
+    auto func6 = instr.getFunc6();
+    if ((func3 == 0x7) || (func3 == 0x2 && func6 == 16) || (func3 == 0x1 && func6 == 16)) {
+      rd_write = true;
+    }
+    executeVector(instr, wid, rsdata, rddata);
+  } break;
  default:
    std::abort();
  }
@ -1629,6 +1645,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
      trace->dst_reg = {type, rdest};
      break;
    default:
+      std::cout << "Unrecognized register write back type: " << type << std::endl;
      std::abort();
      break;
    }
--- a/sim/simx/execute_vector.cpp
+++ b/sim/simx/execute_vector.cpp
--- a/sim/simx/instr.h
+++ b/sim/simx/instr.h
@ -42,6 +42,8 @@ enum class Opcode {
  // RV64 Standard Extension
  R_W       = 0x3b,
  I_W       = 0x1b,
+  // Vector Extension  
+  VSET      = 0x57,
  // Custom Extensions
  EXT1      = 0x0b,
  EXT2      = 0x2b,
@ -56,9 +58,28 @@ enum class InstType {
  B, 
  U, 
  J,
+  V,
  R4
 };

+enum set_vuse_mask {
+  set_func3 = (1 << 0),
+  set_func6 = (1 << 1),
+  set_imm = (1 << 2),
+  set_vlswidth = (1 << 3),
+  set_vmop = (1 << 4),
+  set_vumop = (1 << 5),
+  set_vnf = (1 << 6),
+  set_vmask = (1 << 7),
+  set_vs3 = (1 << 8),
+  set_zimm = (1 << 9),
+  set_vlmul = (1 << 10),
+  set_vsew = (1 << 11),
+  set_vta = (1 << 12),
+  set_vma = (1 << 13),
+  set_vediv = (1 << 14)
+};
+
 class Instr {
 public:
  Instr() 
@ -70,7 +91,22 @@ public:
    , rdest_(0)
    , func2_(0)
    , func3_(0)
-    , func7_(0) {
+    , func6_(0)
+    , func7_(0)
+    , vmask_(0)
+    , vlsWidth_(0)
+    , vMop_(0)
+    , vUmop_(0)
+    , vNf_(0)
+    , vs3_(0)
+    , has_zimm_(false)
+    , vlmul_(0)
+    , vsew_(0)
+    , vta_(0)
+    , vma_(0)
+    , vediv_(0)
+    , _vusemask(0)
+    , _is_vec(false)   {
    for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
       rsrc_type_[i] = RegType::None;
       rsrc_[i] = 0;
@ -93,13 +129,28 @@ public:
    num_rsrcs_ = std::max<uint32_t>(num_rsrcs_, index+1); 
  }
  void setFunc2(uint32_t func2) { func2_ = func2; }
-  void setFunc3(uint32_t func3) { func3_ = func3; }
+  void setFunc3(uint32_t func3) { func3_ = func3; _vusemask |= set_func3; }
+  void setFunc6(uint32_t func6) { func6_ = func6; _vusemask |= set_func6; }
  void setFunc7(uint32_t func7) { func7_ = func7; }
-  void setImm(uint32_t imm) { has_imm_ = true; imm_ = imm; }
+  void setImm(uint32_t imm) { has_imm_ = true; imm_ = imm; _vusemask |= set_imm; }
+  void setVlsWidth(uint32_t width) { vlsWidth_ = width; _vusemask |= set_vlswidth; }
+  void setVmop(uint32_t mop) { vMop_ = mop; _vusemask |= set_vmop; }
+  void setVumop(uint32_t umop) { vUmop_ = umop; _vusemask |= set_vumop; }
+  void setVnf(uint32_t nf) { vNf_ = nf; _vusemask |= set_vnf; }
+  void setVmask(uint32_t mask) { vmask_ = mask; _vusemask |= set_vmask; }
+  void setVs3(uint32_t vs) { vs3_ = vs; _vusemask |= set_vs3; }
+  void setZimm(bool has_zimm) { has_zimm_ = has_zimm; _vusemask |= set_zimm; }
+  void setVlmul(uint32_t lmul) { vlmul_ = lmul; _vusemask |= set_vlmul; }
+  void setVsew(uint32_t sew) { vsew_ = sew; _vusemask |= set_vsew; }
+  void setVta(uint32_t vta) { vta_ = vta; _vusemask |= set_vta; }
+  void setVma(uint32_t vma) { vma_ = vma; _vusemask |= set_vma; }
+  void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; _vusemask |= set_vediv; }
+  void setVec(bool is_vec) { _is_vec = is_vec; }

  Opcode   getOpcode() const { return opcode_; }
  uint32_t getFunc2() const { return func2_; }
  uint32_t getFunc3() const { return func3_; }
+  uint32_t getFunc6() const { return func6_; }
  uint32_t getFunc7() const { return func7_; }
  uint32_t getNRSrc() const { return num_rsrcs_; }
  uint32_t getRSrc(uint32_t i) const { return rsrc_[i]; }
@ -108,6 +159,21 @@ public:
  RegType  getRDType() const { return rdest_type_; }  
  bool     hasImm() const { return has_imm_; }
  uint32_t getImm() const { return imm_; }
+  uint32_t getVlsWidth() const { return vlsWidth_; }
+  uint32_t getVmop() const { return vMop_; }
+  uint32_t getVumop() const { return vUmop_; }
+  uint32_t getVnf() const { return vNf_; }
+  uint32_t getVmask() const { return vmask_; }
+  uint32_t getVs3() const { return vs3_; }
+  bool     hasZimm() const { return has_zimm_; }
+  uint32_t getVlmul() const { return vlmul_; }
+  uint32_t getVsew() const { return 1 << (3 + vsew_); }
+  uint32_t getVsewO() const { return vsew_; }
+  uint32_t getVta() const { return vta_; }
+  uint32_t getVma() const { return vma_; }
+  uint32_t getVediv() const { return vediv_; }
+  uint32_t getVUseMask() const { return _vusemask; }
+  bool     isVec() const { return _is_vec; }

 private:

@ -125,8 +191,25 @@ private:
  uint32_t rdest_;
  uint32_t func2_;
  uint32_t func3_;
+  uint32_t func6_;
  uint32_t func7_;

+  // Vector
+  uint32_t vmask_;
+  uint32_t vlsWidth_;
+  uint32_t vMop_;
+  uint32_t vUmop_;
+  uint32_t vNf_;
+  uint32_t vs3_;
+  bool     has_zimm_;
+  uint32_t vlmul_;
+  uint32_t vsew_;
+  uint32_t vta_;
+  uint32_t vma_;
+  uint32_t vediv_;
+  uint32_t _vusemask;
+  bool     _is_vec;
+
  friend std::ostream &operator<<(std::ostream &, const Instr&);
 };

--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@ -84,7 +84,8 @@ enum class RegType {
  None,
  Integer,
  Float,
-  Count
+  Count,
+  Vector
 };

 inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
@ -92,6 +93,7 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
  case RegType::None: break;
  case RegType::Integer: os << "x"; break;
  case RegType::Float:   os << "f"; break;
+  case RegType::Vector:  os << "v"; break;
  default: assert(false);
  }
  return os;
--- a/sim/xrtsim/Makefile
+++ b/sim/xrtsim/Makefile
@ -51,7 +51,7 @@ endif

 DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)

-SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
+SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
 SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
 SRCS += $(SRC_DIR)/xrt.cpp $(SRC_DIR)/xrt_sim.cpp

--- a/tests/riscv/riscv-vector-tests/README
+++ b/tests/riscv/riscv-vector-tests/README
@ -0,0 +1,39 @@
+## Running the testcases
+
+```
+XLEN=32 ./run-test.sh testcase1 testcase2
+XLEN=64 ./run-test.sh testcase1 testcase2
+
+# or to run all default testcases
+XLEN=32 ./run-test.sh
+XLEN=64 ./run-test.sh
+```
+
+## Adding a new testcase
+
+The source code for the vector extension can be found in `sim/simx/execute_vector.cpp`.
+If you add support for a new vector instruction please go to `run-test.sh` and it to the default testcases.
+This will ensure your instruction is included in the regression test suite.
+
+## Updating the testcase binaries
+
+As `riscv-vector-tests` is still under development,
+we should periodically recompile the testscases and update the binaries.
+
+To update the test case binaries run:
+
+```
+XLEN=32 make -C ../../../third_party/ riscv-vector-tests
+XLEN=64 make -C ../../../third_party/ riscv-vector-tests
+```
+This requires Spike and Go to be installed on your machine.
+
+Then run the testcases that you want to update - this will automatically copy them e.g.:
+```
+XLEN=64 ./run-test.sh testcase1 testcase2
+```
+
+Finally use git to add the updated testcases to your commit (-f required due to .gitignore):
+```
+git add -f testcase1 testcase2
+```
--- a/tests/riscv/riscv-vector-tests/run-test.sh.in
+++ b/tests/riscv/riscv-vector-tests/run-test.sh.in
@ -0,0 +1,117 @@
+#!/bin/bash
+VLEN=${VLEN:-256}
+XLEN=${XLEN:-32}
+
+RISCV_TOOLCHAIN_PATH=${RISCV_TOOLCHAIN_PATH:-$TOOLDIR"/riscv"$XLEN"-gnu-toolchain"}
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+RESTORE_PREV_DIR=$(pwd)
+
+VECTOR_TESTS_REPOSITORY=https://github.com/MichaelJSr/testcases/raw/main
+VECTOR_TESTS_BASE_NAME=vector-tests.tar.bz2
+
+vector_tests()
+{
+    parts=$(eval echo {a..l})
+    for x in $parts
+    do
+        wget $VECTOR_TESTS_REPOSITORY/$VECTOR_TESTS_BASE_NAME.parta$x
+    done
+    cat $VECTOR_TESTS_BASE_NAME.part* > $VECTOR_TESTS_BASE_NAME
+    tar -xvf $VECTOR_TESTS_BASE_NAME
+    rm -f $VECTOR_TESTS_BASE_NAME*
+}
+
+# get selected testcases from command line or run default testcases
+if [ "$#" == "0" ];
+then
+  # write out test case name explicitely if there are collisions with other test names
+  testcases=(vset vmv vslide vmerge vrgather \
+             vlm.v vsm.v \ 
+             vle8 vle16 vle32 \
+             vse8 vse16 vse32 \
+             vlseg vlsseg vluxseg vloxseg \
+             vsseg vssseg vsuxseg vsoxseg \
+             vlse8 vlse16 vlse32 \
+             vsse8 vsse16 vsse32 \
+             vloxei vluxei vsoxei vsuxei \
+             vl1r vl2r vl4r vl8r \
+             vs1r vs2r vs4r vs8r \
+             vadd vsub vmin vmax vand vor vxor \
+             vmseq vmsne vmslt vmsle vmsgt \
+             vsll vsrl vsra vssr \
+             vaadd vasub \
+             vfmin vfmax vfcvt vfsqrt vfrsqrt7 vfrec7 vfclass vfmv vfslide vfmerge \
+             vfadd vfredusum vfsub vfredosum vfredmin vfredmax vfsgnj vmf vfdiv vfrdiv vfmul vfrsub \
+             vfmacc vfnmacc vfmsac vfnmsac vfmadd vfnmadd vfmsub vfnmsub \
+             vredsum vredand vredor vredxor vredmin vredmax \
+             vwred \
+             vmand vmor vmxor vmnand vmnor vmxnor \
+             vdiv vrem vmul vsmul \
+             vmadd vnmsub vmacc vnmsac \
+             vwadd vwsub vwmul vwmacc \
+             vrsub vcompress vnclip vssub vsadd vnsra vnsrl \
+             vadc vmadc vsbc vmsbc \
+             vsext vzext \
+             vid)
+  if [ $XLEN -eq 64 ]; then
+    testcases+=(vle64 vse64 vlse64 vsse64 vfwcvt vfncvt \
+                vfwadd vfwsub vfwmul vfwred vfwmacc vfwnmacc vfwmsac vfwnmsac )
+  fi
+else
+  testcases="${@}"
+fi
+
+cd $SCRIPT_DIR
+
+# Fallback #2: If testcases directory exists, we will use existing testcases
+if [ ! -d "$SCRIPT_DIR/testcases" ]; then
+  mkdir testcases
+  cd testcases
+  # Fallback #3: Otherwise, download testcases
+  vector_tests
+fi
+
+cd $SCRIPT_DIR/testcases/v$VLEN"x"$XLEN
+
+# Fallback #1: Copy locally generated testcases (assuming they exist)
+rm *".ddr4.log"
+for testcase in ${testcases[@]}; do
+  rm "$testcase"*.elf "$testcase"*.bin "$testcase"*.dump "$testcase"*.log
+  cp -f $SCRIPT_DIR/../../../third_party/riscv-vector-tests/out/v"$VLEN"x"$XLEN"machine/bin/stage2/"$testcase"* .
+done
+
+passed=0
+failed=0
+selected=0
+
+# count all available testcases, exclude *.elf, *.bin, *.dump, *.log to prevent double counting
+all=$(($(ls | wc -l) - $(ls -d *.elf | wc -l) - $(ls -d *.bin | wc -l) - $(ls -d *.dump | wc -l) - $(ls -d *.log | wc -l)))
+
+for testcase in ${testcases[@]}; do
+  for f in "$testcase"* ; do 
+    ln -s "$f" "$f.elf";
+    "$RISCV_TOOLCHAIN_PATH"/bin/riscv"$XLEN"-unknown-elf-objdump -D "$f.elf" > "$f.dump";
+    "$RISCV_TOOLCHAIN_PATH"/bin/riscv"$XLEN"-unknown-elf-objcopy -O binary "$f.elf" "$f.bin";
+    $SCRIPT_DIR/../../../sim/simx/simx -c 1 "$f.bin" &> "$f.log";
+    if [ $? -eq 13 ]; then
+      echo "$f PASSED"
+      let "passed++"
+    else
+      echo "$f FAILED"
+      let "failed++"
+    fi
+    # REG_TESTS=1 informs the script to delete the previous binary after each vector test to save disk space
+    # Otherwise, the vector regression tests would run out of disk space eventually
+    if [ $REG_TESTS -eq 1 ]; then
+      cat $f.log
+      rm $f.*
+      rm $f
+    fi
+    let "selected++"
+  done
+done
+cd $RESTORE_PREV_DIR
+echo "Passed $passed out of $selected selected vector tests."
+echo "Total available vector tests: $all"
+exit $failed