vector ISA updates

2025-04-23 21:39:10 -04:00 · 2024-12-05 14:43:51 -08:00 · 2024-12-05 14:43:51 -08:00 · 6b23d290c3
commit 6b23d290c3
parent 5d91fe58ad
13 changed files with 858 additions and 859 deletions
--- a/ci/regression.sh.in
+++ b/ci/regression.sh.in
@ -394,7 +394,7 @@ vector()
 {
    echo "begin vector tests..."

-    make -C sim/simx
+    make -C sim/simx clean && CONFIGS="-DEXT_V_ENABLE" make -C sim/simx
    TOOLDIR=@TOOLDIR@ XLEN=@XLEN@ VLEN=256 REG_TESTS=1 ./tests/riscv/riscv-vector-tests/run-test.sh

    echo "vector tests done!"
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -830,6 +830,12 @@
    `define EXT_M_ENABLED   0
 `endif

+`ifdef EXT_V_ENABLE
+    `define EXT_V_ENABLED   1
+`else
+    `define EXT_V_ENABLED   0
+`endif
+
 `ifdef EXT_ZICOND_ENABLE
    `define EXT_ZICOND_ENABLED 1
 `else
@ -846,7 +852,7 @@
 `define ISA_STD_N           13
 `define ISA_STD_Q           16
 `define ISA_STD_S           18
-`define ISA_STD_U           20
+`define ISA_STD_V           21

 `define ISA_EXT_ICACHE      0
 `define ISA_EXT_DCACHE      1
@ -883,7 +889,7 @@
                | (0 << 18) /* S - Supervisor mode implemented */ \
                | (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
                | (1 << 20) /* U - User mode implemented */ \
-                | (0 << 21) /* V - Tentatively reserved for Vector extension */ \
+                | (`EXT_V_ENABLED << 21) /* V - Tentatively reserved for Vector extension */ \
                | (0 << 22) /* W - Reserved */ \
                | (1 << 23) /* X - Non-standard extensions present */ \
                | (0 << 24) /* Y - Reserved */ \
--- a/sim/common/rvfloats.cpp
+++ b/sim/common/rvfloats.cpp
@ -1,10 +1,10 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -12,11 +12,11 @@
 // limitations under the License.

 #include "rvfloats.h"
-#include "softfloat_ext.h"
 #include <stdio.h>

 extern "C" {
 #include <softfloat.h>
+#include "softfloat_ext.h"
 #include <internals.h>
 #include <../RISCV/specialize.h>
 }
@ -344,7 +344,7 @@ bool rv_fle_d(uint64_t a, uint64_t b, uint32_t* fflags) {
 bool rv_feq_s(uint32_t a, uint32_t b, uint32_t* fflags) {
  rv_init(0);
  auto r = f32_eq(to_float32_t(a), to_float32_t(b));
-  if (fflags) { *fflags = softfloat_exceptionFlags; }  
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

@ -355,11 +355,11 @@ bool rv_feq_d(uint64_t a, uint64_t b, uint32_t* fflags) {
  return r;
 }

-uint32_t rv_fmin_s(uint32_t a, uint32_t b, uint32_t* fflags) {  
+uint32_t rv_fmin_s(uint32_t a, uint32_t b, uint32_t* fflags) {
  uint32_t r;
  rv_init(0);
  if (isNaNF32UI(a) && isNaNF32UI(b)) {
-    r = defaultNaNF32UI;   
+    r = defaultNaNF32UI;
  } else {
    auto fa = to_float32_t(a);
    auto fb = to_float32_t(b);
@ -374,11 +374,11 @@ uint32_t rv_fmin_s(uint32_t a, uint32_t b, uint32_t* fflags) {
  return r;
 }

-uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags) {  
+uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags) {
  uint64_t r;
  rv_init(0);
  if (isNaNF64UI(a) && isNaNF64UI(b)) {
-    r = defaultNaNF64UI;   
+    r = defaultNaNF64UI;
  } else {
    auto fa = to_float64_t(a);
    auto fb = to_float64_t(b);
@ -397,7 +397,7 @@ uint32_t rv_fmax_s(uint32_t a, uint32_t b, uint32_t* fflags) {
  uint32_t r;
  rv_init(0);
  if (isNaNF32UI(a) && isNaNF32UI(b)) {
-    r = defaultNaNF32UI;   
+    r = defaultNaNF32UI;
  } else {
    auto fa = to_float32_t(a);
    auto fb = to_float32_t(b);
@ -416,7 +416,7 @@ uint64_t rv_fmax_d(uint64_t a, uint64_t b, uint32_t* fflags) {
  uint64_t r;
  rv_init(0);
  if (isNaNF64UI(a) && isNaNF64UI(b)) {
-    r = defaultNaNF64UI;   
+    r = defaultNaNF64UI;
  } else {
    auto fa = to_float64_t(a);
    auto fb = to_float64_t(b);
@ -449,8 +449,8 @@ uint32_t rv_fclss_s(uint32_t a) {
      ( !sign && subnormOrZero && !fracZero )  << 5 |
      ( !sign && subnormOrZero && fracZero )   << 4 |
      ( isNaN &&  isSNaN )                     << 8 |
-      ( isNaN && !isSNaN )                     << 9;  
-  
+      ( isNaN && !isSNaN )                     << 9;
+
  return r;
 }

@ -472,8 +472,8 @@ uint32_t rv_fclss_d(uint64_t a) {
      ( !sign && subnormOrZero && !fracZero )  << 5 |
      ( !sign && subnormOrZero && fracZero )   << 4 |
      ( isNaN &&  isSNaN )                     << 8 |
-      ( isNaN && !isSNaN )                     << 9;  
-  
+      ( isNaN && !isSNaN )                     << 9;
+
  return r;
 }

@ -483,7 +483,7 @@ uint32_t rv_fsgnj_s(uint32_t a, uint32_t b) {
  return r;
 }

-uint64_t rv_fsgnj_d(uint64_t a, uint64_t b) {  
+uint64_t rv_fsgnj_d(uint64_t a, uint64_t b) {
  auto sign = b & F64_SIGN;
  auto r = sign | (a & ~F64_SIGN);
  return r;
@ -495,7 +495,7 @@ uint32_t rv_fsgnjn_s(uint32_t a, uint32_t b) {
  return r;
 }

-uint64_t rv_fsgnjn_d(uint64_t a, uint64_t b) {  
+uint64_t rv_fsgnjn_d(uint64_t a, uint64_t b) {
  auto sign = ~b & F64_SIGN;
  auto r = sign | (a & ~F64_SIGN);
  return r;
@ -508,7 +508,7 @@ uint32_t rv_fsgnjx_s(uint32_t a, uint32_t b) {
  return r;
 }

-uint64_t rv_fsgnjx_d(uint64_t a, uint64_t b) {  
+uint64_t rv_fsgnjx_d(uint64_t a, uint64_t b) {
  auto sign1 = a & F64_SIGN;
  auto sign2 = b & F64_SIGN;
  auto r = (sign1 ^ sign2) | (a & ~F64_SIGN);
--- a/sim/common/softfloat_ext.cpp
+++ b/sim/common/softfloat_ext.cpp
@ -33,110 +33,103 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 =============================================================================*/

-#include <assert.h>
-#include <stdbool.h>
-#include <internals.h>
-#include <../RISCV/specialize.h>
-#include <softfloat.h>
 #include "softfloat_ext.h"
+#include <../RISCV/specialize.h>
+#include <assert.h>
+#include <internals.h>
+#include <softfloat.h>
+#include <stdbool.h>

-uint_fast16_t f16_classify( float16_t a )
-{
-    union ui16_f16 uA;
-    uint_fast16_t uiA;
+#ifdef __cplusplus
+extern "C" {
+#endif

-    uA.f = a;
-    uiA = uA.ui;
+uint_fast16_t f16_classify(float16_t a) {
+  union ui16_f16 uA;
+  uint_fast16_t uiA;

-    uint_fast16_t infOrNaN = expF16UI( uiA ) == 0x1F;
-    uint_fast16_t subnormalOrZero = expF16UI( uiA ) == 0;
-    bool sign = signF16UI( uiA );
-    bool fracZero = fracF16UI( uiA ) == 0;
-    bool isNaN = isNaNF16UI( uiA );
-    bool isSNaN = softfloat_isSigNaNF16UI( uiA );
+  uA.f = a;
+  uiA = uA.ui;

-    return
-        (  sign && infOrNaN && fracZero )          << 0 |
-        (  sign && !infOrNaN && !subnormalOrZero ) << 1 |
-        (  sign && subnormalOrZero && !fracZero )  << 2 |
-        (  sign && subnormalOrZero && fracZero )   << 3 |
-        ( !sign && infOrNaN && fracZero )          << 7 |
-        ( !sign && !infOrNaN && !subnormalOrZero ) << 6 |
-        ( !sign && subnormalOrZero && !fracZero )  << 5 |
-        ( !sign && subnormalOrZero && fracZero )   << 4 |
-        ( isNaN &&  isSNaN )                       << 8 |
-        ( isNaN && !isSNaN )                       << 9;
+  uint_fast16_t infOrNaN = expF16UI(uiA) == 0x1F;
+  uint_fast16_t subnormalOrZero = expF16UI(uiA) == 0;
+  bool sign = signF16UI(uiA);
+  bool fracZero = fracF16UI(uiA) == 0;
+  bool isNaN = isNaNF16UI(uiA);
+  bool isSNaN = softfloat_isSigNaNF16UI(uiA);
+
+  return (sign && infOrNaN && fracZero) << 0 |
+         (sign && !infOrNaN && !subnormalOrZero) << 1 |
+         (sign && subnormalOrZero && !fracZero) << 2 |
+         (sign && subnormalOrZero && fracZero) << 3 |
+         (!sign && infOrNaN && fracZero) << 7 |
+         (!sign && !infOrNaN && !subnormalOrZero) << 6 |
+         (!sign && subnormalOrZero && !fracZero) << 5 |
+         (!sign && subnormalOrZero && fracZero) << 4 | (isNaN && isSNaN) << 8 |
+         (isNaN && !isSNaN) << 9;
 }

-uint_fast16_t f32_classify( float32_t a )
-{
-    union ui32_f32 uA;
-    uint_fast32_t uiA;
+uint_fast16_t f32_classify(float32_t a) {
+  union ui32_f32 uA;
+  uint_fast32_t uiA;

-    uA.f = a;
-    uiA = uA.ui;
+  uA.f = a;
+  uiA = uA.ui;

-    uint_fast16_t infOrNaN = expF32UI( uiA ) == 0xFF;
-    uint_fast16_t subnormalOrZero = expF32UI( uiA ) == 0;
-    bool sign = signF32UI( uiA );
-    bool fracZero = fracF32UI( uiA ) == 0;
-    bool isNaN = isNaNF32UI( uiA );
-    bool isSNaN = softfloat_isSigNaNF32UI( uiA );
+  uint_fast16_t infOrNaN = expF32UI(uiA) == 0xFF;
+  uint_fast16_t subnormalOrZero = expF32UI(uiA) == 0;
+  bool sign = signF32UI(uiA);
+  bool fracZero = fracF32UI(uiA) == 0;
+  bool isNaN = isNaNF32UI(uiA);
+  bool isSNaN = softfloat_isSigNaNF32UI(uiA);

-    return
-        (  sign && infOrNaN && fracZero )          << 0 |
-        (  sign && !infOrNaN && !subnormalOrZero ) << 1 |
-        (  sign && subnormalOrZero && !fracZero )  << 2 |
-        (  sign && subnormalOrZero && fracZero )   << 3 |
-        ( !sign && infOrNaN && fracZero )          << 7 |
-        ( !sign && !infOrNaN && !subnormalOrZero ) << 6 |
-        ( !sign && subnormalOrZero && !fracZero )  << 5 |
-        ( !sign && subnormalOrZero && fracZero )   << 4 |
-        ( isNaN &&  isSNaN )                       << 8 |
-        ( isNaN && !isSNaN )                       << 9;
+  return (sign && infOrNaN && fracZero) << 0 |
+         (sign && !infOrNaN && !subnormalOrZero) << 1 |
+         (sign && subnormalOrZero && !fracZero) << 2 |
+         (sign && subnormalOrZero && fracZero) << 3 |
+         (!sign && infOrNaN && fracZero) << 7 |
+         (!sign && !infOrNaN && !subnormalOrZero) << 6 |
+         (!sign && subnormalOrZero && !fracZero) << 5 |
+         (!sign && subnormalOrZero && fracZero) << 4 | (isNaN && isSNaN) << 8 |
+         (isNaN && !isSNaN) << 9;
 }

-uint_fast16_t f64_classify( float64_t a )
-{
-    union ui64_f64 uA;
-    uint_fast64_t uiA;
+uint_fast16_t f64_classify(float64_t a) {
+  union ui64_f64 uA;
+  uint_fast64_t uiA;

-    uA.f = a;
-    uiA = uA.ui;
+  uA.f = a;
+  uiA = uA.ui;

-    uint_fast16_t infOrNaN = expF64UI( uiA ) == 0x7FF;
-    uint_fast16_t subnormalOrZero = expF64UI( uiA ) == 0;
-    bool sign = signF64UI( uiA );
-    bool fracZero = fracF64UI( uiA ) == 0;
-    bool isNaN = isNaNF64UI( uiA );
-    bool isSNaN = softfloat_isSigNaNF64UI( uiA );
+  uint_fast16_t infOrNaN = expF64UI(uiA) == 0x7FF;
+  uint_fast16_t subnormalOrZero = expF64UI(uiA) == 0;
+  bool sign = signF64UI(uiA);
+  bool fracZero = fracF64UI(uiA) == 0;
+  bool isNaN = isNaNF64UI(uiA);
+  bool isSNaN = softfloat_isSigNaNF64UI(uiA);

-    return
-        (  sign && infOrNaN && fracZero )          << 0 |
-        (  sign && !infOrNaN && !subnormalOrZero ) << 1 |
-        (  sign && subnormalOrZero && !fracZero )  << 2 |
-        (  sign && subnormalOrZero && fracZero )   << 3 |
-        ( !sign && infOrNaN && fracZero )          << 7 |
-        ( !sign && !infOrNaN && !subnormalOrZero ) << 6 |
-        ( !sign && subnormalOrZero && !fracZero )  << 5 |
-        ( !sign && subnormalOrZero && fracZero )   << 4 |
-        ( isNaN &&  isSNaN )                       << 8 |
-        ( isNaN && !isSNaN )                       << 9;
+  return (sign && infOrNaN && fracZero) << 0 |
+         (sign && !infOrNaN && !subnormalOrZero) << 1 |
+         (sign && subnormalOrZero && !fracZero) << 2 |
+         (sign && subnormalOrZero && fracZero) << 3 |
+         (!sign && infOrNaN && fracZero) << 7 |
+         (!sign && !infOrNaN && !subnormalOrZero) << 6 |
+         (!sign && subnormalOrZero && !fracZero) << 5 |
+         (!sign && subnormalOrZero && fracZero) << 4 | (isNaN && isSNaN) << 8 |
+         (isNaN && !isSNaN) << 9;
 }

-static inline uint64_t extract64(uint64_t val, int pos, int len)
-{
+static inline uint64_t extract64(uint64_t val, int pos, int len) {
  assert(pos >= 0 && len > 0 && len <= 64 - pos);
  return (val >> pos) & (~UINT64_C(0) >> (64 - len));
 }

-static inline uint64_t make_mask64(int pos, int len)
-{
-    assert(pos >= 0 && len > 0 && pos < 64 && len <= 64);
-    return (UINT64_MAX >> (64 - len)) << pos;
+static inline uint64_t make_mask64(int pos, int len) {
+  assert(pos >= 0 && len > 0 && pos < 64 && len <= 64);
+  return (UINT64_MAX >> (64 - len)) << pos;
 }

-//user needs to truncate output to required length
+// user needs to truncate output to required length
 static inline uint64_t rsqrte7(uint64_t val, int e, int s, bool sub) {
  uint64_t exp = extract64(val, s, e);
  uint64_t sig = extract64(val, 0, s);
@ -144,343 +137,320 @@ static inline uint64_t rsqrte7(uint64_t val, int e, int s, bool sub) {
  const int p = 7;

  static const uint8_t table[] = {
-      52, 51, 50, 48, 47, 46, 44, 43,
-      42, 41, 40, 39, 38, 36, 35, 34,
-      33, 32, 31, 30, 30, 29, 28, 27,
-      26, 25, 24, 23, 23, 22, 21, 20,
-      19, 19, 18, 17, 16, 16, 15, 14,
-      14, 13, 12, 12, 11, 10, 10, 9,
-      9, 8, 7, 7, 6, 6, 5, 4,
-      4, 3, 3, 2, 2, 1, 1, 0,
-      127, 125, 123, 121, 119, 118, 116, 114,
-      113, 111, 109, 108, 106, 105, 103, 102,
-      100, 99, 97, 96, 95, 93, 92, 91,
-      90, 88, 87, 86, 85, 84, 83, 82,
-      80, 79, 78, 77, 76, 75, 74, 73,
-      72, 71, 70, 70, 69, 68, 67, 66,
-      65, 64, 63, 63, 62, 61, 60, 59,
-      59, 58, 57, 56, 56, 55, 54, 53};
+      52,  51,  50,  48,  47,  46,  44,  43,  42,  41,  40,  39,  38,  36,  35,
+      34,  33,  32,  31,  30,  30,  29,  28,  27,  26,  25,  24,  23,  23,  22,
+      21,  20,  19,  19,  18,  17,  16,  16,  15,  14,  14,  13,  12,  12,  11,
+      10,  10,  9,   9,   8,   7,   7,   6,   6,   5,   4,   4,   3,   3,   2,
+      2,   1,   1,   0,   127, 125, 123, 121, 119, 118, 116, 114, 113, 111, 109,
+      108, 106, 105, 103, 102, 100, 99,  97,  96,  95,  93,  92,  91,  90,  88,
+      87,  86,  85,  84,  83,  82,  80,  79,  78,  77,  76,  75,  74,  73,  72,
+      71,  70,  70,  69,  68,  67,  66,  65,  64,  63,  63,  62,  61,  60,  59,
+      59,  58,  57,  56,  56,  55,  54,  53};

  if (sub) {
-      while (extract64(sig, s - 1, 1) == 0)
-          exp--, sig <<= 1;
+    while (extract64(sig, s - 1, 1) == 0)
+      exp--, sig <<= 1;

-      sig = (sig << 1) & make_mask64(0 ,s);
+    sig = (sig << 1) & make_mask64(0, s);
  }

-  int idx = ((exp & 1) << (p-1)) | (sig >> (s-p+1));
-  uint64_t out_sig = (uint64_t)(table[idx]) << (s-p);
+  int idx = ((exp & 1) << (p - 1)) | (sig >> (s - p + 1));
+  uint64_t out_sig = (uint64_t)(table[idx]) << (s - p);
  uint64_t out_exp = (3 * make_mask64(0, e - 1) + ~exp) / 2;

-  return (sign << (s+e)) | (out_exp << s) | out_sig;
+  return (sign << (s + e)) | (out_exp << s) | out_sig;
 }

-float16_t f16_rsqrte7(float16_t in)
-{
-    union ui16_f16 uA;
+float16_t f16_rsqrte7(float16_t in) {
+  union ui16_f16 uA;

-    uA.f = in;
-    unsigned int ret = f16_classify(in);
-    bool sub = false;
-    switch(ret) {
-    case 0x001: // -inf
-    case 0x002: // -normal
-    case 0x004: // -subnormal
-    case 0x100: // sNaN
-        softfloat_exceptionFlags |= softfloat_flag_invalid;
-        [[fallthrough]];
-    case 0x200: //qNaN
-        uA.ui = defaultNaNF16UI;
-        break;
-    case 0x008: // -0
-        uA.ui = 0xfc00;
-        softfloat_exceptionFlags |= softfloat_flag_infinite;
-        break;
-    case 0x010: // +0
-        uA.ui = 0x7c00;
-        softfloat_exceptionFlags |= softfloat_flag_infinite;
-        break;
-    case 0x080: //+inf
-        uA.ui = 0x0;
-        break;
-    case 0x020: //+ sub
-        sub = true;
-        [[fallthrough]];
-    default: // +num
-        uA.ui = rsqrte7(uA.ui, 5, 10, sub);
-        break;
-    }
+  uA.f = in;
+  unsigned bool sub = false;
+  switch (ret) {
+  case 0x001: // -inf
+  case 0x002: // -normal
+  case 0x004: // -subnormal
+  case 0x100: // sNaN
+    softfloat_exceptionFlags |= softfloat_flag_invalid;
+    [[fallthrough]];
+  case 0x200: // qNaN
+    uA.ui = defaultNaNF16UI;
+    break;
+  case 0x008: // -0
+    uA.ui = 0xfc00;
+    softfloat_exceptionFlags |= softfloat_flag_infinite;
+    break;
+  case 0x010: // +0
+    uA.ui = 0x7c00;
+    softfloat_exceptionFlags |= softfloat_flag_infinite;
+    break;
+  case 0x080: //+inf
+    uA.ui = 0x0;
+    break;
+  case 0x020: //+ sub
+    sub = true;
+    [[fallthrough]];
+  default: // +num
+    uA.ui = rsqrte7(uA.ui, 5, 10, sub);
+    break;
+  }

-    return uA.f;
+  return uA.f;
 }

-float32_t f32_rsqrte7(float32_t in)
-{
-    union ui32_f32 uA;
+float32_t f32_rsqrte7(float32_t in) {
+  union ui32_f32 uA;

-    uA.f = in;
-    unsigned int ret = f32_classify(in);
-    bool sub = false;
-    switch(ret) {
-    case 0x001: // -inf
-    case 0x002: // -normal
-    case 0x004: // -subnormal
-    case 0x100: // sNaN
-        softfloat_exceptionFlags |= softfloat_flag_invalid;
-        [[fallthrough]];
-    case 0x200: //qNaN
-        uA.ui = defaultNaNF32UI;
-        break;
-    case 0x008: // -0
-        uA.ui = 0xff800000;
-        softfloat_exceptionFlags |= softfloat_flag_infinite;
-        break;
-    case 0x010: // +0
-        uA.ui = 0x7f800000;
-        softfloat_exceptionFlags |= softfloat_flag_infinite;
-        break;
-    case 0x080: //+inf
-        uA.ui = 0x0;
-        break;
-    case 0x020: //+ sub
-        sub = true;
-        [[fallthrough]];
-    default: // +num
-        uA.ui = rsqrte7(uA.ui, 8, 23, sub);
-        break;
-    }
+  uA.f = in;
+  unsigned int ret = f32_classify(in);
+  bool sub = false;
+  switch (ret) {
+  case 0x001: // -inf
+  case 0x002: // -normal
+  case 0x004: // -subnormal
+  case 0x100: // sNaN
+    softfloat_exceptionFlags |= softfloat_flag_invalid;
+    [[fallthrough]];
+  case 0x200: // qNaN
+    uA.ui = defaultNaNF32UI;
+    break;
+  case 0x008: // -0
+    uA.ui = 0xff800000;
+    softfloat_exceptionFlags |= softfloat_flag_infinite;
+    break;
+  case 0x010: // +0
+    uA.ui = 0x7f800000;
+    softfloat_exceptionFlags |= softfloat_flag_infinite;
+    break;
+  case 0x080: //+inf
+    uA.ui = 0x0;
+    break;
+  case 0x020: //+ sub
+    sub = true;
+    [[fallthrough]];
+  default: // +num
+    uA.ui = rsqrte7(uA.ui, 8, 23, sub);
+    break;
+  }

-    return uA.f;
+  return uA.f;
 }

-float64_t f64_rsqrte7(float64_t in)
-{
-    union ui64_f64 uA;
+float64_t f64_rsqrte7(float64_t in) {
+  union ui64_f64 uA;

-    uA.f = in;
-    unsigned int ret = f64_classify(in);
-    bool sub = false;
-    switch(ret) {
-    case 0x001: // -inf
-    case 0x002: // -normal
-    case 0x004: // -subnormal
-    case 0x100: // sNaN
-        softfloat_exceptionFlags |= softfloat_flag_invalid;
-        [[fallthrough]];
-    case 0x200: //qNaN
-        uA.ui = defaultNaNF64UI;
-        break;
-    case 0x008: // -0
-        uA.ui = 0xfff0000000000000ul;
-        softfloat_exceptionFlags |= softfloat_flag_infinite;
-        break;
-    case 0x010: // +0
-        uA.ui = 0x7ff0000000000000ul;
-        softfloat_exceptionFlags |= softfloat_flag_infinite;
-        break;
-    case 0x080: //+inf
-        uA.ui = 0x0;
-        break;
-    case 0x020: //+ sub
-        sub = true;
-        [[fallthrough]];
-    default: // +num
-        uA.ui = rsqrte7(uA.ui, 11, 52, sub);
-        break;
-    }
+  uA.f = in;
+  unsigned int ret = f64_classify(in);
+  bool sub = false;
+  switch (ret) {
+  case 0x001: // -inf
+  case 0x002: // -normal
+  case 0x004: // -subnormal
+  case 0x100: // sNaN
+    softfloat_exceptionFlags |= softfloat_flag_invalid;
+    [[fallthrough]];
+  case 0x200: // qNaN
+    uA.ui = defaultNaNF64UI;
+    break;
+  case 0x008: // -0
+    uA.ui = 0xfff0000000000000ul;
+    softfloat_exceptionFlags |= softfloat_flag_infinite;
+    break;
+  case 0x010: // +0
+    uA.ui = 0x7ff0000000000000ul;
+    softfloat_exceptionFlags |= softfloat_flag_infinite;
+    break;
+  case 0x080: //+inf
+    uA.ui = 0x0;
+    break;
+  case 0x020: //+ sub
+    sub = true;
+    [[fallthrough]];
+  default: // +num
+    uA.ui = rsqrte7(uA.ui, 11, 52, sub);
+    break;
+  }

-    return uA.f;
+  return uA.f;
 }

-//user needs to truncate output to required length
+// user needs to truncate output to required length
 static inline uint64_t recip7(uint64_t val, int e, int s, int rm, bool sub,
-                              bool *round_abnormal)
-{
-    uint64_t exp = extract64(val, s, e);
-    uint64_t sig = extract64(val, 0, s);
-    uint64_t sign = extract64(val, s + e, 1);
-    const int p = 7;
+                              bool *round_abnormal) {
+  uint64_t exp = extract64(val, s, e);
+  uint64_t sig = extract64(val, 0, s);
+  uint64_t sign = extract64(val, s + e, 1);
+  const int p = 7;

-    static const uint8_t table[] = {
-        127, 125, 123, 121, 119, 117, 116, 114,
-        112, 110, 109, 107, 105, 104, 102, 100,
-        99, 97, 96, 94, 93, 91, 90, 88,
-        87, 85, 84, 83, 81, 80, 79, 77,
-        76, 75, 74, 72, 71, 70, 69, 68,
-        66, 65, 64, 63, 62, 61, 60, 59,
-        58, 57, 56, 55, 54, 53, 52, 51,
-        50, 49, 48, 47, 46, 45, 44, 43,
-        42, 41, 40, 40, 39, 38, 37, 36,
-        35, 35, 34, 33, 32, 31, 31, 30,
-        29, 28, 28, 27, 26, 25, 25, 24,
-        23, 23, 22, 21, 21, 20, 19, 19,
-        18, 17, 17, 16, 15, 15, 14, 14,
-        13, 12, 12, 11, 11, 10, 9, 9,
-        8, 8, 7, 7, 6, 5, 5, 4,
-        4, 3, 3, 2, 2, 1, 1, 0};
+  static const uint8_t table[] = {
+      127, 125, 123, 121, 119, 117, 116, 114, 112, 110, 109, 107, 105, 104, 102,
+      100, 99,  97,  96,  94,  93,  91,  90,  88,  87,  85,  84,  83,  81,  80,
+      79,  77,  76,  75,  74,  72,  71,  70,  69,  68,  66,  65,  64,  63,  62,
+      61,  60,  59,  58,  57,  56,  55,  54,  53,  52,  51,  50,  49,  48,  47,
+      46,  45,  44,  43,  42,  41,  40,  40,  39,  38,  37,  36,  35,  35,  34,
+      33,  32,  31,  31,  30,  29,  28,  28,  27,  26,  25,  25,  24,  23,  23,
+      22,  21,  21,  20,  19,  19,  18,  17,  17,  16,  15,  15,  14,  14,  13,
+      12,  12,  11,  11,  10,  9,   9,   8,   8,   7,   7,   6,   5,   5,   4,
+      4,   3,   3,   2,   2,   1,   1,   0};

-    if (sub) {
-        while (extract64(sig, s - 1, 1) == 0)
-            exp--, sig <<= 1;
+  if (sub) {
+    while (extract64(sig, s - 1, 1) == 0)
+      exp--, sig <<= 1;

-        sig = (sig << 1) & make_mask64(0 ,s);
+    sig = (sig << 1) & make_mask64(0, s);

-        if (exp != 0 && exp != UINT64_MAX) {
-            *round_abnormal = true;
-            if (rm == 1 ||
-                (rm == 2 && !sign) ||
-                (rm == 3 && sign))
-                return ((sign << (s+e)) | make_mask64(s, e)) - 1;
-            else
-                return (sign << (s+e)) | make_mask64(s, e);
-        }
+    if (exp != 0 && exp != UINT64_MAX) {
+      *round_abnormal = true;
+      if (rm == 1 || (rm == 2 && !sign) || (rm == 3 && sign))
+        return ((sign << (s + e)) | make_mask64(s, e)) - 1;
+      else
+        return (sign << (s + e)) | make_mask64(s, e);
    }
+  }

-    int idx = sig >> (s-p);
-    uint64_t out_sig = (uint64_t)(table[idx]) << (s-p);
-    uint64_t out_exp = 2 * make_mask64(0, e - 1) + ~exp;
-    if (out_exp == 0 || out_exp == UINT64_MAX) {
-        out_sig = (out_sig >> 1) | make_mask64(s - 1, 1);
-        if (out_exp == UINT64_MAX) {
-            out_sig >>= 1;
-            out_exp = 0;
-        }
+  int idx = sig >> (s - p);
+  uint64_t out_sig = (uint64_t)(table[idx]) << (s - p);
+  uint64_t out_exp = 2 * make_mask64(0, e - 1) + ~exp;
+  if (out_exp == 0 || out_exp == UINT64_MAX) {
+    out_sig = (out_sig >> 1) | make_mask64(s - 1, 1);
+    if (out_exp == UINT64_MAX) {
+      out_sig >>= 1;
+      out_exp = 0;
    }
+  }

-    return (sign << (s+e)) | (out_exp << s) | out_sig;
+  return (sign << (s + e)) | (out_exp << s) | out_sig;
 }

-float16_t f16_recip7(float16_t in)
-{
-    union ui16_f16 uA;
+float16_t f16_recip7(float16_t in) {
+  union ui16_f16 uA;

-    uA.f = in;
-    unsigned int ret = f16_classify(in);
-    bool sub = false;
-    bool round_abnormal = false;
-    switch(ret) {
-    case 0x001: // -inf
-        uA.ui = 0x8000;
-        break;
-    case 0x080: //+inf
-        uA.ui = 0x0;
-        break;
-    case 0x008: // -0
-        uA.ui = 0xfc00;
-        softfloat_exceptionFlags |= softfloat_flag_infinite;
-        break;
-    case 0x010: // +0
-        uA.ui = 0x7c00;
-        softfloat_exceptionFlags |= softfloat_flag_infinite;
-        break;
-    case 0x100: // sNaN
-        softfloat_exceptionFlags |= softfloat_flag_invalid;
-        [[fallthrough]];
-    case 0x200: //qNaN
-        uA.ui = defaultNaNF16UI;
-        break;
-    case 0x004: // -subnormal
-    case 0x020: //+ sub
-        sub = true;
-        [[fallthrough]];
-    default: // +- normal
-        uA.ui = recip7(uA.ui, 5, 10,
-                       softfloat_roundingMode, sub, &round_abnormal);
-        if (round_abnormal)
-            softfloat_exceptionFlags |= softfloat_flag_inexact |
-                                        softfloat_flag_overflow;
-        break;
-    }
+  uA.f = in;
+  unsigned int ret = f16_classify(in);
+  bool sub = false;
+  bool round_abnormal = false;
+  switch (ret) {
+  case 0x001: // -inf
+    uA.ui = 0x8000;
+    break;
+  case 0x080: //+inf
+    uA.ui = 0x0;
+    break;
+  case 0x008: // -0
+    uA.ui = 0xfc00;
+    softfloat_exceptionFlags |= softfloat_flag_infinite;
+    break;
+  case 0x010: // +0
+    uA.ui = 0x7c00;
+    softfloat_exceptionFlags |= softfloat_flag_infinite;
+    break;
+  case 0x100: // sNaN
+    softfloat_exceptionFlags |= softfloat_flag_invalid;
+    [[fallthrough]];
+  case 0x200: // qNaN
+    uA.ui = defaultNaNF16UI;
+    break;
+  case 0x004: // -subnormal
+  case 0x020: //+ sub
+    sub = true;
+    [[fallthrough]];
+  default: // +- normal
+    uA.ui = recip7(uA.ui, 5, 10, softfloat_roundingMode, sub, &round_abnormal);
+    if (round_abnormal)
+      softfloat_exceptionFlags |=
+          softfloat_flag_inexact | softfloat_flag_overflow;
+    break;
+  }

-    return uA.f;
+  return uA.f;
 }

-float32_t f32_recip7(float32_t in)
-{
-    union ui32_f32 uA;
+float32_t f32_recip7(float32_t in) {
+  union ui32_f32 uA;

-    uA.f = in;
-    unsigned int ret = f32_classify(in);
-    bool sub = false;
-    bool round_abnormal = false;
-    switch(ret) {
-    case 0x001: // -inf
-        uA.ui = 0x80000000;
-        break;
-    case 0x080: //+inf
-        uA.ui = 0x0;
-        break;
-    case 0x008: // -0
-        uA.ui = 0xff800000;
-        softfloat_exceptionFlags |= softfloat_flag_infinite;
-        break;
-    case 0x010: // +0
-        uA.ui = 0x7f800000;
-        softfloat_exceptionFlags |= softfloat_flag_infinite;
-        break;
-    case 0x100: // sNaN
-        softfloat_exceptionFlags |= softfloat_flag_invalid;
-        [[fallthrough]];
-    case 0x200: //qNaN
-        uA.ui = defaultNaNF32UI;
-        break;
-    case 0x004: // -subnormal
-    case 0x020: //+ sub
-        sub = true;
-        [[fallthrough]];
-    default: // +- normal
-        uA.ui = recip7(uA.ui, 8, 23,
-                       softfloat_roundingMode, sub, &round_abnormal);
-        if (round_abnormal)
-          softfloat_exceptionFlags |= softfloat_flag_inexact |
-                                      softfloat_flag_overflow;
-        break;
-    }
+  uA.f = in;
+  unsigned int ret = f32_classify(in);
+  bool sub = false;
+  bool round_abnormal = false;
+  switch (ret) {
+  case 0x001: // -inf
+    uA.ui = 0x80000000;
+    break;
+  case 0x080: //+inf
+    uA.ui = 0x0;
+    break;
+  case 0x008: // -0
+    uA.ui = 0xff800000;
+    softfloat_exceptionFlags |= softfloat_flag_infinite;
+    break;
+  case 0x010: // +0
+    uA.ui = 0x7f800000;
+    softfloat_exceptionFlags |= softfloat_flag_infinite;
+    break;
+  case 0x100: // sNaN
+    softfloat_exceptionFlags |= softfloat_flag_invalid;
+    [[fallthrough]];
+  case 0x200: // qNaN
+    uA.ui = defaultNaNF32UI;
+    break;
+  case 0x004: // -subnormal
+  case 0x020: //+ sub
+    sub = true;
+    [[fallthrough]];
+  default: // +- normal
+    uA.ui = recip7(uA.ui, 8, 23, softfloat_roundingMode, sub, &round_abnormal);
+    if (round_abnormal)
+      softfloat_exceptionFlags |=
+          softfloat_flag_inexact | softfloat_flag_overflow;
+    break;
+  }

-    return uA.f;
+  return uA.f;
 }

-float64_t f64_recip7(float64_t in)
-{
-    union ui64_f64 uA;
+float64_t f64_recip7(float64_t in) {
+  union ui64_f64 uA;

-    uA.f = in;
-    unsigned int ret = f64_classify(in);
-    bool sub = false;
-    bool round_abnormal = false;
-    switch(ret) {
-    case 0x001: // -inf
-        uA.ui = 0x8000000000000000;
-        break;
-    case 0x080: //+inf
-        uA.ui = 0x0;
-        break;
-    case 0x008: // -0
-        uA.ui = 0xfff0000000000000;
-        softfloat_exceptionFlags |= softfloat_flag_infinite;
-        break;
-    case 0x010: // +0
-        uA.ui = 0x7ff0000000000000;
-        softfloat_exceptionFlags |= softfloat_flag_infinite;
-        break;
-    case 0x100: // sNaN
-        softfloat_exceptionFlags |= softfloat_flag_invalid;
-        [[fallthrough]];
-    case 0x200: //qNaN
-        uA.ui = defaultNaNF64UI;
-        break;
-    case 0x004: // -subnormal
-    case 0x020: //+ sub
-        sub = true;
-        [[fallthrough]];
-    default: // +- normal
-        uA.ui = recip7(uA.ui, 11, 52,
-                       softfloat_roundingMode, sub, &round_abnormal);
-        if (round_abnormal)
-            softfloat_exceptionFlags |= softfloat_flag_inexact |
-                                        softfloat_flag_overflow;
-        break;
-    }
+  uA.f = in;
+  unsigned int ret = f64_classify(in);
+  bool sub = false;
+  bool round_abnormal = false;
+  switch (ret) {
+  case 0x001: // -inf
+    uA.ui = 0x8000000000000000;
+    break;
+  case 0x080: //+inf
+    uA.ui = 0x0;
+    break;
+  case 0x008: // -0
+    uA.ui = 0xfff0000000000000;
+    softfloat_exceptionFlags |= softfloat_flag_infinite;
+    break;
+  case 0x010: // +0
+    uA.ui = 0x7ff0000000000000;
+    softfloat_exceptionFlags |= softfloat_flag_infinite;
+    break;
+  case 0x100: // sNaN
+    softfloat_exceptionFlags |= softfloat_flag_invalid;
+    [[fallthrough]];
+  case 0x200: // qNaN
+    uA.ui = defaultNaNF64UI;
+    break;
+  case 0x004: // -subnormal
+  case 0x020: //+ sub
+    sub = true;
+    [[fallthrough]];
+  default: // +- normal
+    uA.ui = recip7(uA.ui, 11, 52, softfloat_roundingMode, sub, &round_abnormal);
+    if (round_abnormal)
+      softfloat_exceptionFlags |=
+          softfloat_flag_inexact | softfloat_flag_overflow;
+    break;
+  }

-    return uA.f;
-}
+  return uA.f;
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/sim/common/softfloat_ext.h
+++ b/sim/common/softfloat_ext.h
@ -1,14 +1,22 @@
 #include <stdint.h>
 #include <softfloat_types.h>

-uint_fast16_t f16_classify( float16_t );
-float16_t f16_rsqrte7( float16_t );
-float16_t f16_recip7( float16_t );
+#ifdef __cplusplus
+extern "C" {
+#endif

-uint_fast16_t f32_classify( float32_t );
-float32_t f32_rsqrte7( float32_t );
-float32_t f32_recip7( float32_t );
+uint_fast16_t f16_classify(float16_t);
+float16_t f16_rsqrte7(float16_t);
+float16_t f16_recip7(float16_t);

-uint_fast16_t f64_classify( float64_t );
-float64_t f64_rsqrte7( float64_t );
-float64_t f64_recip7( float64_t );
+uint_fast16_t f32_classify(float32_t);
+float32_t f32_rsqrte7(float32_t);
+float32_t f32_recip7(float32_t);
+
+uint_fast16_t f64_classify(float64_t);
+float64_t f64_rsqrte7(float64_t);
+float64_t f64_recip7(float64_t);
+
+#ifdef __cplusplus
+}
+#endif
--- a/sim/common/util.cpp
+++ b/sim/common/util.cpp
@ -1,10 +1,10 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -16,10 +16,10 @@

 // return file extension
 const char* fileExtension(const char* filepath) {
-    const char *ext = strrchr(filepath, '.');
-    if (ext == NULL || ext == filepath) 
-      return "";
-    return ext + 1;
+  const char *ext = strrchr(filepath, '.');
+  if (ext == NULL || ext == filepath)
+    return "";
+  return ext + 1;
 }

 void* aligned_malloc(size_t size, size_t alignment) {
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@ -18,7 +18,12 @@ LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
 LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulator -lramulator

 SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
-SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/execute_vector.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp
+SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp
+
+# Add V extension sources
+ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),)
+  SRCS += $(SRC_DIR)/execute_v.cpp
+endif

 # Debugging
 ifdef DEBUG
--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@ -390,7 +390,7 @@ static const char* op_string(const Instr &instr) {
    default:
      std::abort();
    }
-  
+
  case Opcode::TCU:
    switch(func3)
    {
@ -405,36 +405,31 @@ static const char* op_string(const Instr &instr) {
  }
 }

-inline void vec_log(std::ostream &os, const Instr &instr) {
-  if (instr.getVUseMask() & set_func3)
-    os << ", func3:" << instr.getFunc3();
-  if (instr.getVUseMask() & set_func6)
-    os << ", func6:" << instr.getFunc6();
-  if (instr.getVUseMask() & set_imm)
-    os << ", imm:" << instr.getImm();
-  if (instr.getVUseMask() & set_vlswidth)
+inline void print_vec_attr(std::ostream &os, const Instr &instr) {
+  uint32_t mask = instr.getVattrMask();
+  if (mask & vattr_vlswidth)
    os << ", width:" << instr.getVlsWidth();
-  if (instr.getVUseMask() & set_vmop)
+  if (mask & vattr_vmop)
    os << ", mop:" << instr.getVmop();
-  if (instr.getVUseMask() & set_vumop)
+  if (mask & vattr_vumop)
    os << ", umop:" << instr.getVumop();
-  if (instr.getVUseMask() & set_vnf)
+  if (mask & vattr_vnf)
    os << ", nf:" << instr.getVnf();
-  if (instr.getVUseMask() & set_vmask)
+  if (mask & vattr_vmask)
    os << ", vmask:" << instr.getVmask();
-  if (instr.getVUseMask() & set_vs3)
+  if (mask & vattr_vs3)
    os << ", vs3:" << instr.getVs3();
-  if (instr.getVUseMask() & set_zimm)
+  if (mask & vattr_zimm)
    os << ", zimm:" << ((instr.hasZimm()) ? "true" : "false");
-  if (instr.getVUseMask() & set_vlmul)
+  if (mask & vattr_vlmul)
    os << ", lmul:" << instr.getVlmul();
-  if (instr.getVUseMask() & set_vsew)
+  if (mask & vattr_vsew)
    os << ", sew:" << instr.getVsew();
-  if (instr.getVUseMask() & set_vta)
+  if (mask & vattr_vta)
    os << ", ta:" << instr.getVta();
-  if (instr.getVUseMask() & set_vma)
+  if (mask & vattr_vma)
    os << ", ma:" << instr.getVma();
-  if (instr.getVUseMask() & set_vediv)
+  if (mask & vattr_vediv)
    os << ", ediv:" << instr.getVediv();
 }

@ -463,8 +458,10 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
    if (sep++ != 0) { os << ", "; } else { os << " "; }
    os << "0x" << std::hex << instr.getRSrc(0);
  }
-  // Log vector-specific vtype and vreg info
-  if (instr.isVec()) vec_log(os, instr);
+  // Log vector-specific attributes
+  if (instr.getVattrMask() != 0) {
+    print_vec_attr(os, instr);
+  }
  return os;
 }
 }
@ -478,6 +475,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
  auto func3 = (code >> shift_func3) & mask_func3;
  auto func6 = (code >> shift_func6) & mask_func6;
  auto func7 = (code >> shift_func7) & mask_func7;
+  __unused(func6);

  auto rd  = (code >> shift_rd)  & mask_reg;
  auto rs1 = (code >> shift_rs1) & mask_reg;
@ -690,9 +688,18 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
    auto imm = (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20);
    instr->setImm(sext(imm, width_j_imm+1));
  } break;
-    
+
+  case InstType::R4: {
+    instr->setDestReg(rd, RegType::Float);
+    instr->addSrcReg(rs1, RegType::Float);
+    instr->addSrcReg(rs2, RegType::Float);
+    instr->addSrcReg(rs3, RegType::Float);
+    instr->setFunc2(func2);
+    instr->setFunc3(func3);
+  } break;
+
+#ifdef EXT_V_ENABLE
  case InstType::V:
-    instr->setVec(true);
    switch (op) {
    case Opcode::VSET: {
      instr->setDestReg(rd, RegType::Integer);
@ -738,7 +745,6 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
        }
      }
    } break;
-
    case Opcode::FL:
      instr->addSrcReg(rs1, RegType::Integer);
      instr->setVmop((code >> shift_vmop) & 0b11);
@ -788,14 +794,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
      std::abort();
    }
    break;
-  case InstType::R4:
-    instr->setDestReg(rd, RegType::Float);
-    instr->addSrcReg(rs1, RegType::Float);
-    instr->addSrcReg(rs2, RegType::Float);
-    instr->addSrcReg(rs3, RegType::Float);
-    instr->setFunc2(func2);
-    instr->setFunc3(func3);
-    break;
+  #endif

  default:
    std::abort();
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@ -43,7 +43,9 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
  this->uuid = 0;
  this->fcsr = 0;

-  std::srand(50);
+  this->vtype = {0, 0, 0, 0, 0};
+  this->vl = 0;
+  this->VLMAX = 0;

  for (auto& reg_file : this->ireg_file) {
    for (auto& reg : reg_file) {
@ -102,6 +104,8 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
    , scratchpad(std::vector<Word>(32 * 32 * 32768))
    , csrs_(arch.num_warps())
 {
+  std::srand(50);
+
  for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
    csrs_.at(i).resize(arch.num_threads());
  }
@ -142,8 +146,7 @@ void Emulator::clear() {
  warps_[0].tmask.set(0);
  wspawn_.valid = false;

-  for (auto& reg : scratchpad) 
-  {
+  for (auto& reg : scratchpad) {
    reg = 0;
  }
 }
@ -190,6 +193,7 @@ instr_trace_t* Emulator::step() {
  assert(warp.tmask.any());

 #ifndef NDEBUG
+  // generate unique universal instruction ID
  uint32_t instr_uuid = warp.uuid++;
  uint32_t g_wid = core_->id() * arch_.num_warps() + scheduled_warp;
  uint64_t uuid = (uint64_t(g_wid) << 32) | instr_uuid;
@ -305,27 +309,26 @@ bool Emulator::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
 #ifdef VM_ENABLE
 void Emulator::icache_read(void *data, uint64_t addr, uint32_t size) {
  DP(3, "*** icache_read 0x" << std::hex << addr << ", size = 0x "  << size);
-
-  try  
+  try
  {
    mmu_.read(data, addr, size, ACCESS_TYPE::FETCH);
  }
-  catch (Page_Fault_Exception& page_fault)  
+  catch (Page_Fault_Exception& page_fault)
  {
    std::cout<<page_fault.what()<<std::endl;
    throw;
-  }  
+  }
 }
 #else
 void Emulator::icache_read(void *data, uint64_t addr, uint32_t size) {
-    mmu_.read(data, addr, size, 0);
+  mmu_.read(data, addr, size, 0);
 }
 #endif

 #ifdef VM_ENABLE
 void Emulator::set_satp(uint64_t satp) {
  DPH(3, "set satp 0x" << std::hex << satp << " in emulator module\n");
-  set_csr(VX_CSR_SATP,satp,0,0); 
+  set_csr(VX_CSR_SATP,satp,0,0);
 }
 #endif

@ -337,11 +340,11 @@ void Emulator::dcache_read(void *data, uint64_t addr, uint32_t size) {
  if (type == AddrType::Shared) {
    core_->local_mem()->read(data, addr, size);
  } else {
-    try  
+    try
    {
      mmu_.read(data, addr, size, ACCESS_TYPE::LOAD);
    }
-    catch (Page_Fault_Exception& page_fault)  
+    catch (Page_Fault_Exception& page_fault)
    {
      std::cout<<page_fault.what()<<std::endl;
      throw;
@ -373,16 +376,16 @@ void Emulator::dcache_write(const void* data, uint64_t addr, uint32_t size) {
    if (type == AddrType::Shared) {
      core_->local_mem()->write(data, addr, size);
    } else {
-      try  
+      try
      {
        // mmu_.write(data, addr, size, 0);
        mmu_.write(data, addr, size, ACCESS_TYPE::STORE);
      }
-      catch (Page_Fault_Exception& page_fault)  
+      catch (Page_Fault_Exception& page_fault)
      {
        std::cout<<page_fault.what()<<std::endl;
        throw;
-      }  
+      }
    }
  }
  DPH(2, "Mem Write: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl);
@ -450,18 +453,15 @@ void Emulator::cout_flush() {
    case (addr + (VX_CSR_MPM_BASE_H-VX_CSR_MPM_BASE)) : return ((value >> 32) & 0xFFFFFFFF)
 #endif

-Word Emulator::get_tiles()
-{
+Word Emulator::get_tiles() {
  return mat_size;
 }

-Word Emulator::get_tc_size()
-{
+Word Emulator::get_tc_size() {
  return tc_size;
 }

-Word Emulator::get_tc_num()
-{
+Word Emulator::get_tc_num() {
  return tc_num;
 }

@ -680,7 +680,7 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
  case VX_TC_SIZE:
    tc_size = value;
    break;
-  
+
  default: {
      std::cout << "Error: invalid CSR write addr=0x" << std::hex << addr << ", value=0x" << value << std::dec << std::endl;
      std::abort();
@ -688,8 +688,6 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
  }
 }

-
-
 uint32_t Emulator::get_fpu_rm(uint32_t func3, uint32_t tid, uint32_t wid) {
  return (func3 == 0x7) ? this->get_csr(VX_CSR_FRM, tid, wid) : func3;
 }
@ -711,4 +709,4 @@ void Emulator::trigger_ecall() {
 }
 void Emulator::trigger_ebreak() {
  active_warps_.reset();
-}
+}
--- a/sim/simx/emulator.h
+++ b/sim/simx/emulator.h
@ -28,76 +28,6 @@ class Core;
 class Instr;
 class instr_trace_t;

-enum Constants {
-  width_opcode= 7,
-  width_reg   = 5,
-  width_func2 = 2,
-  width_func3 = 3,
-  width_func6 = 6,
-  width_func7 = 7,
-  width_mop   = 3,
-  width_vmask = 1,
-  width_i_imm = 12,
-  width_j_imm = 20,
-  width_v_zimm = 11,
-  width_v_ma = 1,
-  width_v_ta = 1,
-  width_v_sew = 3,
-  width_v_lmul = 3,
-  width_aq    = 1,
-  width_rl    = 1,
-
-  shift_opcode= 0,
-  shift_rd    = width_opcode,
-  shift_func3 = shift_rd + width_reg,
-  shift_rs1   = shift_func3 + width_func3,
-  shift_rs2   = shift_rs1 + width_reg,
-  shift_func2 = shift_rs2 + width_reg,
-  shift_func7 = shift_rs2 + width_reg,
-  shift_rs3   = shift_func7 + width_func2,
-  shift_vmop  = shift_func7 + width_vmask,
-  shift_vnf   = shift_vmop + width_mop,
-  shift_func6 = shift_func7 + width_vmask,
-  shift_vset  = shift_func7 + width_func6,
-  shift_v_sew = width_v_lmul,
-  shift_v_ta  = shift_v_sew + width_v_sew,
-  shift_v_ma  = shift_v_ta + width_v_ta,
-
-  mask_opcode = (1 << width_opcode) - 1,
-  mask_reg    = (1 << width_reg)   - 1,
-  mask_func2  = (1 << width_func2) - 1,
-  mask_func3  = (1 << width_func3) - 1,
-  mask_func6  = (1 << width_func6) - 1,
-  mask_func7  = (1 << width_func7) - 1,
-  mask_i_imm  = (1 << width_i_imm) - 1,
-  mask_j_imm  = (1 << width_j_imm) - 1,
-  mask_v_zimm = (1 << width_v_zimm) - 1,
-  mask_v_ma   = (1 << width_v_ma) - 1,
-  mask_v_ta   = (1 << width_v_ta) - 1,
-  mask_v_sew  = (1 << width_v_sew) - 1,
-  mask_v_lmul  = (1 << width_v_lmul) - 1,
-};
-
-struct vtype {
-  uint32_t vill;
-  uint32_t vma;
-  uint32_t vta;
-  uint32_t vsew;
-  uint32_t vlmul;
-};
-
-union reg_data_t {
-  Word     u;
-  WordI    i;
-  WordF    f;
-  float    f32;
-  double   f64;
-  uint32_t u32;
-  uint64_t u64;
-  int32_t  i32;
-  int64_t  i64;
-};
-
 class Emulator {
 public:
  Emulator(const Arch &arch,
@ -126,11 +56,11 @@ public:
  bool wspawn(uint32_t num_warps, Word nextPC);

  int get_exitcode() const;
-  
+
  Word get_tiles();
  Word get_tc_size();
  Word get_tc_num();
-  
+
  void dcache_read(void* data, uint64_t addr, uint32_t size);

  void dcache_write(const void* data, uint64_t addr, uint32_t size);
@ -151,6 +81,26 @@ private:
    bool        fallthrough;
  };

+  struct vtype_t {
+    uint32_t vill;
+    uint32_t vma;
+    uint32_t vta;
+    uint32_t vsew;
+    uint32_t vlmul;
+  };
+
+  union reg_data_t {
+    Word     u;
+    WordI    i;
+    WordF    f;
+    float    f32;
+    double   f64;
+    uint32_t u32;
+    uint64_t u64;
+    int32_t  i32;
+    int64_t  i64;
+  };
+
  struct warp_t {
    warp_t(const Arch& arch);
    void clear(uint64_t startup_addr);
@ -162,11 +112,10 @@ private:
    std::vector<std::vector<Byte>>    vreg_file;
    std::stack<ipdom_entry_t>         ipdom_stack;
    Byte                              fcsr;
+    vtype_t                           vtype;
+    uint32_t                          vl;
+    Word                              VLMAX;
    uint32_t                          uuid;
-
-    struct vtype vtype;
-    uint32_t vl;
-    Word VLMAX;
  };

  struct wspawn_t {
@ -179,11 +128,11 @@ private:

  void execute(const Instr &instr, uint32_t wid, instr_trace_t *trace);

-  void executeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata);
-
+#ifdef EXT_V_ENABLE
  void loadVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
-
  void storeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
+  void executeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata);
+#endif

  void icache_read(void* data, uint64_t addr, uint32_t size);

@ -203,9 +152,10 @@ private:

  void update_fcrs(uint32_t fflags, uint32_t tid, uint32_t wid);

-  void trigger_ecall(); // Re-added for riscv-vector test functionality
-
-  void trigger_ebreak(); // Re-added for riscv-vector test functionality
+  // temporarily added for riscv-vector tests
+  // TODO: remove once ecall/ebreak are supported
+  void trigger_ecall();
+  void trigger_ebreak();

  const Arch& arch_;
  const DCRS& dcrs_;
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@ -677,7 +677,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
      for (uint32_t t = thread_start; t < num_threads; ++t) {
        if (!warp.tmask.test(t))
          continue;
-        uint64_t mem_addr = rsdata[t][0].i + immsrc;         
+        uint64_t mem_addr = rsdata[t][0].i + immsrc;
        uint64_t read_data = 0;
        this->dcache_read(&read_data, mem_addr, data_bytes);
        trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
@ -703,12 +703,14 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
          rddata[t].u64 = read_data;
          break;
        default:
-          std::abort();      
+          std::abort();
        }
      }
      rd_write = true;
    } else {
-      loadVector(instr, wid, rsdata);
+    #ifdef EXT_V_ENABLE
+      this->loadVector(instr, wid, rsdata);
+    #endif
    }
    break;
  }
@ -736,14 +738,16 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
        case 1:
        case 2:
        case 3:
-          this->dcache_write(&write_data, mem_addr, data_bytes);  
+          this->dcache_write(&write_data, mem_addr, data_bytes);
          break;
        default:
          std::abort();
        }
      }
    } else {
-      storeVector(instr, wid, rsdata);
+    #ifdef EXT_V_ENABLE
+      this->storeVector(instr, wid, rsdata);
+    #endif
    }
    break;
  }
@ -1595,6 +1599,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
        std::abort();
    }
  } break;
+#ifdef EXT_V_ENABLE
  case Opcode::VSET: {
    auto func6 = instr.getFunc6();
    if ((func3 == 0x7) || (func3 == 0x2 && func6 == 16) || (func3 == 0x1 && func6 == 16)) {
@ -1602,6 +1607,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
    }
    executeVector(instr, wid, rsdata, rddata);
  } break;
+#endif
  default:
    std::abort();
  }
--- a/sim/simx/execute_vector.cpp
+++ b/sim/simx/execute_vector.cpp
@ -1132,7 +1132,7 @@ bool isMasked(std::vector<std::vector<Byte>> &vreg_file, uint32_t maskVreg, uint
  auto& mask = vreg_file.at(maskVreg);
  uint8_t emask = *(uint8_t *)(mask.data() + byteI / 8);
  uint8_t value = (emask >> (byteI % 8)) & 0x1;
-  DP(1, "Masking enabled: " << +!vmask << " mask element: " << +value);
+  DP(4, "Masking enabled: " << +!vmask << " mask element: " << +value);
  return !vmask && value == 0;
 }

@ -1164,14 +1164,14 @@ void vector_op_vix_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emula
  }
  for (uint32_t i = 0; i < vl * nfields; i++) {
    if (isMasked(vreg_file, 0, i / nfields, vmask)) continue;
-    
+
    uint32_t nfields_strided = strided ? nfields : 1;
    Word mem_addr = ((rsdata[0][0].i) & 0xFFFFFFFC) + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
    Word mem_data = 0;
    emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
-    DP(1, "Loading data " << mem_data << " from: " << mem_addr << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
+    DP(4, "Loading data " << mem_data << " from: " << mem_addr << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
    DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
-    DP(1, "Previous data: " << +result);
+    DP(4, "Previous data: " << +result);
    result = (DT) mem_data;
  }
 }
@ -1225,13 +1225,13 @@ void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulat
        std::cout << "Unsupported iSew: " << iSew << std::endl;
        std::abort();
    }
-    
+
    Word mem_addr = ((rsdata[0][0].i) & 0xFFFFFFFC) + offset + (i % nfields) * sizeof(DT);
    Word mem_data = 0;
    emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
-    DP(1, "VLUX/VLOX - Loading data " << mem_data << " from: " << mem_addr << " with offset: " << std::dec << offset << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
+    DP(4, "VLUX/VLOX - Loading data " << mem_data << " from: " << mem_addr << " with offset: " << std::dec << offset << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
    DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
-    DP(1, "Previous data: " << +result);
+    DP(4, "Previous data: " << +result);
    result = (DT) mem_data;
  }
 }
@ -1256,104 +1256,6 @@ void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulat
  }
 }

-void Emulator::loadVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
-  auto &warp = warps_.at(wid);
-  auto vmask  = instr.getVmask();
-  auto rdest  = instr.getRDest();
-  auto mop = instr.getVmop();
-  switch (mop) {
-    case 0b00: { // unit-stride
-      auto lumop  = instr.getVumop();
-      switch (lumop) {
-        case 0b10000: // vle8ff.v, vle16ff.v, vle32ff.v, vle64ff.v - we do not support exceptions -> treat like regular unit stride
-                       // vlseg2e8ff.v, vlseg2e16ff.v, vlseg2e32ff.v, vlseg2e64ff.v
-                       // vlseg3e8ff.v, vlseg3e16ff.v, vlseg3e32ff.v, vlseg3e64ff.v
-                       // vlseg4e8ff.v, vlseg4e16ff.v, vlseg4e32ff.v, vlseg4e64ff.v
-                       // vlseg5e8ff.v, vlseg5e16ff.v, vlseg5e32ff.v, vlseg5e64ff.v
-                       // vlseg6e8ff.v, vlseg6e16ff.v, vlseg6e32ff.v, vlseg6e64ff.v
-                       // vlseg7e8ff.v, vlseg7e16ff.v, vlseg7e32ff.v, vlseg7e64ff.v
-                       // vlseg8e8ff.v, vlseg8e16ff.v, vlseg8e32ff.v, vlseg8e64ff.v
-        case 0b0000: { // vle8.v, vle16.v, vle32.v, vle64.v
-                       // vlseg2e8.v, vlseg2e16.v, vlseg2e32.v, vlseg2e64.v
-                       // vlseg3e8.v, vlseg3e16.v, vlseg3e32.v, vlseg3e64.v
-                       // vlseg4e8.v, vlseg4e16.v, vlseg4e32.v, vlseg4e64.v
-                       // vlseg5e8.v, vlseg5e16.v, vlseg5e32.v, vlseg5e64.v
-                       // vlseg6e8.v, vlseg6e16.v, vlseg6e32.v, vlseg6e64.v
-                       // vlseg7e8.v, vlseg7e16.v, vlseg7e32.v, vlseg7e64.v
-                       // vlseg8e8.v, vlseg8e16.v, vlseg8e32.v, vlseg8e64.v
-          WordI stride = warp.vtype.vsew / 8;
-          uint32_t nfields = instr.getVnf() + 1;
-          vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, warp.vtype.vsew, warp.vl, false, stride, nfields, warp.vtype.vlmul, vmask);
-          break;
-        }
-        case 0b1000: { // vl1r.v, vl2r.v, vl4r.v, vl8r.v
-          uint32_t nreg = instr.getVnf() + 1;
-          if (nreg != 1 && nreg != 2 && nreg != 4 && nreg != 8) {
-            std::cout << "Whole vector register load - reserved value for nreg: " << nreg << std::endl;
-            std::abort();
-          }
-          DP(1, "Whole vector register load with nreg: " << nreg);
-          uint32_t vl = nreg * VLEN / instr.getVsew();
-          WordI stride = instr.getVsew() / 8;
-          vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, instr.getVsew(), vl, false, stride, 1, 0, vmask);
-          break;
-        }
-        case 0b1011: { // vlm.v
-          if (warp.vtype.vsew != 8) {
-            std::cout << "vlm.v only supports EEW=8, but EEW was: " << warp.vtype.vsew << std::endl;
-            std::abort();
-          }
-          WordI stride = warp.vtype.vsew / 8;
-          vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, warp.vtype.vsew, (warp.vl + 7) / 8, false, stride, 1, 0, true);
-          break;
-        }
-        default:
-          std::cout << "Load vector - unsupported lumop: " << lumop << std::endl;
-          std::abort();
-      }
-      break;
-    }
-    case 0b10: { // strided: vlse8.v, vlse16.v, vlse32.v, vlse64.v
-                 // vlsseg2e8.v, vlsseg2e16.v, vlsseg2e32.v, vlsseg2e64.v
-                 // vlsseg3e8.v, vlsseg3e16.v, vlsseg3e32.v, vlsseg3e64.v
-                 // vlsseg4e8.v, vlsseg4e16.v, vlsseg4e32.v, vlsseg4e64.v
-                 // vlsseg5e8.v, vlsseg5e16.v, vlsseg5e32.v, vlsseg5e64.v
-                 // vlsseg6e8.v, vlsseg6e16.v, vlsseg6e32.v, vlsseg6e64.v
-                 // vlsseg7e8.v, vlsseg7e16.v, vlsseg7e32.v, vlsseg7e64.v
-                 // vlsseg8e8.v, vlsseg8e16.v, vlsseg8e32.v, vlsseg8e64.v
-      auto rsrc1  = instr.getRSrc(1);
-      auto rdest  = instr.getRDest();
-      WordI stride = warp.ireg_file.at(0).at(rsrc1);
-      uint32_t nfields = instr.getVnf() + 1;
-      vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, warp.vtype.vsew, warp.vl, true, stride, nfields, warp.vtype.vlmul, vmask);
-      break;
-    }
-    case 0b01: // indexed - unordered, vluxei8.v, vluxei16.v, vluxei32.v, vluxei64.v
-               // vluxseg2e8.v, vluxseg2e16.v, vluxseg2e32.v, vluxseg2e64.v
-               // vluxseg3e8.v, vluxseg3e16.v, vluxseg3e32.v, vluxseg3e64.v
-               // vluxseg4e8.v, vluxseg4e16.v, vluxseg4e32.v, vluxseg4e64.v
-               // vluxseg5e8.v, vluxseg5e16.v, vluxseg5e32.v, vluxseg5e64.v
-               // vluxseg6e8.v, vluxseg6e16.v, vluxseg6e32.v, vluxseg6e64.v
-               // vluxseg7e8.v, vluxseg7e16.v, vluxseg7e32.v, vluxseg7e64.v
-               // vluxseg8e8.v, vluxseg8e16.v, vluxseg8e32.v, vluxseg8e64.v
-    case 0b11: { // indexed - ordered, vloxei8.v, vloxei16.v, vloxei32.v, vloxei64.v
-                 // vloxseg2e8.v, vloxseg2e16.v, vloxseg2e32.v, vloxseg2e64.v
-                 // vloxseg3e8.v, vloxseg3e16.v, vloxseg3e32.v, vloxseg3e64.v
-                 // vloxseg4e8.v, vloxseg4e16.v, vloxseg4e32.v, vloxseg4e64.v
-                 // vloxseg5e8.v, vloxseg5e16.v, vloxseg5e32.v, vloxseg5e64.v
-                 // vloxseg6e8.v, vloxseg6e16.v, vloxseg6e32.v, vloxseg6e64.v
-                 // vloxseg7e8.v, vloxseg7e16.v, vloxseg7e32.v, vloxseg7e64.v
-                 // vloxseg8e8.v, vloxseg8e16.v, vloxseg8e32.v, vloxseg8e64.v
-      uint32_t nfields = instr.getVnf() + 1;
-      vector_op_vv_load(warp.vreg_file, this, rsdata, instr.getRSrc(1), rdest, warp.vtype.vsew, instr.getVsew(), warp.vl, nfields, warp.vtype.vlmul, vmask);
-      break;
-    }
-    default:
-      std::cout << "Load vector - unsupported mop: " << mop << std::endl;
-      std::abort();
-  }
-}
-
 template <typename DT>
 void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, std::vector<reg_data_t[3]> &rsdata, uint32_t rsrc3, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
  uint32_t vsew = sizeof(DT) * 8;
@ -1364,7 +1266,7 @@ void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emul
    uint32_t nfields_strided = strided ? nfields : 1;
    Word mem_addr = rsdata[0][0].i + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
    Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
-    DP(1, "Storing: " << std::hex << mem_data << " at: " << mem_addr << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
+    DP(4, "Storing: " << std::hex << mem_data << " at: " << mem_addr << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
    emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
  }
 }
@ -1417,7 +1319,7 @@ void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emula

    Word mem_addr = rsdata[0][0].i + offset + (i % nfields) * sizeof(DT);
    Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
-    DP(1, "VSUX/VSOX - Storing: " << std::hex << mem_data << " at: " << mem_addr << " with offset: " << std::dec << offset << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
+    DP(4, "VSUX/VSOX - Storing: " << std::hex << mem_data << " at: " << mem_addr << " with offset: " << std::dec << offset << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
    emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
  }
 }
@ -1442,97 +1344,16 @@ void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emula
  }
 }

-void Emulator::storeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
-  auto &warp = warps_.at(wid);
-  auto vmask  = instr.getVmask();
-  auto mop = instr.getVmop();
-  switch (mop) {
-    case 0b00: { // unit-stride
-      auto vs3  = instr.getRSrc(1);
-      auto sumop  = instr.getVumop();
-      WordI stride = warp.vtype.vsew / 8;
-      switch (sumop) {
-        case 0b0000: { // vse8.v, vse16.v, vse32.v, vse64.v
-          uint32_t nfields = instr.getVnf() + 1;
-          vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, warp.vtype.vsew, warp.vl, false, stride, nfields, warp.vtype.vlmul, vmask);
-          break;
-        }
-        case 0b1000: { // vs1r.v, vs2r.v, vs4r.v, vs8r.v
-          uint32_t nreg = instr.getVnf() + 1;
-          if (nreg != 1 && nreg != 2 && nreg != 4 && nreg != 8) {
-            std::cout << "Whole vector register store - reserved value for nreg: " << nreg << std::endl;
-            std::abort();
-          }
-          DP(1, "Whole vector register store with nreg: " << nreg);
-          uint32_t vl = nreg * VLEN / 8;
-          vector_op_vix_store<uint8_t>(warp.vreg_file, this, rsdata, vs3, vl, false, stride, 1, 0, vmask);
-          break;
-        }
-        case 0b1011: { // vsm.v
-          if (warp.vtype.vsew != 8) {
-            std::cout << "vsm.v only supports EEW=8, but EEW was: " << warp.vtype.vsew << std::endl;
-            std::abort();
-          }
-          vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, warp.vtype.vsew, (warp.vl + 7) / 8, false, stride, 1, 0, true);
-          break;
-        }
-        default:
-          std::cout << "Store vector - unsupported sumop: " << sumop << std::endl;
-          std::abort();
-      }
-      break;
-    }
-    case 0b10: { // strided: vsse8.v, vsse16.v, vsse32.v, vsse64.v
-                 // vssseg2e8.v, vssseg2e16.v, vssseg2e32.v, vssseg2e64.v
-                 // vssseg3e8.v, vssseg3e16.v, vssseg3e32.v, vssseg3e64.v
-                 // vssseg4e8.v, vssseg4e16.v, vssseg4e32.v, vssseg4e64.v
-                 // vssseg5e8.v, vssseg5e16.v, vssseg5e32.v, vssseg5e64.v
-                 // vssseg6e8.v, vssseg6e16.v, vssseg6e32.v, vssseg6e64.v
-                 // vssseg7e8.v, vssseg7e16.v, vssseg7e32.v, vssseg7e64.v
-                 // vssseg8e8.v, vssseg8e16.v, vssseg8e32.v, vssseg8e64.v
-      auto rsrc1  = instr.getRSrc(1);
-      auto vs3  = instr.getRSrc(2);
-      WordI stride = warp.ireg_file.at(0).at(rsrc1);
-      uint32_t nfields = instr.getVnf() + 1;
-      vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, warp.vtype.vsew, warp.vl, true, stride, nfields, warp.vtype.vlmul, vmask);
-      break;
-    }
-    case 0b01: // indexed - unordered, vsuxei8.v, vsuxei16.v, vsuxei32.v, vsuxei64.v
-               // vsuxseg2ei8.v, vsuxseg2ei16.v, vsuxseg2ei32.v, vsuxseg2ei64.v
-               // vsuxseg3ei8.v, vsuxseg3ei16.v, vsuxseg3ei32.v, vsuxseg3ei64.v
-               // vsuxseg4ei8.v, vsuxseg4ei16.v, vsuxseg4ei32.v, vsuxseg4ei64.v
-               // vsuxseg5ei8.v, vsuxseg5ei16.v, vsuxseg5ei32.v, vsuxseg5ei64.v
-               // vsuxseg6ei8.v, vsuxseg6ei16.v, vsuxseg6ei32.v, vsuxseg6ei64.v
-               // vsuxseg7ei8.v, vsuxseg7ei16.v, vsuxseg7ei32.v, vsuxseg7ei64.v
-               // vsuxseg8ei8.v, vsuxseg8ei16.v, vsuxseg8ei32.v, vsuxseg8ei64.v
-    case 0b11: { // indexed - ordered, vsoxei8.v, vsoxei16.v, vsoxei32.v, vsoxei64.v
-                 // vsoxseg2ei8.v, vsoxseg2ei16.v, vsoxseg2ei32.v, vsoxseg2ei64.v
-                 // vsoxseg3ei8.v, vsoxseg3ei16.v, vsoxseg3ei32.v, vsoxseg3ei64.v
-                 // vsoxseg4ei8.v, vsoxseg4ei16.v, vsoxseg4ei32.v, vsoxseg4ei64.v
-                 // vsoxseg5ei8.v, vsoxseg5ei16.v, vsoxseg5ei32.v, vsoxseg5ei64.v
-                 // vsoxseg6ei8.v, vsoxseg6ei16.v, vsoxseg6ei32.v, vsoxseg6ei64.v
-                 // vsoxseg7ei8.v, vsoxseg7ei16.v, vsoxseg7ei32.v, vsoxseg7ei64.v
-                 // vsoxseg8ei8.v, vsoxseg8ei16.v, vsoxseg8ei32.v, vsoxseg8ei64.v
-      uint32_t nfields = instr.getVnf() + 1;
-      vector_op_vv_store(warp.vreg_file, this, rsdata, instr.getRSrc(1), instr.getRSrc(2), warp.vtype.vsew, instr.getVsew(), warp.vl, nfields, warp.vtype.vlmul, vmask);
-      break;
-    }
-    default:
-      std::cout << "Store vector - unsupported mop: " << mop << std::endl;
-      std::abort();      
-  }
-}
-
 template <template <typename DT1, typename DT2> class OP, typename DT>
 void vector_op_vix(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask)
 {
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask)) continue;
-    
+
    DT second = getVregData<DT>(vreg_file, rsrc0, i);
    DT third = getVregData<DT>(vreg_file, rdest, i);
    DT result = OP<DT, DT>::apply(first, second, third);
-    DP(1, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
    getVregData<DT>(vreg_file, rdest, i) = result;
  }
 }
@ -1557,11 +1378,11 @@ void vector_op_vix(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_
 template <template <typename DT1, typename DT2> class OP, typename DT>
 void vector_op_vix_carry(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl)
 {
-  for (uint32_t i = 0; i < vl; i++) {    
+  for (uint32_t i = 0; i < vl; i++) {
    DT second = getVregData<DT>(vreg_file, rsrc0, i);
    bool third = !isMasked(vreg_file, 0, i, false);
    DT result = OP<DT, DT>::apply(first, second, third);
-    DP(1, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
    getVregData<DT>(vreg_file, rdest, i) = result;
  }
 }
@ -1586,11 +1407,11 @@ void vector_op_vix_carry(Word src1, std::vector<std::vector<Byte>> &vreg_file, u
 template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
 void vector_op_vix_carry_out(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask)
 {
-  for (uint32_t i = 0; i < vl; i++) {    
+  for (uint32_t i = 0; i < vl; i++) {
    DT second = getVregData<DT>(vreg_file, rsrc0, i);
    bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
    bool result = OP<DT, DTR>::apply(first, second, third);
-    DP(1, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
    if (result) {
      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
    } else {
@ -1621,7 +1442,7 @@ void vector_op_vix_merge(DT first, std::vector<std::vector<Byte>> &vreg_file, ui
 {
  for (uint32_t i = 0; i < vl; i++) {
    DT result = isMasked(vreg_file, 0, i, vmask) ? getVregData<DT>(vreg_file, rsrc0, i) : first;
-    DP(1, "Merge - Choosing result: " << +result);
+    DP(4, "Merge - Choosing result: " << +result);
    getVregData<DT>(vreg_file, rdest, i) = result;
  }
 }
@ -1673,7 +1494,7 @@ void vector_op_vix_w(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32
    DT second = getVregData<DT>(vreg_file, rsrc0, i);
    DTR third = getVregData<DTR>(vreg_file, rdest, i);
    DTR result = OP<DT, DTR>::apply(first, second, third);
-    DP(1, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
    getVregData<DTR>(vreg_file, rdest, i) = result;
  }
 }
@ -1716,7 +1537,7 @@ void vector_op_vix_n(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32

    DT second = getVregData<DT>(vreg_file, rsrc0, i);
    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
-    DP(1, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
    getVregData<DTR>(vreg_file, rdest, i) = result;
  }
 }
@ -1744,7 +1565,7 @@ void vector_op_vix_sat(DTR first, std::vector<std::vector<Byte>> &vreg_file, uin

    DT second = getVregData<DTR>(vreg_file, rsrc0, i);
    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
-    DP(1, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
+    DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
    getVregData<DTR>(vreg_file, rdest, i) = result;
  }
 }
@ -1854,7 +1675,7 @@ void vector_op_vix_mask(DT first, std::vector<std::vector<Byte>> &vreg_file, uin

    DT second = getVregData<DT>(vreg_file, rsrc0, i);
    bool result = OP<DT, bool>::apply(first, second, 0);
-    DP(1, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
    if (result) {
      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
    } else {
@ -1889,7 +1710,7 @@ void vector_op_vix_slide(Word first, std::vector<std::vector<Byte>> &vreg_file,
  // If scalar set is set this means we have a v(f)slide1up or v(f)slide1down instruction,
  // so first is our scalar value and we need to overwrite it with 1 for later computations
  if (scalar && vl && !isMasked(vreg_file, 0, scalarPos, vmask)) {
-    DP(1, "Slide - Moving scalar value " << +first << " to position " << +scalarPos);
+    DP(4, "Slide - Moving scalar value " << +first << " to position " << +scalarPos);
    getVregData<DT>(vreg_file, rdest, scalarPos) = first;
  }
  first = scalar ? 1 : first;
@ -1899,7 +1720,7 @@ void vector_op_vix_slide(Word first, std::vector<std::vector<Byte>> &vreg_file,

    __uint128_t iSrc = slideDown ? (__uint128_t)i + (__uint128_t)first : (__uint128_t)i - (__uint128_t)first; // prevent overflows/underflows
    DT value = (!slideDown || iSrc < VLMAX) ? getVregData<DT>(vreg_file, rsrc0, iSrc) : 0;
-    DP(1, "Slide - Moving value " << +value << " from position " << (uint64_t)iSrc << " to position " << +i);
+    DP(4, "Slide - Moving value " << +value << " from position " << (uint64_t)iSrc << " to position " << +i);
    getVregData<DT>(vreg_file, rdest, i) = value;
  }
 }
@ -1928,7 +1749,7 @@ void vector_op_vix_gather(Word first, std::vector<std::vector<Byte>> &vreg_file,
    if (isMasked(vreg_file, 0, i, vmask)) continue;

    DT value = first < VLMAX ? getVregData<DT>(vreg_file, rsrc0, first) : 0;
-    DP(1, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
+    DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
    getVregData<DT>(vreg_file, rdest, i) = value;
  }
 }
@ -1960,7 +1781,7 @@ void vector_op_vv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uin
    DT second = getVregData<DT>(vreg_file, rsrc1, i);
    DT third = getVregData<DT>(vreg_file, rdest, i);
    DT result = OP<DT, DT>::apply(first, second, third);
-    DP(1, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
    getVregData<DT>(vreg_file, rdest, i) = result;
  }
 }
@ -1990,7 +1811,7 @@ void vector_op_vv_carry(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc
    DT second = getVregData<DT>(vreg_file, rsrc1, i);
    bool third = !isMasked(vreg_file, 0, i, false);
    DT result = OP<DT, DT>::apply(first, second, third);
-    DP(1, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
    getVregData<DT>(vreg_file, rdest, i) = result;
  }
 }
@ -2020,7 +1841,7 @@ void vector_op_vv_carry_out(std::vector<std::vector<Byte>> &vreg_file, uint32_t
    DT second = getVregData<DT>(vreg_file, rsrc1, i);
    bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
    bool result = OP<DT, DTR>::apply(first, second, third);
-    DP(1, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
    if (result) {
      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
    } else {
@ -2052,7 +1873,7 @@ void vector_op_vv_merge(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc
  for (uint32_t i = 0; i < vl; i++) {
    uint32_t rsrc = isMasked(vreg_file, 0, i, vmask) ? rsrc1 : rsrc0;
    DT result = getVregData<DT>(vreg_file, rsrc, i);
-    DP(1, "Merge - Choosing result: " << +result);
+    DP(4, "Merge - Choosing result: " << +result);
    getVregData<DT>(vreg_file, rdest, i) = result;
  }
 }
@ -2082,7 +1903,7 @@ void vector_op_vv_gather(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsr

    uint32_t first = ei16 ? getVregData<uint16_t>(vreg_file, rsrc0, i) : getVregData<DT>(vreg_file, rsrc0, i);
    DT value = first < VLMAX ? getVregData<DT>(vreg_file, rsrc1, first) : 0;
-    DP(1, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
+    DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
    getVregData<DT>(vreg_file, rdest, i) = value;
  }
 }
@ -2114,7 +1935,7 @@ void vector_op_vv_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, u
    DT second = getVregData<DT>(vreg_file, rsrc1, i);
    DTR third = getVregData<DTR>(vreg_file, rdest, i);
    DTR result = OP<DT, DTR>::apply(first, second, third);
-    DP(1, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
    getVregData<DTR>(vreg_file, rdest, i) = result;
  }
 }
@ -2144,7 +1965,7 @@ void vector_op_vv_wv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0,
    DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
    DTR third = getVregData<DTR>(vreg_file, rdest, i);
    DTR result = OP<DTR, DTR>::apply(first, second, third);
-    DP(1, "Widening wv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    DP(4, "Widening wv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
    getVregData<DTR>(vreg_file, rdest, i) = result;
  }
 }
@ -2174,7 +1995,7 @@ void vector_op_vv_wfv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0,
    DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
    DTR third = getVregData<DTR>(vreg_file, rdest, i);
    DTR result = OP<DTR, DTR>::apply(rv_ftod(first), second, third);
-    DP(1, "Widening wfv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    DP(4, "Widening wfv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
    getVregData<DTR>(vreg_file, rdest, i) = result;
  }
 }
@ -2199,7 +2020,7 @@ void vector_op_vv_n(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, u
    DTR first = getVregData<DTR>(vreg_file, rsrc0, i);
    DT second = getVregData<DT>(vreg_file, rsrc1, i);
    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
-    DP(1, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
    getVregData<DTR>(vreg_file, rdest, i) = result;
  }
 }
@ -2228,7 +2049,7 @@ void vector_op_vv_sat(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0,
    DT first = getVregData<DTR>(vreg_file, rsrc0, i);
    DT second = getVregData<DTR>(vreg_file, rsrc1, i);
    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
-    DP(1, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
+    DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
    getVregData<DTR>(vreg_file, rdest, i) = result;
  }
 }
@ -2280,9 +2101,9 @@ void vector_op_vv_red(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0,
    DT first = getVregData<DT>(vreg_file, rdest, 0);
    DT second = getVregData<DT>(vreg_file, rsrc1, i);
    DT result = OP<DT, DT>::apply(first, second, 0);
-    DP(1, "Reduction " << (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    DP(4, "Reduction " << (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
    getVregData<DT>(vreg_file, rdest, 0) = result;
-  } 
+  }
 }

 template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
@ -2316,9 +2137,9 @@ void vector_op_vv_red_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc
    DT second = getVregData<DT>(vreg_file, rsrc1, i);
    DTR second_w = std::is_signed<DT>() ? sext((DTR) second, sizeof(DT) * 8) : zext((DTR) second, sizeof(DT) * 8);
    DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
-    DP(1, "Widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
+    DP(4, "Widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
    getVregData<DTR>(vreg_file, rdest, 0) = result;
-  } 
+  }
 }

 template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
@ -2350,9 +2171,9 @@ void vector_op_vv_red_wf(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsr
    DT second = getVregData<DT>(vreg_file, rsrc1, i);
    DTR second_w = rv_ftod(second);
    DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
-    DP(1, "Float widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
+    DP(4, "Float widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
    getVregData<DTR>(vreg_file, rdest, 0) = result;
-  } 
+  }
 }

 template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
@ -2372,9 +2193,9 @@ void vector_op_vid(std::vector<std::vector<Byte>> &vreg_file, uint32_t rdest, ui
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask)) continue;

-    DP(1, "Element Index = " << +i);
+    DP(4, "Element Index = " << +i);
    getVregData<DT>(vreg_file, rdest, i) = i;
-  } 
+  }
 }

 void vector_op_vid(std::vector<std::vector<Byte>> &vreg_file, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask)
@ -2402,7 +2223,7 @@ void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0
    DT first = getVregData<DT>(vreg_file, rsrc0, i);
    DT second = getVregData<DT>(vreg_file, rsrc1, i);
    bool result = OP<DT, bool>::apply(first, second, 0);
-    DP(1, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
    if (result) {
      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
    } else {
@ -2437,7 +2258,7 @@ void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0
    uint8_t secondMask = getVregData<uint8_t>(vreg_file, rsrc1, i / 8);
    bool second = (secondMask >> (i % 8)) & 0x1;
    bool result = OP<uint8_t, uint8_t>::apply(first, second, 0) & 0x1;
-    DP(1, "Compare mask bits " << (OP<uint8_t, uint8_t>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    DP(4, "Compare mask bits " << (OP<uint8_t, uint8_t>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
    if (result) {
      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
    } else {
@ -2456,7 +2277,7 @@ void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t r
    if (isMasked(vreg_file, rsrc0, i, 0)) continue;

    DT value = getVregData<DT>(vreg_file, rsrc1, i);
-    DP(1, "Compression - Moving value " << +value << " from position " << i << " to position " << currPos);
+    DP(4, "Compression - Moving value " << +value << " from position " << i << " to position " << currPos);
    getVregData<DT>(vreg_file, rdest, currPos) = value;
    currPos++;
  }
@ -2479,6 +2300,185 @@ void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t r
  }
 }

+void Emulator::loadVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
+  auto &warp = warps_.at(wid);
+  auto vmask  = instr.getVmask();
+  auto rdest  = instr.getRDest();
+  auto mop = instr.getVmop();
+  switch (mop) {
+    case 0b00: { // unit-stride
+      auto lumop  = instr.getVumop();
+      switch (lumop) {
+        case 0b10000: // vle8ff.v, vle16ff.v, vle32ff.v, vle64ff.v - we do not support exceptions -> treat like regular unit stride
+                       // vlseg2e8ff.v, vlseg2e16ff.v, vlseg2e32ff.v, vlseg2e64ff.v
+                       // vlseg3e8ff.v, vlseg3e16ff.v, vlseg3e32ff.v, vlseg3e64ff.v
+                       // vlseg4e8ff.v, vlseg4e16ff.v, vlseg4e32ff.v, vlseg4e64ff.v
+                       // vlseg5e8ff.v, vlseg5e16ff.v, vlseg5e32ff.v, vlseg5e64ff.v
+                       // vlseg6e8ff.v, vlseg6e16ff.v, vlseg6e32ff.v, vlseg6e64ff.v
+                       // vlseg7e8ff.v, vlseg7e16ff.v, vlseg7e32ff.v, vlseg7e64ff.v
+                       // vlseg8e8ff.v, vlseg8e16ff.v, vlseg8e32ff.v, vlseg8e64ff.v
+        case 0b0000: { // vle8.v, vle16.v, vle32.v, vle64.v
+                       // vlseg2e8.v, vlseg2e16.v, vlseg2e32.v, vlseg2e64.v
+                       // vlseg3e8.v, vlseg3e16.v, vlseg3e32.v, vlseg3e64.v
+                       // vlseg4e8.v, vlseg4e16.v, vlseg4e32.v, vlseg4e64.v
+                       // vlseg5e8.v, vlseg5e16.v, vlseg5e32.v, vlseg5e64.v
+                       // vlseg6e8.v, vlseg6e16.v, vlseg6e32.v, vlseg6e64.v
+                       // vlseg7e8.v, vlseg7e16.v, vlseg7e32.v, vlseg7e64.v
+                       // vlseg8e8.v, vlseg8e16.v, vlseg8e32.v, vlseg8e64.v
+          WordI stride = warp.vtype.vsew / 8;
+          uint32_t nfields = instr.getVnf() + 1;
+          vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, warp.vtype.vsew, warp.vl, false, stride, nfields, warp.vtype.vlmul, vmask);
+          break;
+        }
+        case 0b1000: { // vl1r.v, vl2r.v, vl4r.v, vl8r.v
+          uint32_t nreg = instr.getVnf() + 1;
+          if (nreg != 1 && nreg != 2 && nreg != 4 && nreg != 8) {
+            std::cout << "Whole vector register load - reserved value for nreg: " << nreg << std::endl;
+            std::abort();
+          }
+          DP(4, "Whole vector register load with nreg: " << nreg);
+          uint32_t vl = nreg * VLEN / instr.getVsew();
+          WordI stride = instr.getVsew() / 8;
+          vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, instr.getVsew(), vl, false, stride, 1, 0, vmask);
+          break;
+        }
+        case 0b1011: { // vlm.v
+          if (warp.vtype.vsew != 8) {
+            std::cout << "vlm.v only supports EEW=8, but EEW was: " << warp.vtype.vsew << std::endl;
+            std::abort();
+          }
+          WordI stride = warp.vtype.vsew / 8;
+          vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, warp.vtype.vsew, (warp.vl + 7) / 8, false, stride, 1, 0, true);
+          break;
+        }
+        default:
+          std::cout << "Load vector - unsupported lumop: " << lumop << std::endl;
+          std::abort();
+      }
+      break;
+    }
+    case 0b10: { // strided: vlse8.v, vlse16.v, vlse32.v, vlse64.v
+                 // vlsseg2e8.v, vlsseg2e16.v, vlsseg2e32.v, vlsseg2e64.v
+                 // vlsseg3e8.v, vlsseg3e16.v, vlsseg3e32.v, vlsseg3e64.v
+                 // vlsseg4e8.v, vlsseg4e16.v, vlsseg4e32.v, vlsseg4e64.v
+                 // vlsseg5e8.v, vlsseg5e16.v, vlsseg5e32.v, vlsseg5e64.v
+                 // vlsseg6e8.v, vlsseg6e16.v, vlsseg6e32.v, vlsseg6e64.v
+                 // vlsseg7e8.v, vlsseg7e16.v, vlsseg7e32.v, vlsseg7e64.v
+                 // vlsseg8e8.v, vlsseg8e16.v, vlsseg8e32.v, vlsseg8e64.v
+      auto rsrc1  = instr.getRSrc(1);
+      auto rdest  = instr.getRDest();
+      WordI stride = warp.ireg_file.at(0).at(rsrc1);
+      uint32_t nfields = instr.getVnf() + 1;
+      vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, warp.vtype.vsew, warp.vl, true, stride, nfields, warp.vtype.vlmul, vmask);
+      break;
+    }
+    case 0b01: // indexed - unordered, vluxei8.v, vluxei16.v, vluxei32.v, vluxei64.v
+               // vluxseg2e8.v, vluxseg2e16.v, vluxseg2e32.v, vluxseg2e64.v
+               // vluxseg3e8.v, vluxseg3e16.v, vluxseg3e32.v, vluxseg3e64.v
+               // vluxseg4e8.v, vluxseg4e16.v, vluxseg4e32.v, vluxseg4e64.v
+               // vluxseg5e8.v, vluxseg5e16.v, vluxseg5e32.v, vluxseg5e64.v
+               // vluxseg6e8.v, vluxseg6e16.v, vluxseg6e32.v, vluxseg6e64.v
+               // vluxseg7e8.v, vluxseg7e16.v, vluxseg7e32.v, vluxseg7e64.v
+               // vluxseg8e8.v, vluxseg8e16.v, vluxseg8e32.v, vluxseg8e64.v
+    case 0b11: { // indexed - ordered, vloxei8.v, vloxei16.v, vloxei32.v, vloxei64.v
+                 // vloxseg2e8.v, vloxseg2e16.v, vloxseg2e32.v, vloxseg2e64.v
+                 // vloxseg3e8.v, vloxseg3e16.v, vloxseg3e32.v, vloxseg3e64.v
+                 // vloxseg4e8.v, vloxseg4e16.v, vloxseg4e32.v, vloxseg4e64.v
+                 // vloxseg5e8.v, vloxseg5e16.v, vloxseg5e32.v, vloxseg5e64.v
+                 // vloxseg6e8.v, vloxseg6e16.v, vloxseg6e32.v, vloxseg6e64.v
+                 // vloxseg7e8.v, vloxseg7e16.v, vloxseg7e32.v, vloxseg7e64.v
+                 // vloxseg8e8.v, vloxseg8e16.v, vloxseg8e32.v, vloxseg8e64.v
+      uint32_t nfields = instr.getVnf() + 1;
+      vector_op_vv_load(warp.vreg_file, this, rsdata, instr.getRSrc(1), rdest, warp.vtype.vsew, instr.getVsew(), warp.vl, nfields, warp.vtype.vlmul, vmask);
+      break;
+    }
+    default:
+      std::cout << "Load vector - unsupported mop: " << mop << std::endl;
+      std::abort();
+  }
+}
+
+void Emulator::storeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
+  auto &warp = warps_.at(wid);
+  auto vmask  = instr.getVmask();
+  auto mop = instr.getVmop();
+  switch (mop) {
+    case 0b00: { // unit-stride
+      auto vs3  = instr.getRSrc(1);
+      auto sumop  = instr.getVumop();
+      WordI stride = warp.vtype.vsew / 8;
+      switch (sumop) {
+        case 0b0000: { // vse8.v, vse16.v, vse32.v, vse64.v
+          uint32_t nfields = instr.getVnf() + 1;
+          vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, warp.vtype.vsew, warp.vl, false, stride, nfields, warp.vtype.vlmul, vmask);
+          break;
+        }
+        case 0b1000: { // vs1r.v, vs2r.v, vs4r.v, vs8r.v
+          uint32_t nreg = instr.getVnf() + 1;
+          if (nreg != 1 && nreg != 2 && nreg != 4 && nreg != 8) {
+            std::cout << "Whole vector register store - reserved value for nreg: " << nreg << std::endl;
+            std::abort();
+          }
+          DP(4, "Whole vector register store with nreg: " << nreg);
+          uint32_t vl = nreg * VLEN / 8;
+          vector_op_vix_store<uint8_t>(warp.vreg_file, this, rsdata, vs3, vl, false, stride, 1, 0, vmask);
+          break;
+        }
+        case 0b1011: { // vsm.v
+          if (warp.vtype.vsew != 8) {
+            std::cout << "vsm.v only supports EEW=8, but EEW was: " << warp.vtype.vsew << std::endl;
+            std::abort();
+          }
+          vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, warp.vtype.vsew, (warp.vl + 7) / 8, false, stride, 1, 0, true);
+          break;
+        }
+        default:
+          std::cout << "Store vector - unsupported sumop: " << sumop << std::endl;
+          std::abort();
+      }
+      break;
+    }
+    case 0b10: { // strided: vsse8.v, vsse16.v, vsse32.v, vsse64.v
+                 // vssseg2e8.v, vssseg2e16.v, vssseg2e32.v, vssseg2e64.v
+                 // vssseg3e8.v, vssseg3e16.v, vssseg3e32.v, vssseg3e64.v
+                 // vssseg4e8.v, vssseg4e16.v, vssseg4e32.v, vssseg4e64.v
+                 // vssseg5e8.v, vssseg5e16.v, vssseg5e32.v, vssseg5e64.v
+                 // vssseg6e8.v, vssseg6e16.v, vssseg6e32.v, vssseg6e64.v
+                 // vssseg7e8.v, vssseg7e16.v, vssseg7e32.v, vssseg7e64.v
+                 // vssseg8e8.v, vssseg8e16.v, vssseg8e32.v, vssseg8e64.v
+      auto rsrc1  = instr.getRSrc(1);
+      auto vs3  = instr.getRSrc(2);
+      WordI stride = warp.ireg_file.at(0).at(rsrc1);
+      uint32_t nfields = instr.getVnf() + 1;
+      vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, warp.vtype.vsew, warp.vl, true, stride, nfields, warp.vtype.vlmul, vmask);
+      break;
+    }
+    case 0b01: // indexed - unordered, vsuxei8.v, vsuxei16.v, vsuxei32.v, vsuxei64.v
+               // vsuxseg2ei8.v, vsuxseg2ei16.v, vsuxseg2ei32.v, vsuxseg2ei64.v
+               // vsuxseg3ei8.v, vsuxseg3ei16.v, vsuxseg3ei32.v, vsuxseg3ei64.v
+               // vsuxseg4ei8.v, vsuxseg4ei16.v, vsuxseg4ei32.v, vsuxseg4ei64.v
+               // vsuxseg5ei8.v, vsuxseg5ei16.v, vsuxseg5ei32.v, vsuxseg5ei64.v
+               // vsuxseg6ei8.v, vsuxseg6ei16.v, vsuxseg6ei32.v, vsuxseg6ei64.v
+               // vsuxseg7ei8.v, vsuxseg7ei16.v, vsuxseg7ei32.v, vsuxseg7ei64.v
+               // vsuxseg8ei8.v, vsuxseg8ei16.v, vsuxseg8ei32.v, vsuxseg8ei64.v
+    case 0b11: { // indexed - ordered, vsoxei8.v, vsoxei16.v, vsoxei32.v, vsoxei64.v
+                 // vsoxseg2ei8.v, vsoxseg2ei16.v, vsoxseg2ei32.v, vsoxseg2ei64.v
+                 // vsoxseg3ei8.v, vsoxseg3ei16.v, vsoxseg3ei32.v, vsoxseg3ei64.v
+                 // vsoxseg4ei8.v, vsoxseg4ei16.v, vsoxseg4ei32.v, vsoxseg4ei64.v
+                 // vsoxseg5ei8.v, vsoxseg5ei16.v, vsoxseg5ei32.v, vsoxseg5ei64.v
+                 // vsoxseg6ei8.v, vsoxseg6ei16.v, vsoxseg6ei32.v, vsoxseg6ei64.v
+                 // vsoxseg7ei8.v, vsoxseg7ei16.v, vsoxseg7ei32.v, vsoxseg7ei64.v
+                 // vsoxseg8ei8.v, vsoxseg8ei16.v, vsoxseg8ei32.v, vsoxseg8ei64.v
+      uint32_t nfields = instr.getVnf() + 1;
+      vector_op_vv_store(warp.vreg_file, this, rsdata, instr.getRSrc(1), instr.getRSrc(2), warp.vtype.vsew, instr.getVsew(), warp.vl, nfields, warp.vtype.vlmul, vmask);
+      break;
+    }
+    default:
+      std::cout << "Store vector - unsupported mop: " << mop << std::endl;
+      std::abort();
+  }
+}
+
 void Emulator::executeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata) {
  auto &warp = warps_.at(wid);
  auto func3  = instr.getFunc3();
@ -2491,10 +2491,10 @@ void Emulator::executeVector(const Instr &instr, uint32_t wid, std::vector<reg_d
  auto uimmsrc = (Word)instr.getImm();
  auto vmask  = instr.getVmask();
  auto num_threads = arch_.num_threads();
-  
+
    switch (func3) {
    case 0: { // vector - vector
-        switch (func6) { 
+        switch (func6) {
          case 0: { // vadd.vv
            for (uint32_t t = 0; t < num_threads; ++t) {
              if (!warp.tmask.test(t)) continue;
@ -2769,7 +2769,7 @@ void Emulator::executeVector(const Instr &instr, uint32_t wid, std::vector<reg_d
          default:
            std::cout << "Unrecognised vector - vector instruction func3: " << func3 << " func6: " << func6 << std::endl;
            std::abort();
-        } 
+        }
      } break;
    case 1: { // float vector - vector
        switch (func6) {
@ -2839,7 +2839,7 @@ void Emulator::executeVector(const Instr &instr, uint32_t wid, std::vector<reg_d
              if (!warp.tmask.test(t)) continue;
              auto &dest = rddata[t].u64;
              vector_op_scalar(dest, warp.vreg_file, rsrc0, rsrc1, warp.vtype.vsew);
-              DP(1, "Moved " << +dest << " from: " << +rsrc1 << " to: " << +rdest);
+              DP(4, "Moved " << +dest << " from: " << +rsrc1 << " to: " << +rdest);
            }
          } break;
          case 18: {
@ -3107,7 +3107,7 @@ void Emulator::executeVector(const Instr &instr, uint32_t wid, std::vector<reg_d
            if (!warp.tmask.test(t)) continue;
            auto &dest = rddata[t].i;
            vector_op_scalar(dest, warp.vreg_file, rsrc0, rsrc1, warp.vtype.vsew);
-            DP(1, "Moved " << +dest << " from: " << +rsrc1 << " to: " << +rdest);
+            DP(4, "Moved " << +dest << " from: " << +rsrc1 << " to: " << +rdest);
          }
        } break;
        case 18: { // vzext.vf8, vsext.vf8, vzext.vf4, vsext.vf4, vzext.vf2, vsext.vf2
@ -4438,7 +4438,7 @@ void Emulator::executeVector(const Instr &instr, uint32_t wid, std::vector<reg_d
      uint32_t vsew = instr.getVsew();
      uint32_t vlmul = instr.getVlmul();

-      if(!instr.hasZimm()){ // vsetvl
+      if (!instr.hasZimm()) { // vsetvl
        uint32_t zimm = rsdata[0][1].u;
        vlmul = zimm & mask_v_lmul;
        vsewO = (zimm >> shift_v_sew) & mask_v_sew;
@ -4459,7 +4459,7 @@ void Emulator::executeVector(const Instr &instr, uint32_t wid, std::vector<reg_d
        s0 = rsdata[0][0].u;
      }

-      DP(1, "Vset(i)vl(i) - vill: " << +warp.vtype.vill << " vma: " << vma << " vta: " << vta << " lmul: " << vlmul << " sew: " << vsew << " s0: " << s0 << " VLMAX: " << warp.VLMAX);
+      DP(4, "Vset(i)vl(i) - vill: " << +warp.vtype.vill << " vma: " << vma << " vta: " << vta << " lmul: " << vlmul << " sew: " << vsew << " s0: " << s0 << " VLMAX: " << warp.VLMAX);
      warp.vl = std::min(s0, warp.VLMAX);

      if (warp.vtype.vill) {
@ -4490,4 +4490,4 @@ void Emulator::executeVector(const Instr &instr, uint32_t wid, std::vector<reg_d
      std::cout << "Unrecognised vector instruction func3: " << func3 << " func6: " << func6 << std::endl;
      std::abort();
    }
-}
+}
--- a/sim/simx/instr.h
+++ b/sim/simx/instr.h
@ -1,10 +1,10 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,8 +17,8 @@

 namespace vortex {

-enum class Opcode {   
-  NONE      = 0,    
+enum class Opcode {
+  NONE      = 0,
  R         = 0x33,
  L         = 0x3,
  I         = 0x13,
@ -38,11 +38,11 @@ enum class Opcode {
  FMADD     = 0x43,
  FMSUB     = 0x47,
  FMNMSUB   = 0x4b,
-  FMNMADD   = 0x4f,  
+  FMNMADD   = 0x4f,
  // RV64 Standard Extension
  R_W       = 0x3b,
  I_W       = 0x1b,
-  // Vector Extension  
+  // Vector Extension
  VSET      = 0x57,
  // Custom Extensions
  EXT1      = 0x0b,
@ -52,37 +52,84 @@ enum class Opcode {
 };

 enum class InstType {
-  R, 
-  I, 
-  S, 
-  B, 
-  U, 
+  R,
+  I,
+  S,
+  B,
+  U,
  J,
  V,
  R4
 };

-enum set_vuse_mask {
-  set_func3 = (1 << 0),
-  set_func6 = (1 << 1),
-  set_imm = (1 << 2),
-  set_vlswidth = (1 << 3),
-  set_vmop = (1 << 4),
-  set_vumop = (1 << 5),
-  set_vnf = (1 << 6),
-  set_vmask = (1 << 7),
-  set_vs3 = (1 << 8),
-  set_zimm = (1 << 9),
-  set_vlmul = (1 << 10),
-  set_vsew = (1 << 11),
-  set_vta = (1 << 12),
-  set_vma = (1 << 13),
-  set_vediv = (1 << 14)
+enum DecodeConstants {
+  width_opcode= 7,
+  width_reg   = 5,
+  width_func2 = 2,
+  width_func3 = 3,
+  width_func6 = 6,
+  width_func7 = 7,
+  width_mop   = 3,
+  width_vmask = 1,
+  width_i_imm = 12,
+  width_j_imm = 20,
+  width_v_zimm = 11,
+  width_v_ma = 1,
+  width_v_ta = 1,
+  width_v_sew = 3,
+  width_v_lmul = 3,
+  width_aq    = 1,
+  width_rl    = 1,
+
+  shift_opcode= 0,
+  shift_rd    = width_opcode,
+  shift_func3 = shift_rd + width_reg,
+  shift_rs1   = shift_func3 + width_func3,
+  shift_rs2   = shift_rs1 + width_reg,
+  shift_func2 = shift_rs2 + width_reg,
+  shift_func7 = shift_rs2 + width_reg,
+  shift_rs3   = shift_func7 + width_func2,
+  shift_vmop  = shift_func7 + width_vmask,
+  shift_vnf   = shift_vmop + width_mop,
+  shift_func6 = shift_func7 + width_vmask,
+  shift_vset  = shift_func7 + width_func6,
+  shift_v_sew = width_v_lmul,
+  shift_v_ta  = shift_v_sew + width_v_sew,
+  shift_v_ma  = shift_v_ta + width_v_ta,
+
+  mask_opcode = (1 << width_opcode) - 1,
+  mask_reg    = (1 << width_reg)   - 1,
+  mask_func2  = (1 << width_func2) - 1,
+  mask_func3  = (1 << width_func3) - 1,
+  mask_func6  = (1 << width_func6) - 1,
+  mask_func7  = (1 << width_func7) - 1,
+  mask_i_imm  = (1 << width_i_imm) - 1,
+  mask_j_imm  = (1 << width_j_imm) - 1,
+  mask_v_zimm = (1 << width_v_zimm) - 1,
+  mask_v_ma   = (1 << width_v_ma) - 1,
+  mask_v_ta   = (1 << width_v_ta) - 1,
+  mask_v_sew  = (1 << width_v_sew) - 1,
+  mask_v_lmul = (1 << width_v_lmul) - 1,
+};
+
+enum VectorAttrMask {
+  vattr_vlswidth = (1 << 3),
+  vattr_vmop     = (1 << 4),
+  vattr_vumop    = (1 << 5),
+  vattr_vnf      = (1 << 6),
+  vattr_vmask    = (1 << 7),
+  vattr_vs3      = (1 << 8),
+  vattr_zimm     = (1 << 9),
+  vattr_vlmul    = (1 << 10),
+  vattr_vsew     = (1 << 11),
+  vattr_vta      = (1 << 12),
+  vattr_vma      = (1 << 13),
+  vattr_vediv    = (1 << 14)
 };

 class Instr {
 public:
-  Instr() 
+  Instr()
    : opcode_(Opcode::NONE)
    , num_rsrcs_(0)
    , has_imm_(false)
@ -105,60 +152,72 @@ public:
    , vta_(0)
    , vma_(0)
    , vediv_(0)
-    , _vusemask(0)
-    , _is_vec(false)   {
+    , vattr_mask_(0) {
    for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
       rsrc_type_[i] = RegType::None;
       rsrc_[i] = 0;
    }
  }

-  void setOpcode(Opcode opcode)  { opcode_ = opcode; }
-  void setDestReg(uint32_t destReg, RegType type) { 
-    rdest_type_ = type; 
-    rdest_ = destReg; 
+  void setOpcode(Opcode opcode) {
+    opcode_ = opcode;
  }
-  void addSrcReg(uint32_t srcReg, RegType type) { 
-    rsrc_type_[num_rsrcs_] = type; 
-    rsrc_[num_rsrcs_] = srcReg; 
+
+  void setDestReg(uint32_t destReg, RegType type) {
+    rdest_type_ = type;
+    rdest_ = destReg;
+  }
+
+  void addSrcReg(uint32_t srcReg, RegType type) {
+    rsrc_type_[num_rsrcs_] = type;
+    rsrc_[num_rsrcs_] = srcReg;
    ++num_rsrcs_;
  }
-  void setSrcReg(uint32_t index, uint32_t srcReg, RegType type) { 
-    rsrc_type_[index] = type; 
-    rsrc_[index] = srcReg; 
-    num_rsrcs_ = std::max<uint32_t>(num_rsrcs_, index+1); 
+
+  void setSrcReg(uint32_t index, uint32_t srcReg, RegType type) {
+    rsrc_type_[index] = type;
+    rsrc_[index] = srcReg;
+    num_rsrcs_ = std::max<uint32_t>(num_rsrcs_, index+1);
  }
+
+  void setImm(uint32_t imm) { has_imm_ = true; imm_ = imm; }
+
  void setFunc2(uint32_t func2) { func2_ = func2; }
-  void setFunc3(uint32_t func3) { func3_ = func3; _vusemask |= set_func3; }
-  void setFunc6(uint32_t func6) { func6_ = func6; _vusemask |= set_func6; }
+  void setFunc3(uint32_t func3) { func3_ = func3; }
+  void setFunc6(uint32_t func6) { func6_ = func6; }
  void setFunc7(uint32_t func7) { func7_ = func7; }
-  void setImm(uint32_t imm) { has_imm_ = true; imm_ = imm; _vusemask |= set_imm; }
-  void setVlsWidth(uint32_t width) { vlsWidth_ = width; _vusemask |= set_vlswidth; }
-  void setVmop(uint32_t mop) { vMop_ = mop; _vusemask |= set_vmop; }
-  void setVumop(uint32_t umop) { vUmop_ = umop; _vusemask |= set_vumop; }
-  void setVnf(uint32_t nf) { vNf_ = nf; _vusemask |= set_vnf; }
-  void setVmask(uint32_t mask) { vmask_ = mask; _vusemask |= set_vmask; }
-  void setVs3(uint32_t vs) { vs3_ = vs; _vusemask |= set_vs3; }
-  void setZimm(bool has_zimm) { has_zimm_ = has_zimm; _vusemask |= set_zimm; }
-  void setVlmul(uint32_t lmul) { vlmul_ = lmul; _vusemask |= set_vlmul; }
-  void setVsew(uint32_t sew) { vsew_ = sew; _vusemask |= set_vsew; }
-  void setVta(uint32_t vta) { vta_ = vta; _vusemask |= set_vta; }
-  void setVma(uint32_t vma) { vma_ = vma; _vusemask |= set_vma; }
-  void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; _vusemask |= set_vediv; }
-  void setVec(bool is_vec) { _is_vec = is_vec; }
+
+  // Attributes for Vector instructions
+  void setVlsWidth(uint32_t width) { vlsWidth_ = width; vattr_mask_ |= vattr_vlswidth; }
+  void setVmop(uint32_t mop) { vMop_ = mop; vattr_mask_ |= vattr_vmop; }
+  void setVumop(uint32_t umop) { vUmop_ = umop; vattr_mask_ |= vattr_vumop; }
+  void setVnf(uint32_t nf) { vNf_ = nf; vattr_mask_ |= vattr_vnf; }
+  void setVmask(uint32_t mask) { vmask_ = mask; vattr_mask_ |= vattr_vmask; }
+  void setVs3(uint32_t vs) { vs3_ = vs; vattr_mask_ |= vattr_vs3; }
+  void setZimm(bool has_zimm) { has_zimm_ = has_zimm; vattr_mask_ |= vattr_zimm; }
+  void setVlmul(uint32_t lmul) { vlmul_ = lmul; vattr_mask_ |= vattr_vlmul; }
+  void setVsew(uint32_t sew) { vsew_ = sew; vattr_mask_ |= vattr_vsew; }
+  void setVta(uint32_t vta) { vta_ = vta; vattr_mask_ |= vattr_vta; }
+  void setVma(uint32_t vma) { vma_ = vma; vattr_mask_ |= vattr_vma; }
+  void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; vattr_mask_ |= vattr_vediv; }

  Opcode   getOpcode() const { return opcode_; }
+
+  uint32_t getNRSrc() const { return num_rsrcs_; }
+  uint32_t getRSrc(uint32_t i) const { return rsrc_[i]; }
+  RegType  getRSType(uint32_t i) const { return rsrc_type_[i]; }
+
+  uint32_t getRDest() const { return rdest_; }
+  RegType  getRDType() const { return rdest_type_; }
+
+  bool     hasImm() const { return has_imm_; }
+  uint32_t getImm() const { return imm_; }
+
  uint32_t getFunc2() const { return func2_; }
  uint32_t getFunc3() const { return func3_; }
  uint32_t getFunc6() const { return func6_; }
  uint32_t getFunc7() const { return func7_; }
-  uint32_t getNRSrc() const { return num_rsrcs_; }
-  uint32_t getRSrc(uint32_t i) const { return rsrc_[i]; }
-  RegType  getRSType(uint32_t i) const { return rsrc_type_[i]; }
-  uint32_t getRDest() const { return rdest_; }  
-  RegType  getRDType() const { return rdest_type_; }  
-  bool     hasImm() const { return has_imm_; }
-  uint32_t getImm() const { return imm_; }
+
  uint32_t getVlsWidth() const { return vlsWidth_; }
  uint32_t getVmop() const { return vMop_; }
  uint32_t getVumop() const { return vUmop_; }
@ -172,8 +231,7 @@ public:
  uint32_t getVta() const { return vta_; }
  uint32_t getVma() const { return vma_; }
  uint32_t getVediv() const { return vediv_; }
-  uint32_t getVUseMask() const { return _vusemask; }
-  bool     isVec() const { return _is_vec; }
+  uint32_t getVattrMask() const { return vattr_mask_; }

 private:

@ -187,7 +245,7 @@ private:
  RegType rdest_type_;
  uint32_t imm_;
  RegType rsrc_type_[MAX_REG_SOURCES];
-  uint32_t rsrc_[MAX_REG_SOURCES];  
+  uint32_t rsrc_[MAX_REG_SOURCES];
  uint32_t rdest_;
  uint32_t func2_;
  uint32_t func3_;
@ -207,8 +265,7 @@ private:
  uint32_t vta_;
  uint32_t vma_;
  uint32_t vediv_;
-  uint32_t _vusemask;
-  bool     _is_vec;
+  uint32_t vattr_mask_;

  friend std::ostream &operator<<(std::ostream &, const Instr&);
 };