Adds the riscv vector extension into simx

This commit is contained in:
MichaelJSr 2024-11-26 18:41:01 -08:00
parent 8230b37411
commit 1e4583ac17
22 changed files with 5717 additions and 105 deletions

View file

@ -386,10 +386,20 @@ synthesis()
echo "synthesis tests done!"
}
vector()
{
echo "begin vector tests..."
make -C sim/simx
TOOLDIR=@TOOLDIR@ XLEN=@XLEN@ VLEN=256 REG_TESTS=1 ./tests/riscv/riscv-vector-tests/run-test.sh
echo "vector tests done!"
}
show_usage()
{
echo "Vortex Regression Test"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--all] [--h|--help]"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--vector] [--all] [--h|--help]"
}
declare -a tests=()
@ -439,6 +449,9 @@ while [ "$1" != "" ]; do
--synthesis )
tests+=("synthesis")
;;
--vector )
tests+=("vector")
;;
--all )
tests=()
tests+=("unittest")
@ -454,6 +467,7 @@ while [ "$1" != "" ]; do
tests+=("scope")
tests+=("stress")
tests+=("synthesis")
tests+=("vector")
;;
-h | --help )
show_usage

View file

@ -87,6 +87,10 @@
`endif
`endif
`ifndef VLEN
`define VLEN 256
`endif
`ifndef NUM_CLUSTERS
`define NUM_CLUSTERS 1
`endif

View file

@ -188,6 +188,19 @@
`define VX_CSR_MIMPID 12'hF13
`define VX_CSR_MHARTID 12'hF14
// Vector CSRs
`define VX_CSR_VSTART 12'h008
`define VX_CSR_VXSAT 12'h009
`define VX_CSR_VXRM 12'h00A
`define VX_CSR_VCSR 12'h00F
`define VX_CSR_VL 12'hC20
`define VX_CSR_VTYPE 12'hC21
`define VX_CSR_VLENB 12'hC22
`define VX_CSR_VCYCLE 12'hC00
`define VX_CSR_VTIME 12'hC01
`define VX_CSR_VINSTRET 12'hC02
// GPGU CSRs
`define VX_CSR_THREAD_ID 12'hCC0

View file

@ -1,3 +1,3 @@
CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1
running: CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 make -C ./ci/../driver/rtlsim
verilator --build --exe --cc Vortex --top-module Vortex --language 1800-2009 --assert -Wall -Wpedantic -Wno-DECLFILENAME -Wno-REDEFMACRO --x-initial unique --x-assign unique verilator.vlt -I../../hw/rtl -I../../hw/dpi -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/simulate -I../../hw/rtl/fp_cores -I../../third_party/fpnew/src/common_cells/include -I../../third_party/fpnew/src/common_cells/src -I../../third_party/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../third_party/fpnew/src -I../../hw/rtl/tex_unit -I../../hw/rtl/raster_unit -I../../hw/rtl/rop_unit -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -j 64 -DNDEBUG -DIMUL_DPI -DIDIV_DPI -DFPU_DPI ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp ../../hw/dpi/util_dpi.cpp ../../hw/dpi/float_dpi.cpp processor.cpp -CFLAGS '-std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds -fPIC -Wno-maybe-uninitialized -I../../../hw -I../../common -I../../../third_party/softfloat/source/include -I../../../third_party -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -O2 -DNDEBUG' -LDFLAGS '-shared ../../../third_party/softfloat/build/Linux-x86_64-GCC/softfloat.a -L../../../third_party/ramulator -lramulator' -o ../../../driver/rtlsim/librtlsim.so
verilator --build --exe --cc Vortex --top-module Vortex --language 1800-2009 --assert -Wall -Wpedantic -Wno-DECLFILENAME -Wno-REDEFMACRO --x-initial unique --x-assign unique verilator.vlt -I../../hw/rtl -I../../hw/dpi -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/simulate -I../../hw/rtl/fp_cores -I../../third_party/fpnew/src/common_cells/include -I../../third_party/fpnew/src/common_cells/src -I../../third_party/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../third_party/fpnew/src -I../../hw/rtl/tex_unit -I../../hw/rtl/raster_unit -I../../hw/rtl/rop_unit -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -j 64 -DNDEBUG -DIMUL_DPI -DIDIV_DPI -DFPU_DPI ../common/util.cpp ../common/mem.cpp ../common/softfloat_ext.cpp ../common/rvfloats.cpp ../../hw/dpi/util_dpi.cpp ../../hw/dpi/float_dpi.cpp processor.cpp -CFLAGS '-std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds -fPIC -Wno-maybe-uninitialized -I../../../hw -I../../common -I../../../third_party/softfloat/source/include -I../../../third_party -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -O2 -DNDEBUG' -LDFLAGS '-shared ../../../third_party/softfloat/build/Linux-x86_64-GCC/softfloat.a -L../../../third_party/ramulator -lramulator' -o ../../../driver/rtlsim/librtlsim.so

View file

@ -12,6 +12,7 @@
// limitations under the License.
#include "rvfloats.h"
#include "softfloat_ext.h"
#include <stdio.h>
extern "C" {
@ -158,6 +159,34 @@ uint64_t rv_fdiv_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
return from_float64_t(r);
}
uint32_t rv_frecip7_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
auto r = f32_recip7(to_float32_t(a));
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_frecip7_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
auto r = f64_recip7(to_float64_t(a));
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_frsqrt7_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
auto r = f32_rsqrte7(to_float32_t(a));
if (fflags) { *fflags =softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_frsqrt7_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
auto r = f64_rsqrte7(to_float64_t(a));
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fsqrt_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
rv_init(frm);
auto r = f32_sqrt(to_float32_t(a));
@ -486,6 +515,11 @@ uint64_t rv_fsgnjx_d(uint64_t a, uint64_t b) {
return r;
}
uint32_t rv_dtof_r(uint64_t a, uint32_t frm) {
rv_init(frm);
return rv_dtof(a);
}
uint32_t rv_dtof(uint64_t a) {
auto r = f64_to_f32(to_float64_t(a));
return from_float32_t(r);

View file

@ -28,6 +28,8 @@ uint32_t rv_fnmadd_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t*
uint32_t rv_fnmsub_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags);
uint32_t rv_fdiv_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags);
uint32_t rv_fsqrt_s(uint32_t a, uint32_t frm, uint32_t* fflags);
uint32_t rv_frecip7_s(uint32_t a, uint32_t frm, uint32_t* fflags);
uint32_t rv_frsqrt7_s(uint32_t a, uint32_t frm, uint32_t* fflags);
uint32_t rv_ftoi_s(uint32_t a, uint32_t frm, uint32_t* fflags);
uint32_t rv_ftou_s(uint32_t a, uint32_t frm, uint32_t* fflags);
@ -58,6 +60,8 @@ uint64_t rv_fsub_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags);
uint64_t rv_fmul_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags);
uint64_t rv_fdiv_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags);
uint64_t rv_fsqrt_d(uint64_t a, uint32_t frm, uint32_t* fflags);
uint64_t rv_frecip7_d(uint64_t a, uint32_t frm, uint32_t* fflags);
uint64_t rv_frsqrt7_d(uint64_t a, uint32_t frm, uint32_t* fflags);
uint64_t rv_fmadd_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags);
uint64_t rv_fmsub_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags);
@ -85,6 +89,7 @@ uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags);
uint64_t rv_fmax_d(uint64_t a, uint64_t b, uint32_t* fflags);
uint32_t rv_dtof(uint64_t a);
uint32_t rv_dtof_r(uint64_t a, uint32_t frm);
uint64_t rv_ftod(uint32_t a);
#ifdef __cplusplus

View file

@ -0,0 +1,486 @@
/*============================================================================
This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic
Package, Release 3e, by John R. Hauser.
Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of
California. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions, and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions, and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the University nor the names of its contributors may
be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
=============================================================================*/
#include <assert.h>
#include <stdbool.h>
#include <internals.h>
#include <../RISCV/specialize.h>
#include <softfloat.h>
#include "softfloat_ext.h"
uint_fast16_t f16_classify( float16_t a )
{
union ui16_f16 uA;
uint_fast16_t uiA;
uA.f = a;
uiA = uA.ui;
uint_fast16_t infOrNaN = expF16UI( uiA ) == 0x1F;
uint_fast16_t subnormalOrZero = expF16UI( uiA ) == 0;
bool sign = signF16UI( uiA );
bool fracZero = fracF16UI( uiA ) == 0;
bool isNaN = isNaNF16UI( uiA );
bool isSNaN = softfloat_isSigNaNF16UI( uiA );
return
( sign && infOrNaN && fracZero ) << 0 |
( sign && !infOrNaN && !subnormalOrZero ) << 1 |
( sign && subnormalOrZero && !fracZero ) << 2 |
( sign && subnormalOrZero && fracZero ) << 3 |
( !sign && infOrNaN && fracZero ) << 7 |
( !sign && !infOrNaN && !subnormalOrZero ) << 6 |
( !sign && subnormalOrZero && !fracZero ) << 5 |
( !sign && subnormalOrZero && fracZero ) << 4 |
( isNaN && isSNaN ) << 8 |
( isNaN && !isSNaN ) << 9;
}
uint_fast16_t f32_classify( float32_t a )
{
union ui32_f32 uA;
uint_fast32_t uiA;
uA.f = a;
uiA = uA.ui;
uint_fast16_t infOrNaN = expF32UI( uiA ) == 0xFF;
uint_fast16_t subnormalOrZero = expF32UI( uiA ) == 0;
bool sign = signF32UI( uiA );
bool fracZero = fracF32UI( uiA ) == 0;
bool isNaN = isNaNF32UI( uiA );
bool isSNaN = softfloat_isSigNaNF32UI( uiA );
return
( sign && infOrNaN && fracZero ) << 0 |
( sign && !infOrNaN && !subnormalOrZero ) << 1 |
( sign && subnormalOrZero && !fracZero ) << 2 |
( sign && subnormalOrZero && fracZero ) << 3 |
( !sign && infOrNaN && fracZero ) << 7 |
( !sign && !infOrNaN && !subnormalOrZero ) << 6 |
( !sign && subnormalOrZero && !fracZero ) << 5 |
( !sign && subnormalOrZero && fracZero ) << 4 |
( isNaN && isSNaN ) << 8 |
( isNaN && !isSNaN ) << 9;
}
uint_fast16_t f64_classify( float64_t a )
{
union ui64_f64 uA;
uint_fast64_t uiA;
uA.f = a;
uiA = uA.ui;
uint_fast16_t infOrNaN = expF64UI( uiA ) == 0x7FF;
uint_fast16_t subnormalOrZero = expF64UI( uiA ) == 0;
bool sign = signF64UI( uiA );
bool fracZero = fracF64UI( uiA ) == 0;
bool isNaN = isNaNF64UI( uiA );
bool isSNaN = softfloat_isSigNaNF64UI( uiA );
return
( sign && infOrNaN && fracZero ) << 0 |
( sign && !infOrNaN && !subnormalOrZero ) << 1 |
( sign && subnormalOrZero && !fracZero ) << 2 |
( sign && subnormalOrZero && fracZero ) << 3 |
( !sign && infOrNaN && fracZero ) << 7 |
( !sign && !infOrNaN && !subnormalOrZero ) << 6 |
( !sign && subnormalOrZero && !fracZero ) << 5 |
( !sign && subnormalOrZero && fracZero ) << 4 |
( isNaN && isSNaN ) << 8 |
( isNaN && !isSNaN ) << 9;
}
static inline uint64_t extract64(uint64_t val, int pos, int len)
{
assert(pos >= 0 && len > 0 && len <= 64 - pos);
return (val >> pos) & (~UINT64_C(0) >> (64 - len));
}
static inline uint64_t make_mask64(int pos, int len)
{
assert(pos >= 0 && len > 0 && pos < 64 && len <= 64);
return (UINT64_MAX >> (64 - len)) << pos;
}
//user needs to truncate output to required length
static inline uint64_t rsqrte7(uint64_t val, int e, int s, bool sub) {
uint64_t exp = extract64(val, s, e);
uint64_t sig = extract64(val, 0, s);
uint64_t sign = extract64(val, s + e, 1);
const int p = 7;
static const uint8_t table[] = {
52, 51, 50, 48, 47, 46, 44, 43,
42, 41, 40, 39, 38, 36, 35, 34,
33, 32, 31, 30, 30, 29, 28, 27,
26, 25, 24, 23, 23, 22, 21, 20,
19, 19, 18, 17, 16, 16, 15, 14,
14, 13, 12, 12, 11, 10, 10, 9,
9, 8, 7, 7, 6, 6, 5, 4,
4, 3, 3, 2, 2, 1, 1, 0,
127, 125, 123, 121, 119, 118, 116, 114,
113, 111, 109, 108, 106, 105, 103, 102,
100, 99, 97, 96, 95, 93, 92, 91,
90, 88, 87, 86, 85, 84, 83, 82,
80, 79, 78, 77, 76, 75, 74, 73,
72, 71, 70, 70, 69, 68, 67, 66,
65, 64, 63, 63, 62, 61, 60, 59,
59, 58, 57, 56, 56, 55, 54, 53};
if (sub) {
while (extract64(sig, s - 1, 1) == 0)
exp--, sig <<= 1;
sig = (sig << 1) & make_mask64(0 ,s);
}
int idx = ((exp & 1) << (p-1)) | (sig >> (s-p+1));
uint64_t out_sig = (uint64_t)(table[idx]) << (s-p);
uint64_t out_exp = (3 * make_mask64(0, e - 1) + ~exp) / 2;
return (sign << (s+e)) | (out_exp << s) | out_sig;
}
float16_t f16_rsqrte7(float16_t in)
{
union ui16_f16 uA;
uA.f = in;
unsigned int ret = f16_classify(in);
bool sub = false;
switch(ret) {
case 0x001: // -inf
case 0x002: // -normal
case 0x004: // -subnormal
case 0x100: // sNaN
softfloat_exceptionFlags |= softfloat_flag_invalid;
[[fallthrough]];
case 0x200: //qNaN
uA.ui = defaultNaNF16UI;
break;
case 0x008: // -0
uA.ui = 0xfc00;
softfloat_exceptionFlags |= softfloat_flag_infinite;
break;
case 0x010: // +0
uA.ui = 0x7c00;
softfloat_exceptionFlags |= softfloat_flag_infinite;
break;
case 0x080: //+inf
uA.ui = 0x0;
break;
case 0x020: //+ sub
sub = true;
[[fallthrough]];
default: // +num
uA.ui = rsqrte7(uA.ui, 5, 10, sub);
break;
}
return uA.f;
}
float32_t f32_rsqrte7(float32_t in)
{
union ui32_f32 uA;
uA.f = in;
unsigned int ret = f32_classify(in);
bool sub = false;
switch(ret) {
case 0x001: // -inf
case 0x002: // -normal
case 0x004: // -subnormal
case 0x100: // sNaN
softfloat_exceptionFlags |= softfloat_flag_invalid;
[[fallthrough]];
case 0x200: //qNaN
uA.ui = defaultNaNF32UI;
break;
case 0x008: // -0
uA.ui = 0xff800000;
softfloat_exceptionFlags |= softfloat_flag_infinite;
break;
case 0x010: // +0
uA.ui = 0x7f800000;
softfloat_exceptionFlags |= softfloat_flag_infinite;
break;
case 0x080: //+inf
uA.ui = 0x0;
break;
case 0x020: //+ sub
sub = true;
[[fallthrough]];
default: // +num
uA.ui = rsqrte7(uA.ui, 8, 23, sub);
break;
}
return uA.f;
}
float64_t f64_rsqrte7(float64_t in)
{
union ui64_f64 uA;
uA.f = in;
unsigned int ret = f64_classify(in);
bool sub = false;
switch(ret) {
case 0x001: // -inf
case 0x002: // -normal
case 0x004: // -subnormal
case 0x100: // sNaN
softfloat_exceptionFlags |= softfloat_flag_invalid;
[[fallthrough]];
case 0x200: //qNaN
uA.ui = defaultNaNF64UI;
break;
case 0x008: // -0
uA.ui = 0xfff0000000000000ul;
softfloat_exceptionFlags |= softfloat_flag_infinite;
break;
case 0x010: // +0
uA.ui = 0x7ff0000000000000ul;
softfloat_exceptionFlags |= softfloat_flag_infinite;
break;
case 0x080: //+inf
uA.ui = 0x0;
break;
case 0x020: //+ sub
sub = true;
[[fallthrough]];
default: // +num
uA.ui = rsqrte7(uA.ui, 11, 52, sub);
break;
}
return uA.f;
}
//user needs to truncate output to required length
static inline uint64_t recip7(uint64_t val, int e, int s, int rm, bool sub,
bool *round_abnormal)
{
uint64_t exp = extract64(val, s, e);
uint64_t sig = extract64(val, 0, s);
uint64_t sign = extract64(val, s + e, 1);
const int p = 7;
static const uint8_t table[] = {
127, 125, 123, 121, 119, 117, 116, 114,
112, 110, 109, 107, 105, 104, 102, 100,
99, 97, 96, 94, 93, 91, 90, 88,
87, 85, 84, 83, 81, 80, 79, 77,
76, 75, 74, 72, 71, 70, 69, 68,
66, 65, 64, 63, 62, 61, 60, 59,
58, 57, 56, 55, 54, 53, 52, 51,
50, 49, 48, 47, 46, 45, 44, 43,
42, 41, 40, 40, 39, 38, 37, 36,
35, 35, 34, 33, 32, 31, 31, 30,
29, 28, 28, 27, 26, 25, 25, 24,
23, 23, 22, 21, 21, 20, 19, 19,
18, 17, 17, 16, 15, 15, 14, 14,
13, 12, 12, 11, 11, 10, 9, 9,
8, 8, 7, 7, 6, 5, 5, 4,
4, 3, 3, 2, 2, 1, 1, 0};
if (sub) {
while (extract64(sig, s - 1, 1) == 0)
exp--, sig <<= 1;
sig = (sig << 1) & make_mask64(0 ,s);
if (exp != 0 && exp != UINT64_MAX) {
*round_abnormal = true;
if (rm == 1 ||
(rm == 2 && !sign) ||
(rm == 3 && sign))
return ((sign << (s+e)) | make_mask64(s, e)) - 1;
else
return (sign << (s+e)) | make_mask64(s, e);
}
}
int idx = sig >> (s-p);
uint64_t out_sig = (uint64_t)(table[idx]) << (s-p);
uint64_t out_exp = 2 * make_mask64(0, e - 1) + ~exp;
if (out_exp == 0 || out_exp == UINT64_MAX) {
out_sig = (out_sig >> 1) | make_mask64(s - 1, 1);
if (out_exp == UINT64_MAX) {
out_sig >>= 1;
out_exp = 0;
}
}
return (sign << (s+e)) | (out_exp << s) | out_sig;
}
float16_t f16_recip7(float16_t in)
{
union ui16_f16 uA;
uA.f = in;
unsigned int ret = f16_classify(in);
bool sub = false;
bool round_abnormal = false;
switch(ret) {
case 0x001: // -inf
uA.ui = 0x8000;
break;
case 0x080: //+inf
uA.ui = 0x0;
break;
case 0x008: // -0
uA.ui = 0xfc00;
softfloat_exceptionFlags |= softfloat_flag_infinite;
break;
case 0x010: // +0
uA.ui = 0x7c00;
softfloat_exceptionFlags |= softfloat_flag_infinite;
break;
case 0x100: // sNaN
softfloat_exceptionFlags |= softfloat_flag_invalid;
[[fallthrough]];
case 0x200: //qNaN
uA.ui = defaultNaNF16UI;
break;
case 0x004: // -subnormal
case 0x020: //+ sub
sub = true;
[[fallthrough]];
default: // +- normal
uA.ui = recip7(uA.ui, 5, 10,
softfloat_roundingMode, sub, &round_abnormal);
if (round_abnormal)
softfloat_exceptionFlags |= softfloat_flag_inexact |
softfloat_flag_overflow;
break;
}
return uA.f;
}
float32_t f32_recip7(float32_t in)
{
union ui32_f32 uA;
uA.f = in;
unsigned int ret = f32_classify(in);
bool sub = false;
bool round_abnormal = false;
switch(ret) {
case 0x001: // -inf
uA.ui = 0x80000000;
break;
case 0x080: //+inf
uA.ui = 0x0;
break;
case 0x008: // -0
uA.ui = 0xff800000;
softfloat_exceptionFlags |= softfloat_flag_infinite;
break;
case 0x010: // +0
uA.ui = 0x7f800000;
softfloat_exceptionFlags |= softfloat_flag_infinite;
break;
case 0x100: // sNaN
softfloat_exceptionFlags |= softfloat_flag_invalid;
[[fallthrough]];
case 0x200: //qNaN
uA.ui = defaultNaNF32UI;
break;
case 0x004: // -subnormal
case 0x020: //+ sub
sub = true;
[[fallthrough]];
default: // +- normal
uA.ui = recip7(uA.ui, 8, 23,
softfloat_roundingMode, sub, &round_abnormal);
if (round_abnormal)
softfloat_exceptionFlags |= softfloat_flag_inexact |
softfloat_flag_overflow;
break;
}
return uA.f;
}
float64_t f64_recip7(float64_t in)
{
union ui64_f64 uA;
uA.f = in;
unsigned int ret = f64_classify(in);
bool sub = false;
bool round_abnormal = false;
switch(ret) {
case 0x001: // -inf
uA.ui = 0x8000000000000000;
break;
case 0x080: //+inf
uA.ui = 0x0;
break;
case 0x008: // -0
uA.ui = 0xfff0000000000000;
softfloat_exceptionFlags |= softfloat_flag_infinite;
break;
case 0x010: // +0
uA.ui = 0x7ff0000000000000;
softfloat_exceptionFlags |= softfloat_flag_infinite;
break;
case 0x100: // sNaN
softfloat_exceptionFlags |= softfloat_flag_invalid;
[[fallthrough]];
case 0x200: //qNaN
uA.ui = defaultNaNF64UI;
break;
case 0x004: // -subnormal
case 0x020: //+ sub
sub = true;
[[fallthrough]];
default: // +- normal
uA.ui = recip7(uA.ui, 11, 52,
softfloat_roundingMode, sub, &round_abnormal);
if (round_abnormal)
softfloat_exceptionFlags |= softfloat_flag_inexact |
softfloat_flag_overflow;
break;
}
return uA.f;
}

View file

@ -0,0 +1,14 @@
#include <stdint.h>
#include <softfloat_types.h>
uint_fast16_t f16_classify( float16_t );
float16_t f16_rsqrte7( float16_t );
float16_t f16_recip7( float16_t );
uint_fast16_t f32_classify( float32_t );
float32_t f32_rsqrte7( float32_t );
float32_t f32_recip7( float32_t );
uint_fast16_t f64_classify( float64_t );
float64_t f64_rsqrte7( float64_t );
float64_t f64_recip7( float64_t );

View file

@ -51,7 +51,7 @@ endif
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += $(SRC_DIR)/fpga.cpp $(SRC_DIR)/opae_sim.cpp

View file

@ -35,7 +35,7 @@ ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
endif
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE)
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += $(SRC_DIR)/processor.cpp

View file

@ -17,8 +17,8 @@ CXXFLAGS += $(CONFIGS)
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulator -lramulator
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/execute_vector.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp
# Debugging
ifdef DEBUG

View file

@ -29,6 +29,7 @@ private:
uint16_t num_cores_;
uint16_t num_clusters_;
uint16_t socket_size_;
uint16_t vsize_;
uint16_t num_barriers_;
uint64_t local_mem_base_;
@ -39,6 +40,7 @@ public:
, num_cores_(num_cores)
, num_clusters_(NUM_CLUSTERS)
, socket_size_(SOCKET_SIZE)
, vsize_(VLEN / 8)
, num_barriers_(NUM_BARRIERS)
, local_mem_base_(LMEM_BASE_ADDR)
{}
@ -71,6 +73,10 @@ public:
return socket_size_;
}
uint16_t vsize() const {
return vsize_;
}
};
}

View file

@ -47,6 +47,7 @@ static const std::unordered_map<Opcode, InstType> sc_instTable = {
{Opcode::FMSUB, InstType::R4},
{Opcode::FMNMADD, InstType::R4},
{Opcode::FMNMSUB, InstType::R4},
{Opcode::VSET, InstType::V},
{Opcode::EXT1, InstType::R},
{Opcode::EXT2, InstType::R4},
{Opcode::R_W, InstType::R},
@ -54,33 +55,6 @@ static const std::unordered_map<Opcode, InstType> sc_instTable = {
{Opcode::TCU, InstType::I},
};
enum Constants {
width_opcode= 7,
width_reg = 5,
width_func2 = 2,
width_func3 = 3,
width_func7 = 7,
width_i_imm = 12,
width_j_imm = 20,
shift_opcode= 0,
shift_rd = width_opcode,
shift_func3 = shift_rd + width_reg,
shift_rs1 = shift_func3 + width_func3,
shift_rs2 = shift_rs1 + width_reg,
shift_func2 = shift_rs2 + width_reg,
shift_func7 = shift_rs2 + width_reg,
shift_rs3 = shift_func7 + width_func2,
mask_opcode = (1 << width_opcode) - 1,
mask_reg = (1 << width_reg) - 1,
mask_func2 = (1 << width_func2) - 1,
mask_func3 = (1 << width_func3) - 1,
mask_func7 = (1 << width_func7) - 1,
mask_i_imm = (1 << width_i_imm) - 1,
mask_j_imm = (1 << width_j_imm) - 1,
};
static const char* op_string(const Instr &instr) {
auto opcode = instr.getOpcode();
auto func2 = instr.getFunc2();
@ -230,10 +204,14 @@ static const char* op_string(const Instr &instr) {
case Opcode::FENCE: return "FENCE";
case Opcode::FL:
switch (func3) {
case 0x1: return "VL";
case 0x2: return "FLW";
case 0x3: return "FLD";
case 0x0: return "VL8";
case 0x5: return "VL16";
case 0x6: return "VL32";
case 0x7: return "VL64";
default:
std::cout << "Could not decode float/vector load with func3: " << func3 << std::endl;
std::abort();
}
case Opcode::FS:
@ -241,7 +219,12 @@ static const char* op_string(const Instr &instr) {
case 0x1: return "VS";
case 0x2: return "FSW";
case 0x3: return "FSD";
case 0x0: return "VS8";
case 0x5: return "VS16";
case 0x6: return "VS32";
case 0x7: return "VS64";
default:
std::cout << "Could not decode float/vector store with func3: " << func3 << std::endl;
std::abort();
}
case Opcode::AMO: {
@ -390,6 +373,7 @@ static const char* op_string(const Instr &instr) {
case Opcode::FMSUB: return func2 ? "FMSUB.D" : "FMSUB.S";
case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
case Opcode::VSET: return "VSET";
case Opcode::EXT1:
switch (func7) {
case 0:
@ -421,6 +405,39 @@ static const char* op_string(const Instr &instr) {
}
}
inline void vec_log(std::ostream &os, const Instr &instr) {
if (instr.getVUseMask() & set_func3)
os << ", func3:" << instr.getFunc3();
if (instr.getVUseMask() & set_func6)
os << ", func6:" << instr.getFunc6();
if (instr.getVUseMask() & set_imm)
os << ", imm:" << instr.getImm();
if (instr.getVUseMask() & set_vlswidth)
os << ", width:" << instr.getVlsWidth();
if (instr.getVUseMask() & set_vmop)
os << ", mop:" << instr.getVmop();
if (instr.getVUseMask() & set_vumop)
os << ", umop:" << instr.getVumop();
if (instr.getVUseMask() & set_vnf)
os << ", nf:" << instr.getVnf();
if (instr.getVUseMask() & set_vmask)
os << ", vmask:" << instr.getVmask();
if (instr.getVUseMask() & set_vs3)
os << ", vs3:" << instr.getVs3();
if (instr.getVUseMask() & set_zimm)
os << ", zimm:" << ((instr.hasZimm()) ? "true" : "false");
if (instr.getVUseMask() & set_vlmul)
os << ", lmul:" << instr.getVlmul();
if (instr.getVUseMask() & set_vsew)
os << ", sew:" << instr.getVsew();
if (instr.getVUseMask() & set_vta)
os << ", ta:" << instr.getVta();
if (instr.getVUseMask() & set_vma)
os << ", ma:" << instr.getVma();
if (instr.getVUseMask() & set_vediv)
os << ", ediv:" << instr.getVediv();
}
namespace vortex {
std::ostream &operator<<(std::ostream &os, const Instr &instr) {
os << op_string(instr);
@ -441,6 +458,13 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << "0x" << std::hex << instr.getImm() << std::dec;
}
if (instr.getOpcode() == Opcode::SYS && instr.getFunc3() >= 5) {
// CSRs with immediate values
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << "0x" << std::hex << instr.getRSrc(0);
}
// Log vector-specific vtype and vreg info
if (instr.isVec()) vec_log(os, instr);
return os;
}
}
@ -452,6 +476,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
auto func2 = (code >> shift_func2) & mask_func2;
auto func3 = (code >> shift_func3) & mask_func3;
auto func6 = (code >> shift_func6) & mask_func6;
auto func7 = (code >> shift_func7) & mask_func7;
auto rd = (code >> shift_rd) & mask_reg;
@ -466,6 +491,12 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
}
auto iType = op_it->second;
if (op == Opcode::FL || op == Opcode::FS) {
if (func3 != 0x2 && func3 != 0x3) {
iType = InstType::V;
}
}
switch (iType) {
case InstType::R:
switch (op) {
@ -659,7 +690,104 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
auto imm = (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20);
instr->setImm(sext(imm, width_j_imm+1));
} break;
case InstType::V:
instr->setVec(true);
switch (op) {
case Opcode::VSET: {
instr->setDestReg(rd, RegType::Integer);
instr->setFunc3(func3);
switch (func3) {
case 7: {
if (code >> (shift_vset - 1) == 0b10) { // vsetvl
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
} else {
auto zimm = (code >> shift_rs2) & mask_v_zimm;
instr->setZimm(true);
instr->setVlmul(zimm & mask_v_lmul);
instr->setVsew((zimm >> shift_v_sew) & mask_v_sew);
instr->setVta((zimm >> shift_v_ta) & mask_v_ta);
instr->setVma((zimm >> shift_v_ma) & mask_v_ma);
if ((code >> shift_vset)) { // vsetivli
instr->setImm(rs1);
} else { // vsetvli
instr->addSrcReg(rs1, RegType::Integer);
}
}
} break;
case 3: { // Vector - immediate arithmetic instructions
instr->setDestReg(rd, RegType::Vector);
instr->addSrcReg(rs2, RegType::Vector);
instr->setImm(rs1);
instr->setVmask((code >> shift_func7) & 0x1);
instr->setFunc6(func6);
} break;
default: { // Vector - vector/scalar arithmetic instructions
if (func3 == 1 && func6 == 16) {
instr->setDestReg(rd, RegType::Float);
} else if (func3 == 2 && func6 == 16) {
instr->setDestReg(rd, RegType::Integer);
} else {
instr->setDestReg(rd, RegType::Vector);
}
instr->addSrcReg(rs1, RegType::Vector);
instr->addSrcReg(rs2, RegType::Vector);
instr->setVmask((code >> shift_func7) & 0x1);
instr->setFunc6(func6);
}
}
} break;
case Opcode::FL:
instr->addSrcReg(rs1, RegType::Integer);
instr->setVmop((code >> shift_vmop) & 0b11);
switch (instr->getVmop()) {
case 0b00:
instr->setVumop(rs2);
break;
case 0b10:
instr->addSrcReg(rs2, RegType::Integer);
break;
case 0b01:
case 0b11:
instr->addSrcReg(rs2, RegType::Vector);
break;
}
instr->setVsew(func3 & 0x3);
instr->setDestReg(rd, RegType::Vector);
instr->setVlsWidth(func3);
instr->setVmask((code >> shift_func7) & 0x1);
instr->setVnf((code >> shift_vnf) & mask_func3);
break;
case Opcode::FS:
instr->addSrcReg(rs1, RegType::Integer);
instr->setVmop((code >> shift_vmop) & 0b11);
switch (instr->getVmop()) {
case 0b00:
instr->setVumop(rs2);
break;
case 0b10:
instr->addSrcReg(rs2, RegType::Integer);
break;
case 0b01:
case 0b11:
instr->addSrcReg(rs2, RegType::Vector);
break;
}
instr->setVsew(func3 & 0x3);
instr->addSrcReg(rd, RegType::Vector);
instr->setVlsWidth(func3);
instr->setVmask((code >> shift_func7) & 0x1);
instr->setVmop((code >> shift_vmop) & 0b11);
instr->setVnf((code >> shift_vnf) & mask_func3);
break;
default:
std::abort();
}
break;
case InstType::R4:
instr->setDestReg(rd, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);

View file

@ -33,6 +33,7 @@ using namespace vortex;
Emulator::warp_t::warp_t(const Arch& arch)
: ireg_file(arch.num_threads(), std::vector<Word>(MAX_NUM_REGS))
, freg_file(arch.num_threads(), std::vector<uint64_t>(MAX_NUM_REGS))
, vreg_file(MAX_NUM_REGS, std::vector<Byte>(arch.vsize()))
, uuid(0)
{}
@ -64,6 +65,26 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
#endif
}
}
for (auto& reg_file : this->vreg_file) {
for (auto& reg : reg_file) {
#ifndef NDEBUG
reg = 0;
#else
reg = std::rand();
#endif
}
}
for (auto& reg_file : this->vreg_file) {
for (auto& reg : reg_file) {
#ifndef NDEBUG
reg = 0;
#else
reg = std::rand();
#endif
}
}
}
///////////////////////////////////////////////////////////////////////////////
@ -79,7 +100,12 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
// considered to be big enough to hold input tiles for one output tile.
// In future versions, scratchpad size should be fixed to an appropriate value.
, scratchpad(std::vector<Word>(32 * 32 * 32768))
, csrs_(arch.num_warps())
{
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
csrs_.at(i).resize(arch.num_threads());
}
this->clear();
}
@ -463,6 +489,32 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_FFLAGS: return warps_.at(wid).fcsr & 0x1F;
case VX_CSR_FRM: return (warps_.at(wid).fcsr >> 5);
case VX_CSR_FCSR: return warps_.at(wid).fcsr;
// Vector CRSs
case VX_CSR_VSTART:
return csrs_.at(wid).at(tid)[VX_CSR_VSTART];
case VX_CSR_VXSAT:
return csrs_.at(wid).at(tid)[VX_CSR_VXSAT];
case VX_CSR_VXRM:
return csrs_.at(wid).at(tid)[VX_CSR_VXRM];
case VX_CSR_VCSR: {
Word vxsat = csrs_.at(wid).at(tid)[VX_CSR_VXSAT];
Word vxrm = csrs_.at(wid).at(tid)[VX_CSR_VXRM];
return (vxrm << 1) | vxsat;
}
case VX_CSR_VL:
return csrs_.at(wid).at(tid)[VX_CSR_VL];
case VX_CSR_VTYPE:
return csrs_.at(wid).at(tid)[VX_CSR_VTYPE];
case VX_CSR_VLENB:
return VLEN / 8;
case VX_CSR_VCYCLE:
return csrs_.at(wid).at(tid)[VX_CSR_VCYCLE];
case VX_CSR_VTIME:
return csrs_.at(wid).at(tid)[VX_CSR_VTIME];
case VX_CSR_VINSTRET:
return csrs_.at(wid).at(tid)[VX_CSR_VINSTRET];
case VX_CSR_MHARTID: return (core_->id() * arch_.num_warps() + wid) * arch_.num_threads() + tid;
case VX_CSR_THREAD_ID: return tid;
case VX_CSR_WARP_ID: return wid;
@ -578,6 +630,29 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
case VX_CSR_MSCRATCH:
csr_mscratch_ = value;
break;
// Vector CRSs
case VX_CSR_VSTART:
csrs_.at(wid).at(tid)[VX_CSR_VSTART] = value;
break;
case VX_CSR_VXSAT:
csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1;
break;
case VX_CSR_VXRM:
csrs_.at(wid).at(tid)[VX_CSR_VXRM] = value & 0b11;
break;
case VX_CSR_VCSR:
csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1;
csrs_.at(wid).at(tid)[VX_CSR_VXRM] = (value >> 1) & 0b11;
break;
case VX_CSR_VL: // read only, written by vset(i)vl(i)
csrs_.at(wid).at(tid)[VX_CSR_VL] = value;
break;
case VX_CSR_VTYPE: // read only, written by vset(i)vl(i)
csrs_.at(wid).at(tid)[VX_CSR_VTYPE] = value;
break;
case VX_CSR_VLENB: // read only, set to VLEN / 8
case VX_CSR_SATP:
#ifdef VM_ENABLE
// warps_.at(wid).fcsr = (warps_.at(wid).fcsr & ~0x1F) | (value & 0x1F);

View file

@ -28,6 +28,76 @@ class Core;
class Instr;
class instr_trace_t;
enum Constants {
width_opcode= 7,
width_reg = 5,
width_func2 = 2,
width_func3 = 3,
width_func6 = 6,
width_func7 = 7,
width_mop = 3,
width_vmask = 1,
width_i_imm = 12,
width_j_imm = 20,
width_v_zimm = 11,
width_v_ma = 1,
width_v_ta = 1,
width_v_sew = 3,
width_v_lmul = 3,
width_aq = 1,
width_rl = 1,
shift_opcode= 0,
shift_rd = width_opcode,
shift_func3 = shift_rd + width_reg,
shift_rs1 = shift_func3 + width_func3,
shift_rs2 = shift_rs1 + width_reg,
shift_func2 = shift_rs2 + width_reg,
shift_func7 = shift_rs2 + width_reg,
shift_rs3 = shift_func7 + width_func2,
shift_vmop = shift_func7 + width_vmask,
shift_vnf = shift_vmop + width_mop,
shift_func6 = shift_func7 + width_vmask,
shift_vset = shift_func7 + width_func6,
shift_v_sew = width_v_lmul,
shift_v_ta = shift_v_sew + width_v_sew,
shift_v_ma = shift_v_ta + width_v_ta,
mask_opcode = (1 << width_opcode) - 1,
mask_reg = (1 << width_reg) - 1,
mask_func2 = (1 << width_func2) - 1,
mask_func3 = (1 << width_func3) - 1,
mask_func6 = (1 << width_func6) - 1,
mask_func7 = (1 << width_func7) - 1,
mask_i_imm = (1 << width_i_imm) - 1,
mask_j_imm = (1 << width_j_imm) - 1,
mask_v_zimm = (1 << width_v_zimm) - 1,
mask_v_ma = (1 << width_v_ma) - 1,
mask_v_ta = (1 << width_v_ta) - 1,
mask_v_sew = (1 << width_v_sew) - 1,
mask_v_lmul = (1 << width_v_lmul) - 1,
};
struct vtype {
uint32_t vill;
uint32_t vma;
uint32_t vta;
uint32_t vsew;
uint32_t vlmul;
};
union reg_data_t {
Word u;
WordI i;
WordF f;
float f32;
double f64;
uint32_t u32;
uint64_t u64;
int32_t i32;
int64_t i64;
};
class Emulator {
public:
Emulator(const Arch &arch,
@ -61,6 +131,10 @@ public:
Word get_tc_size();
Word get_tc_num();
void dcache_read(void* data, uint64_t addr, uint32_t size);
void dcache_write(const void* data, uint64_t addr, uint32_t size);
private:
struct ipdom_entry_t {
@ -85,9 +159,14 @@ private:
ThreadMask tmask;
std::vector<std::vector<Word>> ireg_file;
std::vector<std::vector<uint64_t>>freg_file;
std::vector<std::vector<Byte>> vreg_file;
std::stack<ipdom_entry_t> ipdom_stack;
Byte fcsr;
uint32_t uuid;
struct vtype vtype;
uint32_t vl;
Word VLMAX;
};
struct wspawn_t {
@ -100,12 +179,14 @@ private:
void execute(const Instr &instr, uint32_t wid, instr_trace_t *trace);
void executeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata);
void loadVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
void storeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
void icache_read(void* data, uint64_t addr, uint32_t size);
void dcache_read(void* data, uint64_t addr, uint32_t size);
void dcache_write(const void* data, uint64_t addr, uint32_t size);
void dcache_amo_reserve(uint64_t addr);
bool dcache_amo_check(uint64_t addr);
@ -142,6 +223,7 @@ private:
uint32_t mat_size;
uint32_t tc_size;
uint32_t tc_num;
std::vector<std::vector<std::unordered_map<uint32_t, uint32_t>>> csrs_;
};
}

View file

@ -25,22 +25,11 @@
#include "emulator.h"
#include "instr.h"
#include "core.h"
#include "processor_impl.h"
#include "VX_types.h"
using namespace vortex;
union reg_data_t {
Word u;
WordI i;
WordF f;
float f32;
double f64;
uint32_t u32;
uint64_t u64;
int32_t i32;
int64_t i64;
};
inline uint64_t nan_box(uint32_t value) {
return value | 0xffffffff00000000;
}
@ -128,6 +117,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
}
DPN(2, "}" << std::endl);
break;
case RegType::Vector:
break;
default:
break;
}
@ -678,41 +669,47 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
trace->src_regs[0] = {RegType::Integer, rsrc0};
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
trace->data = trace_data;
uint32_t data_bytes = 1 << (func3 & 0x3);
uint32_t data_width = 8 * data_bytes;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!warp.tmask.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].i + immsrc;
uint64_t read_data = 0;
this->dcache_read(&read_data, mem_addr, data_bytes);
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
switch (func3) {
case 0: // RV32I: LB
case 1: // RV32I: LH
rddata[t].i = sext((Word)read_data, data_width);
break;
case 2:
if (opcode == Opcode::L) {
// RV32I: LW
if ((opcode == Opcode::L )
|| (opcode == Opcode::FL && func3 == 2)
|| (opcode == Opcode::FL && func3 == 3)) {
uint32_t data_bytes = 1 << (func3 & 0x3);
uint32_t data_width = 8 * data_bytes;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!warp.tmask.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].i + immsrc;
uint64_t read_data = 0;
this->dcache_read(&read_data, mem_addr, data_bytes);
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
switch (func3) {
case 0: // RV32I: LB
case 1: // RV32I: LH
rddata[t].i = sext((Word)read_data, data_width);
} else {
// RV32F: FLW
rddata[t].u64 = nan_box((uint32_t)read_data);
break;
case 2:
if (opcode == Opcode::L) {
// RV32I: LW
rddata[t].i = sext((Word)read_data, data_width);
} else {
// RV32F: FLW
rddata[t].u64 = nan_box((uint32_t)read_data);
}
break;
case 3: // RV64I: LD
// RV32D: FLD
case 4: // RV32I: LBU
case 5: // RV32I: LHU
case 6: // RV64I: LWU
rddata[t].u64 = read_data;
break;
default:
std::abort();
}
break;
case 3: // RV64I: LD
// RV32D: FLD
case 4: // RV32I: LBU
case 5: // RV32I: LHU
case 6: // RV64I: LWU
rddata[t].u64 = read_data;
break;
default:
std::abort();
}
rd_write = true;
} else {
loadVector(instr, wid, rsdata);
}
rd_write = true;
break;
}
case Opcode::S:
@ -724,23 +721,29 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
trace->src_regs[1] = {data_type, rsrc1};
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
trace->data = trace_data;
uint32_t data_bytes = 1 << (func3 & 0x3);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!warp.tmask.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].i + immsrc;
uint64_t write_data = rsdata[t][1].u64;
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
switch (func3) {
case 0:
case 1:
case 2:
case 3:
this->dcache_write(&write_data, mem_addr, data_bytes);
break;
default:
std::abort();
if ((opcode == Opcode::S)
|| (opcode == Opcode::FS && func3 == 2)
|| (opcode == Opcode::FS && func3 == 3)) {
uint32_t data_bytes = 1 << (func3 & 0x3);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!warp.tmask.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].i + immsrc;
uint64_t write_data = rsdata[t][1].u64;
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
switch (func3) {
case 0:
case 1:
case 2:
case 3:
this->dcache_write(&write_data, mem_addr, data_bytes);
break;
default:
std::abort();
}
}
} else {
storeVector(instr, wid, rsdata);
}
break;
}
@ -925,7 +928,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!warp.tmask.test(t))
continue;
uint32_t frm = this->get_fpu_rm(func3, t, wid);
uint32_t frm = (func3 == 0x7) ? this->get_csr(VX_CSR_FRM, t, wid) : func3;
uint32_t fflags = 0;
switch (func7) {
case 0x00: { // RV32F: FADD.S
@ -1240,7 +1243,10 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
break;
}
}
this->update_fcrs(fflags, t, wid);
if (fflags) {
this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, t, wid) | fflags, t, wid);
this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, t, wid) | fflags, t, wid);
}
}
rd_write = true;
break;
@ -1294,7 +1300,10 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
default:
break;
}
this->update_fcrs(fflags, t, wid);
if (fflags) {
this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, t, wid) | fflags, t, wid);
this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, t, wid) | fflags, t, wid);
}
}
rd_write = true;
break;
@ -1586,6 +1595,13 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
std::abort();
}
} break;
case Opcode::VSET: {
auto func6 = instr.getFunc6();
if ((func3 == 0x7) || (func3 == 0x2 && func6 == 16) || (func3 == 0x1 && func6 == 16)) {
rd_write = true;
}
executeVector(instr, wid, rsdata, rddata);
} break;
default:
std::abort();
}
@ -1629,6 +1645,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
trace->dst_reg = {type, rdest};
break;
default:
std::cout << "Unrecognized register write back type: " << type << std::endl;
std::abort();
break;
}

4493
sim/simx/execute_vector.cpp Normal file

File diff suppressed because it is too large Load diff

View file

@ -42,6 +42,8 @@ enum class Opcode {
// RV64 Standard Extension
R_W = 0x3b,
I_W = 0x1b,
// Vector Extension
VSET = 0x57,
// Custom Extensions
EXT1 = 0x0b,
EXT2 = 0x2b,
@ -56,9 +58,28 @@ enum class InstType {
B,
U,
J,
V,
R4
};
enum set_vuse_mask {
set_func3 = (1 << 0),
set_func6 = (1 << 1),
set_imm = (1 << 2),
set_vlswidth = (1 << 3),
set_vmop = (1 << 4),
set_vumop = (1 << 5),
set_vnf = (1 << 6),
set_vmask = (1 << 7),
set_vs3 = (1 << 8),
set_zimm = (1 << 9),
set_vlmul = (1 << 10),
set_vsew = (1 << 11),
set_vta = (1 << 12),
set_vma = (1 << 13),
set_vediv = (1 << 14)
};
class Instr {
public:
Instr()
@ -70,7 +91,22 @@ public:
, rdest_(0)
, func2_(0)
, func3_(0)
, func7_(0) {
, func6_(0)
, func7_(0)
, vmask_(0)
, vlsWidth_(0)
, vMop_(0)
, vUmop_(0)
, vNf_(0)
, vs3_(0)
, has_zimm_(false)
, vlmul_(0)
, vsew_(0)
, vta_(0)
, vma_(0)
, vediv_(0)
, _vusemask(0)
, _is_vec(false) {
for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
rsrc_type_[i] = RegType::None;
rsrc_[i] = 0;
@ -93,13 +129,28 @@ public:
num_rsrcs_ = std::max<uint32_t>(num_rsrcs_, index+1);
}
void setFunc2(uint32_t func2) { func2_ = func2; }
void setFunc3(uint32_t func3) { func3_ = func3; }
void setFunc3(uint32_t func3) { func3_ = func3; _vusemask |= set_func3; }
void setFunc6(uint32_t func6) { func6_ = func6; _vusemask |= set_func6; }
void setFunc7(uint32_t func7) { func7_ = func7; }
void setImm(uint32_t imm) { has_imm_ = true; imm_ = imm; }
void setImm(uint32_t imm) { has_imm_ = true; imm_ = imm; _vusemask |= set_imm; }
void setVlsWidth(uint32_t width) { vlsWidth_ = width; _vusemask |= set_vlswidth; }
void setVmop(uint32_t mop) { vMop_ = mop; _vusemask |= set_vmop; }
void setVumop(uint32_t umop) { vUmop_ = umop; _vusemask |= set_vumop; }
void setVnf(uint32_t nf) { vNf_ = nf; _vusemask |= set_vnf; }
void setVmask(uint32_t mask) { vmask_ = mask; _vusemask |= set_vmask; }
void setVs3(uint32_t vs) { vs3_ = vs; _vusemask |= set_vs3; }
void setZimm(bool has_zimm) { has_zimm_ = has_zimm; _vusemask |= set_zimm; }
void setVlmul(uint32_t lmul) { vlmul_ = lmul; _vusemask |= set_vlmul; }
void setVsew(uint32_t sew) { vsew_ = sew; _vusemask |= set_vsew; }
void setVta(uint32_t vta) { vta_ = vta; _vusemask |= set_vta; }
void setVma(uint32_t vma) { vma_ = vma; _vusemask |= set_vma; }
void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; _vusemask |= set_vediv; }
void setVec(bool is_vec) { _is_vec = is_vec; }
Opcode getOpcode() const { return opcode_; }
uint32_t getFunc2() const { return func2_; }
uint32_t getFunc3() const { return func3_; }
uint32_t getFunc6() const { return func6_; }
uint32_t getFunc7() const { return func7_; }
uint32_t getNRSrc() const { return num_rsrcs_; }
uint32_t getRSrc(uint32_t i) const { return rsrc_[i]; }
@ -108,6 +159,21 @@ public:
RegType getRDType() const { return rdest_type_; }
bool hasImm() const { return has_imm_; }
uint32_t getImm() const { return imm_; }
uint32_t getVlsWidth() const { return vlsWidth_; }
uint32_t getVmop() const { return vMop_; }
uint32_t getVumop() const { return vUmop_; }
uint32_t getVnf() const { return vNf_; }
uint32_t getVmask() const { return vmask_; }
uint32_t getVs3() const { return vs3_; }
bool hasZimm() const { return has_zimm_; }
uint32_t getVlmul() const { return vlmul_; }
uint32_t getVsew() const { return 1 << (3 + vsew_); }
uint32_t getVsewO() const { return vsew_; }
uint32_t getVta() const { return vta_; }
uint32_t getVma() const { return vma_; }
uint32_t getVediv() const { return vediv_; }
uint32_t getVUseMask() const { return _vusemask; }
bool isVec() const { return _is_vec; }
private:
@ -125,8 +191,25 @@ private:
uint32_t rdest_;
uint32_t func2_;
uint32_t func3_;
uint32_t func6_;
uint32_t func7_;
// Vector
uint32_t vmask_;
uint32_t vlsWidth_;
uint32_t vMop_;
uint32_t vUmop_;
uint32_t vNf_;
uint32_t vs3_;
bool has_zimm_;
uint32_t vlmul_;
uint32_t vsew_;
uint32_t vta_;
uint32_t vma_;
uint32_t vediv_;
uint32_t _vusemask;
bool _is_vec;
friend std::ostream &operator<<(std::ostream &, const Instr&);
};

View file

@ -84,7 +84,8 @@ enum class RegType {
None,
Integer,
Float,
Count
Count,
Vector
};
inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
@ -92,6 +93,7 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
case RegType::None: break;
case RegType::Integer: os << "x"; break;
case RegType::Float: os << "f"; break;
case RegType::Vector: os << "v"; break;
default: assert(false);
}
return os;

View file

@ -51,7 +51,7 @@ endif
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += $(SRC_DIR)/xrt.cpp $(SRC_DIR)/xrt_sim.cpp

View file

@ -0,0 +1,39 @@
## Running the testcases
```
XLEN=32 ./run-test.sh testcase1 testcase2
XLEN=64 ./run-test.sh testcase1 testcase2
# or to run all default testcases
XLEN=32 ./run-test.sh
XLEN=64 ./run-test.sh
```
## Adding a new testcase
The source code for the vector extension can be found in `sim/simx/execute_vector.cpp`.
If you add support for a new vector instruction please go to `run-test.sh` and it to the default testcases.
This will ensure your instruction is included in the regression test suite.
## Updating the testcase binaries
As `riscv-vector-tests` is still under development,
we should periodically recompile the testscases and update the binaries.
To update the test case binaries run:
```
XLEN=32 make -C ../../../third_party/ riscv-vector-tests
XLEN=64 make -C ../../../third_party/ riscv-vector-tests
```
This requires Spike and Go to be installed on your machine.
Then run the testcases that you want to update - this will automatically copy them e.g.:
```
XLEN=64 ./run-test.sh testcase1 testcase2
```
Finally use git to add the updated testcases to your commit (-f required due to .gitignore):
```
git add -f testcase1 testcase2
```

View file

@ -0,0 +1,117 @@
#!/bin/bash
VLEN=${VLEN:-256}
XLEN=${XLEN:-32}
RISCV_TOOLCHAIN_PATH=${RISCV_TOOLCHAIN_PATH:-$TOOLDIR"/riscv"$XLEN"-gnu-toolchain"}
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
RESTORE_PREV_DIR=$(pwd)
VECTOR_TESTS_REPOSITORY=https://github.com/MichaelJSr/testcases/raw/main
VECTOR_TESTS_BASE_NAME=vector-tests.tar.bz2
vector_tests()
{
parts=$(eval echo {a..l})
for x in $parts
do
wget $VECTOR_TESTS_REPOSITORY/$VECTOR_TESTS_BASE_NAME.parta$x
done
cat $VECTOR_TESTS_BASE_NAME.part* > $VECTOR_TESTS_BASE_NAME
tar -xvf $VECTOR_TESTS_BASE_NAME
rm -f $VECTOR_TESTS_BASE_NAME*
}
# get selected testcases from command line or run default testcases
if [ "$#" == "0" ];
then
# write out test case name explicitely if there are collisions with other test names
testcases=(vset vmv vslide vmerge vrgather \
vlm.v vsm.v \
vle8 vle16 vle32 \
vse8 vse16 vse32 \
vlseg vlsseg vluxseg vloxseg \
vsseg vssseg vsuxseg vsoxseg \
vlse8 vlse16 vlse32 \
vsse8 vsse16 vsse32 \
vloxei vluxei vsoxei vsuxei \
vl1r vl2r vl4r vl8r \
vs1r vs2r vs4r vs8r \
vadd vsub vmin vmax vand vor vxor \
vmseq vmsne vmslt vmsle vmsgt \
vsll vsrl vsra vssr \
vaadd vasub \
vfmin vfmax vfcvt vfsqrt vfrsqrt7 vfrec7 vfclass vfmv vfslide vfmerge \
vfadd vfredusum vfsub vfredosum vfredmin vfredmax vfsgnj vmf vfdiv vfrdiv vfmul vfrsub \
vfmacc vfnmacc vfmsac vfnmsac vfmadd vfnmadd vfmsub vfnmsub \
vredsum vredand vredor vredxor vredmin vredmax \
vwred \
vmand vmor vmxor vmnand vmnor vmxnor \
vdiv vrem vmul vsmul \
vmadd vnmsub vmacc vnmsac \
vwadd vwsub vwmul vwmacc \
vrsub vcompress vnclip vssub vsadd vnsra vnsrl \
vadc vmadc vsbc vmsbc \
vsext vzext \
vid)
if [ $XLEN -eq 64 ]; then
testcases+=(vle64 vse64 vlse64 vsse64 vfwcvt vfncvt \
vfwadd vfwsub vfwmul vfwred vfwmacc vfwnmacc vfwmsac vfwnmsac )
fi
else
testcases="${@}"
fi
cd $SCRIPT_DIR
# Fallback #2: If testcases directory exists, we will use existing testcases
if [ ! -d "$SCRIPT_DIR/testcases" ]; then
mkdir testcases
cd testcases
# Fallback #3: Otherwise, download testcases
vector_tests
fi
cd $SCRIPT_DIR/testcases/v$VLEN"x"$XLEN
# Fallback #1: Copy locally generated testcases (assuming they exist)
rm *".ddr4.log"
for testcase in ${testcases[@]}; do
rm "$testcase"*.elf "$testcase"*.bin "$testcase"*.dump "$testcase"*.log
cp -f $SCRIPT_DIR/../../../third_party/riscv-vector-tests/out/v"$VLEN"x"$XLEN"machine/bin/stage2/"$testcase"* .
done
passed=0
failed=0
selected=0
# count all available testcases, exclude *.elf, *.bin, *.dump, *.log to prevent double counting
all=$(($(ls | wc -l) - $(ls -d *.elf | wc -l) - $(ls -d *.bin | wc -l) - $(ls -d *.dump | wc -l) - $(ls -d *.log | wc -l)))
for testcase in ${testcases[@]}; do
for f in "$testcase"* ; do
ln -s "$f" "$f.elf";
"$RISCV_TOOLCHAIN_PATH"/bin/riscv"$XLEN"-unknown-elf-objdump -D "$f.elf" > "$f.dump";
"$RISCV_TOOLCHAIN_PATH"/bin/riscv"$XLEN"-unknown-elf-objcopy -O binary "$f.elf" "$f.bin";
$SCRIPT_DIR/../../../sim/simx/simx -c 1 "$f.bin" &> "$f.log";
if [ $? -eq 13 ]; then
echo "$f PASSED"
let "passed++"
else
echo "$f FAILED"
let "failed++"
fi
# REG_TESTS=1 informs the script to delete the previous binary after each vector test to save disk space
# Otherwise, the vector regression tests would run out of disk space eventually
if [ $REG_TESTS -eq 1 ]; then
cat $f.log
rm $f.*
rm $f
fi
let "selected++"
done
done
cd $RESTORE_PREV_DIR
echo "Passed $passed out of $selected selected vector tests."
echo "Total available vector tests: $all"
exit $failed