add a quick'n'dirty implementation of RV32BK-accelerated AES-OCB, using the _m128i compatibility layer (spun off in its own header)

2025-04-18 18:44:42 -04:00 · 2021-02-17 09:02:43 -05:00 · 2021-02-17 09:02:43 -05:00 · 8ad11036be
commit 8ad11036be
parent 431fdc5288
22 changed files with 1943 additions and 292 deletions
--- a/aeadaes256ocbtaglen128v1-rv32/Makefile
+++ b/aeadaes256ocbtaglen128v1-rv32/Makefile
@ -0,0 +1,47 @@
+SRCs=encrypt.c try-anything.c verify.c
+OBJs=$(SRCs:.c=.o)
+SCLIBS=cpucycles.o kernelrandombytes.o
+
+COMPDIR=~dolbeau2/LITEX/buildroot-rv32/output/host
+ALTCOMPDIR=/opt/riscv64b
+
+CC=$(COMPDIR)/bin/riscv32-buildroot-linux-gnu-gcc
+ALTCC=$(ALTCOMPDIR)/bin/riscv64-unknown-elf-gcc
+CXX=$(COMPDIR)/bin/riscv32-buildroot-linux-gnu-g++
+STRIP=$(COMPDIR)/bin/riscv32-buildroot-linux-gnu-strip
+NEWOPT=-march=rv32imab -mabi=ilp32 -I. -I.. -O3 -DRV32B #-fno-vectorize #-DUSE_EPI_CUSTOM
+OPT=-march=rv32ima -mabi=ilp32 -I. -I.. -O3 #-fno-vectorize #-DUSE_EPI_CUSTOM
+#ALTCC=$(CC)
+#NEWOPT=$(OPT)
+
+all: aeadaes256ocbtaglen128v1 aeadaes256ocbtaglen128v1_small
+
+clean:
+	rm -f $(OBJs) *.S try.o try_small.o encrypt.o aeadaes256ocbtaglen128v1 aeadaes256ocbtaglen128v1_small
+
+%.o: %.c
+	$(CC) $(OPT) $< -c -o $@
+
+try.o: try.c
+	$(CC) $(OPT) $< -c -o $@
+
+try_small.o: try.c
+	$(CC) $(OPT) $< -c -o $@ -DSMALL
+
+encrypt.S: encrypt.c
+	$(ALTCC) $(NEWOPT) $< -S -o $@
+
+encrypt.o: encrypt.S
+	$(ALTCC) $(NEWOPT) $< -c -o $@
+
+aeadaes256ocbtaglen128v1: $(OBJs) encrypt.o try.o $(SCLIBS)
+	$(CXX) $(OPT) $^ -o $@
+
+aeadaes256ocbtaglen128v1_small: $(OBJs) encrypt.o try_small.o $(SCLIBS)
+	$(CXX) $(OPT) $^ -o $@
+
+kernelrandombytes.o: random.cpp
+	$(CXX) $(OPT) $< -c -o $@
+
+cpucycles.o: riscv.c
+	$(CC) $< -march=rv32ima -mabi=ilp32 -I. -O1 -c -o $@
--- a/aeadaes256ocbtaglen128v1-rv32/api.h
+++ b/aeadaes256ocbtaglen128v1-rv32/api.h
@ -0,0 +1,4 @@
+#define CRYPTO_KEYBYTES 32
+#define CRYPTO_NSECBYTES 0
+#define CRYPTO_NPUBBYTES 12
+#define CRYPTO_ABYTES 16
--- a/aeadaes256ocbtaglen128v1-rv32/cpucycles.h
+++ b/aeadaes256ocbtaglen128v1-rv32/cpucycles.h
@ -0,0 +1,28 @@
+/*
+cpucycles riscv.h version 20190803
+D. J. Bernstein
+Romain Dolbeau
+Public domain.
+*/
+
+#ifndef CPUCYCLES_riscv_h
+#define CPUCYCLES_riscv_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern long long cpucycles_riscv(void);
+extern long long cpucycles_riscv_persecond(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifndef cpucycles_implementation
+#define cpucycles_implementation "riscv"
+#define cpucycles cpucycles_riscv
+#define cpucycles_persecond cpucycles_riscv_persecond
+#endif
+
+#endif
--- a/aeadaes256ocbtaglen128v1-rv32/crypto_aead.h
+++ b/aeadaes256ocbtaglen128v1-rv32/crypto_aead.h
@ -0,0 +1,17 @@
+#ifndef crypto_aead_H
+#define crypto_aead_H
+
+#include "crypto_aead_aeadaes256ocbtaglen128v1.h"
+
+#define crypto_aead_encrypt crypto_aead_aeadaes256ocbtaglen128v1_encrypt
+#define crypto_aead_decrypt crypto_aead_aeadaes256ocbtaglen128v1_decrypt
+#define crypto_aead_KEYBYTES crypto_aead_aeadaes256ocbtaglen128v1_KEYBYTES
+#define crypto_aead_NSECBYTES crypto_aead_aeadaes256ocbtaglen128v1_NSECBYTES
+#define crypto_aead_NPUBBYTES crypto_aead_aeadaes256ocbtaglen128v1_NPUBBYTES
+#define crypto_aead_ABYTES crypto_aead_aeadaes256ocbtaglen128v1_ABYTES
+#define crypto_aead_NOOVERLAP crypto_aead_aeadaes256ocbtaglen128v1_NOOVERLAP
+#define crypto_aead_PRIMITIVE "aeadaes256ocbtaglen128v1"
+#define crypto_aead_IMPLEMENTATION crypto_aead_aeadaes256ocbtaglen128v1_IMPLEMENTATION
+#define crypto_aead_VERSION crypto_aead_aeadaes256ocbtaglen128v1_VERSION
+
+#endif
--- a/aeadaes256ocbtaglen128v1-rv32/crypto_aead_aeadaes256ocbtaglen128v1.h
+++ b/aeadaes256ocbtaglen128v1-rv32/crypto_aead_aeadaes256ocbtaglen128v1.h
@ -0,0 +1,31 @@
+#ifndef crypto_aead_aeadaes256ocbtaglen128v1_H
+#define crypto_aead_aeadaes256ocbtaglen128v1_H
+
+#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_KEYBYTES 32
+#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_NSECBYTES 0
+#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_NPUBBYTES 12
+#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_ABYTES 16
+ 
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int crypto_aead_aeadaes256ocbtaglen128v1_rv32_encrypt(unsigned char *,unsigned long long *,const unsigned char *,unsigned long long,const unsigned char *,unsigned long long,const unsigned char *,const unsigned char *,const unsigned char *);
+extern int crypto_aead_aeadaes256ocbtaglen128v1_rv32_decrypt(unsigned char *,unsigned long long *,unsigned char *,const unsigned char *,unsigned long long,const unsigned char *,unsigned long long,const unsigned char *,const unsigned char *);
+#ifdef __cplusplus
+}
+#endif
+
+#define crypto_aead_aeadaes256ocbtaglen128v1_encrypt crypto_aead_aeadaes256ocbtaglen128v1_rv32_encrypt
+#define crypto_aead_aeadaes256ocbtaglen128v1_decrypt crypto_aead_aeadaes256ocbtaglen128v1_rv32_decrypt
+#define crypto_aead_aeadaes256ocbtaglen128v1_KEYBYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_KEYBYTES
+#define crypto_aead_aeadaes256ocbtaglen128v1_NSECBYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_NSECBYTES
+#define crypto_aead_aeadaes256ocbtaglen128v1_NPUBBYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_NPUBBYTES
+#define crypto_aead_aeadaes256ocbtaglen128v1_ABYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_ABYTES
+#define crypto_aead_aeadaes256ocbtaglen128v1_NOOVERLAP crypto_aead_aeadaes256ocbtaglen128v1_rv32_NOOVERLAP
+#define crypto_aead_aeadaes256ocbtaglen128v1_IMPLEMENTATION "crypto_aead/aeadaes256ocbtaglen128v1/dolbeau/aesenc-int"
+#ifndef crypto_aead_aeadaes256ocbtaglen128v1_rv32_VERSION
+#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_VERSION "-"
+#endif
+#define crypto_aead_aeadaes256ocbtaglen128v1_VERSION crypto_aead_aeadaes256ocbtaglen128v1_rv32_VERSION
+
+#endif
--- a/aeadaes256ocbtaglen128v1-rv32/crypto_uint32.h
+++ b/aeadaes256ocbtaglen128v1-rv32/crypto_uint32.h
@ -0,0 +1,6 @@
+#ifndef crypto_uint32_h
+#define crypto_uint32_h
+
+typedef unsigned int crypto_uint32;
+
+#endif
--- a/aeadaes256ocbtaglen128v1-rv32/crypto_uint64.h
+++ b/aeadaes256ocbtaglen128v1-rv32/crypto_uint64.h
@ -0,0 +1,6 @@
+#ifndef crypto_uint64_h
+#define crypto_uint64_h
+
+typedef unsigned long long crypto_uint64;
+
+#endif
--- a/aeadaes256ocbtaglen128v1-rv32/crypto_uint8.h
+++ b/aeadaes256ocbtaglen128v1-rv32/crypto_uint8.h
@ -0,0 +1,6 @@
+#ifndef crypto_uint8_h
+#define crypto_uint8_h
+
+typedef unsigned char crypto_uint8;
+
+#endif
--- a/aeadaes256ocbtaglen128v1-rv32/crypto_verify.h
+++ b/aeadaes256ocbtaglen128v1-rv32/crypto_verify.h
@ -0,0 +1,12 @@
+#ifndef crypto_verify_H
+#define crypto_verify_H
+
+#include "crypto_verify_16.h"
+
+#define crypto_verify crypto_verify_16
+#define crypto_verify_BYTES crypto_verify_16_BYTES
+#define crypto_verify_PRIMITIVE "16"
+#define crypto_verify_IMPLEMENTATION crypto_verify_16_IMPLEMENTATION
+#define crypto_verify_VERSION crypto_verify_16_VERSION
+
+#endif
--- a/aeadaes256ocbtaglen128v1-rv32/crypto_verify_16.h
+++ b/aeadaes256ocbtaglen128v1-rv32/crypto_verify_16.h
@ -0,0 +1,22 @@
+#ifndef crypto_verify_16_H
+#define crypto_verify_16_H
+
+#define crypto_verify_16_ref_BYTES 16
+ 
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int crypto_verify_16_ref(const unsigned char *,const unsigned char *);
+#ifdef __cplusplus
+}
+#endif
+
+#define crypto_verify_16 crypto_verify_16_ref
+#define crypto_verify_16_BYTES crypto_verify_16_ref_BYTES
+#define crypto_verify_16_IMPLEMENTATION "crypto_verify/16/ref"
+#ifndef crypto_verify_16_ref_VERSION
+#define crypto_verify_16_ref_VERSION "-"
+#endif
+#define crypto_verify_16_VERSION crypto_verify_16_ref_VERSION
+
+#endif
--- a/aeadaes256ocbtaglen128v1-rv32/encrypt.c
+++ b/aeadaes256ocbtaglen128v1-rv32/encrypt.c
@ -0,0 +1,796 @@
+/* 
+// CAESAR OCB v1 somewhat optimised code
+// Info: http://www.cs.ucdavis.edu/~rogaway/ocb
+//
+// Written by Romain Dolbeau (romain@dolbeau.org),
+// based on the reference implementation by Ted Krovetz (ted@krovetz.net).
+//
+// Phillip Rogaway holds patents relevant to OCB. See the following for
+// his free patent grant: http://www.cs.ucdavis.edu/~rogaway/ocb/grant.htm
+//
+// This is free and unencumbered software released into the public domain.
+//
+// Anyone is free to copy, modify, publish, use, compile, sell, or
+// distribute this software, either in source code form or as a compiled
+// binary, for any purpose, commercial or non-commercial, and by any
+// means.
+//
+// In jurisdictions that recognize copyright laws, the author or authors
+// of this software dedicate any and all copyright interest in the
+// software to the public domain. We make this dedication for the benefit
+// of the public at large and to the detriment of our heirs and
+// successors. We intend this dedication to be an overt act of
+// relinquishment in perpetuity of all present and future rights to this
+// software under copyright law.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// For more information, please refer to <http://unlicense.org/>
+*/
+
+#include <string.h>
+
+#include "api.h"
+#include "crypto_aead.h"
+#define KEYBYTES   CRYPTO_KEYBYTES
+#define NONCEBYTES CRYPTO_NPUBBYTES
+#define TAGBYTES   CRYPTO_ABYTES
+
+#define ALIGN16  __attribute__((aligned(16)))
+#define ALIGN32  __attribute__((aligned(32)))
+#define ALIGN64  __attribute__((aligned(64)))
+#define _bswap64(a) __builtin_bswap64(a)
+#define _bswap(a) __builtin_bswap32(a)
+
+#define printv16c(p,v)                                                  \
+  {                                                                     \
+    ALIGN16 unsigned char temp[16];                       \
+    _mm_store_si128(temp, v);                                           \
+    int z;                                                              \
+    printf("%8s:%8s = ",p,#v);                                          \
+    for (z = 15 ; z >= 0 ; z--) {                                       \
+      printf("%02hhx", temp[z]);                                        \
+      if ((z%4)==0) printf(" ");                                        \
+    }                                                                   \
+    printf("\n");                                                       \
+  }
+
+#include "m128_compat.h"
+
+#include "new_instructions_support_k.h"
+
+#define rotr(a,b) _rv32_ror(a,b)
+
+static inline void aes256_Tsetkey_encrypt(const unsigned int key[], unsigned int *aes_edrk) {
+  unsigned int i = 0;
+  unsigned int rotl_aes_edrk;
+  unsigned int tmp8, tmp9, tmp10, tmp11;
+  unsigned int tmp12, tmp13, tmp14, tmp15;
+  unsigned int temp_lds;
+  unsigned int round = 0x00000001;
+
+  tmp8  = (key[0]);
+  aes_edrk[0] = tmp8;
+  tmp9  = (key[1]);
+  aes_edrk[1] = tmp9;
+  tmp10 = (key[2]);
+  aes_edrk[2] = tmp10;
+  tmp11 = (key[3]);
+  aes_edrk[3] = tmp11;
+  tmp12 = (key[4]);
+  aes_edrk[4] = tmp12;
+  tmp13 = (key[5]);
+  aes_edrk[5] = tmp13;
+  tmp14 = (key[6]);
+  aes_edrk[6] = tmp14;
+  tmp15 = (key[7]);
+  aes_edrk[7] = tmp15;
+
+  for( i = 8; i < 56; /* i+=8 */ )
+  {
+	tmp8 = tmp8 ^ round;
+	round = round << 1;
+	rotl_aes_edrk = rotr(tmp15,8);
+	tmp8 = aes32esi0(tmp8, rotl_aes_edrk);
+	tmp8 = aes32esi1(tmp8, rotl_aes_edrk);
+	tmp8 = aes32esi2(tmp8, rotl_aes_edrk);
+	tmp8 = aes32esi3(tmp8, rotl_aes_edrk);
+
+    aes_edrk[i++]   = tmp8;
+    tmp9  = tmp9  ^ tmp8;
+    aes_edrk[i++]   = tmp9;
+    tmp10 = tmp10 ^ tmp9;
+    aes_edrk[i++]  = tmp10;
+    tmp11 = tmp11 ^ tmp10;
+    aes_edrk[i++]  = tmp11;
+
+	tmp12 = aes32esi0(tmp12, tmp11);
+	tmp12 = aes32esi1(tmp12, tmp11);
+	tmp12 = aes32esi2(tmp12, tmp11);
+	tmp12 = aes32esi3(tmp12, tmp11);
+	
+    aes_edrk[i++]  = tmp12;
+    tmp13 = tmp13 ^ tmp12;
+    aes_edrk[i++]  = tmp13;
+    tmp14 = tmp14 ^ tmp13;
+    aes_edrk[i++]  = tmp14;
+    tmp15 = tmp15 ^ tmp14;
+    aes_edrk[i++]  = tmp15;
+  }
+
+  tmp8 = tmp8 ^ round;
+  round = round << 1;
+  rotl_aes_edrk = rotr(tmp15,8);
+  tmp8 = aes32esi0(tmp8, rotl_aes_edrk);
+  tmp8 = aes32esi1(tmp8, rotl_aes_edrk);
+  tmp8 = aes32esi2(tmp8, rotl_aes_edrk);
+  tmp8 = aes32esi3(tmp8, rotl_aes_edrk);
+  
+  aes_edrk[i++]   = tmp8;
+  tmp9  = tmp9  ^ tmp8;
+  aes_edrk[i++]   = tmp9;
+  tmp10 = tmp10 ^ tmp9;
+  aes_edrk[i++]  = tmp10;
+  tmp11 = tmp11 ^ tmp10;
+  aes_edrk[i++]  = tmp11;
+}
+
+static void aes256_key_enc2dec(unsigned int *erk, unsigned int *drk)
+{
+	int i, j;
+	// first and last unchanged (but swapped)
+	for (i = 0; i < 4; i++) {
+		drk[i] = erk[i+56];
+		drk[i+56] = erk[i];
+	}
+	// convert & revert order
+	for (i = 1; i < 14; i++) {
+		for (j = 0 ; j < 4 ; j++) {
+			unsigned int ek, dk;
+			ek = erk[i*4+j];
+			
+			dk = 0;
+			dk = aes32esi0(dk, ek);
+			dk = aes32esi1(dk, ek);
+			dk = aes32esi2(dk, ek);
+			dk = aes32esi3(dk, ek);
+			
+			ek = 0;
+			ek = aes32dsmi0(ek, dk);
+			ek = aes32dsmi1(ek, dk);
+			ek = aes32dsmi2(ek, dk);
+			ek = aes32dsmi3(ek, dk);
+			
+			drk[56-4*i+j] = ek;
+		}
+	}
+}
+
+#define AES_ROUND1T(TAB,I,X0,X1,X2,X3,Y0,Y1,Y2,Y3)		 \
+  {								 \
+    X0 = aes32esmi0(TAB[I++],Y0);				 \
+    X0 = aes32esmi1(X0,Y1);					 \
+    X0 = aes32esmi2(X0,Y2);					 \
+    X0 = aes32esmi3(X0,Y3);					 \
+    X1 = aes32esmi0(TAB[I++],Y1);				 \
+    X1 = aes32esmi1(X1,Y2);					 \
+    X1 = aes32esmi2(X1,Y3);					 \
+    X1 = aes32esmi3(X1,Y0);					 \
+    X2 = aes32esmi0(TAB[I++],Y2);				 \
+    X2 = aes32esmi1(X2,Y3);					 \
+    X2 = aes32esmi2(X2,Y0);					 \
+    X2 = aes32esmi3(X2,Y1);					 \
+    X3 = aes32esmi0(TAB[I++],Y3);				 \
+    X3 = aes32esmi1(X3,Y0);					 \
+    X3 = aes32esmi2(X3,Y1);					 \
+    X3 = aes32esmi3(X3,Y2);					 \
+  }
+
+/* using the K + B instructions */
+static inline void aes256_1Tft_encrypt(const uint32_t *aes_edrk, const uint32_t *input, uint32_t *output)
+{
+  unsigned int X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+  unsigned int i = 0, j = 0;
+  unsigned int l_aes_nr = 14;
+
+  X0 = ((input[0]) ^ aes_edrk[j++]);
+  X1 = ((input[1]) ^ aes_edrk[j++]);
+  X2 = ((input[2]) ^ aes_edrk[j++]);
+  X3 = ((input[3]) ^ aes_edrk[j++]);
+
+  for (i = 4 ; i < (l_aes_nr<<2) ; ) {
+    
+    AES_ROUND1T(aes_edrk, i, Y0, Y1, Y2, Y3, X0, X1, X2, X3 );
+
+    X0=Y0;
+    X1=Y1;
+    X2=Y2;
+    X3=Y3;
+  }
+  /* last round */
+
+  Y0 = aes32esi0(aes_edrk[i], X0);
+  Y0 = aes32esi1(Y0, X1);
+  Y0 = aes32esi2(Y0, X2);
+  Y0 = aes32esi3(Y0, X3);
+  i++;
+  Y1 = aes32esi0(aes_edrk[i], X1);
+  Y1 = aes32esi1(Y1, X2);
+  Y1 = aes32esi2(Y1, X3);
+  Y1 = aes32esi3(Y1, X0);
+  i++;
+  Y2 = aes32esi0(aes_edrk[i], X2);
+  Y2 = aes32esi1(Y2, X3);
+  Y2 = aes32esi2(Y2, X0);
+  Y2 = aes32esi3(Y2, X1);
+  i++;
+  Y3 = aes32esi0(aes_edrk[i], X3);
+  Y3 = aes32esi1(Y3, X0);
+  Y3 = aes32esi2(Y3, X1);
+  Y3 = aes32esi3(Y3, X2);
+
+  output[0] = (Y0);
+  output[1] = (Y1);
+  output[2] = (Y2);
+  output[3] = (Y3);
+}
+
+
+#define AES_ROUND_DKT(TAB,I,X0,X1,X2,X3,Y0,Y1,Y2,Y3)	\
+	{												\
+		X0 = aes32dsmi0(TAB[I+0],Y0);				\
+		X0 = aes32dsmi1(X0,Y3);						\
+		X0 = aes32dsmi2(X0,Y2);						\
+		X0 = aes32dsmi3(X0,Y1);						\
+		X1 = aes32dsmi0(TAB[I+1],Y1);				\
+		X1 = aes32dsmi1(X1,Y0);						\
+		X1 = aes32dsmi2(X1,Y3);						\
+		X1 = aes32dsmi3(X1,Y2);						\
+		X2 = aes32dsmi0(TAB[I+2],Y2);				\
+		X2 = aes32dsmi1(X2,Y1);						\
+		X2 = aes32dsmi2(X2,Y0);						\
+		X2 = aes32dsmi3(X2,Y3);						\
+		X3 = aes32dsmi0(TAB[I+3],Y3);				\
+		X3 = aes32dsmi1(X3,Y2);						\
+		X3 = aes32dsmi2(X3,Y1);						\
+		X3 = aes32dsmi3(X3,Y0);						\
+	}
+
+void aes256_1Tft_decrypt(const unsigned int *aes_drk, const unsigned int *input, unsigned int *output)
+{
+	const unsigned int aes_nr = 14; // aes256
+  unsigned int X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+  unsigned int i;
+
+  X0 = input[0]; X0 ^= aes_drk[0];
+  X1 = input[1]; X1 ^= aes_drk[1];
+  X2 = input[2]; X2 ^= aes_drk[2];
+  X3 = input[3]; X3 ^= aes_drk[3];
+  
+  //    for (i=1;i<aes_nr;i++)
+  i=1;
+  do
+      {
+        AES_ROUND_DKT(aes_drk, (i<<2), Y0, Y1, Y2, Y3, X0, X1, X2, X3 );       /* round 1 */
+        X0=Y0;
+        X1=Y1;
+        X2=Y2;
+        X3=Y3;
+        i++;
+      }
+      while(i<aes_nr);
+    i=(i<<2);
+
+    /* last round */
+
+	Y0 = aes32dsi0(aes_drk[i+0], X0);
+	Y0 = aes32dsi1(Y0, X3);
+	Y0 = aes32dsi2(Y0, X2);
+	Y0 = aes32dsi3(Y0, X1);
+	Y1 = aes32dsi0(aes_drk[i+1], X1);
+	Y1 = aes32dsi1(Y1, X0);
+	Y1 = aes32dsi2(Y1, X3);
+	Y1 = aes32dsi3(Y1, X2);
+	Y2 = aes32dsi0(aes_drk[i+2], X2);
+	Y2 = aes32dsi1(Y2, X1);
+	Y2 = aes32dsi2(Y2, X0);
+	Y2 = aes32dsi3(Y2, X3);
+	Y3 = aes32dsi0(aes_drk[i+3], X3);
+	Y3 = aes32dsi1(Y3, X2);
+	Y3 = aes32dsi2(Y3, X1);
+	Y3 = aes32dsi3(Y3, X0);
+	
+    output[0] = Y0;
+    output[1] = Y1;
+    output[2] = Y2;
+    output[3] = Y3;
+}
+
+
+/** single, by-the-book AES encryption with AES-NI */
+static inline __m128i aes256_1Tft__encrypt1_si128(const __m128i nv, const __m128i rkeys[15]) {
+	__m128i temp;
+	aes256_1Tft_encrypt(rkeys, &nv, &temp);
+	return temp;
+}
+static inline __m128i aes256_1Tft__decrypt1_si128(const __m128i nv, const __m128i rkeys[15]) {
+	__m128i temp;
+	aes256_1Tft_decrypt(rkeys, &nv, &temp);
+	return temp;
+}
+
+typedef unsigned char block[16];
+
+/* ------------------------------------------------------------------------- */
+
+#if 0
+static inline void xor_block(block d, block s1, block s2) {
+    unsigned i;
+    for (i=0; i<16; i++)
+        d[i] = s1[i] ^ s2[i];
+}
+#else
+/* 128 bits SSE doubling */
+static inline void xor_block(unsigned char* d, const unsigned char* s1, const unsigned char* s2) {
+  __m128i dv = _mm_xor_si128(_mm_loadu_si128((const __m128i*)s1), _mm_loadu_si128((const __m128i*)s2));
+  _mm_storeu_si128((__m128i*)d,dv);
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+
+#if 0
+static inline void double_block(block d, block s) {
+    unsigned i;
+    unsigned char tmp = s[0];
+    for (i=0; i<15; i++)
+        d[i] = (s[i] << 1) | (s[i+1] >> 7);
+    d[15] = (s[15] << 1) ^ ((tmp >> 7) * 135);
+}
+#else
+#if 0
+/* 64 bits little-endian doubling, faster */
+static inline void double_block(unsigned long long *d, const unsigned long long* s) {
+  unsigned long long sl = _bswap64(s[1]), sh = _bswap64(s[0]);
+  unsigned long long sl1 = sl << 1;
+  unsigned long long sh1 = sh << 1;
+  sh1 |= sl>>63;
+  sl1 ^= (((long long)sh>>63) & 135);
+  d[1]=_bswap64(sl1);
+  d[0]=_bswap64(sh1);
+}
+#else
+/* 128 bits SSE, much faster */
+static inline __m128i double_block_si128_norev(const __m128i sv) {
+  const __m128i mask = _mm_set_epi32(135,1,1,1);
+  /* __m128i sv31 = _mm_srai_epi32(sv, 31); */
+  __m128i sv31 = wordsign128(sv);
+  __m128i sv31m = _mm_and_si128(sv31, mask);
+ /*  __m128i sv31ms = _mm_shuffle_epi32(sv31m, _MM_SHUFFLE(2,1,0,3)); */
+  __m128i sv31ms = wordrotate1l128(sv31m);
+  __m128i sv1 = _mm_slli_epi32(sv, 1);
+  __m128i dv = _mm_xor_si128(sv31ms,sv1);
+  return dv;
+}
+static inline __m128i double_block_si128(const __m128i svr) {
+  /* const __m128i rev = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); */
+  /* __m128i sv = _mm_shuffle_epi8(svr, rev); */
+  __m128i sv = bytereverse128(svr);
+  __m128i dv = double_block_si128_norev(sv);
+  /* return  _mm_shuffle_epi8(dv, rev); */
+  return bytereverse128(dv);
+}
+static inline void double_block(unsigned char *d, const unsigned char* s) {
+	__m128i sv = _mm_loadu_si128((const __m128i*)s);
+	__m128i dv = double_block_si128(sv);
+	_mm_storeu_si128((__m128i*)d,dv);
+}
+/* 128 bits SSE times 4 */
+static const unsigned short lk4[64] = {
+0x0000, 0x0086, 0x010c, 0x018a, 0x0218, 0x029e, 0x0314, 0x0392,
+0x0430, 0x04b6, 0x053c, 0x05ba, 0x0628, 0x06ae, 0x0724, 0x07a2,
+0x0860, 0x08e6, 0x096c, 0x09ea, 0x0a78, 0x0afe, 0x0b74, 0x0bf2,
+0x0c50, 0x0cd6, 0x0d5c, 0x0dda, 0x0e48, 0x0ece, 0x0f44, 0x0fc2,
+0x10c0, 0x1046, 0x11cc, 0x114a, 0x12d8, 0x125e, 0x13d4, 0x1352,
+0x14f0, 0x1476, 0x15fc, 0x157a, 0x16e8, 0x166e, 0x17e4, 0x1762,
+0x18a0, 0x1826, 0x19ac, 0x192a, 0x1ab8, 0x1a3e, 0x1bb4, 0x1b32,
+0x1c90, 0x1c16, 0x1d9c, 0x1d1a, 0x1e88, 0x1e0e, 0x1f84, 0x1f02
+};
+static inline __m128i double_block_2_si128_norev(const __m128i sv) {
+  const __m128i mask = _mm_set_epi32(3,3,3,3);
+  const int idx = _mm_extract_epi8(sv,15);
+  /* __m128i sv30x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xC0)>>6],0); */
+  __m128i sv30x = halfwordandzero(lk4[(idx&0xC0)>>6]);
+
+  __m128i sv30 = _mm_srli_epi32(sv, 30);
+  __m128i sv30m = _mm_and_si128(sv30, mask);
+  /* __m128i sv30ms = _mm_shuffle_epi32(sv30m, _MM_SHUFFLE(2,1,0,3)); */
+  __m128i sv30ms = wordrotate1l128(sv30m);
+  __m128i sv2 = _mm_slli_epi32(sv, 2);
+  __m128i dv = _mm_xor_si128(sv30ms,sv2);
+  __m128i final =  _mm_xor_si128(dv, sv30x);
+  return final;
+}
+static inline __m128i double_block_3_si128_norev(const __m128i sv) {
+  const __m128i mask = _mm_set_epi32(7,7,7,7);
+  const int idx = _mm_extract_epi8(sv,15);
+  /* __m128i sv29x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xE0)>>5],0); */
+  __m128i sv29x = halfwordandzero(lk4[(idx&0xE0)>>5]);
+
+  __m128i sv29 = _mm_srli_epi32(sv, 29);
+  __m128i sv29m = _mm_and_si128(sv29, mask);
+  /* __m128i sv29ms = _mm_shuffle_epi32(sv29m, _MM_SHUFFLE(2,1,0,3)); */
+  __m128i sv29ms = wordrotate1l128(sv29m);
+  __m128i sv3 = _mm_slli_epi32(sv, 3);
+  __m128i dv = _mm_xor_si128(sv29ms,sv3);
+  __m128i final =  _mm_xor_si128(dv, sv29x);
+  return final;
+}
+static inline __m128i double_block_4_si128_norev(const __m128i sv) {
+  const __m128i mask = _mm_set_epi32(15,15,15,15);
+  const int idx = _mm_extract_epi8(sv,15);
+  /* __m128i sv28x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xF0)>>4],0); */
+  __m128i sv28x = halfwordandzero(lk4[(idx&0xF0)>>4]);
+
+  __m128i sv28 = _mm_srli_epi32(sv, 28);
+  __m128i sv28m = _mm_and_si128(sv28, mask);
+  /* __m128i sv28ms = _mm_shuffle_epi32(sv28m, _MM_SHUFFLE(2,1,0,3)); */
+  __m128i sv28ms = wordrotate1l128(sv28m);
+  __m128i sv4 = _mm_slli_epi32(sv, 4);
+  __m128i dv = _mm_xor_si128(sv28ms,sv4);
+  __m128i final =  _mm_xor_si128(dv, sv28x);
+  return final;
+}
+static inline __m128i double_block_5_si128_norev(const __m128i sv) {
+  const __m128i mask = _mm_set_epi32(31,31,31,31);
+  const int idx = _mm_extract_epi8(sv,15);
+  /* __m128i sv27x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xF8)>>3],0); */
+  __m128i sv27x = halfwordandzero(lk4[(idx&0xF8)>>3]);
+
+  __m128i sv27 = _mm_srli_epi32(sv, 27);
+  __m128i sv27m = _mm_and_si128(sv27, mask);
+  /* __m128i sv27ms = _mm_shuffle_epi32(sv27m, _MM_SHUFFLE(2,1,0,3)); */
+  __m128i sv27ms = wordrotate1l128(sv27m);
+  __m128i sv5 = _mm_slli_epi32(sv, 5);
+  __m128i dv = _mm_xor_si128(sv27ms,sv5);
+  __m128i final =  _mm_xor_si128(dv, sv27x);
+  return final;
+}
+static inline __m128i double_block_6_si128_norev(const __m128i sv) {
+  const __m128i mask = _mm_set_epi32(63,63,63,63);
+  const int idx = _mm_extract_epi8(sv,15);
+  /* __m128i sv26x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xFC)>>2],0); */
+  __m128i sv26x = halfwordandzero(lk4[(idx&0xFC)>>2]);
+
+  __m128i sv26 = _mm_srli_epi32(sv, 26);
+  __m128i sv26m = _mm_and_si128(sv26, mask);
+  /* __m128i sv26ms = _mm_shuffle_epi32(sv26m, _MM_SHUFFLE(2,1,0,3)); */
+  __m128i sv26ms = wordrotate1l128(sv26m);
+  __m128i sv6 = _mm_slli_epi32(sv, 6);
+  __m128i dv = _mm_xor_si128(sv26ms,sv6);
+  __m128i final =  _mm_xor_si128(dv, sv26x);
+  return final;
+}
+#endif
+#endif
+
+/* ------------------------------------------------------------------------- */
+static inline __m128i calc_L_i_si128(const __m128i ldollarvr, const unsigned j) {
+  /* const __m128i rev = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); */
+  /* __m128i ldollarv = _mm_shuffle_epi8(ldollarvr, rev); */
+  __m128i ldollarv = bytereverse128(ldollarvr);
+  unsigned i;
+  __m128i lv;
+  unsigned ntz = __builtin_ctz(j);/* printf("ntz = %u\n", ntz); */
+  switch(ntz) {
+	case 0:
+      lv = double_block_si128_norev(ldollarv);
+	break;
+	case 1:
+      lv = double_block_2_si128_norev(ldollarv);
+	break;
+	case 2:
+      lv = double_block_3_si128_norev(ldollarv);
+	break;
+	case 3:
+      lv = double_block_4_si128_norev(ldollarv);
+	break;
+	case 4:
+      lv = double_block_5_si128_norev(ldollarv);
+	break;
+ 	default:
+      lv = double_block_6_si128_norev(ldollarv);
+      for (i = 5; i < ntz ; i++)
+        lv = double_block_si128_norev(lv);
+	break;
+      }
+  /* return _mm_shuffle_epi8(lv, rev); */
+  return bytereverse128(lv);
+}
+static inline void calc_L_i(block l, const block ldollar, const unsigned i) {
+  __m128i ldollarv = _mm_loadu_si128((const __m128i*)ldollar);
+  __m128i lv = calc_L_i_si128(ldollarv, i);
+  _mm_storeu_si128((__m128i*)l,lv);
+}
+static inline void precompute_lv(__m128i prelv[32], const __m128i ldollarvr, const unsigned max) {
+  /* const __m128i rev = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); */
+  /* __m128i ldollarv = _mm_shuffle_epi8(ldollarvr, rev); */
+  __m128i ldollarv = bytereverse128(ldollarvr);
+  unsigned i;
+  __m128i lv = double_block_si128_norev(ldollarv);
+  for (i = 0 ; i < max-1 ; i++) {
+	/* prelv[i] = _mm_shuffle_epi8(lv, rev); */
+	prelv[i] = bytereverse128(lv);
+	lv = double_block_si128_norev(lv);
+  }
+  /* prelv[i] = _mm_shuffle_epi8(lv, rev); */
+  return bytereverse128(lv);
+}
+
+/* ------------------------------------------------------------------------- */
+
+static void hash(block result, const unsigned char *k,
+                 unsigned char *a, unsigned abytes,
+		 const __m128i lstar,
+		 const __m128i prelv[32], const __m128i aes_key[15]) {
+    __m128i  offset, sum, tmp;
+    unsigned i;
+    
+    /* Process any whole blocks */
+    /* Sum_0 = zeros(128) */
+    sum = _mm_setzero_si128();
+    /* Offset_0 = zeros(128) */
+    offset = _mm_setzero_si128();
+    i=1;
+    for (; i<=abytes/16; i++, a = a + 16) {
+        /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	tmp = prelv[__builtin_ctz(i)];
+        offset = _mm_xor_si128(offset, tmp);
+        /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+        tmp = _mm_xor_si128(offset, _mm_loadu_si128((const __m128i*)a));
+        tmp = aes256_1Tft__encrypt1_si128(tmp, aes_key);
+        sum = _mm_xor_si128(sum, tmp);
+    }
+
+    /* Process any final partial block; compute final hash value */
+
+    abytes = abytes % 16;  /* Bytes in final block */
+    if (abytes > 0) {
+        /* Offset_* = Offset_m xor L_* */
+        offset = _mm_xor_si128(offset, lstar);
+        /* tmp = (A_* || 1 || zeros(127-bitlen(A_*))) xor Offset_* */
+        unsigned char pad[16];
+        memset(pad, 0, 16);
+        memcpy(pad, a, abytes);
+        pad[abytes] = 0x80;
+        tmp = _mm_loadu_si128((const __m128i*)pad);
+        tmp = _mm_xor_si128(offset, tmp);
+        /* Sum = Sum_m xor ENCIPHER(K, tmp) */
+        tmp = aes256_1Tft__encrypt1_si128(tmp, aes_key);
+        sum = _mm_xor_si128(tmp, sum);
+    }
+    
+    _mm_storeu_si128((__m128i*)result,sum);
+}
+
+/* ------------------------------------------------------------------------- */
+
+static int ocb_crypt(unsigned char *out, unsigned char *k, unsigned char *n,
+                     unsigned char *a, unsigned abytes,
+                     unsigned char *in, unsigned inbytes, int encrypting) {
+    __m128i prelv[32];
+    __m128i aes_decrypt_key[15];
+    __m128i aes_encrypt_key[15];
+    block ad_hash;
+    __m128i lstar, ldollar, sum, offset, ktop, pad, nonce, tag, tmp, outv;
+    block nonce_b, offset_b;
+    unsigned char stretch[24];
+    unsigned bottom, byteshift, bitshift, i, max;
+    
+    /* Setup AES and strip ciphertext of its tag */
+    if ( ! encrypting ) {
+         if (inbytes < TAGBYTES) return -1;
+         inbytes -= TAGBYTES;
+    }
+    aes256_Tsetkey_encrypt(k, aes_encrypt_key);
+    if ( ! encrypting ) {
+	aes256_key_enc2dec(aes_encrypt_key, aes_decrypt_key);
+    }
+     
+    /* Key-dependent variables */
+
+    /* L_* = ENCIPHER(K, zeros(128)) */
+    tmp = _mm_setzero_si128();
+    lstar = aes256_1Tft__encrypt1_si128(tmp, aes_encrypt_key);
+    /* L_$ = double(L_*) */
+    ldollar = double_block_si128(lstar); 
+    max = abytes >= inbytes ? abytes/4 : inbytes/4;
+    max = (max < 2 ? 2 : max);
+    /* only precompute what's really needed;
+       look at the number of leading zero (to find the leftmost bit set to one)
+       all trailing zero will be at the right of it so we have an upper bound
+    */
+    precompute_lv(prelv,ldollar,31-__builtin_clz(max));
+
+    /* Nonce-dependent and per-encryption variables */
+
+    /* Nonce = zeros(127-bitlen(N)) || 1 || N */
+    memset(nonce_b, 0, 16);
+    memcpy(&nonce_b[16-NONCEBYTES],n,NONCEBYTES);
+    nonce_b[0] = (unsigned char)(((TAGBYTES * 8) % 128) << 1);
+    nonce_b[16-NONCEBYTES-1] |= 0x01;
+    /* bottom = str2num(Nonce[123..128]) */
+    bottom = nonce_b[15] & 0x3F;
+    /* Ktop = ENCIPHER(K, Nonce[1..122] || zeros(6)) */
+    nonce_b[15] &= 0xC0;
+    nonce = _mm_loadu_si128((const __m128i*)nonce_b);
+    ktop = aes256_1Tft__encrypt1_si128(nonce, aes_encrypt_key);
+    /* Stretch = Ktop || (Ktop[1..64] xor Ktop[9..72]) */
+    _mm_storeu_si128((__m128i*)stretch, ktop);
+    _mm_storel_epi64((__m128i*)(stretch+16), _mm_xor_si128(_mm_srli_si128(ktop,1), ktop));
+    /* Offset_0 = Stretch[1+bottom..128+bottom] */
+    byteshift = bottom/8;
+    bitshift  = bottom%8;
+    if (bitshift != 0)
+        for (i=0; i<16; i++)
+            offset_b[i] = (stretch[i+byteshift] << bitshift) |
+                        (stretch[i+byteshift+1] >> (8-bitshift));
+    else
+        for (i=0; i<16; i++)
+            offset_b[i] = stretch[i+byteshift];
+    offset = _mm_loadu_si128((const __m128i*)offset_b);
+    /* Checksum_0 = zeros(128) */
+    sum = _mm_xor_si128(sum,sum);
+
+    /* Hash associated data */
+    hash(ad_hash, k, a, abytes, lstar, prelv, aes_encrypt_key);
+
+    /* Process any whole blocks */
+    i=1;
+    if (encrypting) {
+
+      for (; i<=inbytes/16; i++, in=in+16, out=out+16) {
+        /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+        tmp = prelv[__builtin_ctz(i)];
+ 
+        offset = _mm_xor_si128(offset, tmp);
+        tmp = _mm_xor_si128(offset, _mm_loadu_si128((const __m128i*)in));
+        
+        /* Checksum_i = Checksum_{i-1} xor P_i */
+        sum = _mm_xor_si128(_mm_loadu_si128((const __m128i*)in), sum);
+        /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+        tmp = aes256_1Tft__encrypt1_si128(tmp, aes_encrypt_key);
+        outv = _mm_xor_si128(offset, tmp);
+        _mm_storeu_si128((__m128i*)out, outv);
+      }
+    } else {
+
+      for (; i<=inbytes/16; i++, in=in+16, out=out+16) {
+        /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+        tmp= prelv[__builtin_ctz(i)];
+        offset = _mm_xor_si128(offset, tmp);        
+        tmp = _mm_xor_si128(offset, _mm_loadu_si128((const __m128i*)in));
+
+        /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+        tmp = aes256_1Tft__decrypt1_si128(tmp, aes_decrypt_key);
+        outv = _mm_xor_si128(offset, tmp);
+        _mm_storeu_si128((__m128i*)out, outv);
+        /* Checksum_i = Checksum_{i-1} xor P_i */
+        sum = _mm_xor_si128(outv, sum);
+      }
+    }
+
+    /* Process any final partial block and compute raw tag */
+
+    inbytes = inbytes % 16;  /* Bytes in final block */
+    if (inbytes > 0) {
+        /* Offset_* = Offset_m xor L_* */
+        offset = _mm_xor_si128(offset, lstar);
+        /* Pad = ENCIPHER(K, Offset_*) */
+        pad = aes256_1Tft__encrypt1_si128(offset, aes_encrypt_key);
+        
+        if (encrypting) {
+            /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
+            unsigned char tmp_b[16];
+            unsigned char pad_b[16];
+            memset(tmp_b, 0, 16);
+            memcpy(tmp_b, in, inbytes);
+            tmp_b[inbytes] = 0x80;
+            tmp = _mm_loadu_si128((const __m128i*)tmp_b);
+            sum = _mm_xor_si128(tmp, sum);
+            /* C_* = P_* xor Pad[1..bitlen(P_*)] */
+            pad = _mm_xor_si128(tmp, pad);
+            _mm_storeu_si128((__m128i*)pad_b, pad);
+            memcpy(out, pad_b, inbytes);
+            out = out + inbytes;
+        } else {
+            /* P_* = C_* xor Pad[1..bitlen(C_*)] */
+            unsigned char tmp_b[16];
+            unsigned char pad_b[16];
+            _mm_storeu_si128((__m128i*)pad_b, pad);
+            memcpy(tmp_b, pad_b, 16);
+            memcpy(tmp_b, in, inbytes);
+            xor_block(tmp_b,pad_b,tmp_b);
+            tmp_b[inbytes] = 0x80;
+            memcpy(out, tmp_b, inbytes);
+            tmp = _mm_loadu_si128((const __m128i*)tmp_b);
+            /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
+            sum = _mm_xor_si128(tmp, sum);
+            in = in + inbytes;
+        }
+    }
+    
+    /* Tag = ENCIPHER(K, Checksum xor Offset xor L_$) xor HASH(K,A) */
+    tmp = _mm_xor_si128(sum, offset);
+    tmp = _mm_xor_si128(tmp, ldollar);
+    tag = aes256_1Tft__encrypt1_si128(tmp, aes_encrypt_key);
+    tag = _mm_xor_si128(_mm_loadu_si128((const __m128i*)ad_hash), tag);
+    
+    if (encrypting) {
+        unsigned char tag_b[16];
+        _mm_storeu_si128((__m128i*)tag_b, tag);
+        memcpy(out, tag_b, TAGBYTES);
+        return 0;
+    } else {
+        unsigned char tag_b[16];
+        _mm_storeu_si128((__m128i*)tag_b, tag);
+        return (memcmp(in,tag_b,TAGBYTES) ? -1 : 0);     /* Check for validity */
+    }
+}
+
+/* ------------------------------------------------------------------------- */
+
+#define OCB_ENCRYPT 1
+#define OCB_DECRYPT 0
+
+void ocb_encrypt(unsigned char *c, unsigned char *k, unsigned char *n,
+                 unsigned char *a, unsigned abytes,
+                 unsigned char *p, unsigned pbytes) {
+    ocb_crypt(c, k, n, a, abytes, p, pbytes, OCB_ENCRYPT);
+}
+
+/* ------------------------------------------------------------------------- */
+
+int ocb_decrypt(unsigned char *p, unsigned char *k, unsigned char *n,
+                unsigned char *a, unsigned abytes,
+                unsigned char *c, unsigned cbytes) {
+    return ocb_crypt(p, k, n, a, abytes, c, cbytes, OCB_DECRYPT);
+}
+
+/* ------------------------------------------------------------------------- */
+
+int crypto_aead_encrypt(
+unsigned char *c,unsigned long long *clen,
+const unsigned char *m,unsigned long long mlen,
+const unsigned char *ad,unsigned long long adlen,
+const unsigned char *nsec,
+const unsigned char *npub,
+const unsigned char *k
+)
+{
+    *clen = mlen + TAGBYTES;
+    ocb_crypt(c, (unsigned char *)k, (unsigned char *)npub, (unsigned char *)ad,
+            adlen, (unsigned char *)m, mlen, OCB_ENCRYPT);
+    return 0;
+}
+
+int crypto_aead_decrypt(
+unsigned char *m,unsigned long long *mlen,
+unsigned char *nsec,
+const unsigned char *c,unsigned long long clen,
+const unsigned char *ad,unsigned long long adlen,
+const unsigned char *npub,
+const unsigned char *k
+)
+{
+    *mlen = clen - TAGBYTES;
+    return ocb_crypt(m, (unsigned char *)k, (unsigned char *)npub,
+            (unsigned char *)ad, adlen, (unsigned char *)c, clen, OCB_DECRYPT);
+}
+
--- a/aeadaes256ocbtaglen128v1-rv32/kernelrandombytes.h
+++ b/aeadaes256ocbtaglen128v1-rv32/kernelrandombytes.h
@ -0,0 +1,14 @@
+#ifndef kernelrandombytes_h
+#define kernelrandombytes_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void kernelrandombytes(unsigned char *,unsigned long long);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/aeadaes256ocbtaglen128v1-rv32/random.cpp
+++ b/aeadaes256ocbtaglen128v1-rv32/random.cpp
@ -0,0 +1,19 @@
+#include <random>
+#include <functional>
+
+std::default_random_engine generator;
+std::uniform_int_distribution<unsigned char> distribution(0,255);
+auto rbyte = std::bind ( distribution, generator );
+
+extern "C" {
+  void kernelrandombytes(unsigned char *x,unsigned long long xlen)
+  {
+    int i;
+
+    while (xlen > 0) {
+      *x = rbyte();
+      x++;
+      xlen--;
+    }
+  }
+}
--- a/aeadaes256ocbtaglen128v1-rv32/riscv.c
+++ b/aeadaes256ocbtaglen128v1-rv32/riscv.c
@ -0,0 +1,83 @@
+/*
+cpucycles/riscv.c version 20190803
+D. J. Bernstein
+Romain Dolbeau
+Public domain.
+*/
+
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+long long cpucycles_riscv(void)
+{
+  long long result;
+#if defined(__riscv_xlen)
+#if __riscv_xlen == 64
+  asm volatile("rdcycle %0" : "=r" (result));
+#elif __riscv_xlen == 32
+  unsigned int l, h, h2;
+  asm volatile(	"start:\n"
+		"rdcycleh %0\n"
+		"rdcycle %1\n"  
+		"rdcycleh %2\n"
+		"bne %0, %2, start\n"
+		: "=r" (h), "=r" (l), "=r" (h2));
+
+  result = (((unsigned long long)h)<<32) | ((unsigned long long)l);
+#else
+#error "unknown __riscv_xlen"
+#endif
+#else // __riscv_xlen
+#error "__riscv_xlen required for RISC-V support"
+#endif // __riscv_xlen
+  return result;
+}
+
+static long long microseconds(void)
+{
+  struct timeval t;
+  gettimeofday(&t,(struct timezone *) 0);
+  return t.tv_sec * (long long) 1000000 + t.tv_usec;
+}
+
+static double guessfreq(void)
+{
+  long long tb0; long long us0;
+  long long tb1; long long us1;
+
+  tb0 = cpucycles_riscv();
+  us0 = microseconds();
+  do {
+    tb1 = cpucycles_riscv();
+    us1 = microseconds();
+  } while (us1 - us0 < 10000 || tb1 - tb0 < 1000);
+  if (tb1 <= tb0) return 0;
+  tb1 -= tb0;
+  us1 -= us0;
+  return ((double) tb1) / (0.000001 * (double) us1);
+}
+
+static long long cpufrequency = 0;
+
+static void init(void)
+{
+  double guess1;
+  double guess2;
+  int loop;
+
+  for (loop = 0;loop < 100;++loop) {
+    guess1 = guessfreq();
+    guess2 = guessfreq();
+    if (guess1 > 1.01 * guess2) continue;
+    if (guess2 > 1.01 * guess1) continue;
+    cpufrequency = 0.5 * (guess1 + guess2);
+    break;
+  }
+}
+
+long long cpucycles_riscv_persecond(void)
+{
+  if (!cpufrequency) init();
+  return cpufrequency;
+}
--- a/aeadaes256ocbtaglen128v1-rv32/try-anything.c
+++ b/aeadaes256ocbtaglen128v1-rv32/try-anything.c
@ -0,0 +1,323 @@
+/*
+ * try-anything.c version 20190729
+ * D. J. Bernstein
+ * Some portions adapted from TweetNaCl by Bernstein, Janssen, Lange, Schwabe.
+ * Public domain.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/resource.h>
+#include "kernelrandombytes.h"
+#include "cpucycles.h"
+#include "crypto_uint8.h"
+#include "crypto_uint32.h"
+#include "crypto_uint64.h"
+#include "try.h"
+
+typedef crypto_uint8 u8;
+typedef crypto_uint32 u32;
+typedef crypto_uint64 u64;
+
+#define FOR(i,n) for (i = 0;i < n;++i)
+
+static u32 L32(u32 x,int c) { return (x << c) | ((x&0xffffffff) >> (32 - c)); }
+
+static u32 ld32(const u8 *x)
+{
+  u32 u = x[3];
+  u = (u<<8)|x[2];
+  u = (u<<8)|x[1];
+  return (u<<8)|x[0];
+}
+
+static void st32(u8 *x,u32 u)
+{
+  int i;
+  FOR(i,4) { x[i] = u; u >>= 8; }
+}
+
+static const u8 sigma[17] = "expand 32-byte k";
+
+static void core(u8 *out,const u8 *in,const u8 *k)
+{
+  u32 w[16],x[16],y[16],t[4];
+  int i,j,m;
+
+  FOR(i,4) {
+    x[5*i] = ld32(sigma+4*i);
+    x[1+i] = ld32(k+4*i);
+    x[6+i] = ld32(in+4*i);
+    x[11+i] = ld32(k+16+4*i);
+  }
+
+  FOR(i,16) y[i] = x[i];
+
+  FOR(i,20) {
+    FOR(j,4) {
+      FOR(m,4) t[m] = x[(5*j+4*m)%16];
+      t[1] ^= L32(t[0]+t[3], 7);
+      t[2] ^= L32(t[1]+t[0], 9);
+      t[3] ^= L32(t[2]+t[1],13);
+      t[0] ^= L32(t[3]+t[2],18);
+      FOR(m,4) w[4*j+(j+m)%4] = t[m];
+    }
+    FOR(m,16) x[m] = w[m];
+  }
+
+  FOR(i,16) st32(out + 4 * i,x[i] + y[i]);
+}
+
+static void salsa20(u8 *c,u64 b,const u8 *n,const u8 *k)
+{
+  u8 z[16],x[64];
+  u32 u,i;
+  if (!b) return;
+  FOR(i,16) z[i] = 0;
+  FOR(i,8) z[i] = n[i];
+  while (b >= 64) {
+    core(x,z,k);
+    FOR(i,64) c[i] = x[i];
+    u = 1;
+    for (i = 8;i < 16;++i) {
+      u += (u32) z[i];
+      z[i] = u;
+      u >>= 8;
+    }
+    b -= 64;
+    c += 64;
+  }
+  if (b) {
+    core(x,z,k);
+    FOR(i,b) c[i] = x[i];
+  }
+}
+
+static void increment(u8 *n)
+{
+  if (!++n[0])
+    if (!++n[1])
+      if (!++n[2])
+        if (!++n[3])
+          if (!++n[4])
+            if (!++n[5])
+              if (!++n[6])
+                if (!++n[7])
+                  ;
+}
+
+static void testvector(unsigned char *x,unsigned long long xlen)
+{
+  const static unsigned char testvector_k[33] = "generate inputs for test vectors";
+  static unsigned char testvector_n[8];
+  salsa20(x,xlen,testvector_n,testvector_k);
+  increment(testvector_n);
+}
+
+unsigned long long myrandom(void)
+{
+  unsigned char x[8];
+  unsigned long long result;
+  testvector(x,8);
+  result = x[7];
+  result = (result<<8)|x[6];
+  result = (result<<8)|x[5];
+  result = (result<<8)|x[4];
+  result = (result<<8)|x[3];
+  result = (result<<8)|x[2];
+  result = (result<<8)|x[1];
+  result = (result<<8)|x[0];
+  return result;
+}
+
+static void canary(unsigned char *x,unsigned long long xlen)
+{
+  const static unsigned char canary_k[33] = "generate pad to catch overwrites";
+  static unsigned char canary_n[8];
+  salsa20(x,xlen,canary_n,canary_k);
+  increment(canary_n);
+}
+
+void double_canary(unsigned char *x2,unsigned char *x,unsigned long long xlen)
+{
+  canary(x - 16,16);
+  canary(x + xlen,16);
+  memcpy(x2 - 16,x - 16,16);
+  memcpy(x2 + xlen,x + xlen,16);
+}
+
+void input_prepare(unsigned char *x2,unsigned char *x,unsigned long long xlen)
+{
+  testvector(x,xlen);
+  canary(x - 16,16);
+  canary(x + xlen,16);
+  memcpy(x2 - 16,x - 16,xlen + 32);
+}
+
+void input_compare(const unsigned char *x2,const unsigned char *x,unsigned long long xlen,const char *fun)
+{
+  if (memcmp(x2 - 16,x - 16,xlen + 32)) {
+    fprintf(stderr,"%s overwrites input\n",fun);
+    exit(111);
+  }
+}
+
+void output_prepare(unsigned char *x2,unsigned char *x,unsigned long long xlen)
+{
+  canary(x - 16,xlen + 32);
+  memcpy(x2 - 16,x - 16,xlen + 32);
+}
+
+void output_compare(const unsigned char *x2,const unsigned char *x,unsigned long long xlen,const char *fun)
+{
+  if (memcmp(x2 - 16,x - 16,16)) {
+    fprintf(stderr,"%s writes before output\n",fun);
+    exit(111);
+  }
+  if (memcmp(x2 + xlen,x + xlen,16)) {
+    fprintf(stderr,"%s writes after output\n",fun);
+    exit(111);
+  }
+}
+
+static unsigned char checksum_state[64];
+static char checksum_hex[65];
+
+void checksum(const unsigned char *x,unsigned long long xlen)
+{
+  u8 block[16];
+  int i;
+  while (xlen >= 16) {
+    core(checksum_state,x,checksum_state);
+    x += 16;
+    xlen -= 16;
+  }
+  FOR(i,16) block[i] = 0;
+  FOR(i,xlen) block[i] = x[i];
+  block[xlen] = 1;
+  checksum_state[0] ^= 1;
+  core(checksum_state,block,checksum_state);
+}
+
+static void printword(const char *s)
+{
+  if (!*s) putchar('-');
+  while (*s) {
+    if (*s == ' ') putchar('_');
+    else if (*s == '\t') putchar('_');
+    else if (*s == '\r') putchar('_');
+    else if (*s == '\n') putchar('_');
+    else putchar(*s);
+    ++s;
+  }
+  putchar(' ');
+}
+
+static void printnum(long long x)
+{
+  printf("%lld ",x);
+}
+
+void fail(const char *why)
+{
+  fprintf(stderr,"%s\n",why);
+  exit(111);
+}
+
+unsigned char *alignedcalloc(unsigned long long len)
+{
+  unsigned char *x = (unsigned char *) calloc(1,len + 256);
+  long long i;
+  if (!x) fail("out of memory");
+  /* will never deallocate so shifting is ok */
+  for (i = 0;i < len + 256;++i) x[i] = random();
+  x += 64;
+  x += 63 & (-(unsigned long) x);
+  for (i = 0;i < len;++i) x[i] = 0;
+  return x;
+}
+
+#define TIMINGS 63
+static long long cycles[TIMINGS + 1];
+
+void limits()
+{
+#ifdef RLIM_INFINITY
+  struct rlimit r;
+  r.rlim_cur = 0;
+  r.rlim_max = 0;
+#ifdef RLIMIT_NOFILE
+  setrlimit(RLIMIT_NOFILE,&r);
+#endif
+#ifdef RLIMIT_NPROC
+  setrlimit(RLIMIT_NPROC,&r);
+#endif
+#ifdef RLIMIT_CORE
+  setrlimit(RLIMIT_CORE,&r);
+#endif
+#endif
+}
+
+static unsigned char randombyte[1];
+
+int main()
+{
+  long long i;
+  long long j;
+  long long abovej;
+  long long belowj;
+  long long checksumcycles;
+  long long cyclespersecond;
+
+  cycles[0] = cpucycles();
+  cycles[1] = cpucycles();
+  cyclespersecond = cpucycles_persecond();
+
+  kernelrandombytes(randombyte,1);
+  preallocate();
+  limits();
+
+  allocate();
+  srandom(getpid());
+
+  cycles[0] = cpucycles();
+  test();
+  cycles[1] = cpucycles();
+  checksumcycles = cycles[1] - cycles[0];
+
+  predoit();
+  for (i = 0;i <= TIMINGS;++i) {
+    cycles[i] = cpucycles();
+  }
+  for (i = 0;i <= TIMINGS;++i) {
+    cycles[i] = cpucycles();
+    doit();
+  }
+  for (i = 0;i < TIMINGS;++i) cycles[i] = cycles[i + 1] - cycles[i];
+  for (j = 0;j < TIMINGS;++j) {
+    belowj = 0;
+    for (i = 0;i < TIMINGS;++i) if (cycles[i] < cycles[j]) ++belowj;
+    abovej = 0;
+    for (i = 0;i < TIMINGS;++i) if (cycles[i] > cycles[j]) ++abovej;
+    if (belowj * 2 < TIMINGS && abovej * 2 < TIMINGS) break;
+  }
+
+  for (i = 0;i < 32;++i) {
+    checksum_hex[2 * i] = "0123456789abcdef"[15 & (checksum_state[i] >> 4)];
+    checksum_hex[2 * i + 1] = "0123456789abcdef"[15 & checksum_state[i]];
+  }
+  checksum_hex[2 * i] = 0;
+
+  printword(checksum_hex);
+  printnum(cycles[j]);
+  printnum(checksumcycles);
+  printnum(cyclespersecond);
+  printword(primitiveimplementation);
+  printf("\n");
+  return 0;
+}
--- a/aeadaes256ocbtaglen128v1-rv32/try.c
+++ b/aeadaes256ocbtaglen128v1-rv32/try.c
@ -0,0 +1,242 @@
+/*
+ * crypto_aead/try.c version 20200406
+ * D. J. Bernstein
+ * Public domain.
+ * Auto-generated by trygen.py; do not edit.
+ */
+
+#include "crypto_aead.h"
+#include "try.h"
+
+const char *primitiveimplementation = crypto_aead_IMPLEMENTATION;
+
+#define TUNE_BYTES 1536
+#ifdef SMALL
+#define MAXTEST_BYTES 128
+#else
+#define MAXTEST_BYTES 4096
+#endif
+#ifdef SMALL
+#define LOOPS 64
+#else
+#define LOOPS 512
+#endif
+
+static unsigned char *k;
+static unsigned char *s;
+static unsigned char *p;
+static unsigned char *a;
+static unsigned char *m;
+static unsigned char *c;
+static unsigned char *t;
+static unsigned char *r;
+static unsigned char *k2;
+static unsigned char *s2;
+static unsigned char *p2;
+static unsigned char *a2;
+static unsigned char *m2;
+static unsigned char *c2;
+static unsigned char *t2;
+static unsigned char *r2;
+#define klen crypto_aead_KEYBYTES
+#define slen crypto_aead_NSECBYTES
+#define plen crypto_aead_NPUBBYTES
+unsigned long long alen;
+unsigned long long mlen;
+unsigned long long clen;
+unsigned long long tlen;
+#define rlen crypto_aead_NSECBYTES
+
+void preallocate(void)
+{
+}
+
+void allocate(void)
+{
+  unsigned long long alloclen = 0;
+  if (alloclen < TUNE_BYTES) alloclen = TUNE_BYTES;
+  if (alloclen < MAXTEST_BYTES + crypto_aead_ABYTES) alloclen = MAXTEST_BYTES + crypto_aead_ABYTES;
+  if (alloclen < crypto_aead_KEYBYTES) alloclen = crypto_aead_KEYBYTES;
+  if (alloclen < crypto_aead_NSECBYTES) alloclen = crypto_aead_NSECBYTES;
+  if (alloclen < crypto_aead_NPUBBYTES) alloclen = crypto_aead_NPUBBYTES;
+  if (alloclen < crypto_aead_NSECBYTES) alloclen = crypto_aead_NSECBYTES;
+  k = alignedcalloc(alloclen);
+  s = alignedcalloc(alloclen);
+  p = alignedcalloc(alloclen);
+  a = alignedcalloc(alloclen);
+  m = alignedcalloc(alloclen);
+  c = alignedcalloc(alloclen);
+  t = alignedcalloc(alloclen);
+  r = alignedcalloc(alloclen);
+  k2 = alignedcalloc(alloclen);
+  s2 = alignedcalloc(alloclen);
+  p2 = alignedcalloc(alloclen);
+  a2 = alignedcalloc(alloclen);
+  m2 = alignedcalloc(alloclen);
+  c2 = alignedcalloc(alloclen);
+  t2 = alignedcalloc(alloclen);
+  r2 = alignedcalloc(alloclen);
+}
+
+void predoit(void)
+{
+}
+
+void doit(void)
+{
+  crypto_aead_encrypt(c,&clen,m,TUNE_BYTES,a,TUNE_BYTES,s,p,k);
+  crypto_aead_decrypt(t,&tlen,r,c,clen,a,TUNE_BYTES,p,k);
+}
+
+void test(void)
+{
+  unsigned long long loop;
+  
+  for (loop = 0;loop < LOOPS;++loop) {
+    mlen = myrandom() % (MAXTEST_BYTES + 1);
+    alen = myrandom() % (MAXTEST_BYTES + 1);
+    
+    clen = mlen + crypto_aead_ABYTES;
+    output_prepare(c2,c,clen);
+    input_prepare(m2,m,mlen);
+    input_prepare(a2,a,alen);
+    input_prepare(s2,s,slen);
+    input_prepare(p2,p,plen);
+    input_prepare(k2,k,klen);
+    if (crypto_aead_encrypt(c,&clen,m,mlen,a,alen,s,p,k) != 0) fail("crypto_aead_encrypt returns nonzero");
+    if (clen < mlen) fail("crypto_aead_encrypt returns smaller output than input");
+    if (clen > mlen + crypto_aead_ABYTES) fail("crypto_aead_encrypt returns more than crypto_aead_ABYTES extra bytes");
+    checksum(c,clen);
+    output_compare(c2,c,clen,"crypto_aead_encrypt");
+    input_compare(m2,m,mlen,"crypto_aead_encrypt");
+    input_compare(a2,a,alen,"crypto_aead_encrypt");
+    input_compare(s2,s,slen,"crypto_aead_encrypt");
+    input_compare(p2,p,plen,"crypto_aead_encrypt");
+    input_compare(k2,k,klen,"crypto_aead_encrypt");
+    
+    double_canary(c2,c,clen);
+    double_canary(m2,m,mlen);
+    double_canary(a2,a,alen);
+    double_canary(s2,s,slen);
+    double_canary(p2,p,plen);
+    double_canary(k2,k,klen);
+    if (crypto_aead_encrypt(c2,&clen,m2,mlen,a2,alen,s2,p2,k2) != 0) fail("crypto_aead_encrypt returns nonzero");
+    if (memcmp(c2,c,clen) != 0) fail("crypto_aead_encrypt is nondeterministic");
+    
+#if crypto_aead_NOOVERLAP == 1
+#else
+    double_canary(c2,c,clen);
+    double_canary(m2,m,mlen);
+    double_canary(a2,a,alen);
+    double_canary(s2,s,slen);
+    double_canary(p2,p,plen);
+    double_canary(k2,k,klen);
+    if (crypto_aead_encrypt(m2,&clen,m2,mlen,a,alen,s,p,k) != 0) fail("crypto_aead_encrypt with m=c overlap returns nonzero");
+    if (memcmp(m2,c,clen) != 0) fail("crypto_aead_encrypt does not handle m=c overlap");
+    memcpy(m2,m,mlen);
+    if (crypto_aead_encrypt(a2,&clen,m,mlen,a2,alen,s,p,k) != 0) fail("crypto_aead_encrypt with a=c overlap returns nonzero");
+    if (memcmp(a2,c,clen) != 0) fail("crypto_aead_encrypt does not handle a=c overlap");
+    memcpy(a2,a,alen);
+    if (crypto_aead_encrypt(s2,&clen,m,mlen,a,alen,s2,p,k) != 0) fail("crypto_aead_encrypt with s=c overlap returns nonzero");
+    if (memcmp(s2,c,clen) != 0) fail("crypto_aead_encrypt does not handle s=c overlap");
+    memcpy(s2,s,slen);
+    if (crypto_aead_encrypt(p2,&clen,m,mlen,a,alen,s,p2,k) != 0) fail("crypto_aead_encrypt with p=c overlap returns nonzero");
+    if (memcmp(p2,c,clen) != 0) fail("crypto_aead_encrypt does not handle p=c overlap");
+    memcpy(p2,p,plen);
+    if (crypto_aead_encrypt(k2,&clen,m,mlen,a,alen,s,p,k2) != 0) fail("crypto_aead_encrypt with k=c overlap returns nonzero");
+    if (memcmp(k2,c,clen) != 0) fail("crypto_aead_encrypt does not handle k=c overlap");
+    memcpy(k2,k,klen);
+#endif
+    
+    tlen = clen;
+    output_prepare(t2,t,tlen);
+    output_prepare(r2,r,rlen);
+    memcpy(c2,c,clen);
+    double_canary(c2,c,clen);
+    memcpy(a2,a,alen);
+    double_canary(a2,a,alen);
+    memcpy(p2,p,plen);
+    double_canary(p2,p,plen);
+    memcpy(k2,k,klen);
+    double_canary(k2,k,klen);
+    if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) != 0) fail("crypto_aead_decrypt returns nonzero");
+    if (tlen != mlen) fail("crypto_aead_decrypt does not match mlen");
+    if (memcmp(t,m,mlen) != 0) fail("crypto_aead_decrypt does not match m");
+    if (memcmp(r,s,slen) != 0) fail("crypto_aead_decrypt does not match s");
+    checksum(t,tlen);
+    checksum(r,rlen);
+    output_compare(t2,t,clen,"crypto_aead_decrypt");
+    output_compare(r2,r,rlen,"crypto_aead_decrypt");
+    input_compare(c2,c,clen,"crypto_aead_decrypt");
+    input_compare(a2,a,alen,"crypto_aead_decrypt");
+    input_compare(p2,p,plen,"crypto_aead_decrypt");
+    input_compare(k2,k,klen,"crypto_aead_decrypt");
+    
+    double_canary(t2,t,tlen);
+    double_canary(r2,r,rlen);
+    double_canary(c2,c,clen);
+    double_canary(a2,a,alen);
+    double_canary(p2,p,plen);
+    double_canary(k2,k,klen);
+    if (crypto_aead_decrypt(t2,&tlen,r2,c2,clen,a2,alen,p2,k2) != 0) fail("crypto_aead_decrypt returns nonzero");
+    if (memcmp(t2,t,tlen) != 0) fail("crypto_aead_decrypt is nondeterministic");
+    if (memcmp(r2,r,rlen) != 0) fail("crypto_aead_decrypt is nondeterministic");
+    
+#if crypto_aead_NOOVERLAP == 1
+#else
+    double_canary(t2,t,tlen);
+    double_canary(r2,r,rlen);
+    double_canary(c2,c,clen);
+    double_canary(a2,a,alen);
+    double_canary(p2,p,plen);
+    double_canary(k2,k,klen);
+    if (crypto_aead_decrypt(c2,&tlen,r,c2,clen,a,alen,p,k) != 0) fail("crypto_aead_decrypt with c=t overlap returns nonzero");
+    if (memcmp(c2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle c=t overlap");
+    memcpy(c2,c,clen);
+    if (crypto_aead_decrypt(a2,&tlen,r,c,clen,a2,alen,p,k) != 0) fail("crypto_aead_decrypt with a=t overlap returns nonzero");
+    if (memcmp(a2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle a=t overlap");
+    memcpy(a2,a,alen);
+    if (crypto_aead_decrypt(p2,&tlen,r,c,clen,a,alen,p2,k) != 0) fail("crypto_aead_decrypt with p=t overlap returns nonzero");
+    if (memcmp(p2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle p=t overlap");
+    memcpy(p2,p,plen);
+    if (crypto_aead_decrypt(k2,&tlen,r,c,clen,a,alen,p,k2) != 0) fail("crypto_aead_decrypt with k=t overlap returns nonzero");
+    if (memcmp(k2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle k=t overlap");
+    memcpy(k2,k,klen);
+#endif
+    
+#if crypto_aead_NOOVERLAP == 1
+#else
+    double_canary(t2,t,tlen);
+    double_canary(r2,r,rlen);
+    double_canary(c2,c,clen);
+    double_canary(a2,a,alen);
+    double_canary(p2,p,plen);
+    double_canary(k2,k,klen);
+    if (crypto_aead_decrypt(t,&tlen,c2,c2,clen,a,alen,p,k) != 0) fail("crypto_aead_decrypt with c=r overlap returns nonzero");
+    if (memcmp(c2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle c=r overlap");
+    memcpy(c2,c,clen);
+    if (crypto_aead_decrypt(t,&tlen,a2,c,clen,a2,alen,p,k) != 0) fail("crypto_aead_decrypt with a=r overlap returns nonzero");
+    if (memcmp(a2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle a=r overlap");
+    memcpy(a2,a,alen);
+    if (crypto_aead_decrypt(t,&tlen,p2,c,clen,a,alen,p2,k) != 0) fail("crypto_aead_decrypt with p=r overlap returns nonzero");
+    if (memcmp(p2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle p=r overlap");
+    memcpy(p2,p,plen);
+    if (crypto_aead_decrypt(t,&tlen,k2,c,clen,a,alen,p,k2) != 0) fail("crypto_aead_decrypt with k=r overlap returns nonzero");
+    if (memcmp(k2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle k=r overlap");
+    memcpy(k2,k,klen);
+#endif
+    
+    c[myrandom() % clen] += 1 + (myrandom() % 255);
+    if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) == 0)
+      if ((tlen != mlen) || (memcmp(t,m,mlen) != 0) || (memcmp(r,s,slen) != 0))
+        fail("crypto_aead_decrypt allows trivial forgeries");
+    c[myrandom() % clen] += 1 + (myrandom() % 255);
+    if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) == 0)
+      if ((tlen != mlen) || (memcmp(t,m,mlen) != 0) || (memcmp(r,s,slen) != 0))
+        fail("crypto_aead_decrypt allows trivial forgeries");
+    c[myrandom() % clen] += 1 + (myrandom() % 255);
+    if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) == 0)
+      if ((tlen != mlen) || (memcmp(t,m,mlen) != 0) || (memcmp(r,s,slen) != 0))
+        fail("crypto_aead_decrypt allows trivial forgeries");
+  }
+}
--- a/aeadaes256ocbtaglen128v1-rv32/try.h
+++ b/aeadaes256ocbtaglen128v1-rv32/try.h
@ -0,0 +1,21 @@
+#include <stdlib.h>
+#include <string.h>
+
+/* provided by try.c: */
+extern const char *primitiveimplementation;
+extern void preallocate(void);
+extern void allocate(void);;
+extern void test(void);
+extern void predoit(void);
+extern void doit(void);
+
+/* provided by try-anything.c: */
+extern void fail(const char *);
+extern unsigned char *alignedcalloc(unsigned long long);
+extern void checksum(const unsigned char *,unsigned long long);
+extern void double_canary(unsigned char *,unsigned char *,unsigned long long);
+extern void input_prepare(unsigned char *,unsigned char *,unsigned long long);
+extern void output_prepare(unsigned char *,unsigned char *,unsigned long long);
+extern void input_compare(const unsigned char *,const unsigned char *,unsigned long long,const char *);
+extern void output_compare(const unsigned char *,const unsigned char *,unsigned long long,const char *);
+extern unsigned long long myrandom(void);
--- a/aeadaes256ocbtaglen128v1-rv32/verify.c
+++ b/aeadaes256ocbtaglen128v1-rv32/verify.c
@ -0,0 +1,24 @@
+#include "crypto_verify.h"
+
+int crypto_verify(const unsigned char *x,const unsigned char *y)
+{
+  unsigned int differentbits = 0;
+#define F(i) differentbits |= x[i] ^ y[i];
+  F(0)
+  F(1)
+  F(2)
+  F(3)
+  F(4)
+  F(5)
+  F(6)
+  F(7)
+  F(8)
+  F(9)
+  F(10)
+  F(11)
+  F(12)
+  F(13)
+  F(14)
+  F(15)
+  return (1 & ((differentbits - 1) >> 8)) - 1;
+}
--- a/aes256decrypt-rv32/amd64cpuinfo.c
+++ b/aes256decrypt-rv32/amd64cpuinfo.c
@ -1,17 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include "osfreq.c"
-
-long long cpucycles_riscv(void)
-{
-  unsigned long long result;
-  asm volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax"
-    : "=a" (result) ::  "%rdx");
-  return result;
-}
-
-long long cpucycles_riscv_persecond(void)
-{
-  return osfreq();
-}
--- a/aes256decrypt-rv32/osfreq.c
+++ b/aes256decrypt-rv32/osfreq.c
@ -1,93 +0,0 @@
-static double osfreq(void)
-{
-  FILE *f;
-  char *x;
-  double result;
-  int s;
-
-  f = fopen("/etc/cpucyclespersecond", "r");
-  if (f) {
-    s = fscanf(f,"%lf",&result);
-    fclose(f);
-    if (s > 0) return result;
-  }
-
-  f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed", "r");
-  if (f) {
-    s = fscanf(f,"%lf",&result);
-    fclose(f);
-    if (s > 0) return 1000.0 * result;
-  }
-
-  f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq", "r");
-  if (f) {
-    s = fscanf(f,"%lf",&result);
-    fclose(f);
-    if (s > 0) return 1000.0 * result;
-  }
-
-  f = fopen("/sys/devices/system/cpu/cpu0/clock_tick", "r");
-  if (f) {
-    s = fscanf(f,"%lf",&result);
-    fclose(f);
-    if (s > 0) return result;
-  }
-
-  f = fopen("/proc/cpuinfo","r");
-  if (f) {
-    for (;;) {
-      s = fscanf(f,"cpu MHz : %lf",&result);
-      if (s > 0) break;
-      if (s == 0) s = fscanf(f,"%*[^\n]\n");
-      if (s < 0) { result = 0; break; }
-    }
-    fclose(f);
-    if (result) return 1000000.0 * result;
-  }
-
-  f = fopen("/proc/cpuinfo","r");
-  if (f) {
-    for (;;) {
-      s = fscanf(f,"clock : %lf",&result);
-      if (s > 0) break;
-      if (s == 0) s = fscanf(f,"%*[^\n]\n");
-      if (s < 0) { result = 0; break; }
-    }
-    fclose(f);
-    if (result) return 1000000.0 * result;
-  }
-
-  f = popen("sysctl hw.cpufrequency 2>/dev/null","r");
-  if (f) {
-    s = fscanf(f,"hw.cpufrequency: %lf",&result);
-    pclose(f);
-    if (s > 0) if (result > 0) return result;
-  }
-
-  f = popen("/usr/sbin/lsattr -E -l proc0 -a frequency 2>/dev/null","r");
-  if (f) {
-    s = fscanf(f,"frequency %lf",&result);
-    pclose(f);
-    if (s > 0) return result;
-  }
-
-  f = popen("/usr/sbin/psrinfo -v 2>/dev/null","r");
-  if (f) {
-    for (;;) {
-      s = fscanf(f," The %*s processor operates at %lf MHz",&result);
-      if (s > 0) break;
-      if (s == 0) s = fscanf(f,"%*[^\n]\n");
-      if (s < 0) { result = 0; break; }
-    }
-    pclose(f);
-    if (result) return 1000000.0 * result;
-  }
-
-  x = getenv("cpucyclespersecond");
-  if (x) {
-    s = sscanf(x,"%lf",&result);
-    if (s > 0) return result;
-  }
-
-  return 0;
-}
--- a/aes256gcmv1standalone-rv32/encrypt.c
+++ b/aes256gcmv1standalone-rv32/encrypt.c
@ -249,190 +249,9 @@ static inline int64_t _rv64_clmulh(int64_t rs1, int64_t rs2)

 /* this is basically Supercop's crypto_aead/aes256gcmv1/dolbeau/aesenc-int,
   but without the unrolling.
-   So we have a thin compatibility layer to SSE's __m128i data format
-   and associated instructions to support GHASH & the full algo.
 */

-/* ouch */
-typedef struct {
-  uint64_t l;
-  uint64_t h;
-} __m128i;
-
-//#define _mm_loadu_si128(a) (*(const __m128i*)a)
-static inline __m128i  _mm_loadu_si128(const __m128i *ptr) {
-  __m128i r;
-  r.l = ((const uint64_t*)ptr)[0];
-  r.h = ((const uint64_t*)ptr)[1];
-  return r;
-}
-
-//#define _mm_storeu_si128(x,a) (*(__m128i*)x)=a
-static inline void _mm_storeu_si128(__m128i *ptr, const __m128i data) {
-  ((uint64_t*)ptr)[0] = data.l;
-  ((uint64_t*)ptr)[1] = data.h;
-}
-
-static inline __m128i _mm_clmulepi64_si128(const __m128i a, const __m128i b, const int x) {
-  __m128i r;
-  switch (x) {
-  case 0x00:
-    r.l = _rv64_clmul(a.l, b.l);
-    r.h = _rv64_clmulh(a.l, b.l);
-    break;
-  case 0x01:
-    r.l = _rv64_clmul(a.l, b.h);
-    r.h = _rv64_clmulh(a.l, b.h);
-    break;
-  case 0x10:
-    r.l = _rv64_clmul(a.h, b.l);
-    r.h = _rv64_clmulh(a.h, b.l);
-    break;
-  case 0x11:
-    r.l = _rv64_clmul(a.h, b.h);
-    r.h = _rv64_clmulh(a.h, b.h);
-    break;
-  }
-  return r;
-}
-
-/*
-static inline __m128i (const __m128i a, const __m128i b) {
-  __m128i r;
-  return r;
-}
-*/
-static inline __m128i _mm_xor_si128(const __m128i a, const __m128i b) {
-  __m128i r;
-  r.l = a.l ^ b.l;
-  r.h = a.h ^ b.h;
-  return r;
-}
-static inline __m128i _mm_or_si128(const __m128i a, const __m128i b) {
-  __m128i r;
-  r.l = a.l | b.l;
-  r.h = a.h | b.h;
-  return r;
-}
-static inline __m128i _mm_and_si128(const __m128i a, const __m128i b) {
-  __m128i r;
-  r.l = a.l & b.l;
-  r.h = a.h & b.h;
-  return r;
-}
-static inline __m128i _mm_slli_si128(const __m128i a, const int b) {
-  __m128i r;
-  switch (b) {
-  case 4:
-    r.l = a.l << 32;
-    r.h = a.h << 32 | a.l >> 32;
-    break;
-  case 8:
-    r.l = 0;
-    r.h = a.l;
-    break;
-  case 12:
-    r.l = 0;
-    r.h = a.l << 32;
-    break;
-  }
-  return r;
-}
-static inline __m128i _mm_srli_si128(const __m128i a, const int b) {
-  __m128i r;
-  switch (b) {
-  case 4:
-    r.l = a.l >> 32 | a.h << 32;
-    r.h = a.h >> 32;
-    break;
-  case 8:
-    r.l = a.h;
-    r.h = 0;
-    break;
-  case 12:
-    r.l = a.h >> 32;
-    r.h = 0;
-    break;
-  }
-  return r;
-}
-static inline __m128i _mm_srli_epi32(const __m128i a, const int b) {
-  __m128i r;
-  r.l = ((a.l & 0x00000000FFFFFFFFull) >> b) | (((a.l & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull);
-  r.h = ((a.h & 0x00000000FFFFFFFFull) >> b) | (((a.h & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull);
-  return r;
-}
-static inline __m128i _mm_slli_epi32(const __m128i a, const int b) {
-  __m128i r;
-  r.l = (((a.l & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.l & 0xFFFFFFFF00000000ull) << b);
-  r.h = (((a.h & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.h & 0xFFFFFFFF00000000ull) << b);
-  return r;
-}
-static inline __m128i _mm_insert_epi64(const __m128i a, const uint64_t x, const int b) {
-  __m128i r;
-  if (b == 0) {
-    r.l = x;
-    r.h = a.h;
-  } else {
-    r.l = a.l;
-    r.h = x;
-  }
-  return r;
-}
-static inline __m128i _mm_setzero_si128(void) {
-  __m128i r;
-  r.l = 0;
-  r.h = 0;
-  return r;
-}
-static inline __m128i _mm_set1_epi32(const uint32_t x) {
-  __m128i r;
-  r.l = x | ((uint64_t)x) << 32;
-  r.h = x | ((uint64_t)x) << 32;
-  return r;
-}
-
-static inline uint64_t bytereverse64(const uint64_t a) {
-  uint64_t r;
-  r = (uint32_t)_rv32_grev((a>>32), 24) | (((uint64_t)_rv32_grev((a&0xFFFFFFFF), 24))<<32);
-  return r;
-}
-static inline __m128i bytereverse128(const __m128i a) {
-  __m128i r;
-  r.l = bytereverse64(a.h);
-  r.h = bytereverse64(a.l);
-  return r;  
-}
-
-static inline uint64_t bitreverse64(const uint64_t a) {
-  uint64_t r;
-  r = (uint32_t)_rv32_grev((a&0xFFFFFFFF), 7) | (((uint64_t)_rv32_grev((a>>32), 7))<<32);
-  return r;
-}
-static inline __m128i bitreverse128(const __m128i a) {
-  __m128i r;
-  r.l = bitreverse64(a.l);
-  r.h = bitreverse64(a.h);
-  return r;  
-}
-
-static inline uint64_t wordreverse64(const uint64_t a) {
-  uint64_t r;
-  r = (a>>32)|(a<<32);
-  return r;
-}
-static inline __m128i wordreverse128(const __m128i  a) {
-  __m128i r;
-  r.l = wordreverse64(a.h);
-  r.h = wordreverse64(a.l);
-  return r;
-}
-static inline __m128i doublewordreverse128(const __m128i  a) {
-  __m128i r;
-  r.l = a.h;
-  r.h = a.l;
-  return r;
-}
+#include "m128_compat.h"

 static inline void addmul_rv(unsigned char *c,
                          const unsigned char *a, int xlen,
--- a/m128_compat.h
+++ b/m128_compat.h
@ -0,0 +1,241 @@
+/* 
+ * A thin compatibility layer to SSE's __m128i data format
+ *  and associated instructions to support GHASH & the full algo.
+*/
+
+#ifndef __M128_COMPAT_H__
+#define __M128_COMPAT_H__
+
+#include "new_instructions_support_b.h"
+
+#include <stdio.h>
+
+/* ouch */
+typedef struct {
+  uint64_t l;
+  uint64_t h;
+} __m128i;
+
+//#define _mm_loadu_si128(a) (*(const __m128i*)a)
+static inline __m128i  _mm_loadu_si128(const __m128i *ptr) {
+  __m128i r;
+  r.l = ((const uint64_t*)ptr)[0];
+  r.h = ((const uint64_t*)ptr)[1];
+  return r;
+}
+
+//#define _mm_storeu_si128(x,a) (*(__m128i*)x)=a
+static inline void _mm_storeu_si128(__m128i *ptr, const __m128i data) {
+  ((uint64_t*)ptr)[0] = data.l;
+  ((uint64_t*)ptr)[1] = data.h;
+}
+static inline void _mm_store_si128(__m128i *ptr, const __m128i data) {
+  ((uint64_t*)ptr)[0] = data.l;
+  ((uint64_t*)ptr)[1] = data.h;
+}
+static inline void _mm_storel_epi64 (__m128i *ptr, const __m128i data) {
+  ((uint64_t*)ptr)[0] = data.l;
+}
+
+static inline __m128i _mm_clmulepi64_si128(const __m128i a, const __m128i b, const int x) {
+  __m128i r;
+  switch (x) {
+  case 0x00:
+    r.l = _rv64_clmul(a.l, b.l);
+    r.h = _rv64_clmulh(a.l, b.l);
+    break;
+  case 0x01:
+    r.l = _rv64_clmul(a.l, b.h);
+    r.h = _rv64_clmulh(a.l, b.h);
+    break;
+  case 0x10:
+    r.l = _rv64_clmul(a.h, b.l);
+    r.h = _rv64_clmulh(a.h, b.l);
+    break;
+  case 0x11:
+    r.l = _rv64_clmul(a.h, b.h);
+    r.h = _rv64_clmulh(a.h, b.h);
+    break;
+  }
+  return r;
+}
+
+/*
+static inline __m128i (const __m128i a, const __m128i b) {
+  __m128i r;
+  return r;
+}
+*/
+static inline __m128i _mm_xor_si128(const __m128i a, const __m128i b) {
+  __m128i r;
+  r.l = a.l ^ b.l;
+  r.h = a.h ^ b.h;
+  return r;
+}
+static inline __m128i _mm_or_si128(const __m128i a, const __m128i b) {
+  __m128i r;
+  r.l = a.l | b.l;
+  r.h = a.h | b.h;
+  return r;
+}
+static inline __m128i _mm_and_si128(const __m128i a, const __m128i b) {
+  __m128i r;
+  r.l = a.l & b.l;
+  r.h = a.h & b.h;
+  return r;
+}
+static inline __m128i _mm_slli_si128(const __m128i a, const int b) {
+  __m128i r;
+  switch (b) {
+  case 4:
+    r.l = a.l << 32;
+    r.h = a.h << 32 | a.l >> 32;
+    break;
+  case 8:
+    r.l = 0;
+    r.h = a.l;
+    break;
+  case 12:
+    r.l = 0;
+    r.h = a.l << 32;
+    break;
+  default:
+	  fprintf(stderr, "%s: %d unimplemented\n", __PRETTY_FUNCTION__, b);
+	  break;
+  }
+  return r;
+}
+static inline __m128i _mm_srli_si128(const __m128i a, const int b) {
+  __m128i r;
+  switch (b) {
+  case 1:
+	  r.l = a.l >> 8 | a.h << 56;
+	  r.h = a.h >> 8;
+	  break;
+  case 4:
+    r.l = a.l >> 32 | a.h << 32;
+    r.h = a.h >> 32;
+    break;
+  case 8:
+    r.l = a.h;
+    r.h = 0;
+    break;
+  case 12:
+    r.l = a.h >> 32;
+    r.h = 0;
+    break;
+  default:
+	  fprintf(stderr, "%s: %d unimplemented\n", __PRETTY_FUNCTION__, b);
+	  break;
+  }
+  return r;
+}
+static inline __m128i _mm_srli_epi32(const __m128i a, const int b) {
+  __m128i r;
+  r.l = ((a.l & 0x00000000FFFFFFFFull) >> b) | (((a.l & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull);
+  r.h = ((a.h & 0x00000000FFFFFFFFull) >> b) | (((a.h & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull);
+  return r;
+}
+static inline __m128i _mm_slli_epi32(const __m128i a, const int b) {
+  __m128i r;
+  r.l = (((a.l & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.l & 0xFFFFFFFF00000000ull) << b);
+  r.h = (((a.h & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.h & 0xFFFFFFFF00000000ull) << b);
+  return r;
+}
+/* static inline __m128i _mm_srai_epi32(const __m128i a, const int b) { */
+/*   __m128i r; */
+/*   r.l = (((int32_t)(a.l & 0x00000000FFFFFFFFull)) >> b) | ((((int32_t)(a.l & 0xFFFFFFFF00000000ull)) >> b) & 0xFFFFFFFF00000000ull); */
+/*   r.h = (((int32_t)(a.h & 0x00000000FFFFFFFFull)) >> b) | ((((int32_t)(a.h & 0xFFFFFFFF00000000ull)) >> b) & 0xFFFFFFFF00000000ull); */
+/*   return r; */
+/* } */
+static inline __m128i _mm_insert_epi64(const __m128i a, const uint64_t x, const int b) {
+  __m128i r;
+  if (b == 0) {
+    r.l = x;
+    r.h = a.h;
+  } else {
+    r.l = a.l;
+    r.h = x;
+  }
+  return r;
+}
+static inline __m128i _mm_setzero_si128(void) {
+  __m128i r;
+  r.l = 0;
+  r.h = 0;
+  return r;
+}
+static inline __m128i _mm_set1_epi32(const uint32_t x) {
+  __m128i r;
+  r.l = x | ((uint64_t)x) << 32;
+  r.h = x | ((uint64_t)x) << 32;
+  return r;
+}
+static inline __m128i _mm_set_epi32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
+  __m128i r;
+  r.l = (uint64_t)e0 | ((uint64_t)e1) << 32;
+  r.h = (uint64_t)e2 | ((uint64_t)e3) << 32;
+  return r;
+}
+/* non-intel stuff, used to replace some common use cases */
+static inline uint64_t bytereverse64(const uint64_t a) {
+  uint64_t r;
+  r = (uint32_t)_rv32_grev((a>>32), 24) | (((uint64_t)_rv32_grev((a&0xFFFFFFFF), 24))<<32);
+  return r;
+}
+static inline __m128i bytereverse128(const __m128i a) {
+  __m128i r;
+  r.l = bytereverse64(a.h);
+  r.h = bytereverse64(a.l);
+  return r;  
+}
+
+static inline uint64_t bitreverse64(const uint64_t a) {
+  uint64_t r;
+  r = (uint32_t)_rv32_grev((a&0xFFFFFFFF), 7) | (((uint64_t)_rv32_grev((a>>32), 7))<<32);
+  return r;
+}
+static inline __m128i bitreverse128(const __m128i a) {
+  __m128i r;
+  r.l = bitreverse64(a.l);
+  r.h = bitreverse64(a.h);
+  return r;  
+}
+
+static inline uint64_t wordreverse64(const uint64_t a) {
+  uint64_t r;
+  r = (a>>32)|(a<<32);
+  return r;
+}
+static inline __m128i wordreverse128(const __m128i  a) {
+  __m128i r;
+  r.l = wordreverse64(a.h);
+  r.h = wordreverse64(a.l);
+  return r;
+}
+static inline __m128i doublewordreverse128(const __m128i  a) {
+  __m128i r;
+  r.l = a.h;
+  r.h = a.l;
+  return r;
+}
+static inline __m128i wordrotate1l128(const __m128i  a) {
+  __m128i r;
+  /* i.e. epi32 _MM_SHUFFLE(2,1,0,3) */
+  r.l = (a.h >> 32) | (a.l << 32);
+  r.h = (a.l >> 32) | (a.h << 32);
+  return r;
+}
+static inline __m128i halfwordandzero(const uint16_t a) {
+	__m128i r;
+	r.l = a;
+	r.h = 0;
+	return r;
+}
+static inline __m128i wordsign128(const __m128i  a) {
+	__m128i r;
+	r.l = (a.l & 0x0000000080000000ull ? 0x00000000FFFFFFFFull : 0) | (a.l & 0x8000000000000000ull ? 0xFFFFFFFF00000000ull : 0);
+	r.h = (a.h & 0x0000000080000000ull ? 0x00000000FFFFFFFFull : 0) | (a.h & 0x8000000000000000ull ? 0xFFFFFFFF00000000ull : 0);
+	return r;
+}
+#endif // __M128_COMPAT_H__