From 8ad11036be11a77c261328233168ad080286da33 Mon Sep 17 00:00:00 2001 From: Romain Dolbeau Date: Wed, 17 Feb 2021 09:02:43 -0500 Subject: [PATCH] add a quick'n'dirty implementation of RV32BK-accelerated AES-OCB, using the _m128i compatibility layer (spun off in its own header) --- aeadaes256ocbtaglen128v1-rv32/Makefile | 47 ++ aeadaes256ocbtaglen128v1-rv32/api.h | 4 + aeadaes256ocbtaglen128v1-rv32/cpucycles.h | 28 + aeadaes256ocbtaglen128v1-rv32/crypto_aead.h | 17 + .../crypto_aead_aeadaes256ocbtaglen128v1.h | 31 + aeadaes256ocbtaglen128v1-rv32/crypto_uint32.h | 6 + aeadaes256ocbtaglen128v1-rv32/crypto_uint64.h | 6 + aeadaes256ocbtaglen128v1-rv32/crypto_uint8.h | 6 + aeadaes256ocbtaglen128v1-rv32/crypto_verify.h | 12 + .../crypto_verify_16.h | 22 + aeadaes256ocbtaglen128v1-rv32/encrypt.c | 796 ++++++++++++++++++ .../kernelrandombytes.h | 14 + aeadaes256ocbtaglen128v1-rv32/random.cpp | 19 + aeadaes256ocbtaglen128v1-rv32/riscv.c | 83 ++ aeadaes256ocbtaglen128v1-rv32/try-anything.c | 323 +++++++ aeadaes256ocbtaglen128v1-rv32/try.c | 242 ++++++ aeadaes256ocbtaglen128v1-rv32/try.h | 21 + aeadaes256ocbtaglen128v1-rv32/verify.c | 24 + aes256decrypt-rv32/amd64cpuinfo.c | 17 - aes256decrypt-rv32/osfreq.c | 93 -- aes256gcmv1standalone-rv32/encrypt.c | 183 +--- m128_compat.h | 241 ++++++ 22 files changed, 1943 insertions(+), 292 deletions(-) create mode 100644 aeadaes256ocbtaglen128v1-rv32/Makefile create mode 100644 aeadaes256ocbtaglen128v1-rv32/api.h create mode 100644 aeadaes256ocbtaglen128v1-rv32/cpucycles.h create mode 100644 aeadaes256ocbtaglen128v1-rv32/crypto_aead.h create mode 100644 aeadaes256ocbtaglen128v1-rv32/crypto_aead_aeadaes256ocbtaglen128v1.h create mode 100644 aeadaes256ocbtaglen128v1-rv32/crypto_uint32.h create mode 100644 aeadaes256ocbtaglen128v1-rv32/crypto_uint64.h create mode 100644 aeadaes256ocbtaglen128v1-rv32/crypto_uint8.h create mode 100644 aeadaes256ocbtaglen128v1-rv32/crypto_verify.h create mode 100644 aeadaes256ocbtaglen128v1-rv32/crypto_verify_16.h create mode 100644 aeadaes256ocbtaglen128v1-rv32/encrypt.c create mode 100644 aeadaes256ocbtaglen128v1-rv32/kernelrandombytes.h create mode 100644 aeadaes256ocbtaglen128v1-rv32/random.cpp create mode 100644 aeadaes256ocbtaglen128v1-rv32/riscv.c create mode 100644 aeadaes256ocbtaglen128v1-rv32/try-anything.c create mode 100644 aeadaes256ocbtaglen128v1-rv32/try.c create mode 100644 aeadaes256ocbtaglen128v1-rv32/try.h create mode 100644 aeadaes256ocbtaglen128v1-rv32/verify.c delete mode 100644 aes256decrypt-rv32/amd64cpuinfo.c delete mode 100644 aes256decrypt-rv32/osfreq.c create mode 100644 m128_compat.h diff --git a/aeadaes256ocbtaglen128v1-rv32/Makefile b/aeadaes256ocbtaglen128v1-rv32/Makefile new file mode 100644 index 0000000..f195c70 --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/Makefile @@ -0,0 +1,47 @@ +SRCs=encrypt.c try-anything.c verify.c +OBJs=$(SRCs:.c=.o) +SCLIBS=cpucycles.o kernelrandombytes.o + +COMPDIR=~dolbeau2/LITEX/buildroot-rv32/output/host +ALTCOMPDIR=/opt/riscv64b + +CC=$(COMPDIR)/bin/riscv32-buildroot-linux-gnu-gcc +ALTCC=$(ALTCOMPDIR)/bin/riscv64-unknown-elf-gcc +CXX=$(COMPDIR)/bin/riscv32-buildroot-linux-gnu-g++ +STRIP=$(COMPDIR)/bin/riscv32-buildroot-linux-gnu-strip +NEWOPT=-march=rv32imab -mabi=ilp32 -I. -I.. -O3 -DRV32B #-fno-vectorize #-DUSE_EPI_CUSTOM +OPT=-march=rv32ima -mabi=ilp32 -I. -I.. -O3 #-fno-vectorize #-DUSE_EPI_CUSTOM +#ALTCC=$(CC) +#NEWOPT=$(OPT) + +all: aeadaes256ocbtaglen128v1 aeadaes256ocbtaglen128v1_small + +clean: + rm -f $(OBJs) *.S try.o try_small.o encrypt.o aeadaes256ocbtaglen128v1 aeadaes256ocbtaglen128v1_small + +%.o: %.c + $(CC) $(OPT) $< -c -o $@ + +try.o: try.c + $(CC) $(OPT) $< -c -o $@ + +try_small.o: try.c + $(CC) $(OPT) $< -c -o $@ -DSMALL + +encrypt.S: encrypt.c + $(ALTCC) $(NEWOPT) $< -S -o $@ + +encrypt.o: encrypt.S + $(ALTCC) $(NEWOPT) $< -c -o $@ + +aeadaes256ocbtaglen128v1: $(OBJs) encrypt.o try.o $(SCLIBS) + $(CXX) $(OPT) $^ -o $@ + +aeadaes256ocbtaglen128v1_small: $(OBJs) encrypt.o try_small.o $(SCLIBS) + $(CXX) $(OPT) $^ -o $@ + +kernelrandombytes.o: random.cpp + $(CXX) $(OPT) $< -c -o $@ + +cpucycles.o: riscv.c + $(CC) $< -march=rv32ima -mabi=ilp32 -I. -O1 -c -o $@ diff --git a/aeadaes256ocbtaglen128v1-rv32/api.h b/aeadaes256ocbtaglen128v1-rv32/api.h new file mode 100644 index 0000000..d507767 --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/api.h @@ -0,0 +1,4 @@ +#define CRYPTO_KEYBYTES 32 +#define CRYPTO_NSECBYTES 0 +#define CRYPTO_NPUBBYTES 12 +#define CRYPTO_ABYTES 16 diff --git a/aeadaes256ocbtaglen128v1-rv32/cpucycles.h b/aeadaes256ocbtaglen128v1-rv32/cpucycles.h new file mode 100644 index 0000000..ae1b7ba --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/cpucycles.h @@ -0,0 +1,28 @@ +/* +cpucycles riscv.h version 20190803 +D. J. Bernstein +Romain Dolbeau +Public domain. +*/ + +#ifndef CPUCYCLES_riscv_h +#define CPUCYCLES_riscv_h + +#ifdef __cplusplus +extern "C" { +#endif + +extern long long cpucycles_riscv(void); +extern long long cpucycles_riscv_persecond(void); + +#ifdef __cplusplus +} +#endif + +#ifndef cpucycles_implementation +#define cpucycles_implementation "riscv" +#define cpucycles cpucycles_riscv +#define cpucycles_persecond cpucycles_riscv_persecond +#endif + +#endif diff --git a/aeadaes256ocbtaglen128v1-rv32/crypto_aead.h b/aeadaes256ocbtaglen128v1-rv32/crypto_aead.h new file mode 100644 index 0000000..79bf8ec --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/crypto_aead.h @@ -0,0 +1,17 @@ +#ifndef crypto_aead_H +#define crypto_aead_H + +#include "crypto_aead_aeadaes256ocbtaglen128v1.h" + +#define crypto_aead_encrypt crypto_aead_aeadaes256ocbtaglen128v1_encrypt +#define crypto_aead_decrypt crypto_aead_aeadaes256ocbtaglen128v1_decrypt +#define crypto_aead_KEYBYTES crypto_aead_aeadaes256ocbtaglen128v1_KEYBYTES +#define crypto_aead_NSECBYTES crypto_aead_aeadaes256ocbtaglen128v1_NSECBYTES +#define crypto_aead_NPUBBYTES crypto_aead_aeadaes256ocbtaglen128v1_NPUBBYTES +#define crypto_aead_ABYTES crypto_aead_aeadaes256ocbtaglen128v1_ABYTES +#define crypto_aead_NOOVERLAP crypto_aead_aeadaes256ocbtaglen128v1_NOOVERLAP +#define crypto_aead_PRIMITIVE "aeadaes256ocbtaglen128v1" +#define crypto_aead_IMPLEMENTATION crypto_aead_aeadaes256ocbtaglen128v1_IMPLEMENTATION +#define crypto_aead_VERSION crypto_aead_aeadaes256ocbtaglen128v1_VERSION + +#endif diff --git a/aeadaes256ocbtaglen128v1-rv32/crypto_aead_aeadaes256ocbtaglen128v1.h b/aeadaes256ocbtaglen128v1-rv32/crypto_aead_aeadaes256ocbtaglen128v1.h new file mode 100644 index 0000000..d9fcf49 --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/crypto_aead_aeadaes256ocbtaglen128v1.h @@ -0,0 +1,31 @@ +#ifndef crypto_aead_aeadaes256ocbtaglen128v1_H +#define crypto_aead_aeadaes256ocbtaglen128v1_H + +#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_KEYBYTES 32 +#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_NSECBYTES 0 +#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_NPUBBYTES 12 +#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_ABYTES 16 + +#ifdef __cplusplus +extern "C" { +#endif +extern int crypto_aead_aeadaes256ocbtaglen128v1_rv32_encrypt(unsigned char *,unsigned long long *,const unsigned char *,unsigned long long,const unsigned char *,unsigned long long,const unsigned char *,const unsigned char *,const unsigned char *); +extern int crypto_aead_aeadaes256ocbtaglen128v1_rv32_decrypt(unsigned char *,unsigned long long *,unsigned char *,const unsigned char *,unsigned long long,const unsigned char *,unsigned long long,const unsigned char *,const unsigned char *); +#ifdef __cplusplus +} +#endif + +#define crypto_aead_aeadaes256ocbtaglen128v1_encrypt crypto_aead_aeadaes256ocbtaglen128v1_rv32_encrypt +#define crypto_aead_aeadaes256ocbtaglen128v1_decrypt crypto_aead_aeadaes256ocbtaglen128v1_rv32_decrypt +#define crypto_aead_aeadaes256ocbtaglen128v1_KEYBYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_KEYBYTES +#define crypto_aead_aeadaes256ocbtaglen128v1_NSECBYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_NSECBYTES +#define crypto_aead_aeadaes256ocbtaglen128v1_NPUBBYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_NPUBBYTES +#define crypto_aead_aeadaes256ocbtaglen128v1_ABYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_ABYTES +#define crypto_aead_aeadaes256ocbtaglen128v1_NOOVERLAP crypto_aead_aeadaes256ocbtaglen128v1_rv32_NOOVERLAP +#define crypto_aead_aeadaes256ocbtaglen128v1_IMPLEMENTATION "crypto_aead/aeadaes256ocbtaglen128v1/dolbeau/aesenc-int" +#ifndef crypto_aead_aeadaes256ocbtaglen128v1_rv32_VERSION +#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_VERSION "-" +#endif +#define crypto_aead_aeadaes256ocbtaglen128v1_VERSION crypto_aead_aeadaes256ocbtaglen128v1_rv32_VERSION + +#endif diff --git a/aeadaes256ocbtaglen128v1-rv32/crypto_uint32.h b/aeadaes256ocbtaglen128v1-rv32/crypto_uint32.h new file mode 100644 index 0000000..21020d7 --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/crypto_uint32.h @@ -0,0 +1,6 @@ +#ifndef crypto_uint32_h +#define crypto_uint32_h + +typedef unsigned int crypto_uint32; + +#endif diff --git a/aeadaes256ocbtaglen128v1-rv32/crypto_uint64.h b/aeadaes256ocbtaglen128v1-rv32/crypto_uint64.h new file mode 100644 index 0000000..5aa0070 --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/crypto_uint64.h @@ -0,0 +1,6 @@ +#ifndef crypto_uint64_h +#define crypto_uint64_h + +typedef unsigned long long crypto_uint64; + +#endif diff --git a/aeadaes256ocbtaglen128v1-rv32/crypto_uint8.h b/aeadaes256ocbtaglen128v1-rv32/crypto_uint8.h new file mode 100644 index 0000000..f17b77e --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/crypto_uint8.h @@ -0,0 +1,6 @@ +#ifndef crypto_uint8_h +#define crypto_uint8_h + +typedef unsigned char crypto_uint8; + +#endif diff --git a/aeadaes256ocbtaglen128v1-rv32/crypto_verify.h b/aeadaes256ocbtaglen128v1-rv32/crypto_verify.h new file mode 100644 index 0000000..c8d8513 --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/crypto_verify.h @@ -0,0 +1,12 @@ +#ifndef crypto_verify_H +#define crypto_verify_H + +#include "crypto_verify_16.h" + +#define crypto_verify crypto_verify_16 +#define crypto_verify_BYTES crypto_verify_16_BYTES +#define crypto_verify_PRIMITIVE "16" +#define crypto_verify_IMPLEMENTATION crypto_verify_16_IMPLEMENTATION +#define crypto_verify_VERSION crypto_verify_16_VERSION + +#endif diff --git a/aeadaes256ocbtaglen128v1-rv32/crypto_verify_16.h b/aeadaes256ocbtaglen128v1-rv32/crypto_verify_16.h new file mode 100644 index 0000000..4d21a68 --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/crypto_verify_16.h @@ -0,0 +1,22 @@ +#ifndef crypto_verify_16_H +#define crypto_verify_16_H + +#define crypto_verify_16_ref_BYTES 16 + +#ifdef __cplusplus +extern "C" { +#endif +extern int crypto_verify_16_ref(const unsigned char *,const unsigned char *); +#ifdef __cplusplus +} +#endif + +#define crypto_verify_16 crypto_verify_16_ref +#define crypto_verify_16_BYTES crypto_verify_16_ref_BYTES +#define crypto_verify_16_IMPLEMENTATION "crypto_verify/16/ref" +#ifndef crypto_verify_16_ref_VERSION +#define crypto_verify_16_ref_VERSION "-" +#endif +#define crypto_verify_16_VERSION crypto_verify_16_ref_VERSION + +#endif diff --git a/aeadaes256ocbtaglen128v1-rv32/encrypt.c b/aeadaes256ocbtaglen128v1-rv32/encrypt.c new file mode 100644 index 0000000..b32f79e --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/encrypt.c @@ -0,0 +1,796 @@ +/* +// CAESAR OCB v1 somewhat optimised code +// Info: http://www.cs.ucdavis.edu/~rogaway/ocb +// +// Written by Romain Dolbeau (romain@dolbeau.org), +// based on the reference implementation by Ted Krovetz (ted@krovetz.net). +// +// Phillip Rogaway holds patents relevant to OCB. See the following for +// his free patent grant: http://www.cs.ucdavis.edu/~rogaway/ocb/grant.htm +// +// This is free and unencumbered software released into the public domain. +// +// Anyone is free to copy, modify, publish, use, compile, sell, or +// distribute this software, either in source code form or as a compiled +// binary, for any purpose, commercial or non-commercial, and by any +// means. +// +// In jurisdictions that recognize copyright laws, the author or authors +// of this software dedicate any and all copyright interest in the +// software to the public domain. We make this dedication for the benefit +// of the public at large and to the detriment of our heirs and +// successors. We intend this dedication to be an overt act of +// relinquishment in perpetuity of all present and future rights to this +// software under copyright law. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// For more information, please refer to +*/ + +#include + +#include "api.h" +#include "crypto_aead.h" +#define KEYBYTES CRYPTO_KEYBYTES +#define NONCEBYTES CRYPTO_NPUBBYTES +#define TAGBYTES CRYPTO_ABYTES + +#define ALIGN16 __attribute__((aligned(16))) +#define ALIGN32 __attribute__((aligned(32))) +#define ALIGN64 __attribute__((aligned(64))) +#define _bswap64(a) __builtin_bswap64(a) +#define _bswap(a) __builtin_bswap32(a) + +#define printv16c(p,v) \ + { \ + ALIGN16 unsigned char temp[16]; \ + _mm_store_si128(temp, v); \ + int z; \ + printf("%8s:%8s = ",p,#v); \ + for (z = 15 ; z >= 0 ; z--) { \ + printf("%02hhx", temp[z]); \ + if ((z%4)==0) printf(" "); \ + } \ + printf("\n"); \ + } + +#include "m128_compat.h" + +#include "new_instructions_support_k.h" + +#define rotr(a,b) _rv32_ror(a,b) + +static inline void aes256_Tsetkey_encrypt(const unsigned int key[], unsigned int *aes_edrk) { + unsigned int i = 0; + unsigned int rotl_aes_edrk; + unsigned int tmp8, tmp9, tmp10, tmp11; + unsigned int tmp12, tmp13, tmp14, tmp15; + unsigned int temp_lds; + unsigned int round = 0x00000001; + + tmp8 = (key[0]); + aes_edrk[0] = tmp8; + tmp9 = (key[1]); + aes_edrk[1] = tmp9; + tmp10 = (key[2]); + aes_edrk[2] = tmp10; + tmp11 = (key[3]); + aes_edrk[3] = tmp11; + tmp12 = (key[4]); + aes_edrk[4] = tmp12; + tmp13 = (key[5]); + aes_edrk[5] = tmp13; + tmp14 = (key[6]); + aes_edrk[6] = tmp14; + tmp15 = (key[7]); + aes_edrk[7] = tmp15; + + for( i = 8; i < 56; /* i+=8 */ ) + { + tmp8 = tmp8 ^ round; + round = round << 1; + rotl_aes_edrk = rotr(tmp15,8); + tmp8 = aes32esi0(tmp8, rotl_aes_edrk); + tmp8 = aes32esi1(tmp8, rotl_aes_edrk); + tmp8 = aes32esi2(tmp8, rotl_aes_edrk); + tmp8 = aes32esi3(tmp8, rotl_aes_edrk); + + aes_edrk[i++] = tmp8; + tmp9 = tmp9 ^ tmp8; + aes_edrk[i++] = tmp9; + tmp10 = tmp10 ^ tmp9; + aes_edrk[i++] = tmp10; + tmp11 = tmp11 ^ tmp10; + aes_edrk[i++] = tmp11; + + tmp12 = aes32esi0(tmp12, tmp11); + tmp12 = aes32esi1(tmp12, tmp11); + tmp12 = aes32esi2(tmp12, tmp11); + tmp12 = aes32esi3(tmp12, tmp11); + + aes_edrk[i++] = tmp12; + tmp13 = tmp13 ^ tmp12; + aes_edrk[i++] = tmp13; + tmp14 = tmp14 ^ tmp13; + aes_edrk[i++] = tmp14; + tmp15 = tmp15 ^ tmp14; + aes_edrk[i++] = tmp15; + } + + tmp8 = tmp8 ^ round; + round = round << 1; + rotl_aes_edrk = rotr(tmp15,8); + tmp8 = aes32esi0(tmp8, rotl_aes_edrk); + tmp8 = aes32esi1(tmp8, rotl_aes_edrk); + tmp8 = aes32esi2(tmp8, rotl_aes_edrk); + tmp8 = aes32esi3(tmp8, rotl_aes_edrk); + + aes_edrk[i++] = tmp8; + tmp9 = tmp9 ^ tmp8; + aes_edrk[i++] = tmp9; + tmp10 = tmp10 ^ tmp9; + aes_edrk[i++] = tmp10; + tmp11 = tmp11 ^ tmp10; + aes_edrk[i++] = tmp11; +} + +static void aes256_key_enc2dec(unsigned int *erk, unsigned int *drk) +{ + int i, j; + // first and last unchanged (but swapped) + for (i = 0; i < 4; i++) { + drk[i] = erk[i+56]; + drk[i+56] = erk[i]; + } + // convert & revert order + for (i = 1; i < 14; i++) { + for (j = 0 ; j < 4 ; j++) { + unsigned int ek, dk; + ek = erk[i*4+j]; + + dk = 0; + dk = aes32esi0(dk, ek); + dk = aes32esi1(dk, ek); + dk = aes32esi2(dk, ek); + dk = aes32esi3(dk, ek); + + ek = 0; + ek = aes32dsmi0(ek, dk); + ek = aes32dsmi1(ek, dk); + ek = aes32dsmi2(ek, dk); + ek = aes32dsmi3(ek, dk); + + drk[56-4*i+j] = ek; + } + } +} + +#define AES_ROUND1T(TAB,I,X0,X1,X2,X3,Y0,Y1,Y2,Y3) \ + { \ + X0 = aes32esmi0(TAB[I++],Y0); \ + X0 = aes32esmi1(X0,Y1); \ + X0 = aes32esmi2(X0,Y2); \ + X0 = aes32esmi3(X0,Y3); \ + X1 = aes32esmi0(TAB[I++],Y1); \ + X1 = aes32esmi1(X1,Y2); \ + X1 = aes32esmi2(X1,Y3); \ + X1 = aes32esmi3(X1,Y0); \ + X2 = aes32esmi0(TAB[I++],Y2); \ + X2 = aes32esmi1(X2,Y3); \ + X2 = aes32esmi2(X2,Y0); \ + X2 = aes32esmi3(X2,Y1); \ + X3 = aes32esmi0(TAB[I++],Y3); \ + X3 = aes32esmi1(X3,Y0); \ + X3 = aes32esmi2(X3,Y1); \ + X3 = aes32esmi3(X3,Y2); \ + } + +/* using the K + B instructions */ +static inline void aes256_1Tft_encrypt(const uint32_t *aes_edrk, const uint32_t *input, uint32_t *output) +{ + unsigned int X0, X1, X2, X3, Y0, Y1, Y2, Y3; + unsigned int i = 0, j = 0; + unsigned int l_aes_nr = 14; + + X0 = ((input[0]) ^ aes_edrk[j++]); + X1 = ((input[1]) ^ aes_edrk[j++]); + X2 = ((input[2]) ^ aes_edrk[j++]); + X3 = ((input[3]) ^ aes_edrk[j++]); + + for (i = 4 ; i < (l_aes_nr<<2) ; ) { + + AES_ROUND1T(aes_edrk, i, Y0, Y1, Y2, Y3, X0, X1, X2, X3 ); + + X0=Y0; + X1=Y1; + X2=Y2; + X3=Y3; + } + /* last round */ + + Y0 = aes32esi0(aes_edrk[i], X0); + Y0 = aes32esi1(Y0, X1); + Y0 = aes32esi2(Y0, X2); + Y0 = aes32esi3(Y0, X3); + i++; + Y1 = aes32esi0(aes_edrk[i], X1); + Y1 = aes32esi1(Y1, X2); + Y1 = aes32esi2(Y1, X3); + Y1 = aes32esi3(Y1, X0); + i++; + Y2 = aes32esi0(aes_edrk[i], X2); + Y2 = aes32esi1(Y2, X3); + Y2 = aes32esi2(Y2, X0); + Y2 = aes32esi3(Y2, X1); + i++; + Y3 = aes32esi0(aes_edrk[i], X3); + Y3 = aes32esi1(Y3, X0); + Y3 = aes32esi2(Y3, X1); + Y3 = aes32esi3(Y3, X2); + + output[0] = (Y0); + output[1] = (Y1); + output[2] = (Y2); + output[3] = (Y3); +} + + +#define AES_ROUND_DKT(TAB,I,X0,X1,X2,X3,Y0,Y1,Y2,Y3) \ + { \ + X0 = aes32dsmi0(TAB[I+0],Y0); \ + X0 = aes32dsmi1(X0,Y3); \ + X0 = aes32dsmi2(X0,Y2); \ + X0 = aes32dsmi3(X0,Y1); \ + X1 = aes32dsmi0(TAB[I+1],Y1); \ + X1 = aes32dsmi1(X1,Y0); \ + X1 = aes32dsmi2(X1,Y3); \ + X1 = aes32dsmi3(X1,Y2); \ + X2 = aes32dsmi0(TAB[I+2],Y2); \ + X2 = aes32dsmi1(X2,Y1); \ + X2 = aes32dsmi2(X2,Y0); \ + X2 = aes32dsmi3(X2,Y3); \ + X3 = aes32dsmi0(TAB[I+3],Y3); \ + X3 = aes32dsmi1(X3,Y2); \ + X3 = aes32dsmi2(X3,Y1); \ + X3 = aes32dsmi3(X3,Y0); \ + } + +void aes256_1Tft_decrypt(const unsigned int *aes_drk, const unsigned int *input, unsigned int *output) +{ + const unsigned int aes_nr = 14; // aes256 + unsigned int X0, X1, X2, X3, Y0, Y1, Y2, Y3; + unsigned int i; + + X0 = input[0]; X0 ^= aes_drk[0]; + X1 = input[1]; X1 ^= aes_drk[1]; + X2 = input[2]; X2 ^= aes_drk[2]; + X3 = input[3]; X3 ^= aes_drk[3]; + + // for (i=1;i> 7); + d[15] = (s[15] << 1) ^ ((tmp >> 7) * 135); +} +#else +#if 0 +/* 64 bits little-endian doubling, faster */ +static inline void double_block(unsigned long long *d, const unsigned long long* s) { + unsigned long long sl = _bswap64(s[1]), sh = _bswap64(s[0]); + unsigned long long sl1 = sl << 1; + unsigned long long sh1 = sh << 1; + sh1 |= sl>>63; + sl1 ^= (((long long)sh>>63) & 135); + d[1]=_bswap64(sl1); + d[0]=_bswap64(sh1); +} +#else +/* 128 bits SSE, much faster */ +static inline __m128i double_block_si128_norev(const __m128i sv) { + const __m128i mask = _mm_set_epi32(135,1,1,1); + /* __m128i sv31 = _mm_srai_epi32(sv, 31); */ + __m128i sv31 = wordsign128(sv); + __m128i sv31m = _mm_and_si128(sv31, mask); + /* __m128i sv31ms = _mm_shuffle_epi32(sv31m, _MM_SHUFFLE(2,1,0,3)); */ + __m128i sv31ms = wordrotate1l128(sv31m); + __m128i sv1 = _mm_slli_epi32(sv, 1); + __m128i dv = _mm_xor_si128(sv31ms,sv1); + return dv; +} +static inline __m128i double_block_si128(const __m128i svr) { + /* const __m128i rev = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); */ + /* __m128i sv = _mm_shuffle_epi8(svr, rev); */ + __m128i sv = bytereverse128(svr); + __m128i dv = double_block_si128_norev(sv); + /* return _mm_shuffle_epi8(dv, rev); */ + return bytereverse128(dv); +} +static inline void double_block(unsigned char *d, const unsigned char* s) { + __m128i sv = _mm_loadu_si128((const __m128i*)s); + __m128i dv = double_block_si128(sv); + _mm_storeu_si128((__m128i*)d,dv); +} +/* 128 bits SSE times 4 */ +static const unsigned short lk4[64] = { +0x0000, 0x0086, 0x010c, 0x018a, 0x0218, 0x029e, 0x0314, 0x0392, +0x0430, 0x04b6, 0x053c, 0x05ba, 0x0628, 0x06ae, 0x0724, 0x07a2, +0x0860, 0x08e6, 0x096c, 0x09ea, 0x0a78, 0x0afe, 0x0b74, 0x0bf2, +0x0c50, 0x0cd6, 0x0d5c, 0x0dda, 0x0e48, 0x0ece, 0x0f44, 0x0fc2, +0x10c0, 0x1046, 0x11cc, 0x114a, 0x12d8, 0x125e, 0x13d4, 0x1352, +0x14f0, 0x1476, 0x15fc, 0x157a, 0x16e8, 0x166e, 0x17e4, 0x1762, +0x18a0, 0x1826, 0x19ac, 0x192a, 0x1ab8, 0x1a3e, 0x1bb4, 0x1b32, +0x1c90, 0x1c16, 0x1d9c, 0x1d1a, 0x1e88, 0x1e0e, 0x1f84, 0x1f02 +}; +static inline __m128i double_block_2_si128_norev(const __m128i sv) { + const __m128i mask = _mm_set_epi32(3,3,3,3); + const int idx = _mm_extract_epi8(sv,15); + /* __m128i sv30x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xC0)>>6],0); */ + __m128i sv30x = halfwordandzero(lk4[(idx&0xC0)>>6]); + + __m128i sv30 = _mm_srli_epi32(sv, 30); + __m128i sv30m = _mm_and_si128(sv30, mask); + /* __m128i sv30ms = _mm_shuffle_epi32(sv30m, _MM_SHUFFLE(2,1,0,3)); */ + __m128i sv30ms = wordrotate1l128(sv30m); + __m128i sv2 = _mm_slli_epi32(sv, 2); + __m128i dv = _mm_xor_si128(sv30ms,sv2); + __m128i final = _mm_xor_si128(dv, sv30x); + return final; +} +static inline __m128i double_block_3_si128_norev(const __m128i sv) { + const __m128i mask = _mm_set_epi32(7,7,7,7); + const int idx = _mm_extract_epi8(sv,15); + /* __m128i sv29x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xE0)>>5],0); */ + __m128i sv29x = halfwordandzero(lk4[(idx&0xE0)>>5]); + + __m128i sv29 = _mm_srli_epi32(sv, 29); + __m128i sv29m = _mm_and_si128(sv29, mask); + /* __m128i sv29ms = _mm_shuffle_epi32(sv29m, _MM_SHUFFLE(2,1,0,3)); */ + __m128i sv29ms = wordrotate1l128(sv29m); + __m128i sv3 = _mm_slli_epi32(sv, 3); + __m128i dv = _mm_xor_si128(sv29ms,sv3); + __m128i final = _mm_xor_si128(dv, sv29x); + return final; +} +static inline __m128i double_block_4_si128_norev(const __m128i sv) { + const __m128i mask = _mm_set_epi32(15,15,15,15); + const int idx = _mm_extract_epi8(sv,15); + /* __m128i sv28x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xF0)>>4],0); */ + __m128i sv28x = halfwordandzero(lk4[(idx&0xF0)>>4]); + + __m128i sv28 = _mm_srli_epi32(sv, 28); + __m128i sv28m = _mm_and_si128(sv28, mask); + /* __m128i sv28ms = _mm_shuffle_epi32(sv28m, _MM_SHUFFLE(2,1,0,3)); */ + __m128i sv28ms = wordrotate1l128(sv28m); + __m128i sv4 = _mm_slli_epi32(sv, 4); + __m128i dv = _mm_xor_si128(sv28ms,sv4); + __m128i final = _mm_xor_si128(dv, sv28x); + return final; +} +static inline __m128i double_block_5_si128_norev(const __m128i sv) { + const __m128i mask = _mm_set_epi32(31,31,31,31); + const int idx = _mm_extract_epi8(sv,15); + /* __m128i sv27x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xF8)>>3],0); */ + __m128i sv27x = halfwordandzero(lk4[(idx&0xF8)>>3]); + + __m128i sv27 = _mm_srli_epi32(sv, 27); + __m128i sv27m = _mm_and_si128(sv27, mask); + /* __m128i sv27ms = _mm_shuffle_epi32(sv27m, _MM_SHUFFLE(2,1,0,3)); */ + __m128i sv27ms = wordrotate1l128(sv27m); + __m128i sv5 = _mm_slli_epi32(sv, 5); + __m128i dv = _mm_xor_si128(sv27ms,sv5); + __m128i final = _mm_xor_si128(dv, sv27x); + return final; +} +static inline __m128i double_block_6_si128_norev(const __m128i sv) { + const __m128i mask = _mm_set_epi32(63,63,63,63); + const int idx = _mm_extract_epi8(sv,15); + /* __m128i sv26x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xFC)>>2],0); */ + __m128i sv26x = halfwordandzero(lk4[(idx&0xFC)>>2]); + + __m128i sv26 = _mm_srli_epi32(sv, 26); + __m128i sv26m = _mm_and_si128(sv26, mask); + /* __m128i sv26ms = _mm_shuffle_epi32(sv26m, _MM_SHUFFLE(2,1,0,3)); */ + __m128i sv26ms = wordrotate1l128(sv26m); + __m128i sv6 = _mm_slli_epi32(sv, 6); + __m128i dv = _mm_xor_si128(sv26ms,sv6); + __m128i final = _mm_xor_si128(dv, sv26x); + return final; +} +#endif +#endif + +/* ------------------------------------------------------------------------- */ +static inline __m128i calc_L_i_si128(const __m128i ldollarvr, const unsigned j) { + /* const __m128i rev = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); */ + /* __m128i ldollarv = _mm_shuffle_epi8(ldollarvr, rev); */ + __m128i ldollarv = bytereverse128(ldollarvr); + unsigned i; + __m128i lv; + unsigned ntz = __builtin_ctz(j);/* printf("ntz = %u\n", ntz); */ + switch(ntz) { + case 0: + lv = double_block_si128_norev(ldollarv); + break; + case 1: + lv = double_block_2_si128_norev(ldollarv); + break; + case 2: + lv = double_block_3_si128_norev(ldollarv); + break; + case 3: + lv = double_block_4_si128_norev(ldollarv); + break; + case 4: + lv = double_block_5_si128_norev(ldollarv); + break; + default: + lv = double_block_6_si128_norev(ldollarv); + for (i = 5; i < ntz ; i++) + lv = double_block_si128_norev(lv); + break; + } + /* return _mm_shuffle_epi8(lv, rev); */ + return bytereverse128(lv); +} +static inline void calc_L_i(block l, const block ldollar, const unsigned i) { + __m128i ldollarv = _mm_loadu_si128((const __m128i*)ldollar); + __m128i lv = calc_L_i_si128(ldollarv, i); + _mm_storeu_si128((__m128i*)l,lv); +} +static inline void precompute_lv(__m128i prelv[32], const __m128i ldollarvr, const unsigned max) { + /* const __m128i rev = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); */ + /* __m128i ldollarv = _mm_shuffle_epi8(ldollarvr, rev); */ + __m128i ldollarv = bytereverse128(ldollarvr); + unsigned i; + __m128i lv = double_block_si128_norev(ldollarv); + for (i = 0 ; i < max-1 ; i++) { + /* prelv[i] = _mm_shuffle_epi8(lv, rev); */ + prelv[i] = bytereverse128(lv); + lv = double_block_si128_norev(lv); + } + /* prelv[i] = _mm_shuffle_epi8(lv, rev); */ + return bytereverse128(lv); +} + +/* ------------------------------------------------------------------------- */ + +static void hash(block result, const unsigned char *k, + unsigned char *a, unsigned abytes, + const __m128i lstar, + const __m128i prelv[32], const __m128i aes_key[15]) { + __m128i offset, sum, tmp; + unsigned i; + + /* Process any whole blocks */ + /* Sum_0 = zeros(128) */ + sum = _mm_setzero_si128(); + /* Offset_0 = zeros(128) */ + offset = _mm_setzero_si128(); + i=1; + for (; i<=abytes/16; i++, a = a + 16) { + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + tmp = prelv[__builtin_ctz(i)]; + offset = _mm_xor_si128(offset, tmp); + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + tmp = _mm_xor_si128(offset, _mm_loadu_si128((const __m128i*)a)); + tmp = aes256_1Tft__encrypt1_si128(tmp, aes_key); + sum = _mm_xor_si128(sum, tmp); + } + + /* Process any final partial block; compute final hash value */ + + abytes = abytes % 16; /* Bytes in final block */ + if (abytes > 0) { + /* Offset_* = Offset_m xor L_* */ + offset = _mm_xor_si128(offset, lstar); + /* tmp = (A_* || 1 || zeros(127-bitlen(A_*))) xor Offset_* */ + unsigned char pad[16]; + memset(pad, 0, 16); + memcpy(pad, a, abytes); + pad[abytes] = 0x80; + tmp = _mm_loadu_si128((const __m128i*)pad); + tmp = _mm_xor_si128(offset, tmp); + /* Sum = Sum_m xor ENCIPHER(K, tmp) */ + tmp = aes256_1Tft__encrypt1_si128(tmp, aes_key); + sum = _mm_xor_si128(tmp, sum); + } + + _mm_storeu_si128((__m128i*)result,sum); +} + +/* ------------------------------------------------------------------------- */ + +static int ocb_crypt(unsigned char *out, unsigned char *k, unsigned char *n, + unsigned char *a, unsigned abytes, + unsigned char *in, unsigned inbytes, int encrypting) { + __m128i prelv[32]; + __m128i aes_decrypt_key[15]; + __m128i aes_encrypt_key[15]; + block ad_hash; + __m128i lstar, ldollar, sum, offset, ktop, pad, nonce, tag, tmp, outv; + block nonce_b, offset_b; + unsigned char stretch[24]; + unsigned bottom, byteshift, bitshift, i, max; + + /* Setup AES and strip ciphertext of its tag */ + if ( ! encrypting ) { + if (inbytes < TAGBYTES) return -1; + inbytes -= TAGBYTES; + } + aes256_Tsetkey_encrypt(k, aes_encrypt_key); + if ( ! encrypting ) { + aes256_key_enc2dec(aes_encrypt_key, aes_decrypt_key); + } + + /* Key-dependent variables */ + + /* L_* = ENCIPHER(K, zeros(128)) */ + tmp = _mm_setzero_si128(); + lstar = aes256_1Tft__encrypt1_si128(tmp, aes_encrypt_key); + /* L_$ = double(L_*) */ + ldollar = double_block_si128(lstar); + max = abytes >= inbytes ? abytes/4 : inbytes/4; + max = (max < 2 ? 2 : max); + /* only precompute what's really needed; + look at the number of leading zero (to find the leftmost bit set to one) + all trailing zero will be at the right of it so we have an upper bound + */ + precompute_lv(prelv,ldollar,31-__builtin_clz(max)); + + /* Nonce-dependent and per-encryption variables */ + + /* Nonce = zeros(127-bitlen(N)) || 1 || N */ + memset(nonce_b, 0, 16); + memcpy(&nonce_b[16-NONCEBYTES],n,NONCEBYTES); + nonce_b[0] = (unsigned char)(((TAGBYTES * 8) % 128) << 1); + nonce_b[16-NONCEBYTES-1] |= 0x01; + /* bottom = str2num(Nonce[123..128]) */ + bottom = nonce_b[15] & 0x3F; + /* Ktop = ENCIPHER(K, Nonce[1..122] || zeros(6)) */ + nonce_b[15] &= 0xC0; + nonce = _mm_loadu_si128((const __m128i*)nonce_b); + ktop = aes256_1Tft__encrypt1_si128(nonce, aes_encrypt_key); + /* Stretch = Ktop || (Ktop[1..64] xor Ktop[9..72]) */ + _mm_storeu_si128((__m128i*)stretch, ktop); + _mm_storel_epi64((__m128i*)(stretch+16), _mm_xor_si128(_mm_srli_si128(ktop,1), ktop)); + /* Offset_0 = Stretch[1+bottom..128+bottom] */ + byteshift = bottom/8; + bitshift = bottom%8; + if (bitshift != 0) + for (i=0; i<16; i++) + offset_b[i] = (stretch[i+byteshift] << bitshift) | + (stretch[i+byteshift+1] >> (8-bitshift)); + else + for (i=0; i<16; i++) + offset_b[i] = stretch[i+byteshift]; + offset = _mm_loadu_si128((const __m128i*)offset_b); + /* Checksum_0 = zeros(128) */ + sum = _mm_xor_si128(sum,sum); + + /* Hash associated data */ + hash(ad_hash, k, a, abytes, lstar, prelv, aes_encrypt_key); + + /* Process any whole blocks */ + i=1; + if (encrypting) { + + for (; i<=inbytes/16; i++, in=in+16, out=out+16) { + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + tmp = prelv[__builtin_ctz(i)]; + + offset = _mm_xor_si128(offset, tmp); + tmp = _mm_xor_si128(offset, _mm_loadu_si128((const __m128i*)in)); + + /* Checksum_i = Checksum_{i-1} xor P_i */ + sum = _mm_xor_si128(_mm_loadu_si128((const __m128i*)in), sum); + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + tmp = aes256_1Tft__encrypt1_si128(tmp, aes_encrypt_key); + outv = _mm_xor_si128(offset, tmp); + _mm_storeu_si128((__m128i*)out, outv); + } + } else { + + for (; i<=inbytes/16; i++, in=in+16, out=out+16) { + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + tmp= prelv[__builtin_ctz(i)]; + offset = _mm_xor_si128(offset, tmp); + tmp = _mm_xor_si128(offset, _mm_loadu_si128((const __m128i*)in)); + + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + tmp = aes256_1Tft__decrypt1_si128(tmp, aes_decrypt_key); + outv = _mm_xor_si128(offset, tmp); + _mm_storeu_si128((__m128i*)out, outv); + /* Checksum_i = Checksum_{i-1} xor P_i */ + sum = _mm_xor_si128(outv, sum); + } + } + + /* Process any final partial block and compute raw tag */ + + inbytes = inbytes % 16; /* Bytes in final block */ + if (inbytes > 0) { + /* Offset_* = Offset_m xor L_* */ + offset = _mm_xor_si128(offset, lstar); + /* Pad = ENCIPHER(K, Offset_*) */ + pad = aes256_1Tft__encrypt1_si128(offset, aes_encrypt_key); + + if (encrypting) { + /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */ + unsigned char tmp_b[16]; + unsigned char pad_b[16]; + memset(tmp_b, 0, 16); + memcpy(tmp_b, in, inbytes); + tmp_b[inbytes] = 0x80; + tmp = _mm_loadu_si128((const __m128i*)tmp_b); + sum = _mm_xor_si128(tmp, sum); + /* C_* = P_* xor Pad[1..bitlen(P_*)] */ + pad = _mm_xor_si128(tmp, pad); + _mm_storeu_si128((__m128i*)pad_b, pad); + memcpy(out, pad_b, inbytes); + out = out + inbytes; + } else { + /* P_* = C_* xor Pad[1..bitlen(C_*)] */ + unsigned char tmp_b[16]; + unsigned char pad_b[16]; + _mm_storeu_si128((__m128i*)pad_b, pad); + memcpy(tmp_b, pad_b, 16); + memcpy(tmp_b, in, inbytes); + xor_block(tmp_b,pad_b,tmp_b); + tmp_b[inbytes] = 0x80; + memcpy(out, tmp_b, inbytes); + tmp = _mm_loadu_si128((const __m128i*)tmp_b); + /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */ + sum = _mm_xor_si128(tmp, sum); + in = in + inbytes; + } + } + + /* Tag = ENCIPHER(K, Checksum xor Offset xor L_$) xor HASH(K,A) */ + tmp = _mm_xor_si128(sum, offset); + tmp = _mm_xor_si128(tmp, ldollar); + tag = aes256_1Tft__encrypt1_si128(tmp, aes_encrypt_key); + tag = _mm_xor_si128(_mm_loadu_si128((const __m128i*)ad_hash), tag); + + if (encrypting) { + unsigned char tag_b[16]; + _mm_storeu_si128((__m128i*)tag_b, tag); + memcpy(out, tag_b, TAGBYTES); + return 0; + } else { + unsigned char tag_b[16]; + _mm_storeu_si128((__m128i*)tag_b, tag); + return (memcmp(in,tag_b,TAGBYTES) ? -1 : 0); /* Check for validity */ + } +} + +/* ------------------------------------------------------------------------- */ + +#define OCB_ENCRYPT 1 +#define OCB_DECRYPT 0 + +void ocb_encrypt(unsigned char *c, unsigned char *k, unsigned char *n, + unsigned char *a, unsigned abytes, + unsigned char *p, unsigned pbytes) { + ocb_crypt(c, k, n, a, abytes, p, pbytes, OCB_ENCRYPT); +} + +/* ------------------------------------------------------------------------- */ + +int ocb_decrypt(unsigned char *p, unsigned char *k, unsigned char *n, + unsigned char *a, unsigned abytes, + unsigned char *c, unsigned cbytes) { + return ocb_crypt(p, k, n, a, abytes, c, cbytes, OCB_DECRYPT); +} + +/* ------------------------------------------------------------------------- */ + +int crypto_aead_encrypt( +unsigned char *c,unsigned long long *clen, +const unsigned char *m,unsigned long long mlen, +const unsigned char *ad,unsigned long long adlen, +const unsigned char *nsec, +const unsigned char *npub, +const unsigned char *k +) +{ + *clen = mlen + TAGBYTES; + ocb_crypt(c, (unsigned char *)k, (unsigned char *)npub, (unsigned char *)ad, + adlen, (unsigned char *)m, mlen, OCB_ENCRYPT); + return 0; +} + +int crypto_aead_decrypt( +unsigned char *m,unsigned long long *mlen, +unsigned char *nsec, +const unsigned char *c,unsigned long long clen, +const unsigned char *ad,unsigned long long adlen, +const unsigned char *npub, +const unsigned char *k +) +{ + *mlen = clen - TAGBYTES; + return ocb_crypt(m, (unsigned char *)k, (unsigned char *)npub, + (unsigned char *)ad, adlen, (unsigned char *)c, clen, OCB_DECRYPT); +} + diff --git a/aeadaes256ocbtaglen128v1-rv32/kernelrandombytes.h b/aeadaes256ocbtaglen128v1-rv32/kernelrandombytes.h new file mode 100644 index 0000000..2248f60 --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/kernelrandombytes.h @@ -0,0 +1,14 @@ +#ifndef kernelrandombytes_h +#define kernelrandombytes_h + +#ifdef __cplusplus +extern "C" { +#endif + +extern void kernelrandombytes(unsigned char *,unsigned long long); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/aeadaes256ocbtaglen128v1-rv32/random.cpp b/aeadaes256ocbtaglen128v1-rv32/random.cpp new file mode 100644 index 0000000..53fe546 --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/random.cpp @@ -0,0 +1,19 @@ +#include +#include + +std::default_random_engine generator; +std::uniform_int_distribution distribution(0,255); +auto rbyte = std::bind ( distribution, generator ); + +extern "C" { + void kernelrandombytes(unsigned char *x,unsigned long long xlen) + { + int i; + + while (xlen > 0) { + *x = rbyte(); + x++; + xlen--; + } + } +} diff --git a/aeadaes256ocbtaglen128v1-rv32/riscv.c b/aeadaes256ocbtaglen128v1-rv32/riscv.c new file mode 100644 index 0000000..2ed0c0b --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/riscv.c @@ -0,0 +1,83 @@ +/* +cpucycles/riscv.c version 20190803 +D. J. Bernstein +Romain Dolbeau +Public domain. +*/ + +#include +#include +#include + +long long cpucycles_riscv(void) +{ + long long result; +#if defined(__riscv_xlen) +#if __riscv_xlen == 64 + asm volatile("rdcycle %0" : "=r" (result)); +#elif __riscv_xlen == 32 + unsigned int l, h, h2; + asm volatile( "start:\n" + "rdcycleh %0\n" + "rdcycle %1\n" + "rdcycleh %2\n" + "bne %0, %2, start\n" + : "=r" (h), "=r" (l), "=r" (h2)); + + result = (((unsigned long long)h)<<32) | ((unsigned long long)l); +#else +#error "unknown __riscv_xlen" +#endif +#else // __riscv_xlen +#error "__riscv_xlen required for RISC-V support" +#endif // __riscv_xlen + return result; +} + +static long long microseconds(void) +{ + struct timeval t; + gettimeofday(&t,(struct timezone *) 0); + return t.tv_sec * (long long) 1000000 + t.tv_usec; +} + +static double guessfreq(void) +{ + long long tb0; long long us0; + long long tb1; long long us1; + + tb0 = cpucycles_riscv(); + us0 = microseconds(); + do { + tb1 = cpucycles_riscv(); + us1 = microseconds(); + } while (us1 - us0 < 10000 || tb1 - tb0 < 1000); + if (tb1 <= tb0) return 0; + tb1 -= tb0; + us1 -= us0; + return ((double) tb1) / (0.000001 * (double) us1); +} + +static long long cpufrequency = 0; + +static void init(void) +{ + double guess1; + double guess2; + int loop; + + for (loop = 0;loop < 100;++loop) { + guess1 = guessfreq(); + guess2 = guessfreq(); + if (guess1 > 1.01 * guess2) continue; + if (guess2 > 1.01 * guess1) continue; + cpufrequency = 0.5 * (guess1 + guess2); + break; + } +} + +long long cpucycles_riscv_persecond(void) +{ + if (!cpufrequency) init(); + return cpufrequency; +} diff --git a/aeadaes256ocbtaglen128v1-rv32/try-anything.c b/aeadaes256ocbtaglen128v1-rv32/try-anything.c new file mode 100644 index 0000000..84517c4 --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/try-anything.c @@ -0,0 +1,323 @@ +/* + * try-anything.c version 20190729 + * D. J. Bernstein + * Some portions adapted from TweetNaCl by Bernstein, Janssen, Lange, Schwabe. + * Public domain. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "kernelrandombytes.h" +#include "cpucycles.h" +#include "crypto_uint8.h" +#include "crypto_uint32.h" +#include "crypto_uint64.h" +#include "try.h" + +typedef crypto_uint8 u8; +typedef crypto_uint32 u32; +typedef crypto_uint64 u64; + +#define FOR(i,n) for (i = 0;i < n;++i) + +static u32 L32(u32 x,int c) { return (x << c) | ((x&0xffffffff) >> (32 - c)); } + +static u32 ld32(const u8 *x) +{ + u32 u = x[3]; + u = (u<<8)|x[2]; + u = (u<<8)|x[1]; + return (u<<8)|x[0]; +} + +static void st32(u8 *x,u32 u) +{ + int i; + FOR(i,4) { x[i] = u; u >>= 8; } +} + +static const u8 sigma[17] = "expand 32-byte k"; + +static void core(u8 *out,const u8 *in,const u8 *k) +{ + u32 w[16],x[16],y[16],t[4]; + int i,j,m; + + FOR(i,4) { + x[5*i] = ld32(sigma+4*i); + x[1+i] = ld32(k+4*i); + x[6+i] = ld32(in+4*i); + x[11+i] = ld32(k+16+4*i); + } + + FOR(i,16) y[i] = x[i]; + + FOR(i,20) { + FOR(j,4) { + FOR(m,4) t[m] = x[(5*j+4*m)%16]; + t[1] ^= L32(t[0]+t[3], 7); + t[2] ^= L32(t[1]+t[0], 9); + t[3] ^= L32(t[2]+t[1],13); + t[0] ^= L32(t[3]+t[2],18); + FOR(m,4) w[4*j+(j+m)%4] = t[m]; + } + FOR(m,16) x[m] = w[m]; + } + + FOR(i,16) st32(out + 4 * i,x[i] + y[i]); +} + +static void salsa20(u8 *c,u64 b,const u8 *n,const u8 *k) +{ + u8 z[16],x[64]; + u32 u,i; + if (!b) return; + FOR(i,16) z[i] = 0; + FOR(i,8) z[i] = n[i]; + while (b >= 64) { + core(x,z,k); + FOR(i,64) c[i] = x[i]; + u = 1; + for (i = 8;i < 16;++i) { + u += (u32) z[i]; + z[i] = u; + u >>= 8; + } + b -= 64; + c += 64; + } + if (b) { + core(x,z,k); + FOR(i,b) c[i] = x[i]; + } +} + +static void increment(u8 *n) +{ + if (!++n[0]) + if (!++n[1]) + if (!++n[2]) + if (!++n[3]) + if (!++n[4]) + if (!++n[5]) + if (!++n[6]) + if (!++n[7]) + ; +} + +static void testvector(unsigned char *x,unsigned long long xlen) +{ + const static unsigned char testvector_k[33] = "generate inputs for test vectors"; + static unsigned char testvector_n[8]; + salsa20(x,xlen,testvector_n,testvector_k); + increment(testvector_n); +} + +unsigned long long myrandom(void) +{ + unsigned char x[8]; + unsigned long long result; + testvector(x,8); + result = x[7]; + result = (result<<8)|x[6]; + result = (result<<8)|x[5]; + result = (result<<8)|x[4]; + result = (result<<8)|x[3]; + result = (result<<8)|x[2]; + result = (result<<8)|x[1]; + result = (result<<8)|x[0]; + return result; +} + +static void canary(unsigned char *x,unsigned long long xlen) +{ + const static unsigned char canary_k[33] = "generate pad to catch overwrites"; + static unsigned char canary_n[8]; + salsa20(x,xlen,canary_n,canary_k); + increment(canary_n); +} + +void double_canary(unsigned char *x2,unsigned char *x,unsigned long long xlen) +{ + canary(x - 16,16); + canary(x + xlen,16); + memcpy(x2 - 16,x - 16,16); + memcpy(x2 + xlen,x + xlen,16); +} + +void input_prepare(unsigned char *x2,unsigned char *x,unsigned long long xlen) +{ + testvector(x,xlen); + canary(x - 16,16); + canary(x + xlen,16); + memcpy(x2 - 16,x - 16,xlen + 32); +} + +void input_compare(const unsigned char *x2,const unsigned char *x,unsigned long long xlen,const char *fun) +{ + if (memcmp(x2 - 16,x - 16,xlen + 32)) { + fprintf(stderr,"%s overwrites input\n",fun); + exit(111); + } +} + +void output_prepare(unsigned char *x2,unsigned char *x,unsigned long long xlen) +{ + canary(x - 16,xlen + 32); + memcpy(x2 - 16,x - 16,xlen + 32); +} + +void output_compare(const unsigned char *x2,const unsigned char *x,unsigned long long xlen,const char *fun) +{ + if (memcmp(x2 - 16,x - 16,16)) { + fprintf(stderr,"%s writes before output\n",fun); + exit(111); + } + if (memcmp(x2 + xlen,x + xlen,16)) { + fprintf(stderr,"%s writes after output\n",fun); + exit(111); + } +} + +static unsigned char checksum_state[64]; +static char checksum_hex[65]; + +void checksum(const unsigned char *x,unsigned long long xlen) +{ + u8 block[16]; + int i; + while (xlen >= 16) { + core(checksum_state,x,checksum_state); + x += 16; + xlen -= 16; + } + FOR(i,16) block[i] = 0; + FOR(i,xlen) block[i] = x[i]; + block[xlen] = 1; + checksum_state[0] ^= 1; + core(checksum_state,block,checksum_state); +} + +static void printword(const char *s) +{ + if (!*s) putchar('-'); + while (*s) { + if (*s == ' ') putchar('_'); + else if (*s == '\t') putchar('_'); + else if (*s == '\r') putchar('_'); + else if (*s == '\n') putchar('_'); + else putchar(*s); + ++s; + } + putchar(' '); +} + +static void printnum(long long x) +{ + printf("%lld ",x); +} + +void fail(const char *why) +{ + fprintf(stderr,"%s\n",why); + exit(111); +} + +unsigned char *alignedcalloc(unsigned long long len) +{ + unsigned char *x = (unsigned char *) calloc(1,len + 256); + long long i; + if (!x) fail("out of memory"); + /* will never deallocate so shifting is ok */ + for (i = 0;i < len + 256;++i) x[i] = random(); + x += 64; + x += 63 & (-(unsigned long) x); + for (i = 0;i < len;++i) x[i] = 0; + return x; +} + +#define TIMINGS 63 +static long long cycles[TIMINGS + 1]; + +void limits() +{ +#ifdef RLIM_INFINITY + struct rlimit r; + r.rlim_cur = 0; + r.rlim_max = 0; +#ifdef RLIMIT_NOFILE + setrlimit(RLIMIT_NOFILE,&r); +#endif +#ifdef RLIMIT_NPROC + setrlimit(RLIMIT_NPROC,&r); +#endif +#ifdef RLIMIT_CORE + setrlimit(RLIMIT_CORE,&r); +#endif +#endif +} + +static unsigned char randombyte[1]; + +int main() +{ + long long i; + long long j; + long long abovej; + long long belowj; + long long checksumcycles; + long long cyclespersecond; + + cycles[0] = cpucycles(); + cycles[1] = cpucycles(); + cyclespersecond = cpucycles_persecond(); + + kernelrandombytes(randombyte,1); + preallocate(); + limits(); + + allocate(); + srandom(getpid()); + + cycles[0] = cpucycles(); + test(); + cycles[1] = cpucycles(); + checksumcycles = cycles[1] - cycles[0]; + + predoit(); + for (i = 0;i <= TIMINGS;++i) { + cycles[i] = cpucycles(); + } + for (i = 0;i <= TIMINGS;++i) { + cycles[i] = cpucycles(); + doit(); + } + for (i = 0;i < TIMINGS;++i) cycles[i] = cycles[i + 1] - cycles[i]; + for (j = 0;j < TIMINGS;++j) { + belowj = 0; + for (i = 0;i < TIMINGS;++i) if (cycles[i] < cycles[j]) ++belowj; + abovej = 0; + for (i = 0;i < TIMINGS;++i) if (cycles[i] > cycles[j]) ++abovej; + if (belowj * 2 < TIMINGS && abovej * 2 < TIMINGS) break; + } + + for (i = 0;i < 32;++i) { + checksum_hex[2 * i] = "0123456789abcdef"[15 & (checksum_state[i] >> 4)]; + checksum_hex[2 * i + 1] = "0123456789abcdef"[15 & checksum_state[i]]; + } + checksum_hex[2 * i] = 0; + + printword(checksum_hex); + printnum(cycles[j]); + printnum(checksumcycles); + printnum(cyclespersecond); + printword(primitiveimplementation); + printf("\n"); + return 0; +} diff --git a/aeadaes256ocbtaglen128v1-rv32/try.c b/aeadaes256ocbtaglen128v1-rv32/try.c new file mode 100644 index 0000000..687a21d --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/try.c @@ -0,0 +1,242 @@ +/* + * crypto_aead/try.c version 20200406 + * D. J. Bernstein + * Public domain. + * Auto-generated by trygen.py; do not edit. + */ + +#include "crypto_aead.h" +#include "try.h" + +const char *primitiveimplementation = crypto_aead_IMPLEMENTATION; + +#define TUNE_BYTES 1536 +#ifdef SMALL +#define MAXTEST_BYTES 128 +#else +#define MAXTEST_BYTES 4096 +#endif +#ifdef SMALL +#define LOOPS 64 +#else +#define LOOPS 512 +#endif + +static unsigned char *k; +static unsigned char *s; +static unsigned char *p; +static unsigned char *a; +static unsigned char *m; +static unsigned char *c; +static unsigned char *t; +static unsigned char *r; +static unsigned char *k2; +static unsigned char *s2; +static unsigned char *p2; +static unsigned char *a2; +static unsigned char *m2; +static unsigned char *c2; +static unsigned char *t2; +static unsigned char *r2; +#define klen crypto_aead_KEYBYTES +#define slen crypto_aead_NSECBYTES +#define plen crypto_aead_NPUBBYTES +unsigned long long alen; +unsigned long long mlen; +unsigned long long clen; +unsigned long long tlen; +#define rlen crypto_aead_NSECBYTES + +void preallocate(void) +{ +} + +void allocate(void) +{ + unsigned long long alloclen = 0; + if (alloclen < TUNE_BYTES) alloclen = TUNE_BYTES; + if (alloclen < MAXTEST_BYTES + crypto_aead_ABYTES) alloclen = MAXTEST_BYTES + crypto_aead_ABYTES; + if (alloclen < crypto_aead_KEYBYTES) alloclen = crypto_aead_KEYBYTES; + if (alloclen < crypto_aead_NSECBYTES) alloclen = crypto_aead_NSECBYTES; + if (alloclen < crypto_aead_NPUBBYTES) alloclen = crypto_aead_NPUBBYTES; + if (alloclen < crypto_aead_NSECBYTES) alloclen = crypto_aead_NSECBYTES; + k = alignedcalloc(alloclen); + s = alignedcalloc(alloclen); + p = alignedcalloc(alloclen); + a = alignedcalloc(alloclen); + m = alignedcalloc(alloclen); + c = alignedcalloc(alloclen); + t = alignedcalloc(alloclen); + r = alignedcalloc(alloclen); + k2 = alignedcalloc(alloclen); + s2 = alignedcalloc(alloclen); + p2 = alignedcalloc(alloclen); + a2 = alignedcalloc(alloclen); + m2 = alignedcalloc(alloclen); + c2 = alignedcalloc(alloclen); + t2 = alignedcalloc(alloclen); + r2 = alignedcalloc(alloclen); +} + +void predoit(void) +{ +} + +void doit(void) +{ + crypto_aead_encrypt(c,&clen,m,TUNE_BYTES,a,TUNE_BYTES,s,p,k); + crypto_aead_decrypt(t,&tlen,r,c,clen,a,TUNE_BYTES,p,k); +} + +void test(void) +{ + unsigned long long loop; + + for (loop = 0;loop < LOOPS;++loop) { + mlen = myrandom() % (MAXTEST_BYTES + 1); + alen = myrandom() % (MAXTEST_BYTES + 1); + + clen = mlen + crypto_aead_ABYTES; + output_prepare(c2,c,clen); + input_prepare(m2,m,mlen); + input_prepare(a2,a,alen); + input_prepare(s2,s,slen); + input_prepare(p2,p,plen); + input_prepare(k2,k,klen); + if (crypto_aead_encrypt(c,&clen,m,mlen,a,alen,s,p,k) != 0) fail("crypto_aead_encrypt returns nonzero"); + if (clen < mlen) fail("crypto_aead_encrypt returns smaller output than input"); + if (clen > mlen + crypto_aead_ABYTES) fail("crypto_aead_encrypt returns more than crypto_aead_ABYTES extra bytes"); + checksum(c,clen); + output_compare(c2,c,clen,"crypto_aead_encrypt"); + input_compare(m2,m,mlen,"crypto_aead_encrypt"); + input_compare(a2,a,alen,"crypto_aead_encrypt"); + input_compare(s2,s,slen,"crypto_aead_encrypt"); + input_compare(p2,p,plen,"crypto_aead_encrypt"); + input_compare(k2,k,klen,"crypto_aead_encrypt"); + + double_canary(c2,c,clen); + double_canary(m2,m,mlen); + double_canary(a2,a,alen); + double_canary(s2,s,slen); + double_canary(p2,p,plen); + double_canary(k2,k,klen); + if (crypto_aead_encrypt(c2,&clen,m2,mlen,a2,alen,s2,p2,k2) != 0) fail("crypto_aead_encrypt returns nonzero"); + if (memcmp(c2,c,clen) != 0) fail("crypto_aead_encrypt is nondeterministic"); + +#if crypto_aead_NOOVERLAP == 1 +#else + double_canary(c2,c,clen); + double_canary(m2,m,mlen); + double_canary(a2,a,alen); + double_canary(s2,s,slen); + double_canary(p2,p,plen); + double_canary(k2,k,klen); + if (crypto_aead_encrypt(m2,&clen,m2,mlen,a,alen,s,p,k) != 0) fail("crypto_aead_encrypt with m=c overlap returns nonzero"); + if (memcmp(m2,c,clen) != 0) fail("crypto_aead_encrypt does not handle m=c overlap"); + memcpy(m2,m,mlen); + if (crypto_aead_encrypt(a2,&clen,m,mlen,a2,alen,s,p,k) != 0) fail("crypto_aead_encrypt with a=c overlap returns nonzero"); + if (memcmp(a2,c,clen) != 0) fail("crypto_aead_encrypt does not handle a=c overlap"); + memcpy(a2,a,alen); + if (crypto_aead_encrypt(s2,&clen,m,mlen,a,alen,s2,p,k) != 0) fail("crypto_aead_encrypt with s=c overlap returns nonzero"); + if (memcmp(s2,c,clen) != 0) fail("crypto_aead_encrypt does not handle s=c overlap"); + memcpy(s2,s,slen); + if (crypto_aead_encrypt(p2,&clen,m,mlen,a,alen,s,p2,k) != 0) fail("crypto_aead_encrypt with p=c overlap returns nonzero"); + if (memcmp(p2,c,clen) != 0) fail("crypto_aead_encrypt does not handle p=c overlap"); + memcpy(p2,p,plen); + if (crypto_aead_encrypt(k2,&clen,m,mlen,a,alen,s,p,k2) != 0) fail("crypto_aead_encrypt with k=c overlap returns nonzero"); + if (memcmp(k2,c,clen) != 0) fail("crypto_aead_encrypt does not handle k=c overlap"); + memcpy(k2,k,klen); +#endif + + tlen = clen; + output_prepare(t2,t,tlen); + output_prepare(r2,r,rlen); + memcpy(c2,c,clen); + double_canary(c2,c,clen); + memcpy(a2,a,alen); + double_canary(a2,a,alen); + memcpy(p2,p,plen); + double_canary(p2,p,plen); + memcpy(k2,k,klen); + double_canary(k2,k,klen); + if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) != 0) fail("crypto_aead_decrypt returns nonzero"); + if (tlen != mlen) fail("crypto_aead_decrypt does not match mlen"); + if (memcmp(t,m,mlen) != 0) fail("crypto_aead_decrypt does not match m"); + if (memcmp(r,s,slen) != 0) fail("crypto_aead_decrypt does not match s"); + checksum(t,tlen); + checksum(r,rlen); + output_compare(t2,t,clen,"crypto_aead_decrypt"); + output_compare(r2,r,rlen,"crypto_aead_decrypt"); + input_compare(c2,c,clen,"crypto_aead_decrypt"); + input_compare(a2,a,alen,"crypto_aead_decrypt"); + input_compare(p2,p,plen,"crypto_aead_decrypt"); + input_compare(k2,k,klen,"crypto_aead_decrypt"); + + double_canary(t2,t,tlen); + double_canary(r2,r,rlen); + double_canary(c2,c,clen); + double_canary(a2,a,alen); + double_canary(p2,p,plen); + double_canary(k2,k,klen); + if (crypto_aead_decrypt(t2,&tlen,r2,c2,clen,a2,alen,p2,k2) != 0) fail("crypto_aead_decrypt returns nonzero"); + if (memcmp(t2,t,tlen) != 0) fail("crypto_aead_decrypt is nondeterministic"); + if (memcmp(r2,r,rlen) != 0) fail("crypto_aead_decrypt is nondeterministic"); + +#if crypto_aead_NOOVERLAP == 1 +#else + double_canary(t2,t,tlen); + double_canary(r2,r,rlen); + double_canary(c2,c,clen); + double_canary(a2,a,alen); + double_canary(p2,p,plen); + double_canary(k2,k,klen); + if (crypto_aead_decrypt(c2,&tlen,r,c2,clen,a,alen,p,k) != 0) fail("crypto_aead_decrypt with c=t overlap returns nonzero"); + if (memcmp(c2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle c=t overlap"); + memcpy(c2,c,clen); + if (crypto_aead_decrypt(a2,&tlen,r,c,clen,a2,alen,p,k) != 0) fail("crypto_aead_decrypt with a=t overlap returns nonzero"); + if (memcmp(a2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle a=t overlap"); + memcpy(a2,a,alen); + if (crypto_aead_decrypt(p2,&tlen,r,c,clen,a,alen,p2,k) != 0) fail("crypto_aead_decrypt with p=t overlap returns nonzero"); + if (memcmp(p2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle p=t overlap"); + memcpy(p2,p,plen); + if (crypto_aead_decrypt(k2,&tlen,r,c,clen,a,alen,p,k2) != 0) fail("crypto_aead_decrypt with k=t overlap returns nonzero"); + if (memcmp(k2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle k=t overlap"); + memcpy(k2,k,klen); +#endif + +#if crypto_aead_NOOVERLAP == 1 +#else + double_canary(t2,t,tlen); + double_canary(r2,r,rlen); + double_canary(c2,c,clen); + double_canary(a2,a,alen); + double_canary(p2,p,plen); + double_canary(k2,k,klen); + if (crypto_aead_decrypt(t,&tlen,c2,c2,clen,a,alen,p,k) != 0) fail("crypto_aead_decrypt with c=r overlap returns nonzero"); + if (memcmp(c2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle c=r overlap"); + memcpy(c2,c,clen); + if (crypto_aead_decrypt(t,&tlen,a2,c,clen,a2,alen,p,k) != 0) fail("crypto_aead_decrypt with a=r overlap returns nonzero"); + if (memcmp(a2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle a=r overlap"); + memcpy(a2,a,alen); + if (crypto_aead_decrypt(t,&tlen,p2,c,clen,a,alen,p2,k) != 0) fail("crypto_aead_decrypt with p=r overlap returns nonzero"); + if (memcmp(p2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle p=r overlap"); + memcpy(p2,p,plen); + if (crypto_aead_decrypt(t,&tlen,k2,c,clen,a,alen,p,k2) != 0) fail("crypto_aead_decrypt with k=r overlap returns nonzero"); + if (memcmp(k2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle k=r overlap"); + memcpy(k2,k,klen); +#endif + + c[myrandom() % clen] += 1 + (myrandom() % 255); + if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) == 0) + if ((tlen != mlen) || (memcmp(t,m,mlen) != 0) || (memcmp(r,s,slen) != 0)) + fail("crypto_aead_decrypt allows trivial forgeries"); + c[myrandom() % clen] += 1 + (myrandom() % 255); + if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) == 0) + if ((tlen != mlen) || (memcmp(t,m,mlen) != 0) || (memcmp(r,s,slen) != 0)) + fail("crypto_aead_decrypt allows trivial forgeries"); + c[myrandom() % clen] += 1 + (myrandom() % 255); + if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) == 0) + if ((tlen != mlen) || (memcmp(t,m,mlen) != 0) || (memcmp(r,s,slen) != 0)) + fail("crypto_aead_decrypt allows trivial forgeries"); + } +} diff --git a/aeadaes256ocbtaglen128v1-rv32/try.h b/aeadaes256ocbtaglen128v1-rv32/try.h new file mode 100644 index 0000000..47db359 --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/try.h @@ -0,0 +1,21 @@ +#include +#include + +/* provided by try.c: */ +extern const char *primitiveimplementation; +extern void preallocate(void); +extern void allocate(void);; +extern void test(void); +extern void predoit(void); +extern void doit(void); + +/* provided by try-anything.c: */ +extern void fail(const char *); +extern unsigned char *alignedcalloc(unsigned long long); +extern void checksum(const unsigned char *,unsigned long long); +extern void double_canary(unsigned char *,unsigned char *,unsigned long long); +extern void input_prepare(unsigned char *,unsigned char *,unsigned long long); +extern void output_prepare(unsigned char *,unsigned char *,unsigned long long); +extern void input_compare(const unsigned char *,const unsigned char *,unsigned long long,const char *); +extern void output_compare(const unsigned char *,const unsigned char *,unsigned long long,const char *); +extern unsigned long long myrandom(void); diff --git a/aeadaes256ocbtaglen128v1-rv32/verify.c b/aeadaes256ocbtaglen128v1-rv32/verify.c new file mode 100644 index 0000000..d356060 --- /dev/null +++ b/aeadaes256ocbtaglen128v1-rv32/verify.c @@ -0,0 +1,24 @@ +#include "crypto_verify.h" + +int crypto_verify(const unsigned char *x,const unsigned char *y) +{ + unsigned int differentbits = 0; +#define F(i) differentbits |= x[i] ^ y[i]; + F(0) + F(1) + F(2) + F(3) + F(4) + F(5) + F(6) + F(7) + F(8) + F(9) + F(10) + F(11) + F(12) + F(13) + F(14) + F(15) + return (1 & ((differentbits - 1) >> 8)) - 1; +} diff --git a/aes256decrypt-rv32/amd64cpuinfo.c b/aes256decrypt-rv32/amd64cpuinfo.c deleted file mode 100644 index 076e7eb..0000000 --- a/aes256decrypt-rv32/amd64cpuinfo.c +++ /dev/null @@ -1,17 +0,0 @@ -#include -#include -#include -#include "osfreq.c" - -long long cpucycles_riscv(void) -{ - unsigned long long result; - asm volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax" - : "=a" (result) :: "%rdx"); - return result; -} - -long long cpucycles_riscv_persecond(void) -{ - return osfreq(); -} diff --git a/aes256decrypt-rv32/osfreq.c b/aes256decrypt-rv32/osfreq.c deleted file mode 100644 index b705fa4..0000000 --- a/aes256decrypt-rv32/osfreq.c +++ /dev/null @@ -1,93 +0,0 @@ -static double osfreq(void) -{ - FILE *f; - char *x; - double result; - int s; - - f = fopen("/etc/cpucyclespersecond", "r"); - if (f) { - s = fscanf(f,"%lf",&result); - fclose(f); - if (s > 0) return result; - } - - f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed", "r"); - if (f) { - s = fscanf(f,"%lf",&result); - fclose(f); - if (s > 0) return 1000.0 * result; - } - - f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq", "r"); - if (f) { - s = fscanf(f,"%lf",&result); - fclose(f); - if (s > 0) return 1000.0 * result; - } - - f = fopen("/sys/devices/system/cpu/cpu0/clock_tick", "r"); - if (f) { - s = fscanf(f,"%lf",&result); - fclose(f); - if (s > 0) return result; - } - - f = fopen("/proc/cpuinfo","r"); - if (f) { - for (;;) { - s = fscanf(f,"cpu MHz : %lf",&result); - if (s > 0) break; - if (s == 0) s = fscanf(f,"%*[^\n]\n"); - if (s < 0) { result = 0; break; } - } - fclose(f); - if (result) return 1000000.0 * result; - } - - f = fopen("/proc/cpuinfo","r"); - if (f) { - for (;;) { - s = fscanf(f,"clock : %lf",&result); - if (s > 0) break; - if (s == 0) s = fscanf(f,"%*[^\n]\n"); - if (s < 0) { result = 0; break; } - } - fclose(f); - if (result) return 1000000.0 * result; - } - - f = popen("sysctl hw.cpufrequency 2>/dev/null","r"); - if (f) { - s = fscanf(f,"hw.cpufrequency: %lf",&result); - pclose(f); - if (s > 0) if (result > 0) return result; - } - - f = popen("/usr/sbin/lsattr -E -l proc0 -a frequency 2>/dev/null","r"); - if (f) { - s = fscanf(f,"frequency %lf",&result); - pclose(f); - if (s > 0) return result; - } - - f = popen("/usr/sbin/psrinfo -v 2>/dev/null","r"); - if (f) { - for (;;) { - s = fscanf(f," The %*s processor operates at %lf MHz",&result); - if (s > 0) break; - if (s == 0) s = fscanf(f,"%*[^\n]\n"); - if (s < 0) { result = 0; break; } - } - pclose(f); - if (result) return 1000000.0 * result; - } - - x = getenv("cpucyclespersecond"); - if (x) { - s = sscanf(x,"%lf",&result); - if (s > 0) return result; - } - - return 0; -} diff --git a/aes256gcmv1standalone-rv32/encrypt.c b/aes256gcmv1standalone-rv32/encrypt.c index e1e9965..cf45407 100644 --- a/aes256gcmv1standalone-rv32/encrypt.c +++ b/aes256gcmv1standalone-rv32/encrypt.c @@ -249,190 +249,9 @@ static inline int64_t _rv64_clmulh(int64_t rs1, int64_t rs2) /* this is basically Supercop's crypto_aead/aes256gcmv1/dolbeau/aesenc-int, but without the unrolling. - So we have a thin compatibility layer to SSE's __m128i data format - and associated instructions to support GHASH & the full algo. */ -/* ouch */ -typedef struct { - uint64_t l; - uint64_t h; -} __m128i; - -//#define _mm_loadu_si128(a) (*(const __m128i*)a) -static inline __m128i _mm_loadu_si128(const __m128i *ptr) { - __m128i r; - r.l = ((const uint64_t*)ptr)[0]; - r.h = ((const uint64_t*)ptr)[1]; - return r; -} - -//#define _mm_storeu_si128(x,a) (*(__m128i*)x)=a -static inline void _mm_storeu_si128(__m128i *ptr, const __m128i data) { - ((uint64_t*)ptr)[0] = data.l; - ((uint64_t*)ptr)[1] = data.h; -} - -static inline __m128i _mm_clmulepi64_si128(const __m128i a, const __m128i b, const int x) { - __m128i r; - switch (x) { - case 0x00: - r.l = _rv64_clmul(a.l, b.l); - r.h = _rv64_clmulh(a.l, b.l); - break; - case 0x01: - r.l = _rv64_clmul(a.l, b.h); - r.h = _rv64_clmulh(a.l, b.h); - break; - case 0x10: - r.l = _rv64_clmul(a.h, b.l); - r.h = _rv64_clmulh(a.h, b.l); - break; - case 0x11: - r.l = _rv64_clmul(a.h, b.h); - r.h = _rv64_clmulh(a.h, b.h); - break; - } - return r; -} - -/* -static inline __m128i (const __m128i a, const __m128i b) { - __m128i r; - return r; -} -*/ -static inline __m128i _mm_xor_si128(const __m128i a, const __m128i b) { - __m128i r; - r.l = a.l ^ b.l; - r.h = a.h ^ b.h; - return r; -} -static inline __m128i _mm_or_si128(const __m128i a, const __m128i b) { - __m128i r; - r.l = a.l | b.l; - r.h = a.h | b.h; - return r; -} -static inline __m128i _mm_and_si128(const __m128i a, const __m128i b) { - __m128i r; - r.l = a.l & b.l; - r.h = a.h & b.h; - return r; -} -static inline __m128i _mm_slli_si128(const __m128i a, const int b) { - __m128i r; - switch (b) { - case 4: - r.l = a.l << 32; - r.h = a.h << 32 | a.l >> 32; - break; - case 8: - r.l = 0; - r.h = a.l; - break; - case 12: - r.l = 0; - r.h = a.l << 32; - break; - } - return r; -} -static inline __m128i _mm_srli_si128(const __m128i a, const int b) { - __m128i r; - switch (b) { - case 4: - r.l = a.l >> 32 | a.h << 32; - r.h = a.h >> 32; - break; - case 8: - r.l = a.h; - r.h = 0; - break; - case 12: - r.l = a.h >> 32; - r.h = 0; - break; - } - return r; -} -static inline __m128i _mm_srli_epi32(const __m128i a, const int b) { - __m128i r; - r.l = ((a.l & 0x00000000FFFFFFFFull) >> b) | (((a.l & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull); - r.h = ((a.h & 0x00000000FFFFFFFFull) >> b) | (((a.h & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull); - return r; -} -static inline __m128i _mm_slli_epi32(const __m128i a, const int b) { - __m128i r; - r.l = (((a.l & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.l & 0xFFFFFFFF00000000ull) << b); - r.h = (((a.h & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.h & 0xFFFFFFFF00000000ull) << b); - return r; -} -static inline __m128i _mm_insert_epi64(const __m128i a, const uint64_t x, const int b) { - __m128i r; - if (b == 0) { - r.l = x; - r.h = a.h; - } else { - r.l = a.l; - r.h = x; - } - return r; -} -static inline __m128i _mm_setzero_si128(void) { - __m128i r; - r.l = 0; - r.h = 0; - return r; -} -static inline __m128i _mm_set1_epi32(const uint32_t x) { - __m128i r; - r.l = x | ((uint64_t)x) << 32; - r.h = x | ((uint64_t)x) << 32; - return r; -} - -static inline uint64_t bytereverse64(const uint64_t a) { - uint64_t r; - r = (uint32_t)_rv32_grev((a>>32), 24) | (((uint64_t)_rv32_grev((a&0xFFFFFFFF), 24))<<32); - return r; -} -static inline __m128i bytereverse128(const __m128i a) { - __m128i r; - r.l = bytereverse64(a.h); - r.h = bytereverse64(a.l); - return r; -} - -static inline uint64_t bitreverse64(const uint64_t a) { - uint64_t r; - r = (uint32_t)_rv32_grev((a&0xFFFFFFFF), 7) | (((uint64_t)_rv32_grev((a>>32), 7))<<32); - return r; -} -static inline __m128i bitreverse128(const __m128i a) { - __m128i r; - r.l = bitreverse64(a.l); - r.h = bitreverse64(a.h); - return r; -} - -static inline uint64_t wordreverse64(const uint64_t a) { - uint64_t r; - r = (a>>32)|(a<<32); - return r; -} -static inline __m128i wordreverse128(const __m128i a) { - __m128i r; - r.l = wordreverse64(a.h); - r.h = wordreverse64(a.l); - return r; -} -static inline __m128i doublewordreverse128(const __m128i a) { - __m128i r; - r.l = a.h; - r.h = a.l; - return r; -} +#include "m128_compat.h" static inline void addmul_rv(unsigned char *c, const unsigned char *a, int xlen, diff --git a/m128_compat.h b/m128_compat.h new file mode 100644 index 0000000..2f79b51 --- /dev/null +++ b/m128_compat.h @@ -0,0 +1,241 @@ +/* + * A thin compatibility layer to SSE's __m128i data format + * and associated instructions to support GHASH & the full algo. +*/ + +#ifndef __M128_COMPAT_H__ +#define __M128_COMPAT_H__ + +#include "new_instructions_support_b.h" + +#include + +/* ouch */ +typedef struct { + uint64_t l; + uint64_t h; +} __m128i; + +//#define _mm_loadu_si128(a) (*(const __m128i*)a) +static inline __m128i _mm_loadu_si128(const __m128i *ptr) { + __m128i r; + r.l = ((const uint64_t*)ptr)[0]; + r.h = ((const uint64_t*)ptr)[1]; + return r; +} + +//#define _mm_storeu_si128(x,a) (*(__m128i*)x)=a +static inline void _mm_storeu_si128(__m128i *ptr, const __m128i data) { + ((uint64_t*)ptr)[0] = data.l; + ((uint64_t*)ptr)[1] = data.h; +} +static inline void _mm_store_si128(__m128i *ptr, const __m128i data) { + ((uint64_t*)ptr)[0] = data.l; + ((uint64_t*)ptr)[1] = data.h; +} +static inline void _mm_storel_epi64 (__m128i *ptr, const __m128i data) { + ((uint64_t*)ptr)[0] = data.l; +} + +static inline __m128i _mm_clmulepi64_si128(const __m128i a, const __m128i b, const int x) { + __m128i r; + switch (x) { + case 0x00: + r.l = _rv64_clmul(a.l, b.l); + r.h = _rv64_clmulh(a.l, b.l); + break; + case 0x01: + r.l = _rv64_clmul(a.l, b.h); + r.h = _rv64_clmulh(a.l, b.h); + break; + case 0x10: + r.l = _rv64_clmul(a.h, b.l); + r.h = _rv64_clmulh(a.h, b.l); + break; + case 0x11: + r.l = _rv64_clmul(a.h, b.h); + r.h = _rv64_clmulh(a.h, b.h); + break; + } + return r; +} + +/* +static inline __m128i (const __m128i a, const __m128i b) { + __m128i r; + return r; +} +*/ +static inline __m128i _mm_xor_si128(const __m128i a, const __m128i b) { + __m128i r; + r.l = a.l ^ b.l; + r.h = a.h ^ b.h; + return r; +} +static inline __m128i _mm_or_si128(const __m128i a, const __m128i b) { + __m128i r; + r.l = a.l | b.l; + r.h = a.h | b.h; + return r; +} +static inline __m128i _mm_and_si128(const __m128i a, const __m128i b) { + __m128i r; + r.l = a.l & b.l; + r.h = a.h & b.h; + return r; +} +static inline __m128i _mm_slli_si128(const __m128i a, const int b) { + __m128i r; + switch (b) { + case 4: + r.l = a.l << 32; + r.h = a.h << 32 | a.l >> 32; + break; + case 8: + r.l = 0; + r.h = a.l; + break; + case 12: + r.l = 0; + r.h = a.l << 32; + break; + default: + fprintf(stderr, "%s: %d unimplemented\n", __PRETTY_FUNCTION__, b); + break; + } + return r; +} +static inline __m128i _mm_srli_si128(const __m128i a, const int b) { + __m128i r; + switch (b) { + case 1: + r.l = a.l >> 8 | a.h << 56; + r.h = a.h >> 8; + break; + case 4: + r.l = a.l >> 32 | a.h << 32; + r.h = a.h >> 32; + break; + case 8: + r.l = a.h; + r.h = 0; + break; + case 12: + r.l = a.h >> 32; + r.h = 0; + break; + default: + fprintf(stderr, "%s: %d unimplemented\n", __PRETTY_FUNCTION__, b); + break; + } + return r; +} +static inline __m128i _mm_srli_epi32(const __m128i a, const int b) { + __m128i r; + r.l = ((a.l & 0x00000000FFFFFFFFull) >> b) | (((a.l & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull); + r.h = ((a.h & 0x00000000FFFFFFFFull) >> b) | (((a.h & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull); + return r; +} +static inline __m128i _mm_slli_epi32(const __m128i a, const int b) { + __m128i r; + r.l = (((a.l & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.l & 0xFFFFFFFF00000000ull) << b); + r.h = (((a.h & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.h & 0xFFFFFFFF00000000ull) << b); + return r; +} +/* static inline __m128i _mm_srai_epi32(const __m128i a, const int b) { */ +/* __m128i r; */ +/* r.l = (((int32_t)(a.l & 0x00000000FFFFFFFFull)) >> b) | ((((int32_t)(a.l & 0xFFFFFFFF00000000ull)) >> b) & 0xFFFFFFFF00000000ull); */ +/* r.h = (((int32_t)(a.h & 0x00000000FFFFFFFFull)) >> b) | ((((int32_t)(a.h & 0xFFFFFFFF00000000ull)) >> b) & 0xFFFFFFFF00000000ull); */ +/* return r; */ +/* } */ +static inline __m128i _mm_insert_epi64(const __m128i a, const uint64_t x, const int b) { + __m128i r; + if (b == 0) { + r.l = x; + r.h = a.h; + } else { + r.l = a.l; + r.h = x; + } + return r; +} +static inline __m128i _mm_setzero_si128(void) { + __m128i r; + r.l = 0; + r.h = 0; + return r; +} +static inline __m128i _mm_set1_epi32(const uint32_t x) { + __m128i r; + r.l = x | ((uint64_t)x) << 32; + r.h = x | ((uint64_t)x) << 32; + return r; +} +static inline __m128i _mm_set_epi32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { + __m128i r; + r.l = (uint64_t)e0 | ((uint64_t)e1) << 32; + r.h = (uint64_t)e2 | ((uint64_t)e3) << 32; + return r; +} +/* non-intel stuff, used to replace some common use cases */ +static inline uint64_t bytereverse64(const uint64_t a) { + uint64_t r; + r = (uint32_t)_rv32_grev((a>>32), 24) | (((uint64_t)_rv32_grev((a&0xFFFFFFFF), 24))<<32); + return r; +} +static inline __m128i bytereverse128(const __m128i a) { + __m128i r; + r.l = bytereverse64(a.h); + r.h = bytereverse64(a.l); + return r; +} + +static inline uint64_t bitreverse64(const uint64_t a) { + uint64_t r; + r = (uint32_t)_rv32_grev((a&0xFFFFFFFF), 7) | (((uint64_t)_rv32_grev((a>>32), 7))<<32); + return r; +} +static inline __m128i bitreverse128(const __m128i a) { + __m128i r; + r.l = bitreverse64(a.l); + r.h = bitreverse64(a.h); + return r; +} + +static inline uint64_t wordreverse64(const uint64_t a) { + uint64_t r; + r = (a>>32)|(a<<32); + return r; +} +static inline __m128i wordreverse128(const __m128i a) { + __m128i r; + r.l = wordreverse64(a.h); + r.h = wordreverse64(a.l); + return r; +} +static inline __m128i doublewordreverse128(const __m128i a) { + __m128i r; + r.l = a.h; + r.h = a.l; + return r; +} +static inline __m128i wordrotate1l128(const __m128i a) { + __m128i r; + /* i.e. epi32 _MM_SHUFFLE(2,1,0,3) */ + r.l = (a.h >> 32) | (a.l << 32); + r.h = (a.l >> 32) | (a.h << 32); + return r; +} +static inline __m128i halfwordandzero(const uint16_t a) { + __m128i r; + r.l = a; + r.h = 0; + return r; +} +static inline __m128i wordsign128(const __m128i a) { + __m128i r; + r.l = (a.l & 0x0000000080000000ull ? 0x00000000FFFFFFFFull : 0) | (a.l & 0x8000000000000000ull ? 0xFFFFFFFF00000000ull : 0); + r.h = (a.h & 0x0000000080000000ull ? 0x00000000FFFFFFFFull : 0) | (a.h & 0x8000000000000000ull ? 0xFFFFFFFF00000000ull : 0); + return r; +} +#endif // __M128_COMPAT_H__