add a quick'n'dirty implementation of RV32BK-accelerated AES-OCB, using the _m128i compatibility layer (spun off in its own header)

This commit is contained in:
Romain Dolbeau 2021-02-17 09:02:43 -05:00
parent 431fdc5288
commit 8ad11036be
22 changed files with 1943 additions and 292 deletions

View file

@ -0,0 +1,47 @@
SRCs=encrypt.c try-anything.c verify.c
OBJs=$(SRCs:.c=.o)
SCLIBS=cpucycles.o kernelrandombytes.o
COMPDIR=~dolbeau2/LITEX/buildroot-rv32/output/host
ALTCOMPDIR=/opt/riscv64b
CC=$(COMPDIR)/bin/riscv32-buildroot-linux-gnu-gcc
ALTCC=$(ALTCOMPDIR)/bin/riscv64-unknown-elf-gcc
CXX=$(COMPDIR)/bin/riscv32-buildroot-linux-gnu-g++
STRIP=$(COMPDIR)/bin/riscv32-buildroot-linux-gnu-strip
NEWOPT=-march=rv32imab -mabi=ilp32 -I. -I.. -O3 -DRV32B #-fno-vectorize #-DUSE_EPI_CUSTOM
OPT=-march=rv32ima -mabi=ilp32 -I. -I.. -O3 #-fno-vectorize #-DUSE_EPI_CUSTOM
#ALTCC=$(CC)
#NEWOPT=$(OPT)
all: aeadaes256ocbtaglen128v1 aeadaes256ocbtaglen128v1_small
clean:
rm -f $(OBJs) *.S try.o try_small.o encrypt.o aeadaes256ocbtaglen128v1 aeadaes256ocbtaglen128v1_small
%.o: %.c
$(CC) $(OPT) $< -c -o $@
try.o: try.c
$(CC) $(OPT) $< -c -o $@
try_small.o: try.c
$(CC) $(OPT) $< -c -o $@ -DSMALL
encrypt.S: encrypt.c
$(ALTCC) $(NEWOPT) $< -S -o $@
encrypt.o: encrypt.S
$(ALTCC) $(NEWOPT) $< -c -o $@
aeadaes256ocbtaglen128v1: $(OBJs) encrypt.o try.o $(SCLIBS)
$(CXX) $(OPT) $^ -o $@
aeadaes256ocbtaglen128v1_small: $(OBJs) encrypt.o try_small.o $(SCLIBS)
$(CXX) $(OPT) $^ -o $@
kernelrandombytes.o: random.cpp
$(CXX) $(OPT) $< -c -o $@
cpucycles.o: riscv.c
$(CC) $< -march=rv32ima -mabi=ilp32 -I. -O1 -c -o $@

View file

@ -0,0 +1,4 @@
#define CRYPTO_KEYBYTES 32
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 12
#define CRYPTO_ABYTES 16

View file

@ -0,0 +1,28 @@
/*
cpucycles riscv.h version 20190803
D. J. Bernstein
Romain Dolbeau
Public domain.
*/
#ifndef CPUCYCLES_riscv_h
#define CPUCYCLES_riscv_h
#ifdef __cplusplus
extern "C" {
#endif
extern long long cpucycles_riscv(void);
extern long long cpucycles_riscv_persecond(void);
#ifdef __cplusplus
}
#endif
#ifndef cpucycles_implementation
#define cpucycles_implementation "riscv"
#define cpucycles cpucycles_riscv
#define cpucycles_persecond cpucycles_riscv_persecond
#endif
#endif

View file

@ -0,0 +1,17 @@
#ifndef crypto_aead_H
#define crypto_aead_H
#include "crypto_aead_aeadaes256ocbtaglen128v1.h"
#define crypto_aead_encrypt crypto_aead_aeadaes256ocbtaglen128v1_encrypt
#define crypto_aead_decrypt crypto_aead_aeadaes256ocbtaglen128v1_decrypt
#define crypto_aead_KEYBYTES crypto_aead_aeadaes256ocbtaglen128v1_KEYBYTES
#define crypto_aead_NSECBYTES crypto_aead_aeadaes256ocbtaglen128v1_NSECBYTES
#define crypto_aead_NPUBBYTES crypto_aead_aeadaes256ocbtaglen128v1_NPUBBYTES
#define crypto_aead_ABYTES crypto_aead_aeadaes256ocbtaglen128v1_ABYTES
#define crypto_aead_NOOVERLAP crypto_aead_aeadaes256ocbtaglen128v1_NOOVERLAP
#define crypto_aead_PRIMITIVE "aeadaes256ocbtaglen128v1"
#define crypto_aead_IMPLEMENTATION crypto_aead_aeadaes256ocbtaglen128v1_IMPLEMENTATION
#define crypto_aead_VERSION crypto_aead_aeadaes256ocbtaglen128v1_VERSION
#endif

View file

@ -0,0 +1,31 @@
#ifndef crypto_aead_aeadaes256ocbtaglen128v1_H
#define crypto_aead_aeadaes256ocbtaglen128v1_H
#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_KEYBYTES 32
#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_NSECBYTES 0
#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_NPUBBYTES 12
#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_ABYTES 16
#ifdef __cplusplus
extern "C" {
#endif
extern int crypto_aead_aeadaes256ocbtaglen128v1_rv32_encrypt(unsigned char *,unsigned long long *,const unsigned char *,unsigned long long,const unsigned char *,unsigned long long,const unsigned char *,const unsigned char *,const unsigned char *);
extern int crypto_aead_aeadaes256ocbtaglen128v1_rv32_decrypt(unsigned char *,unsigned long long *,unsigned char *,const unsigned char *,unsigned long long,const unsigned char *,unsigned long long,const unsigned char *,const unsigned char *);
#ifdef __cplusplus
}
#endif
#define crypto_aead_aeadaes256ocbtaglen128v1_encrypt crypto_aead_aeadaes256ocbtaglen128v1_rv32_encrypt
#define crypto_aead_aeadaes256ocbtaglen128v1_decrypt crypto_aead_aeadaes256ocbtaglen128v1_rv32_decrypt
#define crypto_aead_aeadaes256ocbtaglen128v1_KEYBYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_KEYBYTES
#define crypto_aead_aeadaes256ocbtaglen128v1_NSECBYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_NSECBYTES
#define crypto_aead_aeadaes256ocbtaglen128v1_NPUBBYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_NPUBBYTES
#define crypto_aead_aeadaes256ocbtaglen128v1_ABYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_ABYTES
#define crypto_aead_aeadaes256ocbtaglen128v1_NOOVERLAP crypto_aead_aeadaes256ocbtaglen128v1_rv32_NOOVERLAP
#define crypto_aead_aeadaes256ocbtaglen128v1_IMPLEMENTATION "crypto_aead/aeadaes256ocbtaglen128v1/dolbeau/aesenc-int"
#ifndef crypto_aead_aeadaes256ocbtaglen128v1_rv32_VERSION
#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_VERSION "-"
#endif
#define crypto_aead_aeadaes256ocbtaglen128v1_VERSION crypto_aead_aeadaes256ocbtaglen128v1_rv32_VERSION
#endif

View file

@ -0,0 +1,6 @@
#ifndef crypto_uint32_h
#define crypto_uint32_h
typedef unsigned int crypto_uint32;
#endif

View file

@ -0,0 +1,6 @@
#ifndef crypto_uint64_h
#define crypto_uint64_h
typedef unsigned long long crypto_uint64;
#endif

View file

@ -0,0 +1,6 @@
#ifndef crypto_uint8_h
#define crypto_uint8_h
typedef unsigned char crypto_uint8;
#endif

View file

@ -0,0 +1,12 @@
#ifndef crypto_verify_H
#define crypto_verify_H
#include "crypto_verify_16.h"
#define crypto_verify crypto_verify_16
#define crypto_verify_BYTES crypto_verify_16_BYTES
#define crypto_verify_PRIMITIVE "16"
#define crypto_verify_IMPLEMENTATION crypto_verify_16_IMPLEMENTATION
#define crypto_verify_VERSION crypto_verify_16_VERSION
#endif

View file

@ -0,0 +1,22 @@
#ifndef crypto_verify_16_H
#define crypto_verify_16_H
#define crypto_verify_16_ref_BYTES 16
#ifdef __cplusplus
extern "C" {
#endif
extern int crypto_verify_16_ref(const unsigned char *,const unsigned char *);
#ifdef __cplusplus
}
#endif
#define crypto_verify_16 crypto_verify_16_ref
#define crypto_verify_16_BYTES crypto_verify_16_ref_BYTES
#define crypto_verify_16_IMPLEMENTATION "crypto_verify/16/ref"
#ifndef crypto_verify_16_ref_VERSION
#define crypto_verify_16_ref_VERSION "-"
#endif
#define crypto_verify_16_VERSION crypto_verify_16_ref_VERSION
#endif

View file

@ -0,0 +1,796 @@
/*
// CAESAR OCB v1 somewhat optimised code
// Info: http://www.cs.ucdavis.edu/~rogaway/ocb
//
// Written by Romain Dolbeau (romain@dolbeau.org),
// based on the reference implementation by Ted Krovetz (ted@krovetz.net).
//
// Phillip Rogaway holds patents relevant to OCB. See the following for
// his free patent grant: http://www.cs.ucdavis.edu/~rogaway/ocb/grant.htm
//
// This is free and unencumbered software released into the public domain.
//
// Anyone is free to copy, modify, publish, use, compile, sell, or
// distribute this software, either in source code form or as a compiled
// binary, for any purpose, commercial or non-commercial, and by any
// means.
//
// In jurisdictions that recognize copyright laws, the author or authors
// of this software dedicate any and all copyright interest in the
// software to the public domain. We make this dedication for the benefit
// of the public at large and to the detriment of our heirs and
// successors. We intend this dedication to be an overt act of
// relinquishment in perpetuity of all present and future rights to this
// software under copyright law.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// For more information, please refer to <http://unlicense.org/>
*/
#include <string.h>
#include "api.h"
#include "crypto_aead.h"
#define KEYBYTES CRYPTO_KEYBYTES
#define NONCEBYTES CRYPTO_NPUBBYTES
#define TAGBYTES CRYPTO_ABYTES
#define ALIGN16 __attribute__((aligned(16)))
#define ALIGN32 __attribute__((aligned(32)))
#define ALIGN64 __attribute__((aligned(64)))
#define _bswap64(a) __builtin_bswap64(a)
#define _bswap(a) __builtin_bswap32(a)
#define printv16c(p,v) \
{ \
ALIGN16 unsigned char temp[16]; \
_mm_store_si128(temp, v); \
int z; \
printf("%8s:%8s = ",p,#v); \
for (z = 15 ; z >= 0 ; z--) { \
printf("%02hhx", temp[z]); \
if ((z%4)==0) printf(" "); \
} \
printf("\n"); \
}
#include "m128_compat.h"
#include "new_instructions_support_k.h"
#define rotr(a,b) _rv32_ror(a,b)
static inline void aes256_Tsetkey_encrypt(const unsigned int key[], unsigned int *aes_edrk) {
unsigned int i = 0;
unsigned int rotl_aes_edrk;
unsigned int tmp8, tmp9, tmp10, tmp11;
unsigned int tmp12, tmp13, tmp14, tmp15;
unsigned int temp_lds;
unsigned int round = 0x00000001;
tmp8 = (key[0]);
aes_edrk[0] = tmp8;
tmp9 = (key[1]);
aes_edrk[1] = tmp9;
tmp10 = (key[2]);
aes_edrk[2] = tmp10;
tmp11 = (key[3]);
aes_edrk[3] = tmp11;
tmp12 = (key[4]);
aes_edrk[4] = tmp12;
tmp13 = (key[5]);
aes_edrk[5] = tmp13;
tmp14 = (key[6]);
aes_edrk[6] = tmp14;
tmp15 = (key[7]);
aes_edrk[7] = tmp15;
for( i = 8; i < 56; /* i+=8 */ )
{
tmp8 = tmp8 ^ round;
round = round << 1;
rotl_aes_edrk = rotr(tmp15,8);
tmp8 = aes32esi0(tmp8, rotl_aes_edrk);
tmp8 = aes32esi1(tmp8, rotl_aes_edrk);
tmp8 = aes32esi2(tmp8, rotl_aes_edrk);
tmp8 = aes32esi3(tmp8, rotl_aes_edrk);
aes_edrk[i++] = tmp8;
tmp9 = tmp9 ^ tmp8;
aes_edrk[i++] = tmp9;
tmp10 = tmp10 ^ tmp9;
aes_edrk[i++] = tmp10;
tmp11 = tmp11 ^ tmp10;
aes_edrk[i++] = tmp11;
tmp12 = aes32esi0(tmp12, tmp11);
tmp12 = aes32esi1(tmp12, tmp11);
tmp12 = aes32esi2(tmp12, tmp11);
tmp12 = aes32esi3(tmp12, tmp11);
aes_edrk[i++] = tmp12;
tmp13 = tmp13 ^ tmp12;
aes_edrk[i++] = tmp13;
tmp14 = tmp14 ^ tmp13;
aes_edrk[i++] = tmp14;
tmp15 = tmp15 ^ tmp14;
aes_edrk[i++] = tmp15;
}
tmp8 = tmp8 ^ round;
round = round << 1;
rotl_aes_edrk = rotr(tmp15,8);
tmp8 = aes32esi0(tmp8, rotl_aes_edrk);
tmp8 = aes32esi1(tmp8, rotl_aes_edrk);
tmp8 = aes32esi2(tmp8, rotl_aes_edrk);
tmp8 = aes32esi3(tmp8, rotl_aes_edrk);
aes_edrk[i++] = tmp8;
tmp9 = tmp9 ^ tmp8;
aes_edrk[i++] = tmp9;
tmp10 = tmp10 ^ tmp9;
aes_edrk[i++] = tmp10;
tmp11 = tmp11 ^ tmp10;
aes_edrk[i++] = tmp11;
}
static void aes256_key_enc2dec(unsigned int *erk, unsigned int *drk)
{
int i, j;
// first and last unchanged (but swapped)
for (i = 0; i < 4; i++) {
drk[i] = erk[i+56];
drk[i+56] = erk[i];
}
// convert & revert order
for (i = 1; i < 14; i++) {
for (j = 0 ; j < 4 ; j++) {
unsigned int ek, dk;
ek = erk[i*4+j];
dk = 0;
dk = aes32esi0(dk, ek);
dk = aes32esi1(dk, ek);
dk = aes32esi2(dk, ek);
dk = aes32esi3(dk, ek);
ek = 0;
ek = aes32dsmi0(ek, dk);
ek = aes32dsmi1(ek, dk);
ek = aes32dsmi2(ek, dk);
ek = aes32dsmi3(ek, dk);
drk[56-4*i+j] = ek;
}
}
}
#define AES_ROUND1T(TAB,I,X0,X1,X2,X3,Y0,Y1,Y2,Y3) \
{ \
X0 = aes32esmi0(TAB[I++],Y0); \
X0 = aes32esmi1(X0,Y1); \
X0 = aes32esmi2(X0,Y2); \
X0 = aes32esmi3(X0,Y3); \
X1 = aes32esmi0(TAB[I++],Y1); \
X1 = aes32esmi1(X1,Y2); \
X1 = aes32esmi2(X1,Y3); \
X1 = aes32esmi3(X1,Y0); \
X2 = aes32esmi0(TAB[I++],Y2); \
X2 = aes32esmi1(X2,Y3); \
X2 = aes32esmi2(X2,Y0); \
X2 = aes32esmi3(X2,Y1); \
X3 = aes32esmi0(TAB[I++],Y3); \
X3 = aes32esmi1(X3,Y0); \
X3 = aes32esmi2(X3,Y1); \
X3 = aes32esmi3(X3,Y2); \
}
/* using the K + B instructions */
static inline void aes256_1Tft_encrypt(const uint32_t *aes_edrk, const uint32_t *input, uint32_t *output)
{
unsigned int X0, X1, X2, X3, Y0, Y1, Y2, Y3;
unsigned int i = 0, j = 0;
unsigned int l_aes_nr = 14;
X0 = ((input[0]) ^ aes_edrk[j++]);
X1 = ((input[1]) ^ aes_edrk[j++]);
X2 = ((input[2]) ^ aes_edrk[j++]);
X3 = ((input[3]) ^ aes_edrk[j++]);
for (i = 4 ; i < (l_aes_nr<<2) ; ) {
AES_ROUND1T(aes_edrk, i, Y0, Y1, Y2, Y3, X0, X1, X2, X3 );
X0=Y0;
X1=Y1;
X2=Y2;
X3=Y3;
}
/* last round */
Y0 = aes32esi0(aes_edrk[i], X0);
Y0 = aes32esi1(Y0, X1);
Y0 = aes32esi2(Y0, X2);
Y0 = aes32esi3(Y0, X3);
i++;
Y1 = aes32esi0(aes_edrk[i], X1);
Y1 = aes32esi1(Y1, X2);
Y1 = aes32esi2(Y1, X3);
Y1 = aes32esi3(Y1, X0);
i++;
Y2 = aes32esi0(aes_edrk[i], X2);
Y2 = aes32esi1(Y2, X3);
Y2 = aes32esi2(Y2, X0);
Y2 = aes32esi3(Y2, X1);
i++;
Y3 = aes32esi0(aes_edrk[i], X3);
Y3 = aes32esi1(Y3, X0);
Y3 = aes32esi2(Y3, X1);
Y3 = aes32esi3(Y3, X2);
output[0] = (Y0);
output[1] = (Y1);
output[2] = (Y2);
output[3] = (Y3);
}
#define AES_ROUND_DKT(TAB,I,X0,X1,X2,X3,Y0,Y1,Y2,Y3) \
{ \
X0 = aes32dsmi0(TAB[I+0],Y0); \
X0 = aes32dsmi1(X0,Y3); \
X0 = aes32dsmi2(X0,Y2); \
X0 = aes32dsmi3(X0,Y1); \
X1 = aes32dsmi0(TAB[I+1],Y1); \
X1 = aes32dsmi1(X1,Y0); \
X1 = aes32dsmi2(X1,Y3); \
X1 = aes32dsmi3(X1,Y2); \
X2 = aes32dsmi0(TAB[I+2],Y2); \
X2 = aes32dsmi1(X2,Y1); \
X2 = aes32dsmi2(X2,Y0); \
X2 = aes32dsmi3(X2,Y3); \
X3 = aes32dsmi0(TAB[I+3],Y3); \
X3 = aes32dsmi1(X3,Y2); \
X3 = aes32dsmi2(X3,Y1); \
X3 = aes32dsmi3(X3,Y0); \
}
void aes256_1Tft_decrypt(const unsigned int *aes_drk, const unsigned int *input, unsigned int *output)
{
const unsigned int aes_nr = 14; // aes256
unsigned int X0, X1, X2, X3, Y0, Y1, Y2, Y3;
unsigned int i;
X0 = input[0]; X0 ^= aes_drk[0];
X1 = input[1]; X1 ^= aes_drk[1];
X2 = input[2]; X2 ^= aes_drk[2];
X3 = input[3]; X3 ^= aes_drk[3];
// for (i=1;i<aes_nr;i++)
i=1;
do
{
AES_ROUND_DKT(aes_drk, (i<<2), Y0, Y1, Y2, Y3, X0, X1, X2, X3 ); /* round 1 */
X0=Y0;
X1=Y1;
X2=Y2;
X3=Y3;
i++;
}
while(i<aes_nr);
i=(i<<2);
/* last round */
Y0 = aes32dsi0(aes_drk[i+0], X0);
Y0 = aes32dsi1(Y0, X3);
Y0 = aes32dsi2(Y0, X2);
Y0 = aes32dsi3(Y0, X1);
Y1 = aes32dsi0(aes_drk[i+1], X1);
Y1 = aes32dsi1(Y1, X0);
Y1 = aes32dsi2(Y1, X3);
Y1 = aes32dsi3(Y1, X2);
Y2 = aes32dsi0(aes_drk[i+2], X2);
Y2 = aes32dsi1(Y2, X1);
Y2 = aes32dsi2(Y2, X0);
Y2 = aes32dsi3(Y2, X3);
Y3 = aes32dsi0(aes_drk[i+3], X3);
Y3 = aes32dsi1(Y3, X2);
Y3 = aes32dsi2(Y3, X1);
Y3 = aes32dsi3(Y3, X0);
output[0] = Y0;
output[1] = Y1;
output[2] = Y2;
output[3] = Y3;
}
/** single, by-the-book AES encryption with AES-NI */
static inline __m128i aes256_1Tft__encrypt1_si128(const __m128i nv, const __m128i rkeys[15]) {
__m128i temp;
aes256_1Tft_encrypt(rkeys, &nv, &temp);
return temp;
}
static inline __m128i aes256_1Tft__decrypt1_si128(const __m128i nv, const __m128i rkeys[15]) {
__m128i temp;
aes256_1Tft_decrypt(rkeys, &nv, &temp);
return temp;
}
typedef unsigned char block[16];
/* ------------------------------------------------------------------------- */
#if 0
static inline void xor_block(block d, block s1, block s2) {
unsigned i;
for (i=0; i<16; i++)
d[i] = s1[i] ^ s2[i];
}
#else
/* 128 bits SSE doubling */
static inline void xor_block(unsigned char* d, const unsigned char* s1, const unsigned char* s2) {
__m128i dv = _mm_xor_si128(_mm_loadu_si128((const __m128i*)s1), _mm_loadu_si128((const __m128i*)s2));
_mm_storeu_si128((__m128i*)d,dv);
}
#endif
/* ------------------------------------------------------------------------- */
#if 0
static inline void double_block(block d, block s) {
unsigned i;
unsigned char tmp = s[0];
for (i=0; i<15; i++)
d[i] = (s[i] << 1) | (s[i+1] >> 7);
d[15] = (s[15] << 1) ^ ((tmp >> 7) * 135);
}
#else
#if 0
/* 64 bits little-endian doubling, faster */
static inline void double_block(unsigned long long *d, const unsigned long long* s) {
unsigned long long sl = _bswap64(s[1]), sh = _bswap64(s[0]);
unsigned long long sl1 = sl << 1;
unsigned long long sh1 = sh << 1;
sh1 |= sl>>63;
sl1 ^= (((long long)sh>>63) & 135);
d[1]=_bswap64(sl1);
d[0]=_bswap64(sh1);
}
#else
/* 128 bits SSE, much faster */
static inline __m128i double_block_si128_norev(const __m128i sv) {
const __m128i mask = _mm_set_epi32(135,1,1,1);
/* __m128i sv31 = _mm_srai_epi32(sv, 31); */
__m128i sv31 = wordsign128(sv);
__m128i sv31m = _mm_and_si128(sv31, mask);
/* __m128i sv31ms = _mm_shuffle_epi32(sv31m, _MM_SHUFFLE(2,1,0,3)); */
__m128i sv31ms = wordrotate1l128(sv31m);
__m128i sv1 = _mm_slli_epi32(sv, 1);
__m128i dv = _mm_xor_si128(sv31ms,sv1);
return dv;
}
static inline __m128i double_block_si128(const __m128i svr) {
/* const __m128i rev = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); */
/* __m128i sv = _mm_shuffle_epi8(svr, rev); */
__m128i sv = bytereverse128(svr);
__m128i dv = double_block_si128_norev(sv);
/* return _mm_shuffle_epi8(dv, rev); */
return bytereverse128(dv);
}
static inline void double_block(unsigned char *d, const unsigned char* s) {
__m128i sv = _mm_loadu_si128((const __m128i*)s);
__m128i dv = double_block_si128(sv);
_mm_storeu_si128((__m128i*)d,dv);
}
/* 128 bits SSE times 4 */
static const unsigned short lk4[64] = {
0x0000, 0x0086, 0x010c, 0x018a, 0x0218, 0x029e, 0x0314, 0x0392,
0x0430, 0x04b6, 0x053c, 0x05ba, 0x0628, 0x06ae, 0x0724, 0x07a2,
0x0860, 0x08e6, 0x096c, 0x09ea, 0x0a78, 0x0afe, 0x0b74, 0x0bf2,
0x0c50, 0x0cd6, 0x0d5c, 0x0dda, 0x0e48, 0x0ece, 0x0f44, 0x0fc2,
0x10c0, 0x1046, 0x11cc, 0x114a, 0x12d8, 0x125e, 0x13d4, 0x1352,
0x14f0, 0x1476, 0x15fc, 0x157a, 0x16e8, 0x166e, 0x17e4, 0x1762,
0x18a0, 0x1826, 0x19ac, 0x192a, 0x1ab8, 0x1a3e, 0x1bb4, 0x1b32,
0x1c90, 0x1c16, 0x1d9c, 0x1d1a, 0x1e88, 0x1e0e, 0x1f84, 0x1f02
};
static inline __m128i double_block_2_si128_norev(const __m128i sv) {
const __m128i mask = _mm_set_epi32(3,3,3,3);
const int idx = _mm_extract_epi8(sv,15);
/* __m128i sv30x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xC0)>>6],0); */
__m128i sv30x = halfwordandzero(lk4[(idx&0xC0)>>6]);
__m128i sv30 = _mm_srli_epi32(sv, 30);
__m128i sv30m = _mm_and_si128(sv30, mask);
/* __m128i sv30ms = _mm_shuffle_epi32(sv30m, _MM_SHUFFLE(2,1,0,3)); */
__m128i sv30ms = wordrotate1l128(sv30m);
__m128i sv2 = _mm_slli_epi32(sv, 2);
__m128i dv = _mm_xor_si128(sv30ms,sv2);
__m128i final = _mm_xor_si128(dv, sv30x);
return final;
}
static inline __m128i double_block_3_si128_norev(const __m128i sv) {
const __m128i mask = _mm_set_epi32(7,7,7,7);
const int idx = _mm_extract_epi8(sv,15);
/* __m128i sv29x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xE0)>>5],0); */
__m128i sv29x = halfwordandzero(lk4[(idx&0xE0)>>5]);
__m128i sv29 = _mm_srli_epi32(sv, 29);
__m128i sv29m = _mm_and_si128(sv29, mask);
/* __m128i sv29ms = _mm_shuffle_epi32(sv29m, _MM_SHUFFLE(2,1,0,3)); */
__m128i sv29ms = wordrotate1l128(sv29m);
__m128i sv3 = _mm_slli_epi32(sv, 3);
__m128i dv = _mm_xor_si128(sv29ms,sv3);
__m128i final = _mm_xor_si128(dv, sv29x);
return final;
}
static inline __m128i double_block_4_si128_norev(const __m128i sv) {
const __m128i mask = _mm_set_epi32(15,15,15,15);
const int idx = _mm_extract_epi8(sv,15);
/* __m128i sv28x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xF0)>>4],0); */
__m128i sv28x = halfwordandzero(lk4[(idx&0xF0)>>4]);
__m128i sv28 = _mm_srli_epi32(sv, 28);
__m128i sv28m = _mm_and_si128(sv28, mask);
/* __m128i sv28ms = _mm_shuffle_epi32(sv28m, _MM_SHUFFLE(2,1,0,3)); */
__m128i sv28ms = wordrotate1l128(sv28m);
__m128i sv4 = _mm_slli_epi32(sv, 4);
__m128i dv = _mm_xor_si128(sv28ms,sv4);
__m128i final = _mm_xor_si128(dv, sv28x);
return final;
}
static inline __m128i double_block_5_si128_norev(const __m128i sv) {
const __m128i mask = _mm_set_epi32(31,31,31,31);
const int idx = _mm_extract_epi8(sv,15);
/* __m128i sv27x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xF8)>>3],0); */
__m128i sv27x = halfwordandzero(lk4[(idx&0xF8)>>3]);
__m128i sv27 = _mm_srli_epi32(sv, 27);
__m128i sv27m = _mm_and_si128(sv27, mask);
/* __m128i sv27ms = _mm_shuffle_epi32(sv27m, _MM_SHUFFLE(2,1,0,3)); */
__m128i sv27ms = wordrotate1l128(sv27m);
__m128i sv5 = _mm_slli_epi32(sv, 5);
__m128i dv = _mm_xor_si128(sv27ms,sv5);
__m128i final = _mm_xor_si128(dv, sv27x);
return final;
}
static inline __m128i double_block_6_si128_norev(const __m128i sv) {
const __m128i mask = _mm_set_epi32(63,63,63,63);
const int idx = _mm_extract_epi8(sv,15);
/* __m128i sv26x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xFC)>>2],0); */
__m128i sv26x = halfwordandzero(lk4[(idx&0xFC)>>2]);
__m128i sv26 = _mm_srli_epi32(sv, 26);
__m128i sv26m = _mm_and_si128(sv26, mask);
/* __m128i sv26ms = _mm_shuffle_epi32(sv26m, _MM_SHUFFLE(2,1,0,3)); */
__m128i sv26ms = wordrotate1l128(sv26m);
__m128i sv6 = _mm_slli_epi32(sv, 6);
__m128i dv = _mm_xor_si128(sv26ms,sv6);
__m128i final = _mm_xor_si128(dv, sv26x);
return final;
}
#endif
#endif
/* ------------------------------------------------------------------------- */
static inline __m128i calc_L_i_si128(const __m128i ldollarvr, const unsigned j) {
/* const __m128i rev = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); */
/* __m128i ldollarv = _mm_shuffle_epi8(ldollarvr, rev); */
__m128i ldollarv = bytereverse128(ldollarvr);
unsigned i;
__m128i lv;
unsigned ntz = __builtin_ctz(j);/* printf("ntz = %u\n", ntz); */
switch(ntz) {
case 0:
lv = double_block_si128_norev(ldollarv);
break;
case 1:
lv = double_block_2_si128_norev(ldollarv);
break;
case 2:
lv = double_block_3_si128_norev(ldollarv);
break;
case 3:
lv = double_block_4_si128_norev(ldollarv);
break;
case 4:
lv = double_block_5_si128_norev(ldollarv);
break;
default:
lv = double_block_6_si128_norev(ldollarv);
for (i = 5; i < ntz ; i++)
lv = double_block_si128_norev(lv);
break;
}
/* return _mm_shuffle_epi8(lv, rev); */
return bytereverse128(lv);
}
static inline void calc_L_i(block l, const block ldollar, const unsigned i) {
__m128i ldollarv = _mm_loadu_si128((const __m128i*)ldollar);
__m128i lv = calc_L_i_si128(ldollarv, i);
_mm_storeu_si128((__m128i*)l,lv);
}
static inline void precompute_lv(__m128i prelv[32], const __m128i ldollarvr, const unsigned max) {
/* const __m128i rev = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); */
/* __m128i ldollarv = _mm_shuffle_epi8(ldollarvr, rev); */
__m128i ldollarv = bytereverse128(ldollarvr);
unsigned i;
__m128i lv = double_block_si128_norev(ldollarv);
for (i = 0 ; i < max-1 ; i++) {
/* prelv[i] = _mm_shuffle_epi8(lv, rev); */
prelv[i] = bytereverse128(lv);
lv = double_block_si128_norev(lv);
}
/* prelv[i] = _mm_shuffle_epi8(lv, rev); */
return bytereverse128(lv);
}
/* ------------------------------------------------------------------------- */
static void hash(block result, const unsigned char *k,
unsigned char *a, unsigned abytes,
const __m128i lstar,
const __m128i prelv[32], const __m128i aes_key[15]) {
__m128i offset, sum, tmp;
unsigned i;
/* Process any whole blocks */
/* Sum_0 = zeros(128) */
sum = _mm_setzero_si128();
/* Offset_0 = zeros(128) */
offset = _mm_setzero_si128();
i=1;
for (; i<=abytes/16; i++, a = a + 16) {
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
tmp = prelv[__builtin_ctz(i)];
offset = _mm_xor_si128(offset, tmp);
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
tmp = _mm_xor_si128(offset, _mm_loadu_si128((const __m128i*)a));
tmp = aes256_1Tft__encrypt1_si128(tmp, aes_key);
sum = _mm_xor_si128(sum, tmp);
}
/* Process any final partial block; compute final hash value */
abytes = abytes % 16; /* Bytes in final block */
if (abytes > 0) {
/* Offset_* = Offset_m xor L_* */
offset = _mm_xor_si128(offset, lstar);
/* tmp = (A_* || 1 || zeros(127-bitlen(A_*))) xor Offset_* */
unsigned char pad[16];
memset(pad, 0, 16);
memcpy(pad, a, abytes);
pad[abytes] = 0x80;
tmp = _mm_loadu_si128((const __m128i*)pad);
tmp = _mm_xor_si128(offset, tmp);
/* Sum = Sum_m xor ENCIPHER(K, tmp) */
tmp = aes256_1Tft__encrypt1_si128(tmp, aes_key);
sum = _mm_xor_si128(tmp, sum);
}
_mm_storeu_si128((__m128i*)result,sum);
}
/* ------------------------------------------------------------------------- */
static int ocb_crypt(unsigned char *out, unsigned char *k, unsigned char *n,
unsigned char *a, unsigned abytes,
unsigned char *in, unsigned inbytes, int encrypting) {
__m128i prelv[32];
__m128i aes_decrypt_key[15];
__m128i aes_encrypt_key[15];
block ad_hash;
__m128i lstar, ldollar, sum, offset, ktop, pad, nonce, tag, tmp, outv;
block nonce_b, offset_b;
unsigned char stretch[24];
unsigned bottom, byteshift, bitshift, i, max;
/* Setup AES and strip ciphertext of its tag */
if ( ! encrypting ) {
if (inbytes < TAGBYTES) return -1;
inbytes -= TAGBYTES;
}
aes256_Tsetkey_encrypt(k, aes_encrypt_key);
if ( ! encrypting ) {
aes256_key_enc2dec(aes_encrypt_key, aes_decrypt_key);
}
/* Key-dependent variables */
/* L_* = ENCIPHER(K, zeros(128)) */
tmp = _mm_setzero_si128();
lstar = aes256_1Tft__encrypt1_si128(tmp, aes_encrypt_key);
/* L_$ = double(L_*) */
ldollar = double_block_si128(lstar);
max = abytes >= inbytes ? abytes/4 : inbytes/4;
max = (max < 2 ? 2 : max);
/* only precompute what's really needed;
look at the number of leading zero (to find the leftmost bit set to one)
all trailing zero will be at the right of it so we have an upper bound
*/
precompute_lv(prelv,ldollar,31-__builtin_clz(max));
/* Nonce-dependent and per-encryption variables */
/* Nonce = zeros(127-bitlen(N)) || 1 || N */
memset(nonce_b, 0, 16);
memcpy(&nonce_b[16-NONCEBYTES],n,NONCEBYTES);
nonce_b[0] = (unsigned char)(((TAGBYTES * 8) % 128) << 1);
nonce_b[16-NONCEBYTES-1] |= 0x01;
/* bottom = str2num(Nonce[123..128]) */
bottom = nonce_b[15] & 0x3F;
/* Ktop = ENCIPHER(K, Nonce[1..122] || zeros(6)) */
nonce_b[15] &= 0xC0;
nonce = _mm_loadu_si128((const __m128i*)nonce_b);
ktop = aes256_1Tft__encrypt1_si128(nonce, aes_encrypt_key);
/* Stretch = Ktop || (Ktop[1..64] xor Ktop[9..72]) */
_mm_storeu_si128((__m128i*)stretch, ktop);
_mm_storel_epi64((__m128i*)(stretch+16), _mm_xor_si128(_mm_srli_si128(ktop,1), ktop));
/* Offset_0 = Stretch[1+bottom..128+bottom] */
byteshift = bottom/8;
bitshift = bottom%8;
if (bitshift != 0)
for (i=0; i<16; i++)
offset_b[i] = (stretch[i+byteshift] << bitshift) |
(stretch[i+byteshift+1] >> (8-bitshift));
else
for (i=0; i<16; i++)
offset_b[i] = stretch[i+byteshift];
offset = _mm_loadu_si128((const __m128i*)offset_b);
/* Checksum_0 = zeros(128) */
sum = _mm_xor_si128(sum,sum);
/* Hash associated data */
hash(ad_hash, k, a, abytes, lstar, prelv, aes_encrypt_key);
/* Process any whole blocks */
i=1;
if (encrypting) {
for (; i<=inbytes/16; i++, in=in+16, out=out+16) {
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
tmp = prelv[__builtin_ctz(i)];
offset = _mm_xor_si128(offset, tmp);
tmp = _mm_xor_si128(offset, _mm_loadu_si128((const __m128i*)in));
/* Checksum_i = Checksum_{i-1} xor P_i */
sum = _mm_xor_si128(_mm_loadu_si128((const __m128i*)in), sum);
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
tmp = aes256_1Tft__encrypt1_si128(tmp, aes_encrypt_key);
outv = _mm_xor_si128(offset, tmp);
_mm_storeu_si128((__m128i*)out, outv);
}
} else {
for (; i<=inbytes/16; i++, in=in+16, out=out+16) {
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
tmp= prelv[__builtin_ctz(i)];
offset = _mm_xor_si128(offset, tmp);
tmp = _mm_xor_si128(offset, _mm_loadu_si128((const __m128i*)in));
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
tmp = aes256_1Tft__decrypt1_si128(tmp, aes_decrypt_key);
outv = _mm_xor_si128(offset, tmp);
_mm_storeu_si128((__m128i*)out, outv);
/* Checksum_i = Checksum_{i-1} xor P_i */
sum = _mm_xor_si128(outv, sum);
}
}
/* Process any final partial block and compute raw tag */
inbytes = inbytes % 16; /* Bytes in final block */
if (inbytes > 0) {
/* Offset_* = Offset_m xor L_* */
offset = _mm_xor_si128(offset, lstar);
/* Pad = ENCIPHER(K, Offset_*) */
pad = aes256_1Tft__encrypt1_si128(offset, aes_encrypt_key);
if (encrypting) {
/* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
unsigned char tmp_b[16];
unsigned char pad_b[16];
memset(tmp_b, 0, 16);
memcpy(tmp_b, in, inbytes);
tmp_b[inbytes] = 0x80;
tmp = _mm_loadu_si128((const __m128i*)tmp_b);
sum = _mm_xor_si128(tmp, sum);
/* C_* = P_* xor Pad[1..bitlen(P_*)] */
pad = _mm_xor_si128(tmp, pad);
_mm_storeu_si128((__m128i*)pad_b, pad);
memcpy(out, pad_b, inbytes);
out = out + inbytes;
} else {
/* P_* = C_* xor Pad[1..bitlen(C_*)] */
unsigned char tmp_b[16];
unsigned char pad_b[16];
_mm_storeu_si128((__m128i*)pad_b, pad);
memcpy(tmp_b, pad_b, 16);
memcpy(tmp_b, in, inbytes);
xor_block(tmp_b,pad_b,tmp_b);
tmp_b[inbytes] = 0x80;
memcpy(out, tmp_b, inbytes);
tmp = _mm_loadu_si128((const __m128i*)tmp_b);
/* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
sum = _mm_xor_si128(tmp, sum);
in = in + inbytes;
}
}
/* Tag = ENCIPHER(K, Checksum xor Offset xor L_$) xor HASH(K,A) */
tmp = _mm_xor_si128(sum, offset);
tmp = _mm_xor_si128(tmp, ldollar);
tag = aes256_1Tft__encrypt1_si128(tmp, aes_encrypt_key);
tag = _mm_xor_si128(_mm_loadu_si128((const __m128i*)ad_hash), tag);
if (encrypting) {
unsigned char tag_b[16];
_mm_storeu_si128((__m128i*)tag_b, tag);
memcpy(out, tag_b, TAGBYTES);
return 0;
} else {
unsigned char tag_b[16];
_mm_storeu_si128((__m128i*)tag_b, tag);
return (memcmp(in,tag_b,TAGBYTES) ? -1 : 0); /* Check for validity */
}
}
/* ------------------------------------------------------------------------- */
#define OCB_ENCRYPT 1
#define OCB_DECRYPT 0
void ocb_encrypt(unsigned char *c, unsigned char *k, unsigned char *n,
unsigned char *a, unsigned abytes,
unsigned char *p, unsigned pbytes) {
ocb_crypt(c, k, n, a, abytes, p, pbytes, OCB_ENCRYPT);
}
/* ------------------------------------------------------------------------- */
int ocb_decrypt(unsigned char *p, unsigned char *k, unsigned char *n,
unsigned char *a, unsigned abytes,
unsigned char *c, unsigned cbytes) {
return ocb_crypt(p, k, n, a, abytes, c, cbytes, OCB_DECRYPT);
}
/* ------------------------------------------------------------------------- */
int crypto_aead_encrypt(
unsigned char *c,unsigned long long *clen,
const unsigned char *m,unsigned long long mlen,
const unsigned char *ad,unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
)
{
*clen = mlen + TAGBYTES;
ocb_crypt(c, (unsigned char *)k, (unsigned char *)npub, (unsigned char *)ad,
adlen, (unsigned char *)m, mlen, OCB_ENCRYPT);
return 0;
}
int crypto_aead_decrypt(
unsigned char *m,unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c,unsigned long long clen,
const unsigned char *ad,unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
)
{
*mlen = clen - TAGBYTES;
return ocb_crypt(m, (unsigned char *)k, (unsigned char *)npub,
(unsigned char *)ad, adlen, (unsigned char *)c, clen, OCB_DECRYPT);
}

View file

@ -0,0 +1,14 @@
#ifndef kernelrandombytes_h
#define kernelrandombytes_h
#ifdef __cplusplus
extern "C" {
#endif
extern void kernelrandombytes(unsigned char *,unsigned long long);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -0,0 +1,19 @@
#include <random>
#include <functional>
std::default_random_engine generator;
std::uniform_int_distribution<unsigned char> distribution(0,255);
auto rbyte = std::bind ( distribution, generator );
extern "C" {
void kernelrandombytes(unsigned char *x,unsigned long long xlen)
{
int i;
while (xlen > 0) {
*x = rbyte();
x++;
xlen--;
}
}
}

View file

@ -0,0 +1,83 @@
/*
cpucycles/riscv.c version 20190803
D. J. Bernstein
Romain Dolbeau
Public domain.
*/
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
long long cpucycles_riscv(void)
{
long long result;
#if defined(__riscv_xlen)
#if __riscv_xlen == 64
asm volatile("rdcycle %0" : "=r" (result));
#elif __riscv_xlen == 32
unsigned int l, h, h2;
asm volatile( "start:\n"
"rdcycleh %0\n"
"rdcycle %1\n"
"rdcycleh %2\n"
"bne %0, %2, start\n"
: "=r" (h), "=r" (l), "=r" (h2));
result = (((unsigned long long)h)<<32) | ((unsigned long long)l);
#else
#error "unknown __riscv_xlen"
#endif
#else // __riscv_xlen
#error "__riscv_xlen required for RISC-V support"
#endif // __riscv_xlen
return result;
}
static long long microseconds(void)
{
struct timeval t;
gettimeofday(&t,(struct timezone *) 0);
return t.tv_sec * (long long) 1000000 + t.tv_usec;
}
static double guessfreq(void)
{
long long tb0; long long us0;
long long tb1; long long us1;
tb0 = cpucycles_riscv();
us0 = microseconds();
do {
tb1 = cpucycles_riscv();
us1 = microseconds();
} while (us1 - us0 < 10000 || tb1 - tb0 < 1000);
if (tb1 <= tb0) return 0;
tb1 -= tb0;
us1 -= us0;
return ((double) tb1) / (0.000001 * (double) us1);
}
static long long cpufrequency = 0;
static void init(void)
{
double guess1;
double guess2;
int loop;
for (loop = 0;loop < 100;++loop) {
guess1 = guessfreq();
guess2 = guessfreq();
if (guess1 > 1.01 * guess2) continue;
if (guess2 > 1.01 * guess1) continue;
cpufrequency = 0.5 * (guess1 + guess2);
break;
}
}
long long cpucycles_riscv_persecond(void)
{
if (!cpufrequency) init();
return cpufrequency;
}

View file

@ -0,0 +1,323 @@
/*
* try-anything.c version 20190729
* D. J. Bernstein
* Some portions adapted from TweetNaCl by Bernstein, Janssen, Lange, Schwabe.
* Public domain.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/resource.h>
#include "kernelrandombytes.h"
#include "cpucycles.h"
#include "crypto_uint8.h"
#include "crypto_uint32.h"
#include "crypto_uint64.h"
#include "try.h"
typedef crypto_uint8 u8;
typedef crypto_uint32 u32;
typedef crypto_uint64 u64;
#define FOR(i,n) for (i = 0;i < n;++i)
static u32 L32(u32 x,int c) { return (x << c) | ((x&0xffffffff) >> (32 - c)); }
static u32 ld32(const u8 *x)
{
u32 u = x[3];
u = (u<<8)|x[2];
u = (u<<8)|x[1];
return (u<<8)|x[0];
}
static void st32(u8 *x,u32 u)
{
int i;
FOR(i,4) { x[i] = u; u >>= 8; }
}
static const u8 sigma[17] = "expand 32-byte k";
static void core(u8 *out,const u8 *in,const u8 *k)
{
u32 w[16],x[16],y[16],t[4];
int i,j,m;
FOR(i,4) {
x[5*i] = ld32(sigma+4*i);
x[1+i] = ld32(k+4*i);
x[6+i] = ld32(in+4*i);
x[11+i] = ld32(k+16+4*i);
}
FOR(i,16) y[i] = x[i];
FOR(i,20) {
FOR(j,4) {
FOR(m,4) t[m] = x[(5*j+4*m)%16];
t[1] ^= L32(t[0]+t[3], 7);
t[2] ^= L32(t[1]+t[0], 9);
t[3] ^= L32(t[2]+t[1],13);
t[0] ^= L32(t[3]+t[2],18);
FOR(m,4) w[4*j+(j+m)%4] = t[m];
}
FOR(m,16) x[m] = w[m];
}
FOR(i,16) st32(out + 4 * i,x[i] + y[i]);
}
static void salsa20(u8 *c,u64 b,const u8 *n,const u8 *k)
{
u8 z[16],x[64];
u32 u,i;
if (!b) return;
FOR(i,16) z[i] = 0;
FOR(i,8) z[i] = n[i];
while (b >= 64) {
core(x,z,k);
FOR(i,64) c[i] = x[i];
u = 1;
for (i = 8;i < 16;++i) {
u += (u32) z[i];
z[i] = u;
u >>= 8;
}
b -= 64;
c += 64;
}
if (b) {
core(x,z,k);
FOR(i,b) c[i] = x[i];
}
}
static void increment(u8 *n)
{
if (!++n[0])
if (!++n[1])
if (!++n[2])
if (!++n[3])
if (!++n[4])
if (!++n[5])
if (!++n[6])
if (!++n[7])
;
}
static void testvector(unsigned char *x,unsigned long long xlen)
{
const static unsigned char testvector_k[33] = "generate inputs for test vectors";
static unsigned char testvector_n[8];
salsa20(x,xlen,testvector_n,testvector_k);
increment(testvector_n);
}
unsigned long long myrandom(void)
{
unsigned char x[8];
unsigned long long result;
testvector(x,8);
result = x[7];
result = (result<<8)|x[6];
result = (result<<8)|x[5];
result = (result<<8)|x[4];
result = (result<<8)|x[3];
result = (result<<8)|x[2];
result = (result<<8)|x[1];
result = (result<<8)|x[0];
return result;
}
static void canary(unsigned char *x,unsigned long long xlen)
{
const static unsigned char canary_k[33] = "generate pad to catch overwrites";
static unsigned char canary_n[8];
salsa20(x,xlen,canary_n,canary_k);
increment(canary_n);
}
void double_canary(unsigned char *x2,unsigned char *x,unsigned long long xlen)
{
canary(x - 16,16);
canary(x + xlen,16);
memcpy(x2 - 16,x - 16,16);
memcpy(x2 + xlen,x + xlen,16);
}
void input_prepare(unsigned char *x2,unsigned char *x,unsigned long long xlen)
{
testvector(x,xlen);
canary(x - 16,16);
canary(x + xlen,16);
memcpy(x2 - 16,x - 16,xlen + 32);
}
void input_compare(const unsigned char *x2,const unsigned char *x,unsigned long long xlen,const char *fun)
{
if (memcmp(x2 - 16,x - 16,xlen + 32)) {
fprintf(stderr,"%s overwrites input\n",fun);
exit(111);
}
}
void output_prepare(unsigned char *x2,unsigned char *x,unsigned long long xlen)
{
canary(x - 16,xlen + 32);
memcpy(x2 - 16,x - 16,xlen + 32);
}
void output_compare(const unsigned char *x2,const unsigned char *x,unsigned long long xlen,const char *fun)
{
if (memcmp(x2 - 16,x - 16,16)) {
fprintf(stderr,"%s writes before output\n",fun);
exit(111);
}
if (memcmp(x2 + xlen,x + xlen,16)) {
fprintf(stderr,"%s writes after output\n",fun);
exit(111);
}
}
static unsigned char checksum_state[64];
static char checksum_hex[65];
void checksum(const unsigned char *x,unsigned long long xlen)
{
u8 block[16];
int i;
while (xlen >= 16) {
core(checksum_state,x,checksum_state);
x += 16;
xlen -= 16;
}
FOR(i,16) block[i] = 0;
FOR(i,xlen) block[i] = x[i];
block[xlen] = 1;
checksum_state[0] ^= 1;
core(checksum_state,block,checksum_state);
}
static void printword(const char *s)
{
if (!*s) putchar('-');
while (*s) {
if (*s == ' ') putchar('_');
else if (*s == '\t') putchar('_');
else if (*s == '\r') putchar('_');
else if (*s == '\n') putchar('_');
else putchar(*s);
++s;
}
putchar(' ');
}
static void printnum(long long x)
{
printf("%lld ",x);
}
void fail(const char *why)
{
fprintf(stderr,"%s\n",why);
exit(111);
}
unsigned char *alignedcalloc(unsigned long long len)
{
unsigned char *x = (unsigned char *) calloc(1,len + 256);
long long i;
if (!x) fail("out of memory");
/* will never deallocate so shifting is ok */
for (i = 0;i < len + 256;++i) x[i] = random();
x += 64;
x += 63 & (-(unsigned long) x);
for (i = 0;i < len;++i) x[i] = 0;
return x;
}
#define TIMINGS 63
static long long cycles[TIMINGS + 1];
void limits()
{
#ifdef RLIM_INFINITY
struct rlimit r;
r.rlim_cur = 0;
r.rlim_max = 0;
#ifdef RLIMIT_NOFILE
setrlimit(RLIMIT_NOFILE,&r);
#endif
#ifdef RLIMIT_NPROC
setrlimit(RLIMIT_NPROC,&r);
#endif
#ifdef RLIMIT_CORE
setrlimit(RLIMIT_CORE,&r);
#endif
#endif
}
static unsigned char randombyte[1];
int main()
{
long long i;
long long j;
long long abovej;
long long belowj;
long long checksumcycles;
long long cyclespersecond;
cycles[0] = cpucycles();
cycles[1] = cpucycles();
cyclespersecond = cpucycles_persecond();
kernelrandombytes(randombyte,1);
preallocate();
limits();
allocate();
srandom(getpid());
cycles[0] = cpucycles();
test();
cycles[1] = cpucycles();
checksumcycles = cycles[1] - cycles[0];
predoit();
for (i = 0;i <= TIMINGS;++i) {
cycles[i] = cpucycles();
}
for (i = 0;i <= TIMINGS;++i) {
cycles[i] = cpucycles();
doit();
}
for (i = 0;i < TIMINGS;++i) cycles[i] = cycles[i + 1] - cycles[i];
for (j = 0;j < TIMINGS;++j) {
belowj = 0;
for (i = 0;i < TIMINGS;++i) if (cycles[i] < cycles[j]) ++belowj;
abovej = 0;
for (i = 0;i < TIMINGS;++i) if (cycles[i] > cycles[j]) ++abovej;
if (belowj * 2 < TIMINGS && abovej * 2 < TIMINGS) break;
}
for (i = 0;i < 32;++i) {
checksum_hex[2 * i] = "0123456789abcdef"[15 & (checksum_state[i] >> 4)];
checksum_hex[2 * i + 1] = "0123456789abcdef"[15 & checksum_state[i]];
}
checksum_hex[2 * i] = 0;
printword(checksum_hex);
printnum(cycles[j]);
printnum(checksumcycles);
printnum(cyclespersecond);
printword(primitiveimplementation);
printf("\n");
return 0;
}

View file

@ -0,0 +1,242 @@
/*
* crypto_aead/try.c version 20200406
* D. J. Bernstein
* Public domain.
* Auto-generated by trygen.py; do not edit.
*/
#include "crypto_aead.h"
#include "try.h"
const char *primitiveimplementation = crypto_aead_IMPLEMENTATION;
#define TUNE_BYTES 1536
#ifdef SMALL
#define MAXTEST_BYTES 128
#else
#define MAXTEST_BYTES 4096
#endif
#ifdef SMALL
#define LOOPS 64
#else
#define LOOPS 512
#endif
static unsigned char *k;
static unsigned char *s;
static unsigned char *p;
static unsigned char *a;
static unsigned char *m;
static unsigned char *c;
static unsigned char *t;
static unsigned char *r;
static unsigned char *k2;
static unsigned char *s2;
static unsigned char *p2;
static unsigned char *a2;
static unsigned char *m2;
static unsigned char *c2;
static unsigned char *t2;
static unsigned char *r2;
#define klen crypto_aead_KEYBYTES
#define slen crypto_aead_NSECBYTES
#define plen crypto_aead_NPUBBYTES
unsigned long long alen;
unsigned long long mlen;
unsigned long long clen;
unsigned long long tlen;
#define rlen crypto_aead_NSECBYTES
void preallocate(void)
{
}
void allocate(void)
{
unsigned long long alloclen = 0;
if (alloclen < TUNE_BYTES) alloclen = TUNE_BYTES;
if (alloclen < MAXTEST_BYTES + crypto_aead_ABYTES) alloclen = MAXTEST_BYTES + crypto_aead_ABYTES;
if (alloclen < crypto_aead_KEYBYTES) alloclen = crypto_aead_KEYBYTES;
if (alloclen < crypto_aead_NSECBYTES) alloclen = crypto_aead_NSECBYTES;
if (alloclen < crypto_aead_NPUBBYTES) alloclen = crypto_aead_NPUBBYTES;
if (alloclen < crypto_aead_NSECBYTES) alloclen = crypto_aead_NSECBYTES;
k = alignedcalloc(alloclen);
s = alignedcalloc(alloclen);
p = alignedcalloc(alloclen);
a = alignedcalloc(alloclen);
m = alignedcalloc(alloclen);
c = alignedcalloc(alloclen);
t = alignedcalloc(alloclen);
r = alignedcalloc(alloclen);
k2 = alignedcalloc(alloclen);
s2 = alignedcalloc(alloclen);
p2 = alignedcalloc(alloclen);
a2 = alignedcalloc(alloclen);
m2 = alignedcalloc(alloclen);
c2 = alignedcalloc(alloclen);
t2 = alignedcalloc(alloclen);
r2 = alignedcalloc(alloclen);
}
void predoit(void)
{
}
void doit(void)
{
crypto_aead_encrypt(c,&clen,m,TUNE_BYTES,a,TUNE_BYTES,s,p,k);
crypto_aead_decrypt(t,&tlen,r,c,clen,a,TUNE_BYTES,p,k);
}
void test(void)
{
unsigned long long loop;
for (loop = 0;loop < LOOPS;++loop) {
mlen = myrandom() % (MAXTEST_BYTES + 1);
alen = myrandom() % (MAXTEST_BYTES + 1);
clen = mlen + crypto_aead_ABYTES;
output_prepare(c2,c,clen);
input_prepare(m2,m,mlen);
input_prepare(a2,a,alen);
input_prepare(s2,s,slen);
input_prepare(p2,p,plen);
input_prepare(k2,k,klen);
if (crypto_aead_encrypt(c,&clen,m,mlen,a,alen,s,p,k) != 0) fail("crypto_aead_encrypt returns nonzero");
if (clen < mlen) fail("crypto_aead_encrypt returns smaller output than input");
if (clen > mlen + crypto_aead_ABYTES) fail("crypto_aead_encrypt returns more than crypto_aead_ABYTES extra bytes");
checksum(c,clen);
output_compare(c2,c,clen,"crypto_aead_encrypt");
input_compare(m2,m,mlen,"crypto_aead_encrypt");
input_compare(a2,a,alen,"crypto_aead_encrypt");
input_compare(s2,s,slen,"crypto_aead_encrypt");
input_compare(p2,p,plen,"crypto_aead_encrypt");
input_compare(k2,k,klen,"crypto_aead_encrypt");
double_canary(c2,c,clen);
double_canary(m2,m,mlen);
double_canary(a2,a,alen);
double_canary(s2,s,slen);
double_canary(p2,p,plen);
double_canary(k2,k,klen);
if (crypto_aead_encrypt(c2,&clen,m2,mlen,a2,alen,s2,p2,k2) != 0) fail("crypto_aead_encrypt returns nonzero");
if (memcmp(c2,c,clen) != 0) fail("crypto_aead_encrypt is nondeterministic");
#if crypto_aead_NOOVERLAP == 1
#else
double_canary(c2,c,clen);
double_canary(m2,m,mlen);
double_canary(a2,a,alen);
double_canary(s2,s,slen);
double_canary(p2,p,plen);
double_canary(k2,k,klen);
if (crypto_aead_encrypt(m2,&clen,m2,mlen,a,alen,s,p,k) != 0) fail("crypto_aead_encrypt with m=c overlap returns nonzero");
if (memcmp(m2,c,clen) != 0) fail("crypto_aead_encrypt does not handle m=c overlap");
memcpy(m2,m,mlen);
if (crypto_aead_encrypt(a2,&clen,m,mlen,a2,alen,s,p,k) != 0) fail("crypto_aead_encrypt with a=c overlap returns nonzero");
if (memcmp(a2,c,clen) != 0) fail("crypto_aead_encrypt does not handle a=c overlap");
memcpy(a2,a,alen);
if (crypto_aead_encrypt(s2,&clen,m,mlen,a,alen,s2,p,k) != 0) fail("crypto_aead_encrypt with s=c overlap returns nonzero");
if (memcmp(s2,c,clen) != 0) fail("crypto_aead_encrypt does not handle s=c overlap");
memcpy(s2,s,slen);
if (crypto_aead_encrypt(p2,&clen,m,mlen,a,alen,s,p2,k) != 0) fail("crypto_aead_encrypt with p=c overlap returns nonzero");
if (memcmp(p2,c,clen) != 0) fail("crypto_aead_encrypt does not handle p=c overlap");
memcpy(p2,p,plen);
if (crypto_aead_encrypt(k2,&clen,m,mlen,a,alen,s,p,k2) != 0) fail("crypto_aead_encrypt with k=c overlap returns nonzero");
if (memcmp(k2,c,clen) != 0) fail("crypto_aead_encrypt does not handle k=c overlap");
memcpy(k2,k,klen);
#endif
tlen = clen;
output_prepare(t2,t,tlen);
output_prepare(r2,r,rlen);
memcpy(c2,c,clen);
double_canary(c2,c,clen);
memcpy(a2,a,alen);
double_canary(a2,a,alen);
memcpy(p2,p,plen);
double_canary(p2,p,plen);
memcpy(k2,k,klen);
double_canary(k2,k,klen);
if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) != 0) fail("crypto_aead_decrypt returns nonzero");
if (tlen != mlen) fail("crypto_aead_decrypt does not match mlen");
if (memcmp(t,m,mlen) != 0) fail("crypto_aead_decrypt does not match m");
if (memcmp(r,s,slen) != 0) fail("crypto_aead_decrypt does not match s");
checksum(t,tlen);
checksum(r,rlen);
output_compare(t2,t,clen,"crypto_aead_decrypt");
output_compare(r2,r,rlen,"crypto_aead_decrypt");
input_compare(c2,c,clen,"crypto_aead_decrypt");
input_compare(a2,a,alen,"crypto_aead_decrypt");
input_compare(p2,p,plen,"crypto_aead_decrypt");
input_compare(k2,k,klen,"crypto_aead_decrypt");
double_canary(t2,t,tlen);
double_canary(r2,r,rlen);
double_canary(c2,c,clen);
double_canary(a2,a,alen);
double_canary(p2,p,plen);
double_canary(k2,k,klen);
if (crypto_aead_decrypt(t2,&tlen,r2,c2,clen,a2,alen,p2,k2) != 0) fail("crypto_aead_decrypt returns nonzero");
if (memcmp(t2,t,tlen) != 0) fail("crypto_aead_decrypt is nondeterministic");
if (memcmp(r2,r,rlen) != 0) fail("crypto_aead_decrypt is nondeterministic");
#if crypto_aead_NOOVERLAP == 1
#else
double_canary(t2,t,tlen);
double_canary(r2,r,rlen);
double_canary(c2,c,clen);
double_canary(a2,a,alen);
double_canary(p2,p,plen);
double_canary(k2,k,klen);
if (crypto_aead_decrypt(c2,&tlen,r,c2,clen,a,alen,p,k) != 0) fail("crypto_aead_decrypt with c=t overlap returns nonzero");
if (memcmp(c2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle c=t overlap");
memcpy(c2,c,clen);
if (crypto_aead_decrypt(a2,&tlen,r,c,clen,a2,alen,p,k) != 0) fail("crypto_aead_decrypt with a=t overlap returns nonzero");
if (memcmp(a2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle a=t overlap");
memcpy(a2,a,alen);
if (crypto_aead_decrypt(p2,&tlen,r,c,clen,a,alen,p2,k) != 0) fail("crypto_aead_decrypt with p=t overlap returns nonzero");
if (memcmp(p2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle p=t overlap");
memcpy(p2,p,plen);
if (crypto_aead_decrypt(k2,&tlen,r,c,clen,a,alen,p,k2) != 0) fail("crypto_aead_decrypt with k=t overlap returns nonzero");
if (memcmp(k2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle k=t overlap");
memcpy(k2,k,klen);
#endif
#if crypto_aead_NOOVERLAP == 1
#else
double_canary(t2,t,tlen);
double_canary(r2,r,rlen);
double_canary(c2,c,clen);
double_canary(a2,a,alen);
double_canary(p2,p,plen);
double_canary(k2,k,klen);
if (crypto_aead_decrypt(t,&tlen,c2,c2,clen,a,alen,p,k) != 0) fail("crypto_aead_decrypt with c=r overlap returns nonzero");
if (memcmp(c2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle c=r overlap");
memcpy(c2,c,clen);
if (crypto_aead_decrypt(t,&tlen,a2,c,clen,a2,alen,p,k) != 0) fail("crypto_aead_decrypt with a=r overlap returns nonzero");
if (memcmp(a2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle a=r overlap");
memcpy(a2,a,alen);
if (crypto_aead_decrypt(t,&tlen,p2,c,clen,a,alen,p2,k) != 0) fail("crypto_aead_decrypt with p=r overlap returns nonzero");
if (memcmp(p2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle p=r overlap");
memcpy(p2,p,plen);
if (crypto_aead_decrypt(t,&tlen,k2,c,clen,a,alen,p,k2) != 0) fail("crypto_aead_decrypt with k=r overlap returns nonzero");
if (memcmp(k2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle k=r overlap");
memcpy(k2,k,klen);
#endif
c[myrandom() % clen] += 1 + (myrandom() % 255);
if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) == 0)
if ((tlen != mlen) || (memcmp(t,m,mlen) != 0) || (memcmp(r,s,slen) != 0))
fail("crypto_aead_decrypt allows trivial forgeries");
c[myrandom() % clen] += 1 + (myrandom() % 255);
if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) == 0)
if ((tlen != mlen) || (memcmp(t,m,mlen) != 0) || (memcmp(r,s,slen) != 0))
fail("crypto_aead_decrypt allows trivial forgeries");
c[myrandom() % clen] += 1 + (myrandom() % 255);
if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) == 0)
if ((tlen != mlen) || (memcmp(t,m,mlen) != 0) || (memcmp(r,s,slen) != 0))
fail("crypto_aead_decrypt allows trivial forgeries");
}
}

View file

@ -0,0 +1,21 @@
#include <stdlib.h>
#include <string.h>
/* provided by try.c: */
extern const char *primitiveimplementation;
extern void preallocate(void);
extern void allocate(void);;
extern void test(void);
extern void predoit(void);
extern void doit(void);
/* provided by try-anything.c: */
extern void fail(const char *);
extern unsigned char *alignedcalloc(unsigned long long);
extern void checksum(const unsigned char *,unsigned long long);
extern void double_canary(unsigned char *,unsigned char *,unsigned long long);
extern void input_prepare(unsigned char *,unsigned char *,unsigned long long);
extern void output_prepare(unsigned char *,unsigned char *,unsigned long long);
extern void input_compare(const unsigned char *,const unsigned char *,unsigned long long,const char *);
extern void output_compare(const unsigned char *,const unsigned char *,unsigned long long,const char *);
extern unsigned long long myrandom(void);

View file

@ -0,0 +1,24 @@
#include "crypto_verify.h"
int crypto_verify(const unsigned char *x,const unsigned char *y)
{
unsigned int differentbits = 0;
#define F(i) differentbits |= x[i] ^ y[i];
F(0)
F(1)
F(2)
F(3)
F(4)
F(5)
F(6)
F(7)
F(8)
F(9)
F(10)
F(11)
F(12)
F(13)
F(14)
F(15)
return (1 & ((differentbits - 1) >> 8)) - 1;
}

View file

@ -1,17 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include "osfreq.c"
long long cpucycles_riscv(void)
{
unsigned long long result;
asm volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax"
: "=a" (result) :: "%rdx");
return result;
}
long long cpucycles_riscv_persecond(void)
{
return osfreq();
}

View file

@ -1,93 +0,0 @@
static double osfreq(void)
{
FILE *f;
char *x;
double result;
int s;
f = fopen("/etc/cpucyclespersecond", "r");
if (f) {
s = fscanf(f,"%lf",&result);
fclose(f);
if (s > 0) return result;
}
f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed", "r");
if (f) {
s = fscanf(f,"%lf",&result);
fclose(f);
if (s > 0) return 1000.0 * result;
}
f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq", "r");
if (f) {
s = fscanf(f,"%lf",&result);
fclose(f);
if (s > 0) return 1000.0 * result;
}
f = fopen("/sys/devices/system/cpu/cpu0/clock_tick", "r");
if (f) {
s = fscanf(f,"%lf",&result);
fclose(f);
if (s > 0) return result;
}
f = fopen("/proc/cpuinfo","r");
if (f) {
for (;;) {
s = fscanf(f,"cpu MHz : %lf",&result);
if (s > 0) break;
if (s == 0) s = fscanf(f,"%*[^\n]\n");
if (s < 0) { result = 0; break; }
}
fclose(f);
if (result) return 1000000.0 * result;
}
f = fopen("/proc/cpuinfo","r");
if (f) {
for (;;) {
s = fscanf(f,"clock : %lf",&result);
if (s > 0) break;
if (s == 0) s = fscanf(f,"%*[^\n]\n");
if (s < 0) { result = 0; break; }
}
fclose(f);
if (result) return 1000000.0 * result;
}
f = popen("sysctl hw.cpufrequency 2>/dev/null","r");
if (f) {
s = fscanf(f,"hw.cpufrequency: %lf",&result);
pclose(f);
if (s > 0) if (result > 0) return result;
}
f = popen("/usr/sbin/lsattr -E -l proc0 -a frequency 2>/dev/null","r");
if (f) {
s = fscanf(f,"frequency %lf",&result);
pclose(f);
if (s > 0) return result;
}
f = popen("/usr/sbin/psrinfo -v 2>/dev/null","r");
if (f) {
for (;;) {
s = fscanf(f," The %*s processor operates at %lf MHz",&result);
if (s > 0) break;
if (s == 0) s = fscanf(f,"%*[^\n]\n");
if (s < 0) { result = 0; break; }
}
pclose(f);
if (result) return 1000000.0 * result;
}
x = getenv("cpucyclespersecond");
if (x) {
s = sscanf(x,"%lf",&result);
if (s > 0) return result;
}
return 0;
}

View file

@ -249,190 +249,9 @@ static inline int64_t _rv64_clmulh(int64_t rs1, int64_t rs2)
/* this is basically Supercop's crypto_aead/aes256gcmv1/dolbeau/aesenc-int,
but without the unrolling.
So we have a thin compatibility layer to SSE's __m128i data format
and associated instructions to support GHASH & the full algo.
*/
/* ouch */
typedef struct {
uint64_t l;
uint64_t h;
} __m128i;
//#define _mm_loadu_si128(a) (*(const __m128i*)a)
static inline __m128i _mm_loadu_si128(const __m128i *ptr) {
__m128i r;
r.l = ((const uint64_t*)ptr)[0];
r.h = ((const uint64_t*)ptr)[1];
return r;
}
//#define _mm_storeu_si128(x,a) (*(__m128i*)x)=a
static inline void _mm_storeu_si128(__m128i *ptr, const __m128i data) {
((uint64_t*)ptr)[0] = data.l;
((uint64_t*)ptr)[1] = data.h;
}
static inline __m128i _mm_clmulepi64_si128(const __m128i a, const __m128i b, const int x) {
__m128i r;
switch (x) {
case 0x00:
r.l = _rv64_clmul(a.l, b.l);
r.h = _rv64_clmulh(a.l, b.l);
break;
case 0x01:
r.l = _rv64_clmul(a.l, b.h);
r.h = _rv64_clmulh(a.l, b.h);
break;
case 0x10:
r.l = _rv64_clmul(a.h, b.l);
r.h = _rv64_clmulh(a.h, b.l);
break;
case 0x11:
r.l = _rv64_clmul(a.h, b.h);
r.h = _rv64_clmulh(a.h, b.h);
break;
}
return r;
}
/*
static inline __m128i (const __m128i a, const __m128i b) {
__m128i r;
return r;
}
*/
static inline __m128i _mm_xor_si128(const __m128i a, const __m128i b) {
__m128i r;
r.l = a.l ^ b.l;
r.h = a.h ^ b.h;
return r;
}
static inline __m128i _mm_or_si128(const __m128i a, const __m128i b) {
__m128i r;
r.l = a.l | b.l;
r.h = a.h | b.h;
return r;
}
static inline __m128i _mm_and_si128(const __m128i a, const __m128i b) {
__m128i r;
r.l = a.l & b.l;
r.h = a.h & b.h;
return r;
}
static inline __m128i _mm_slli_si128(const __m128i a, const int b) {
__m128i r;
switch (b) {
case 4:
r.l = a.l << 32;
r.h = a.h << 32 | a.l >> 32;
break;
case 8:
r.l = 0;
r.h = a.l;
break;
case 12:
r.l = 0;
r.h = a.l << 32;
break;
}
return r;
}
static inline __m128i _mm_srli_si128(const __m128i a, const int b) {
__m128i r;
switch (b) {
case 4:
r.l = a.l >> 32 | a.h << 32;
r.h = a.h >> 32;
break;
case 8:
r.l = a.h;
r.h = 0;
break;
case 12:
r.l = a.h >> 32;
r.h = 0;
break;
}
return r;
}
static inline __m128i _mm_srli_epi32(const __m128i a, const int b) {
__m128i r;
r.l = ((a.l & 0x00000000FFFFFFFFull) >> b) | (((a.l & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull);
r.h = ((a.h & 0x00000000FFFFFFFFull) >> b) | (((a.h & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull);
return r;
}
static inline __m128i _mm_slli_epi32(const __m128i a, const int b) {
__m128i r;
r.l = (((a.l & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.l & 0xFFFFFFFF00000000ull) << b);
r.h = (((a.h & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.h & 0xFFFFFFFF00000000ull) << b);
return r;
}
static inline __m128i _mm_insert_epi64(const __m128i a, const uint64_t x, const int b) {
__m128i r;
if (b == 0) {
r.l = x;
r.h = a.h;
} else {
r.l = a.l;
r.h = x;
}
return r;
}
static inline __m128i _mm_setzero_si128(void) {
__m128i r;
r.l = 0;
r.h = 0;
return r;
}
static inline __m128i _mm_set1_epi32(const uint32_t x) {
__m128i r;
r.l = x | ((uint64_t)x) << 32;
r.h = x | ((uint64_t)x) << 32;
return r;
}
static inline uint64_t bytereverse64(const uint64_t a) {
uint64_t r;
r = (uint32_t)_rv32_grev((a>>32), 24) | (((uint64_t)_rv32_grev((a&0xFFFFFFFF), 24))<<32);
return r;
}
static inline __m128i bytereverse128(const __m128i a) {
__m128i r;
r.l = bytereverse64(a.h);
r.h = bytereverse64(a.l);
return r;
}
static inline uint64_t bitreverse64(const uint64_t a) {
uint64_t r;
r = (uint32_t)_rv32_grev((a&0xFFFFFFFF), 7) | (((uint64_t)_rv32_grev((a>>32), 7))<<32);
return r;
}
static inline __m128i bitreverse128(const __m128i a) {
__m128i r;
r.l = bitreverse64(a.l);
r.h = bitreverse64(a.h);
return r;
}
static inline uint64_t wordreverse64(const uint64_t a) {
uint64_t r;
r = (a>>32)|(a<<32);
return r;
}
static inline __m128i wordreverse128(const __m128i a) {
__m128i r;
r.l = wordreverse64(a.h);
r.h = wordreverse64(a.l);
return r;
}
static inline __m128i doublewordreverse128(const __m128i a) {
__m128i r;
r.l = a.h;
r.h = a.l;
return r;
}
#include "m128_compat.h"
static inline void addmul_rv(unsigned char *c,
const unsigned char *a, int xlen,

241
m128_compat.h Normal file
View file

@ -0,0 +1,241 @@
/*
* A thin compatibility layer to SSE's __m128i data format
* and associated instructions to support GHASH & the full algo.
*/
#ifndef __M128_COMPAT_H__
#define __M128_COMPAT_H__
#include "new_instructions_support_b.h"
#include <stdio.h>
/* ouch */
typedef struct {
uint64_t l;
uint64_t h;
} __m128i;
//#define _mm_loadu_si128(a) (*(const __m128i*)a)
static inline __m128i _mm_loadu_si128(const __m128i *ptr) {
__m128i r;
r.l = ((const uint64_t*)ptr)[0];
r.h = ((const uint64_t*)ptr)[1];
return r;
}
//#define _mm_storeu_si128(x,a) (*(__m128i*)x)=a
static inline void _mm_storeu_si128(__m128i *ptr, const __m128i data) {
((uint64_t*)ptr)[0] = data.l;
((uint64_t*)ptr)[1] = data.h;
}
static inline void _mm_store_si128(__m128i *ptr, const __m128i data) {
((uint64_t*)ptr)[0] = data.l;
((uint64_t*)ptr)[1] = data.h;
}
static inline void _mm_storel_epi64 (__m128i *ptr, const __m128i data) {
((uint64_t*)ptr)[0] = data.l;
}
static inline __m128i _mm_clmulepi64_si128(const __m128i a, const __m128i b, const int x) {
__m128i r;
switch (x) {
case 0x00:
r.l = _rv64_clmul(a.l, b.l);
r.h = _rv64_clmulh(a.l, b.l);
break;
case 0x01:
r.l = _rv64_clmul(a.l, b.h);
r.h = _rv64_clmulh(a.l, b.h);
break;
case 0x10:
r.l = _rv64_clmul(a.h, b.l);
r.h = _rv64_clmulh(a.h, b.l);
break;
case 0x11:
r.l = _rv64_clmul(a.h, b.h);
r.h = _rv64_clmulh(a.h, b.h);
break;
}
return r;
}
/*
static inline __m128i (const __m128i a, const __m128i b) {
__m128i r;
return r;
}
*/
static inline __m128i _mm_xor_si128(const __m128i a, const __m128i b) {
__m128i r;
r.l = a.l ^ b.l;
r.h = a.h ^ b.h;
return r;
}
static inline __m128i _mm_or_si128(const __m128i a, const __m128i b) {
__m128i r;
r.l = a.l | b.l;
r.h = a.h | b.h;
return r;
}
static inline __m128i _mm_and_si128(const __m128i a, const __m128i b) {
__m128i r;
r.l = a.l & b.l;
r.h = a.h & b.h;
return r;
}
static inline __m128i _mm_slli_si128(const __m128i a, const int b) {
__m128i r;
switch (b) {
case 4:
r.l = a.l << 32;
r.h = a.h << 32 | a.l >> 32;
break;
case 8:
r.l = 0;
r.h = a.l;
break;
case 12:
r.l = 0;
r.h = a.l << 32;
break;
default:
fprintf(stderr, "%s: %d unimplemented\n", __PRETTY_FUNCTION__, b);
break;
}
return r;
}
static inline __m128i _mm_srli_si128(const __m128i a, const int b) {
__m128i r;
switch (b) {
case 1:
r.l = a.l >> 8 | a.h << 56;
r.h = a.h >> 8;
break;
case 4:
r.l = a.l >> 32 | a.h << 32;
r.h = a.h >> 32;
break;
case 8:
r.l = a.h;
r.h = 0;
break;
case 12:
r.l = a.h >> 32;
r.h = 0;
break;
default:
fprintf(stderr, "%s: %d unimplemented\n", __PRETTY_FUNCTION__, b);
break;
}
return r;
}
static inline __m128i _mm_srli_epi32(const __m128i a, const int b) {
__m128i r;
r.l = ((a.l & 0x00000000FFFFFFFFull) >> b) | (((a.l & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull);
r.h = ((a.h & 0x00000000FFFFFFFFull) >> b) | (((a.h & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull);
return r;
}
static inline __m128i _mm_slli_epi32(const __m128i a, const int b) {
__m128i r;
r.l = (((a.l & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.l & 0xFFFFFFFF00000000ull) << b);
r.h = (((a.h & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.h & 0xFFFFFFFF00000000ull) << b);
return r;
}
/* static inline __m128i _mm_srai_epi32(const __m128i a, const int b) { */
/* __m128i r; */
/* r.l = (((int32_t)(a.l & 0x00000000FFFFFFFFull)) >> b) | ((((int32_t)(a.l & 0xFFFFFFFF00000000ull)) >> b) & 0xFFFFFFFF00000000ull); */
/* r.h = (((int32_t)(a.h & 0x00000000FFFFFFFFull)) >> b) | ((((int32_t)(a.h & 0xFFFFFFFF00000000ull)) >> b) & 0xFFFFFFFF00000000ull); */
/* return r; */
/* } */
static inline __m128i _mm_insert_epi64(const __m128i a, const uint64_t x, const int b) {
__m128i r;
if (b == 0) {
r.l = x;
r.h = a.h;
} else {
r.l = a.l;
r.h = x;
}
return r;
}
static inline __m128i _mm_setzero_si128(void) {
__m128i r;
r.l = 0;
r.h = 0;
return r;
}
static inline __m128i _mm_set1_epi32(const uint32_t x) {
__m128i r;
r.l = x | ((uint64_t)x) << 32;
r.h = x | ((uint64_t)x) << 32;
return r;
}
static inline __m128i _mm_set_epi32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
__m128i r;
r.l = (uint64_t)e0 | ((uint64_t)e1) << 32;
r.h = (uint64_t)e2 | ((uint64_t)e3) << 32;
return r;
}
/* non-intel stuff, used to replace some common use cases */
static inline uint64_t bytereverse64(const uint64_t a) {
uint64_t r;
r = (uint32_t)_rv32_grev((a>>32), 24) | (((uint64_t)_rv32_grev((a&0xFFFFFFFF), 24))<<32);
return r;
}
static inline __m128i bytereverse128(const __m128i a) {
__m128i r;
r.l = bytereverse64(a.h);
r.h = bytereverse64(a.l);
return r;
}
static inline uint64_t bitreverse64(const uint64_t a) {
uint64_t r;
r = (uint32_t)_rv32_grev((a&0xFFFFFFFF), 7) | (((uint64_t)_rv32_grev((a>>32), 7))<<32);
return r;
}
static inline __m128i bitreverse128(const __m128i a) {
__m128i r;
r.l = bitreverse64(a.l);
r.h = bitreverse64(a.h);
return r;
}
static inline uint64_t wordreverse64(const uint64_t a) {
uint64_t r;
r = (a>>32)|(a<<32);
return r;
}
static inline __m128i wordreverse128(const __m128i a) {
__m128i r;
r.l = wordreverse64(a.h);
r.h = wordreverse64(a.l);
return r;
}
static inline __m128i doublewordreverse128(const __m128i a) {
__m128i r;
r.l = a.h;
r.h = a.l;
return r;
}
static inline __m128i wordrotate1l128(const __m128i a) {
__m128i r;
/* i.e. epi32 _MM_SHUFFLE(2,1,0,3) */
r.l = (a.h >> 32) | (a.l << 32);
r.h = (a.l >> 32) | (a.h << 32);
return r;
}
static inline __m128i halfwordandzero(const uint16_t a) {
__m128i r;
r.l = a;
r.h = 0;
return r;
}
static inline __m128i wordsign128(const __m128i a) {
__m128i r;
r.l = (a.l & 0x0000000080000000ull ? 0x00000000FFFFFFFFull : 0) | (a.l & 0x8000000000000000ull ? 0xFFFFFFFF00000000ull : 0);
r.h = (a.h & 0x0000000080000000ull ? 0x00000000FFFFFFFFull : 0) | (a.h & 0x8000000000000000ull ? 0xFFFFFFFF00000000ull : 0);
return r;
}
#endif // __M128_COMPAT_H__