mirror of
https://github.com/rdolbeau/VexRiscvBPluginGenerator.git
synced 2025-04-18 18:44:42 -04:00
add a quick'n'dirty implementation of RV32BK-accelerated AES-OCB, using the _m128i compatibility layer (spun off in its own header)
This commit is contained in:
parent
431fdc5288
commit
8ad11036be
22 changed files with 1943 additions and 292 deletions
47
aeadaes256ocbtaglen128v1-rv32/Makefile
Normal file
47
aeadaes256ocbtaglen128v1-rv32/Makefile
Normal file
|
@ -0,0 +1,47 @@
|
|||
SRCs=encrypt.c try-anything.c verify.c
|
||||
OBJs=$(SRCs:.c=.o)
|
||||
SCLIBS=cpucycles.o kernelrandombytes.o
|
||||
|
||||
COMPDIR=~dolbeau2/LITEX/buildroot-rv32/output/host
|
||||
ALTCOMPDIR=/opt/riscv64b
|
||||
|
||||
CC=$(COMPDIR)/bin/riscv32-buildroot-linux-gnu-gcc
|
||||
ALTCC=$(ALTCOMPDIR)/bin/riscv64-unknown-elf-gcc
|
||||
CXX=$(COMPDIR)/bin/riscv32-buildroot-linux-gnu-g++
|
||||
STRIP=$(COMPDIR)/bin/riscv32-buildroot-linux-gnu-strip
|
||||
NEWOPT=-march=rv32imab -mabi=ilp32 -I. -I.. -O3 -DRV32B #-fno-vectorize #-DUSE_EPI_CUSTOM
|
||||
OPT=-march=rv32ima -mabi=ilp32 -I. -I.. -O3 #-fno-vectorize #-DUSE_EPI_CUSTOM
|
||||
#ALTCC=$(CC)
|
||||
#NEWOPT=$(OPT)
|
||||
|
||||
all: aeadaes256ocbtaglen128v1 aeadaes256ocbtaglen128v1_small
|
||||
|
||||
clean:
|
||||
rm -f $(OBJs) *.S try.o try_small.o encrypt.o aeadaes256ocbtaglen128v1 aeadaes256ocbtaglen128v1_small
|
||||
|
||||
%.o: %.c
|
||||
$(CC) $(OPT) $< -c -o $@
|
||||
|
||||
try.o: try.c
|
||||
$(CC) $(OPT) $< -c -o $@
|
||||
|
||||
try_small.o: try.c
|
||||
$(CC) $(OPT) $< -c -o $@ -DSMALL
|
||||
|
||||
encrypt.S: encrypt.c
|
||||
$(ALTCC) $(NEWOPT) $< -S -o $@
|
||||
|
||||
encrypt.o: encrypt.S
|
||||
$(ALTCC) $(NEWOPT) $< -c -o $@
|
||||
|
||||
aeadaes256ocbtaglen128v1: $(OBJs) encrypt.o try.o $(SCLIBS)
|
||||
$(CXX) $(OPT) $^ -o $@
|
||||
|
||||
aeadaes256ocbtaglen128v1_small: $(OBJs) encrypt.o try_small.o $(SCLIBS)
|
||||
$(CXX) $(OPT) $^ -o $@
|
||||
|
||||
kernelrandombytes.o: random.cpp
|
||||
$(CXX) $(OPT) $< -c -o $@
|
||||
|
||||
cpucycles.o: riscv.c
|
||||
$(CC) $< -march=rv32ima -mabi=ilp32 -I. -O1 -c -o $@
|
4
aeadaes256ocbtaglen128v1-rv32/api.h
Normal file
4
aeadaes256ocbtaglen128v1-rv32/api.h
Normal file
|
@ -0,0 +1,4 @@
|
|||
#define CRYPTO_KEYBYTES 32
|
||||
#define CRYPTO_NSECBYTES 0
|
||||
#define CRYPTO_NPUBBYTES 12
|
||||
#define CRYPTO_ABYTES 16
|
28
aeadaes256ocbtaglen128v1-rv32/cpucycles.h
Normal file
28
aeadaes256ocbtaglen128v1-rv32/cpucycles.h
Normal file
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
cpucycles riscv.h version 20190803
|
||||
D. J. Bernstein
|
||||
Romain Dolbeau
|
||||
Public domain.
|
||||
*/
|
||||
|
||||
#ifndef CPUCYCLES_riscv_h
|
||||
#define CPUCYCLES_riscv_h
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern long long cpucycles_riscv(void);
|
||||
extern long long cpucycles_riscv_persecond(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef cpucycles_implementation
|
||||
#define cpucycles_implementation "riscv"
|
||||
#define cpucycles cpucycles_riscv
|
||||
#define cpucycles_persecond cpucycles_riscv_persecond
|
||||
#endif
|
||||
|
||||
#endif
|
17
aeadaes256ocbtaglen128v1-rv32/crypto_aead.h
Normal file
17
aeadaes256ocbtaglen128v1-rv32/crypto_aead.h
Normal file
|
@ -0,0 +1,17 @@
|
|||
#ifndef crypto_aead_H
|
||||
#define crypto_aead_H
|
||||
|
||||
#include "crypto_aead_aeadaes256ocbtaglen128v1.h"
|
||||
|
||||
#define crypto_aead_encrypt crypto_aead_aeadaes256ocbtaglen128v1_encrypt
|
||||
#define crypto_aead_decrypt crypto_aead_aeadaes256ocbtaglen128v1_decrypt
|
||||
#define crypto_aead_KEYBYTES crypto_aead_aeadaes256ocbtaglen128v1_KEYBYTES
|
||||
#define crypto_aead_NSECBYTES crypto_aead_aeadaes256ocbtaglen128v1_NSECBYTES
|
||||
#define crypto_aead_NPUBBYTES crypto_aead_aeadaes256ocbtaglen128v1_NPUBBYTES
|
||||
#define crypto_aead_ABYTES crypto_aead_aeadaes256ocbtaglen128v1_ABYTES
|
||||
#define crypto_aead_NOOVERLAP crypto_aead_aeadaes256ocbtaglen128v1_NOOVERLAP
|
||||
#define crypto_aead_PRIMITIVE "aeadaes256ocbtaglen128v1"
|
||||
#define crypto_aead_IMPLEMENTATION crypto_aead_aeadaes256ocbtaglen128v1_IMPLEMENTATION
|
||||
#define crypto_aead_VERSION crypto_aead_aeadaes256ocbtaglen128v1_VERSION
|
||||
|
||||
#endif
|
|
@ -0,0 +1,31 @@
|
|||
#ifndef crypto_aead_aeadaes256ocbtaglen128v1_H
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_H
|
||||
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_KEYBYTES 32
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_NSECBYTES 0
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_NPUBBYTES 12
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_ABYTES 16
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
extern int crypto_aead_aeadaes256ocbtaglen128v1_rv32_encrypt(unsigned char *,unsigned long long *,const unsigned char *,unsigned long long,const unsigned char *,unsigned long long,const unsigned char *,const unsigned char *,const unsigned char *);
|
||||
extern int crypto_aead_aeadaes256ocbtaglen128v1_rv32_decrypt(unsigned char *,unsigned long long *,unsigned char *,const unsigned char *,unsigned long long,const unsigned char *,unsigned long long,const unsigned char *,const unsigned char *);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_encrypt crypto_aead_aeadaes256ocbtaglen128v1_rv32_encrypt
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_decrypt crypto_aead_aeadaes256ocbtaglen128v1_rv32_decrypt
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_KEYBYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_KEYBYTES
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_NSECBYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_NSECBYTES
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_NPUBBYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_NPUBBYTES
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_ABYTES crypto_aead_aeadaes256ocbtaglen128v1_rv32_ABYTES
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_NOOVERLAP crypto_aead_aeadaes256ocbtaglen128v1_rv32_NOOVERLAP
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_IMPLEMENTATION "crypto_aead/aeadaes256ocbtaglen128v1/dolbeau/aesenc-int"
|
||||
#ifndef crypto_aead_aeadaes256ocbtaglen128v1_rv32_VERSION
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_rv32_VERSION "-"
|
||||
#endif
|
||||
#define crypto_aead_aeadaes256ocbtaglen128v1_VERSION crypto_aead_aeadaes256ocbtaglen128v1_rv32_VERSION
|
||||
|
||||
#endif
|
6
aeadaes256ocbtaglen128v1-rv32/crypto_uint32.h
Normal file
6
aeadaes256ocbtaglen128v1-rv32/crypto_uint32.h
Normal file
|
@ -0,0 +1,6 @@
|
|||
#ifndef crypto_uint32_h
|
||||
#define crypto_uint32_h
|
||||
|
||||
typedef unsigned int crypto_uint32;
|
||||
|
||||
#endif
|
6
aeadaes256ocbtaglen128v1-rv32/crypto_uint64.h
Normal file
6
aeadaes256ocbtaglen128v1-rv32/crypto_uint64.h
Normal file
|
@ -0,0 +1,6 @@
|
|||
#ifndef crypto_uint64_h
|
||||
#define crypto_uint64_h
|
||||
|
||||
typedef unsigned long long crypto_uint64;
|
||||
|
||||
#endif
|
6
aeadaes256ocbtaglen128v1-rv32/crypto_uint8.h
Normal file
6
aeadaes256ocbtaglen128v1-rv32/crypto_uint8.h
Normal file
|
@ -0,0 +1,6 @@
|
|||
#ifndef crypto_uint8_h
|
||||
#define crypto_uint8_h
|
||||
|
||||
typedef unsigned char crypto_uint8;
|
||||
|
||||
#endif
|
12
aeadaes256ocbtaglen128v1-rv32/crypto_verify.h
Normal file
12
aeadaes256ocbtaglen128v1-rv32/crypto_verify.h
Normal file
|
@ -0,0 +1,12 @@
|
|||
#ifndef crypto_verify_H
|
||||
#define crypto_verify_H
|
||||
|
||||
#include "crypto_verify_16.h"
|
||||
|
||||
#define crypto_verify crypto_verify_16
|
||||
#define crypto_verify_BYTES crypto_verify_16_BYTES
|
||||
#define crypto_verify_PRIMITIVE "16"
|
||||
#define crypto_verify_IMPLEMENTATION crypto_verify_16_IMPLEMENTATION
|
||||
#define crypto_verify_VERSION crypto_verify_16_VERSION
|
||||
|
||||
#endif
|
22
aeadaes256ocbtaglen128v1-rv32/crypto_verify_16.h
Normal file
22
aeadaes256ocbtaglen128v1-rv32/crypto_verify_16.h
Normal file
|
@ -0,0 +1,22 @@
|
|||
#ifndef crypto_verify_16_H
|
||||
#define crypto_verify_16_H
|
||||
|
||||
#define crypto_verify_16_ref_BYTES 16
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
extern int crypto_verify_16_ref(const unsigned char *,const unsigned char *);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#define crypto_verify_16 crypto_verify_16_ref
|
||||
#define crypto_verify_16_BYTES crypto_verify_16_ref_BYTES
|
||||
#define crypto_verify_16_IMPLEMENTATION "crypto_verify/16/ref"
|
||||
#ifndef crypto_verify_16_ref_VERSION
|
||||
#define crypto_verify_16_ref_VERSION "-"
|
||||
#endif
|
||||
#define crypto_verify_16_VERSION crypto_verify_16_ref_VERSION
|
||||
|
||||
#endif
|
796
aeadaes256ocbtaglen128v1-rv32/encrypt.c
Normal file
796
aeadaes256ocbtaglen128v1-rv32/encrypt.c
Normal file
|
@ -0,0 +1,796 @@
|
|||
/*
|
||||
// CAESAR OCB v1 somewhat optimised code
|
||||
// Info: http://www.cs.ucdavis.edu/~rogaway/ocb
|
||||
//
|
||||
// Written by Romain Dolbeau (romain@dolbeau.org),
|
||||
// based on the reference implementation by Ted Krovetz (ted@krovetz.net).
|
||||
//
|
||||
// Phillip Rogaway holds patents relevant to OCB. See the following for
|
||||
// his free patent grant: http://www.cs.ucdavis.edu/~rogaway/ocb/grant.htm
|
||||
//
|
||||
// This is free and unencumbered software released into the public domain.
|
||||
//
|
||||
// Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||
// distribute this software, either in source code form or as a compiled
|
||||
// binary, for any purpose, commercial or non-commercial, and by any
|
||||
// means.
|
||||
//
|
||||
// In jurisdictions that recognize copyright laws, the author or authors
|
||||
// of this software dedicate any and all copyright interest in the
|
||||
// software to the public domain. We make this dedication for the benefit
|
||||
// of the public at large and to the detriment of our heirs and
|
||||
// successors. We intend this dedication to be an overt act of
|
||||
// relinquishment in perpetuity of all present and future rights to this
|
||||
// software under copyright law.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
// OTHER DEALINGS IN THE SOFTWARE.
|
||||
//
|
||||
// For more information, please refer to <http://unlicense.org/>
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "api.h"
|
||||
#include "crypto_aead.h"
|
||||
#define KEYBYTES CRYPTO_KEYBYTES
|
||||
#define NONCEBYTES CRYPTO_NPUBBYTES
|
||||
#define TAGBYTES CRYPTO_ABYTES
|
||||
|
||||
#define ALIGN16 __attribute__((aligned(16)))
|
||||
#define ALIGN32 __attribute__((aligned(32)))
|
||||
#define ALIGN64 __attribute__((aligned(64)))
|
||||
#define _bswap64(a) __builtin_bswap64(a)
|
||||
#define _bswap(a) __builtin_bswap32(a)
|
||||
|
||||
#define printv16c(p,v) \
|
||||
{ \
|
||||
ALIGN16 unsigned char temp[16]; \
|
||||
_mm_store_si128(temp, v); \
|
||||
int z; \
|
||||
printf("%8s:%8s = ",p,#v); \
|
||||
for (z = 15 ; z >= 0 ; z--) { \
|
||||
printf("%02hhx", temp[z]); \
|
||||
if ((z%4)==0) printf(" "); \
|
||||
} \
|
||||
printf("\n"); \
|
||||
}
|
||||
|
||||
#include "m128_compat.h"
|
||||
|
||||
#include "new_instructions_support_k.h"
|
||||
|
||||
#define rotr(a,b) _rv32_ror(a,b)
|
||||
|
||||
static inline void aes256_Tsetkey_encrypt(const unsigned int key[], unsigned int *aes_edrk) {
|
||||
unsigned int i = 0;
|
||||
unsigned int rotl_aes_edrk;
|
||||
unsigned int tmp8, tmp9, tmp10, tmp11;
|
||||
unsigned int tmp12, tmp13, tmp14, tmp15;
|
||||
unsigned int temp_lds;
|
||||
unsigned int round = 0x00000001;
|
||||
|
||||
tmp8 = (key[0]);
|
||||
aes_edrk[0] = tmp8;
|
||||
tmp9 = (key[1]);
|
||||
aes_edrk[1] = tmp9;
|
||||
tmp10 = (key[2]);
|
||||
aes_edrk[2] = tmp10;
|
||||
tmp11 = (key[3]);
|
||||
aes_edrk[3] = tmp11;
|
||||
tmp12 = (key[4]);
|
||||
aes_edrk[4] = tmp12;
|
||||
tmp13 = (key[5]);
|
||||
aes_edrk[5] = tmp13;
|
||||
tmp14 = (key[6]);
|
||||
aes_edrk[6] = tmp14;
|
||||
tmp15 = (key[7]);
|
||||
aes_edrk[7] = tmp15;
|
||||
|
||||
for( i = 8; i < 56; /* i+=8 */ )
|
||||
{
|
||||
tmp8 = tmp8 ^ round;
|
||||
round = round << 1;
|
||||
rotl_aes_edrk = rotr(tmp15,8);
|
||||
tmp8 = aes32esi0(tmp8, rotl_aes_edrk);
|
||||
tmp8 = aes32esi1(tmp8, rotl_aes_edrk);
|
||||
tmp8 = aes32esi2(tmp8, rotl_aes_edrk);
|
||||
tmp8 = aes32esi3(tmp8, rotl_aes_edrk);
|
||||
|
||||
aes_edrk[i++] = tmp8;
|
||||
tmp9 = tmp9 ^ tmp8;
|
||||
aes_edrk[i++] = tmp9;
|
||||
tmp10 = tmp10 ^ tmp9;
|
||||
aes_edrk[i++] = tmp10;
|
||||
tmp11 = tmp11 ^ tmp10;
|
||||
aes_edrk[i++] = tmp11;
|
||||
|
||||
tmp12 = aes32esi0(tmp12, tmp11);
|
||||
tmp12 = aes32esi1(tmp12, tmp11);
|
||||
tmp12 = aes32esi2(tmp12, tmp11);
|
||||
tmp12 = aes32esi3(tmp12, tmp11);
|
||||
|
||||
aes_edrk[i++] = tmp12;
|
||||
tmp13 = tmp13 ^ tmp12;
|
||||
aes_edrk[i++] = tmp13;
|
||||
tmp14 = tmp14 ^ tmp13;
|
||||
aes_edrk[i++] = tmp14;
|
||||
tmp15 = tmp15 ^ tmp14;
|
||||
aes_edrk[i++] = tmp15;
|
||||
}
|
||||
|
||||
tmp8 = tmp8 ^ round;
|
||||
round = round << 1;
|
||||
rotl_aes_edrk = rotr(tmp15,8);
|
||||
tmp8 = aes32esi0(tmp8, rotl_aes_edrk);
|
||||
tmp8 = aes32esi1(tmp8, rotl_aes_edrk);
|
||||
tmp8 = aes32esi2(tmp8, rotl_aes_edrk);
|
||||
tmp8 = aes32esi3(tmp8, rotl_aes_edrk);
|
||||
|
||||
aes_edrk[i++] = tmp8;
|
||||
tmp9 = tmp9 ^ tmp8;
|
||||
aes_edrk[i++] = tmp9;
|
||||
tmp10 = tmp10 ^ tmp9;
|
||||
aes_edrk[i++] = tmp10;
|
||||
tmp11 = tmp11 ^ tmp10;
|
||||
aes_edrk[i++] = tmp11;
|
||||
}
|
||||
|
||||
static void aes256_key_enc2dec(unsigned int *erk, unsigned int *drk)
|
||||
{
|
||||
int i, j;
|
||||
// first and last unchanged (but swapped)
|
||||
for (i = 0; i < 4; i++) {
|
||||
drk[i] = erk[i+56];
|
||||
drk[i+56] = erk[i];
|
||||
}
|
||||
// convert & revert order
|
||||
for (i = 1; i < 14; i++) {
|
||||
for (j = 0 ; j < 4 ; j++) {
|
||||
unsigned int ek, dk;
|
||||
ek = erk[i*4+j];
|
||||
|
||||
dk = 0;
|
||||
dk = aes32esi0(dk, ek);
|
||||
dk = aes32esi1(dk, ek);
|
||||
dk = aes32esi2(dk, ek);
|
||||
dk = aes32esi3(dk, ek);
|
||||
|
||||
ek = 0;
|
||||
ek = aes32dsmi0(ek, dk);
|
||||
ek = aes32dsmi1(ek, dk);
|
||||
ek = aes32dsmi2(ek, dk);
|
||||
ek = aes32dsmi3(ek, dk);
|
||||
|
||||
drk[56-4*i+j] = ek;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define AES_ROUND1T(TAB,I,X0,X1,X2,X3,Y0,Y1,Y2,Y3) \
|
||||
{ \
|
||||
X0 = aes32esmi0(TAB[I++],Y0); \
|
||||
X0 = aes32esmi1(X0,Y1); \
|
||||
X0 = aes32esmi2(X0,Y2); \
|
||||
X0 = aes32esmi3(X0,Y3); \
|
||||
X1 = aes32esmi0(TAB[I++],Y1); \
|
||||
X1 = aes32esmi1(X1,Y2); \
|
||||
X1 = aes32esmi2(X1,Y3); \
|
||||
X1 = aes32esmi3(X1,Y0); \
|
||||
X2 = aes32esmi0(TAB[I++],Y2); \
|
||||
X2 = aes32esmi1(X2,Y3); \
|
||||
X2 = aes32esmi2(X2,Y0); \
|
||||
X2 = aes32esmi3(X2,Y1); \
|
||||
X3 = aes32esmi0(TAB[I++],Y3); \
|
||||
X3 = aes32esmi1(X3,Y0); \
|
||||
X3 = aes32esmi2(X3,Y1); \
|
||||
X3 = aes32esmi3(X3,Y2); \
|
||||
}
|
||||
|
||||
/* using the K + B instructions */
|
||||
static inline void aes256_1Tft_encrypt(const uint32_t *aes_edrk, const uint32_t *input, uint32_t *output)
|
||||
{
|
||||
unsigned int X0, X1, X2, X3, Y0, Y1, Y2, Y3;
|
||||
unsigned int i = 0, j = 0;
|
||||
unsigned int l_aes_nr = 14;
|
||||
|
||||
X0 = ((input[0]) ^ aes_edrk[j++]);
|
||||
X1 = ((input[1]) ^ aes_edrk[j++]);
|
||||
X2 = ((input[2]) ^ aes_edrk[j++]);
|
||||
X3 = ((input[3]) ^ aes_edrk[j++]);
|
||||
|
||||
for (i = 4 ; i < (l_aes_nr<<2) ; ) {
|
||||
|
||||
AES_ROUND1T(aes_edrk, i, Y0, Y1, Y2, Y3, X0, X1, X2, X3 );
|
||||
|
||||
X0=Y0;
|
||||
X1=Y1;
|
||||
X2=Y2;
|
||||
X3=Y3;
|
||||
}
|
||||
/* last round */
|
||||
|
||||
Y0 = aes32esi0(aes_edrk[i], X0);
|
||||
Y0 = aes32esi1(Y0, X1);
|
||||
Y0 = aes32esi2(Y0, X2);
|
||||
Y0 = aes32esi3(Y0, X3);
|
||||
i++;
|
||||
Y1 = aes32esi0(aes_edrk[i], X1);
|
||||
Y1 = aes32esi1(Y1, X2);
|
||||
Y1 = aes32esi2(Y1, X3);
|
||||
Y1 = aes32esi3(Y1, X0);
|
||||
i++;
|
||||
Y2 = aes32esi0(aes_edrk[i], X2);
|
||||
Y2 = aes32esi1(Y2, X3);
|
||||
Y2 = aes32esi2(Y2, X0);
|
||||
Y2 = aes32esi3(Y2, X1);
|
||||
i++;
|
||||
Y3 = aes32esi0(aes_edrk[i], X3);
|
||||
Y3 = aes32esi1(Y3, X0);
|
||||
Y3 = aes32esi2(Y3, X1);
|
||||
Y3 = aes32esi3(Y3, X2);
|
||||
|
||||
output[0] = (Y0);
|
||||
output[1] = (Y1);
|
||||
output[2] = (Y2);
|
||||
output[3] = (Y3);
|
||||
}
|
||||
|
||||
|
||||
#define AES_ROUND_DKT(TAB,I,X0,X1,X2,X3,Y0,Y1,Y2,Y3) \
|
||||
{ \
|
||||
X0 = aes32dsmi0(TAB[I+0],Y0); \
|
||||
X0 = aes32dsmi1(X0,Y3); \
|
||||
X0 = aes32dsmi2(X0,Y2); \
|
||||
X0 = aes32dsmi3(X0,Y1); \
|
||||
X1 = aes32dsmi0(TAB[I+1],Y1); \
|
||||
X1 = aes32dsmi1(X1,Y0); \
|
||||
X1 = aes32dsmi2(X1,Y3); \
|
||||
X1 = aes32dsmi3(X1,Y2); \
|
||||
X2 = aes32dsmi0(TAB[I+2],Y2); \
|
||||
X2 = aes32dsmi1(X2,Y1); \
|
||||
X2 = aes32dsmi2(X2,Y0); \
|
||||
X2 = aes32dsmi3(X2,Y3); \
|
||||
X3 = aes32dsmi0(TAB[I+3],Y3); \
|
||||
X3 = aes32dsmi1(X3,Y2); \
|
||||
X3 = aes32dsmi2(X3,Y1); \
|
||||
X3 = aes32dsmi3(X3,Y0); \
|
||||
}
|
||||
|
||||
void aes256_1Tft_decrypt(const unsigned int *aes_drk, const unsigned int *input, unsigned int *output)
|
||||
{
|
||||
const unsigned int aes_nr = 14; // aes256
|
||||
unsigned int X0, X1, X2, X3, Y0, Y1, Y2, Y3;
|
||||
unsigned int i;
|
||||
|
||||
X0 = input[0]; X0 ^= aes_drk[0];
|
||||
X1 = input[1]; X1 ^= aes_drk[1];
|
||||
X2 = input[2]; X2 ^= aes_drk[2];
|
||||
X3 = input[3]; X3 ^= aes_drk[3];
|
||||
|
||||
// for (i=1;i<aes_nr;i++)
|
||||
i=1;
|
||||
do
|
||||
{
|
||||
AES_ROUND_DKT(aes_drk, (i<<2), Y0, Y1, Y2, Y3, X0, X1, X2, X3 ); /* round 1 */
|
||||
X0=Y0;
|
||||
X1=Y1;
|
||||
X2=Y2;
|
||||
X3=Y3;
|
||||
i++;
|
||||
}
|
||||
while(i<aes_nr);
|
||||
i=(i<<2);
|
||||
|
||||
/* last round */
|
||||
|
||||
Y0 = aes32dsi0(aes_drk[i+0], X0);
|
||||
Y0 = aes32dsi1(Y0, X3);
|
||||
Y0 = aes32dsi2(Y0, X2);
|
||||
Y0 = aes32dsi3(Y0, X1);
|
||||
Y1 = aes32dsi0(aes_drk[i+1], X1);
|
||||
Y1 = aes32dsi1(Y1, X0);
|
||||
Y1 = aes32dsi2(Y1, X3);
|
||||
Y1 = aes32dsi3(Y1, X2);
|
||||
Y2 = aes32dsi0(aes_drk[i+2], X2);
|
||||
Y2 = aes32dsi1(Y2, X1);
|
||||
Y2 = aes32dsi2(Y2, X0);
|
||||
Y2 = aes32dsi3(Y2, X3);
|
||||
Y3 = aes32dsi0(aes_drk[i+3], X3);
|
||||
Y3 = aes32dsi1(Y3, X2);
|
||||
Y3 = aes32dsi2(Y3, X1);
|
||||
Y3 = aes32dsi3(Y3, X0);
|
||||
|
||||
output[0] = Y0;
|
||||
output[1] = Y1;
|
||||
output[2] = Y2;
|
||||
output[3] = Y3;
|
||||
}
|
||||
|
||||
|
||||
/** single, by-the-book AES encryption with AES-NI */
|
||||
static inline __m128i aes256_1Tft__encrypt1_si128(const __m128i nv, const __m128i rkeys[15]) {
|
||||
__m128i temp;
|
||||
aes256_1Tft_encrypt(rkeys, &nv, &temp);
|
||||
return temp;
|
||||
}
|
||||
static inline __m128i aes256_1Tft__decrypt1_si128(const __m128i nv, const __m128i rkeys[15]) {
|
||||
__m128i temp;
|
||||
aes256_1Tft_decrypt(rkeys, &nv, &temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
typedef unsigned char block[16];
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
#if 0
|
||||
static inline void xor_block(block d, block s1, block s2) {
|
||||
unsigned i;
|
||||
for (i=0; i<16; i++)
|
||||
d[i] = s1[i] ^ s2[i];
|
||||
}
|
||||
#else
|
||||
/* 128 bits SSE doubling */
|
||||
static inline void xor_block(unsigned char* d, const unsigned char* s1, const unsigned char* s2) {
|
||||
__m128i dv = _mm_xor_si128(_mm_loadu_si128((const __m128i*)s1), _mm_loadu_si128((const __m128i*)s2));
|
||||
_mm_storeu_si128((__m128i*)d,dv);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
#if 0
|
||||
static inline void double_block(block d, block s) {
|
||||
unsigned i;
|
||||
unsigned char tmp = s[0];
|
||||
for (i=0; i<15; i++)
|
||||
d[i] = (s[i] << 1) | (s[i+1] >> 7);
|
||||
d[15] = (s[15] << 1) ^ ((tmp >> 7) * 135);
|
||||
}
|
||||
#else
|
||||
#if 0
|
||||
/* 64 bits little-endian doubling, faster */
|
||||
static inline void double_block(unsigned long long *d, const unsigned long long* s) {
|
||||
unsigned long long sl = _bswap64(s[1]), sh = _bswap64(s[0]);
|
||||
unsigned long long sl1 = sl << 1;
|
||||
unsigned long long sh1 = sh << 1;
|
||||
sh1 |= sl>>63;
|
||||
sl1 ^= (((long long)sh>>63) & 135);
|
||||
d[1]=_bswap64(sl1);
|
||||
d[0]=_bswap64(sh1);
|
||||
}
|
||||
#else
|
||||
/* 128 bits SSE, much faster */
|
||||
static inline __m128i double_block_si128_norev(const __m128i sv) {
|
||||
const __m128i mask = _mm_set_epi32(135,1,1,1);
|
||||
/* __m128i sv31 = _mm_srai_epi32(sv, 31); */
|
||||
__m128i sv31 = wordsign128(sv);
|
||||
__m128i sv31m = _mm_and_si128(sv31, mask);
|
||||
/* __m128i sv31ms = _mm_shuffle_epi32(sv31m, _MM_SHUFFLE(2,1,0,3)); */
|
||||
__m128i sv31ms = wordrotate1l128(sv31m);
|
||||
__m128i sv1 = _mm_slli_epi32(sv, 1);
|
||||
__m128i dv = _mm_xor_si128(sv31ms,sv1);
|
||||
return dv;
|
||||
}
|
||||
static inline __m128i double_block_si128(const __m128i svr) {
|
||||
/* const __m128i rev = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); */
|
||||
/* __m128i sv = _mm_shuffle_epi8(svr, rev); */
|
||||
__m128i sv = bytereverse128(svr);
|
||||
__m128i dv = double_block_si128_norev(sv);
|
||||
/* return _mm_shuffle_epi8(dv, rev); */
|
||||
return bytereverse128(dv);
|
||||
}
|
||||
static inline void double_block(unsigned char *d, const unsigned char* s) {
|
||||
__m128i sv = _mm_loadu_si128((const __m128i*)s);
|
||||
__m128i dv = double_block_si128(sv);
|
||||
_mm_storeu_si128((__m128i*)d,dv);
|
||||
}
|
||||
/* 128 bits SSE times 4 */
|
||||
static const unsigned short lk4[64] = {
|
||||
0x0000, 0x0086, 0x010c, 0x018a, 0x0218, 0x029e, 0x0314, 0x0392,
|
||||
0x0430, 0x04b6, 0x053c, 0x05ba, 0x0628, 0x06ae, 0x0724, 0x07a2,
|
||||
0x0860, 0x08e6, 0x096c, 0x09ea, 0x0a78, 0x0afe, 0x0b74, 0x0bf2,
|
||||
0x0c50, 0x0cd6, 0x0d5c, 0x0dda, 0x0e48, 0x0ece, 0x0f44, 0x0fc2,
|
||||
0x10c0, 0x1046, 0x11cc, 0x114a, 0x12d8, 0x125e, 0x13d4, 0x1352,
|
||||
0x14f0, 0x1476, 0x15fc, 0x157a, 0x16e8, 0x166e, 0x17e4, 0x1762,
|
||||
0x18a0, 0x1826, 0x19ac, 0x192a, 0x1ab8, 0x1a3e, 0x1bb4, 0x1b32,
|
||||
0x1c90, 0x1c16, 0x1d9c, 0x1d1a, 0x1e88, 0x1e0e, 0x1f84, 0x1f02
|
||||
};
|
||||
static inline __m128i double_block_2_si128_norev(const __m128i sv) {
|
||||
const __m128i mask = _mm_set_epi32(3,3,3,3);
|
||||
const int idx = _mm_extract_epi8(sv,15);
|
||||
/* __m128i sv30x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xC0)>>6],0); */
|
||||
__m128i sv30x = halfwordandzero(lk4[(idx&0xC0)>>6]);
|
||||
|
||||
__m128i sv30 = _mm_srli_epi32(sv, 30);
|
||||
__m128i sv30m = _mm_and_si128(sv30, mask);
|
||||
/* __m128i sv30ms = _mm_shuffle_epi32(sv30m, _MM_SHUFFLE(2,1,0,3)); */
|
||||
__m128i sv30ms = wordrotate1l128(sv30m);
|
||||
__m128i sv2 = _mm_slli_epi32(sv, 2);
|
||||
__m128i dv = _mm_xor_si128(sv30ms,sv2);
|
||||
__m128i final = _mm_xor_si128(dv, sv30x);
|
||||
return final;
|
||||
}
|
||||
static inline __m128i double_block_3_si128_norev(const __m128i sv) {
|
||||
const __m128i mask = _mm_set_epi32(7,7,7,7);
|
||||
const int idx = _mm_extract_epi8(sv,15);
|
||||
/* __m128i sv29x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xE0)>>5],0); */
|
||||
__m128i sv29x = halfwordandzero(lk4[(idx&0xE0)>>5]);
|
||||
|
||||
__m128i sv29 = _mm_srli_epi32(sv, 29);
|
||||
__m128i sv29m = _mm_and_si128(sv29, mask);
|
||||
/* __m128i sv29ms = _mm_shuffle_epi32(sv29m, _MM_SHUFFLE(2,1,0,3)); */
|
||||
__m128i sv29ms = wordrotate1l128(sv29m);
|
||||
__m128i sv3 = _mm_slli_epi32(sv, 3);
|
||||
__m128i dv = _mm_xor_si128(sv29ms,sv3);
|
||||
__m128i final = _mm_xor_si128(dv, sv29x);
|
||||
return final;
|
||||
}
|
||||
static inline __m128i double_block_4_si128_norev(const __m128i sv) {
|
||||
const __m128i mask = _mm_set_epi32(15,15,15,15);
|
||||
const int idx = _mm_extract_epi8(sv,15);
|
||||
/* __m128i sv28x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xF0)>>4],0); */
|
||||
__m128i sv28x = halfwordandzero(lk4[(idx&0xF0)>>4]);
|
||||
|
||||
__m128i sv28 = _mm_srli_epi32(sv, 28);
|
||||
__m128i sv28m = _mm_and_si128(sv28, mask);
|
||||
/* __m128i sv28ms = _mm_shuffle_epi32(sv28m, _MM_SHUFFLE(2,1,0,3)); */
|
||||
__m128i sv28ms = wordrotate1l128(sv28m);
|
||||
__m128i sv4 = _mm_slli_epi32(sv, 4);
|
||||
__m128i dv = _mm_xor_si128(sv28ms,sv4);
|
||||
__m128i final = _mm_xor_si128(dv, sv28x);
|
||||
return final;
|
||||
}
|
||||
static inline __m128i double_block_5_si128_norev(const __m128i sv) {
|
||||
const __m128i mask = _mm_set_epi32(31,31,31,31);
|
||||
const int idx = _mm_extract_epi8(sv,15);
|
||||
/* __m128i sv27x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xF8)>>3],0); */
|
||||
__m128i sv27x = halfwordandzero(lk4[(idx&0xF8)>>3]);
|
||||
|
||||
__m128i sv27 = _mm_srli_epi32(sv, 27);
|
||||
__m128i sv27m = _mm_and_si128(sv27, mask);
|
||||
/* __m128i sv27ms = _mm_shuffle_epi32(sv27m, _MM_SHUFFLE(2,1,0,3)); */
|
||||
__m128i sv27ms = wordrotate1l128(sv27m);
|
||||
__m128i sv5 = _mm_slli_epi32(sv, 5);
|
||||
__m128i dv = _mm_xor_si128(sv27ms,sv5);
|
||||
__m128i final = _mm_xor_si128(dv, sv27x);
|
||||
return final;
|
||||
}
|
||||
static inline __m128i double_block_6_si128_norev(const __m128i sv) {
|
||||
const __m128i mask = _mm_set_epi32(63,63,63,63);
|
||||
const int idx = _mm_extract_epi8(sv,15);
|
||||
/* __m128i sv26x = _mm_insert_epi16(_mm_setzero_si128(),lk4[(idx&0xFC)>>2],0); */
|
||||
__m128i sv26x = halfwordandzero(lk4[(idx&0xFC)>>2]);
|
||||
|
||||
__m128i sv26 = _mm_srli_epi32(sv, 26);
|
||||
__m128i sv26m = _mm_and_si128(sv26, mask);
|
||||
/* __m128i sv26ms = _mm_shuffle_epi32(sv26m, _MM_SHUFFLE(2,1,0,3)); */
|
||||
__m128i sv26ms = wordrotate1l128(sv26m);
|
||||
__m128i sv6 = _mm_slli_epi32(sv, 6);
|
||||
__m128i dv = _mm_xor_si128(sv26ms,sv6);
|
||||
__m128i final = _mm_xor_si128(dv, sv26x);
|
||||
return final;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static inline __m128i calc_L_i_si128(const __m128i ldollarvr, const unsigned j) {
|
||||
/* const __m128i rev = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); */
|
||||
/* __m128i ldollarv = _mm_shuffle_epi8(ldollarvr, rev); */
|
||||
__m128i ldollarv = bytereverse128(ldollarvr);
|
||||
unsigned i;
|
||||
__m128i lv;
|
||||
unsigned ntz = __builtin_ctz(j);/* printf("ntz = %u\n", ntz); */
|
||||
switch(ntz) {
|
||||
case 0:
|
||||
lv = double_block_si128_norev(ldollarv);
|
||||
break;
|
||||
case 1:
|
||||
lv = double_block_2_si128_norev(ldollarv);
|
||||
break;
|
||||
case 2:
|
||||
lv = double_block_3_si128_norev(ldollarv);
|
||||
break;
|
||||
case 3:
|
||||
lv = double_block_4_si128_norev(ldollarv);
|
||||
break;
|
||||
case 4:
|
||||
lv = double_block_5_si128_norev(ldollarv);
|
||||
break;
|
||||
default:
|
||||
lv = double_block_6_si128_norev(ldollarv);
|
||||
for (i = 5; i < ntz ; i++)
|
||||
lv = double_block_si128_norev(lv);
|
||||
break;
|
||||
}
|
||||
/* return _mm_shuffle_epi8(lv, rev); */
|
||||
return bytereverse128(lv);
|
||||
}
|
||||
static inline void calc_L_i(block l, const block ldollar, const unsigned i) {
|
||||
__m128i ldollarv = _mm_loadu_si128((const __m128i*)ldollar);
|
||||
__m128i lv = calc_L_i_si128(ldollarv, i);
|
||||
_mm_storeu_si128((__m128i*)l,lv);
|
||||
}
|
||||
static inline void precompute_lv(__m128i prelv[32], const __m128i ldollarvr, const unsigned max) {
|
||||
/* const __m128i rev = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); */
|
||||
/* __m128i ldollarv = _mm_shuffle_epi8(ldollarvr, rev); */
|
||||
__m128i ldollarv = bytereverse128(ldollarvr);
|
||||
unsigned i;
|
||||
__m128i lv = double_block_si128_norev(ldollarv);
|
||||
for (i = 0 ; i < max-1 ; i++) {
|
||||
/* prelv[i] = _mm_shuffle_epi8(lv, rev); */
|
||||
prelv[i] = bytereverse128(lv);
|
||||
lv = double_block_si128_norev(lv);
|
||||
}
|
||||
/* prelv[i] = _mm_shuffle_epi8(lv, rev); */
|
||||
return bytereverse128(lv);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
static void hash(block result, const unsigned char *k,
|
||||
unsigned char *a, unsigned abytes,
|
||||
const __m128i lstar,
|
||||
const __m128i prelv[32], const __m128i aes_key[15]) {
|
||||
__m128i offset, sum, tmp;
|
||||
unsigned i;
|
||||
|
||||
/* Process any whole blocks */
|
||||
/* Sum_0 = zeros(128) */
|
||||
sum = _mm_setzero_si128();
|
||||
/* Offset_0 = zeros(128) */
|
||||
offset = _mm_setzero_si128();
|
||||
i=1;
|
||||
for (; i<=abytes/16; i++, a = a + 16) {
|
||||
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
|
||||
tmp = prelv[__builtin_ctz(i)];
|
||||
offset = _mm_xor_si128(offset, tmp);
|
||||
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
|
||||
tmp = _mm_xor_si128(offset, _mm_loadu_si128((const __m128i*)a));
|
||||
tmp = aes256_1Tft__encrypt1_si128(tmp, aes_key);
|
||||
sum = _mm_xor_si128(sum, tmp);
|
||||
}
|
||||
|
||||
/* Process any final partial block; compute final hash value */
|
||||
|
||||
abytes = abytes % 16; /* Bytes in final block */
|
||||
if (abytes > 0) {
|
||||
/* Offset_* = Offset_m xor L_* */
|
||||
offset = _mm_xor_si128(offset, lstar);
|
||||
/* tmp = (A_* || 1 || zeros(127-bitlen(A_*))) xor Offset_* */
|
||||
unsigned char pad[16];
|
||||
memset(pad, 0, 16);
|
||||
memcpy(pad, a, abytes);
|
||||
pad[abytes] = 0x80;
|
||||
tmp = _mm_loadu_si128((const __m128i*)pad);
|
||||
tmp = _mm_xor_si128(offset, tmp);
|
||||
/* Sum = Sum_m xor ENCIPHER(K, tmp) */
|
||||
tmp = aes256_1Tft__encrypt1_si128(tmp, aes_key);
|
||||
sum = _mm_xor_si128(tmp, sum);
|
||||
}
|
||||
|
||||
_mm_storeu_si128((__m128i*)result,sum);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
static int ocb_crypt(unsigned char *out, unsigned char *k, unsigned char *n,
|
||||
unsigned char *a, unsigned abytes,
|
||||
unsigned char *in, unsigned inbytes, int encrypting) {
|
||||
__m128i prelv[32];
|
||||
__m128i aes_decrypt_key[15];
|
||||
__m128i aes_encrypt_key[15];
|
||||
block ad_hash;
|
||||
__m128i lstar, ldollar, sum, offset, ktop, pad, nonce, tag, tmp, outv;
|
||||
block nonce_b, offset_b;
|
||||
unsigned char stretch[24];
|
||||
unsigned bottom, byteshift, bitshift, i, max;
|
||||
|
||||
/* Setup AES and strip ciphertext of its tag */
|
||||
if ( ! encrypting ) {
|
||||
if (inbytes < TAGBYTES) return -1;
|
||||
inbytes -= TAGBYTES;
|
||||
}
|
||||
aes256_Tsetkey_encrypt(k, aes_encrypt_key);
|
||||
if ( ! encrypting ) {
|
||||
aes256_key_enc2dec(aes_encrypt_key, aes_decrypt_key);
|
||||
}
|
||||
|
||||
/* Key-dependent variables */
|
||||
|
||||
/* L_* = ENCIPHER(K, zeros(128)) */
|
||||
tmp = _mm_setzero_si128();
|
||||
lstar = aes256_1Tft__encrypt1_si128(tmp, aes_encrypt_key);
|
||||
/* L_$ = double(L_*) */
|
||||
ldollar = double_block_si128(lstar);
|
||||
max = abytes >= inbytes ? abytes/4 : inbytes/4;
|
||||
max = (max < 2 ? 2 : max);
|
||||
/* only precompute what's really needed;
|
||||
look at the number of leading zero (to find the leftmost bit set to one)
|
||||
all trailing zero will be at the right of it so we have an upper bound
|
||||
*/
|
||||
precompute_lv(prelv,ldollar,31-__builtin_clz(max));
|
||||
|
||||
/* Nonce-dependent and per-encryption variables */
|
||||
|
||||
/* Nonce = zeros(127-bitlen(N)) || 1 || N */
|
||||
memset(nonce_b, 0, 16);
|
||||
memcpy(&nonce_b[16-NONCEBYTES],n,NONCEBYTES);
|
||||
nonce_b[0] = (unsigned char)(((TAGBYTES * 8) % 128) << 1);
|
||||
nonce_b[16-NONCEBYTES-1] |= 0x01;
|
||||
/* bottom = str2num(Nonce[123..128]) */
|
||||
bottom = nonce_b[15] & 0x3F;
|
||||
/* Ktop = ENCIPHER(K, Nonce[1..122] || zeros(6)) */
|
||||
nonce_b[15] &= 0xC0;
|
||||
nonce = _mm_loadu_si128((const __m128i*)nonce_b);
|
||||
ktop = aes256_1Tft__encrypt1_si128(nonce, aes_encrypt_key);
|
||||
/* Stretch = Ktop || (Ktop[1..64] xor Ktop[9..72]) */
|
||||
_mm_storeu_si128((__m128i*)stretch, ktop);
|
||||
_mm_storel_epi64((__m128i*)(stretch+16), _mm_xor_si128(_mm_srli_si128(ktop,1), ktop));
|
||||
/* Offset_0 = Stretch[1+bottom..128+bottom] */
|
||||
byteshift = bottom/8;
|
||||
bitshift = bottom%8;
|
||||
if (bitshift != 0)
|
||||
for (i=0; i<16; i++)
|
||||
offset_b[i] = (stretch[i+byteshift] << bitshift) |
|
||||
(stretch[i+byteshift+1] >> (8-bitshift));
|
||||
else
|
||||
for (i=0; i<16; i++)
|
||||
offset_b[i] = stretch[i+byteshift];
|
||||
offset = _mm_loadu_si128((const __m128i*)offset_b);
|
||||
/* Checksum_0 = zeros(128) */
|
||||
sum = _mm_xor_si128(sum,sum);
|
||||
|
||||
/* Hash associated data */
|
||||
hash(ad_hash, k, a, abytes, lstar, prelv, aes_encrypt_key);
|
||||
|
||||
/* Process any whole blocks */
|
||||
i=1;
|
||||
if (encrypting) {
|
||||
|
||||
for (; i<=inbytes/16; i++, in=in+16, out=out+16) {
|
||||
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
|
||||
tmp = prelv[__builtin_ctz(i)];
|
||||
|
||||
offset = _mm_xor_si128(offset, tmp);
|
||||
tmp = _mm_xor_si128(offset, _mm_loadu_si128((const __m128i*)in));
|
||||
|
||||
/* Checksum_i = Checksum_{i-1} xor P_i */
|
||||
sum = _mm_xor_si128(_mm_loadu_si128((const __m128i*)in), sum);
|
||||
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
|
||||
tmp = aes256_1Tft__encrypt1_si128(tmp, aes_encrypt_key);
|
||||
outv = _mm_xor_si128(offset, tmp);
|
||||
_mm_storeu_si128((__m128i*)out, outv);
|
||||
}
|
||||
} else {
|
||||
|
||||
for (; i<=inbytes/16; i++, in=in+16, out=out+16) {
|
||||
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
|
||||
tmp= prelv[__builtin_ctz(i)];
|
||||
offset = _mm_xor_si128(offset, tmp);
|
||||
tmp = _mm_xor_si128(offset, _mm_loadu_si128((const __m128i*)in));
|
||||
|
||||
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
|
||||
tmp = aes256_1Tft__decrypt1_si128(tmp, aes_decrypt_key);
|
||||
outv = _mm_xor_si128(offset, tmp);
|
||||
_mm_storeu_si128((__m128i*)out, outv);
|
||||
/* Checksum_i = Checksum_{i-1} xor P_i */
|
||||
sum = _mm_xor_si128(outv, sum);
|
||||
}
|
||||
}
|
||||
|
||||
/* Process any final partial block and compute raw tag */
|
||||
|
||||
inbytes = inbytes % 16; /* Bytes in final block */
|
||||
if (inbytes > 0) {
|
||||
/* Offset_* = Offset_m xor L_* */
|
||||
offset = _mm_xor_si128(offset, lstar);
|
||||
/* Pad = ENCIPHER(K, Offset_*) */
|
||||
pad = aes256_1Tft__encrypt1_si128(offset, aes_encrypt_key);
|
||||
|
||||
if (encrypting) {
|
||||
/* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
|
||||
unsigned char tmp_b[16];
|
||||
unsigned char pad_b[16];
|
||||
memset(tmp_b, 0, 16);
|
||||
memcpy(tmp_b, in, inbytes);
|
||||
tmp_b[inbytes] = 0x80;
|
||||
tmp = _mm_loadu_si128((const __m128i*)tmp_b);
|
||||
sum = _mm_xor_si128(tmp, sum);
|
||||
/* C_* = P_* xor Pad[1..bitlen(P_*)] */
|
||||
pad = _mm_xor_si128(tmp, pad);
|
||||
_mm_storeu_si128((__m128i*)pad_b, pad);
|
||||
memcpy(out, pad_b, inbytes);
|
||||
out = out + inbytes;
|
||||
} else {
|
||||
/* P_* = C_* xor Pad[1..bitlen(C_*)] */
|
||||
unsigned char tmp_b[16];
|
||||
unsigned char pad_b[16];
|
||||
_mm_storeu_si128((__m128i*)pad_b, pad);
|
||||
memcpy(tmp_b, pad_b, 16);
|
||||
memcpy(tmp_b, in, inbytes);
|
||||
xor_block(tmp_b,pad_b,tmp_b);
|
||||
tmp_b[inbytes] = 0x80;
|
||||
memcpy(out, tmp_b, inbytes);
|
||||
tmp = _mm_loadu_si128((const __m128i*)tmp_b);
|
||||
/* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
|
||||
sum = _mm_xor_si128(tmp, sum);
|
||||
in = in + inbytes;
|
||||
}
|
||||
}
|
||||
|
||||
/* Tag = ENCIPHER(K, Checksum xor Offset xor L_$) xor HASH(K,A) */
|
||||
tmp = _mm_xor_si128(sum, offset);
|
||||
tmp = _mm_xor_si128(tmp, ldollar);
|
||||
tag = aes256_1Tft__encrypt1_si128(tmp, aes_encrypt_key);
|
||||
tag = _mm_xor_si128(_mm_loadu_si128((const __m128i*)ad_hash), tag);
|
||||
|
||||
if (encrypting) {
|
||||
unsigned char tag_b[16];
|
||||
_mm_storeu_si128((__m128i*)tag_b, tag);
|
||||
memcpy(out, tag_b, TAGBYTES);
|
||||
return 0;
|
||||
} else {
|
||||
unsigned char tag_b[16];
|
||||
_mm_storeu_si128((__m128i*)tag_b, tag);
|
||||
return (memcmp(in,tag_b,TAGBYTES) ? -1 : 0); /* Check for validity */
|
||||
}
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
#define OCB_ENCRYPT 1
|
||||
#define OCB_DECRYPT 0
|
||||
|
||||
void ocb_encrypt(unsigned char *c, unsigned char *k, unsigned char *n,
|
||||
unsigned char *a, unsigned abytes,
|
||||
unsigned char *p, unsigned pbytes) {
|
||||
ocb_crypt(c, k, n, a, abytes, p, pbytes, OCB_ENCRYPT);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
int ocb_decrypt(unsigned char *p, unsigned char *k, unsigned char *n,
|
||||
unsigned char *a, unsigned abytes,
|
||||
unsigned char *c, unsigned cbytes) {
|
||||
return ocb_crypt(p, k, n, a, abytes, c, cbytes, OCB_DECRYPT);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
int crypto_aead_encrypt(
|
||||
unsigned char *c,unsigned long long *clen,
|
||||
const unsigned char *m,unsigned long long mlen,
|
||||
const unsigned char *ad,unsigned long long adlen,
|
||||
const unsigned char *nsec,
|
||||
const unsigned char *npub,
|
||||
const unsigned char *k
|
||||
)
|
||||
{
|
||||
*clen = mlen + TAGBYTES;
|
||||
ocb_crypt(c, (unsigned char *)k, (unsigned char *)npub, (unsigned char *)ad,
|
||||
adlen, (unsigned char *)m, mlen, OCB_ENCRYPT);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int crypto_aead_decrypt(
|
||||
unsigned char *m,unsigned long long *mlen,
|
||||
unsigned char *nsec,
|
||||
const unsigned char *c,unsigned long long clen,
|
||||
const unsigned char *ad,unsigned long long adlen,
|
||||
const unsigned char *npub,
|
||||
const unsigned char *k
|
||||
)
|
||||
{
|
||||
*mlen = clen - TAGBYTES;
|
||||
return ocb_crypt(m, (unsigned char *)k, (unsigned char *)npub,
|
||||
(unsigned char *)ad, adlen, (unsigned char *)c, clen, OCB_DECRYPT);
|
||||
}
|
||||
|
14
aeadaes256ocbtaglen128v1-rv32/kernelrandombytes.h
Normal file
14
aeadaes256ocbtaglen128v1-rv32/kernelrandombytes.h
Normal file
|
@ -0,0 +1,14 @@
|
|||
#ifndef kernelrandombytes_h
|
||||
#define kernelrandombytes_h
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern void kernelrandombytes(unsigned char *,unsigned long long);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
19
aeadaes256ocbtaglen128v1-rv32/random.cpp
Normal file
19
aeadaes256ocbtaglen128v1-rv32/random.cpp
Normal file
|
@ -0,0 +1,19 @@
|
|||
#include <random>
|
||||
#include <functional>
|
||||
|
||||
std::default_random_engine generator;
|
||||
std::uniform_int_distribution<unsigned char> distribution(0,255);
|
||||
auto rbyte = std::bind ( distribution, generator );
|
||||
|
||||
extern "C" {
|
||||
void kernelrandombytes(unsigned char *x,unsigned long long xlen)
|
||||
{
|
||||
int i;
|
||||
|
||||
while (xlen > 0) {
|
||||
*x = rbyte();
|
||||
x++;
|
||||
xlen--;
|
||||
}
|
||||
}
|
||||
}
|
83
aeadaes256ocbtaglen128v1-rv32/riscv.c
Normal file
83
aeadaes256ocbtaglen128v1-rv32/riscv.c
Normal file
|
@ -0,0 +1,83 @@
|
|||
/*
|
||||
cpucycles/riscv.c version 20190803
|
||||
D. J. Bernstein
|
||||
Romain Dolbeau
|
||||
Public domain.
|
||||
*/
|
||||
|
||||
#include <time.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
long long cpucycles_riscv(void)
|
||||
{
|
||||
long long result;
|
||||
#if defined(__riscv_xlen)
|
||||
#if __riscv_xlen == 64
|
||||
asm volatile("rdcycle %0" : "=r" (result));
|
||||
#elif __riscv_xlen == 32
|
||||
unsigned int l, h, h2;
|
||||
asm volatile( "start:\n"
|
||||
"rdcycleh %0\n"
|
||||
"rdcycle %1\n"
|
||||
"rdcycleh %2\n"
|
||||
"bne %0, %2, start\n"
|
||||
: "=r" (h), "=r" (l), "=r" (h2));
|
||||
|
||||
result = (((unsigned long long)h)<<32) | ((unsigned long long)l);
|
||||
#else
|
||||
#error "unknown __riscv_xlen"
|
||||
#endif
|
||||
#else // __riscv_xlen
|
||||
#error "__riscv_xlen required for RISC-V support"
|
||||
#endif // __riscv_xlen
|
||||
return result;
|
||||
}
|
||||
|
||||
static long long microseconds(void)
|
||||
{
|
||||
struct timeval t;
|
||||
gettimeofday(&t,(struct timezone *) 0);
|
||||
return t.tv_sec * (long long) 1000000 + t.tv_usec;
|
||||
}
|
||||
|
||||
static double guessfreq(void)
|
||||
{
|
||||
long long tb0; long long us0;
|
||||
long long tb1; long long us1;
|
||||
|
||||
tb0 = cpucycles_riscv();
|
||||
us0 = microseconds();
|
||||
do {
|
||||
tb1 = cpucycles_riscv();
|
||||
us1 = microseconds();
|
||||
} while (us1 - us0 < 10000 || tb1 - tb0 < 1000);
|
||||
if (tb1 <= tb0) return 0;
|
||||
tb1 -= tb0;
|
||||
us1 -= us0;
|
||||
return ((double) tb1) / (0.000001 * (double) us1);
|
||||
}
|
||||
|
||||
static long long cpufrequency = 0;
|
||||
|
||||
static void init(void)
|
||||
{
|
||||
double guess1;
|
||||
double guess2;
|
||||
int loop;
|
||||
|
||||
for (loop = 0;loop < 100;++loop) {
|
||||
guess1 = guessfreq();
|
||||
guess2 = guessfreq();
|
||||
if (guess1 > 1.01 * guess2) continue;
|
||||
if (guess2 > 1.01 * guess1) continue;
|
||||
cpufrequency = 0.5 * (guess1 + guess2);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
long long cpucycles_riscv_persecond(void)
|
||||
{
|
||||
if (!cpufrequency) init();
|
||||
return cpufrequency;
|
||||
}
|
323
aeadaes256ocbtaglen128v1-rv32/try-anything.c
Normal file
323
aeadaes256ocbtaglen128v1-rv32/try-anything.c
Normal file
|
@ -0,0 +1,323 @@
|
|||
/*
|
||||
* try-anything.c version 20190729
|
||||
* D. J. Bernstein
|
||||
* Some portions adapted from TweetNaCl by Bernstein, Janssen, Lange, Schwabe.
|
||||
* Public domain.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/resource.h>
|
||||
#include "kernelrandombytes.h"
|
||||
#include "cpucycles.h"
|
||||
#include "crypto_uint8.h"
|
||||
#include "crypto_uint32.h"
|
||||
#include "crypto_uint64.h"
|
||||
#include "try.h"
|
||||
|
||||
typedef crypto_uint8 u8;
|
||||
typedef crypto_uint32 u32;
|
||||
typedef crypto_uint64 u64;
|
||||
|
||||
#define FOR(i,n) for (i = 0;i < n;++i)
|
||||
|
||||
static u32 L32(u32 x,int c) { return (x << c) | ((x&0xffffffff) >> (32 - c)); }
|
||||
|
||||
static u32 ld32(const u8 *x)
|
||||
{
|
||||
u32 u = x[3];
|
||||
u = (u<<8)|x[2];
|
||||
u = (u<<8)|x[1];
|
||||
return (u<<8)|x[0];
|
||||
}
|
||||
|
||||
static void st32(u8 *x,u32 u)
|
||||
{
|
||||
int i;
|
||||
FOR(i,4) { x[i] = u; u >>= 8; }
|
||||
}
|
||||
|
||||
static const u8 sigma[17] = "expand 32-byte k";
|
||||
|
||||
static void core(u8 *out,const u8 *in,const u8 *k)
|
||||
{
|
||||
u32 w[16],x[16],y[16],t[4];
|
||||
int i,j,m;
|
||||
|
||||
FOR(i,4) {
|
||||
x[5*i] = ld32(sigma+4*i);
|
||||
x[1+i] = ld32(k+4*i);
|
||||
x[6+i] = ld32(in+4*i);
|
||||
x[11+i] = ld32(k+16+4*i);
|
||||
}
|
||||
|
||||
FOR(i,16) y[i] = x[i];
|
||||
|
||||
FOR(i,20) {
|
||||
FOR(j,4) {
|
||||
FOR(m,4) t[m] = x[(5*j+4*m)%16];
|
||||
t[1] ^= L32(t[0]+t[3], 7);
|
||||
t[2] ^= L32(t[1]+t[0], 9);
|
||||
t[3] ^= L32(t[2]+t[1],13);
|
||||
t[0] ^= L32(t[3]+t[2],18);
|
||||
FOR(m,4) w[4*j+(j+m)%4] = t[m];
|
||||
}
|
||||
FOR(m,16) x[m] = w[m];
|
||||
}
|
||||
|
||||
FOR(i,16) st32(out + 4 * i,x[i] + y[i]);
|
||||
}
|
||||
|
||||
static void salsa20(u8 *c,u64 b,const u8 *n,const u8 *k)
|
||||
{
|
||||
u8 z[16],x[64];
|
||||
u32 u,i;
|
||||
if (!b) return;
|
||||
FOR(i,16) z[i] = 0;
|
||||
FOR(i,8) z[i] = n[i];
|
||||
while (b >= 64) {
|
||||
core(x,z,k);
|
||||
FOR(i,64) c[i] = x[i];
|
||||
u = 1;
|
||||
for (i = 8;i < 16;++i) {
|
||||
u += (u32) z[i];
|
||||
z[i] = u;
|
||||
u >>= 8;
|
||||
}
|
||||
b -= 64;
|
||||
c += 64;
|
||||
}
|
||||
if (b) {
|
||||
core(x,z,k);
|
||||
FOR(i,b) c[i] = x[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void increment(u8 *n)
|
||||
{
|
||||
if (!++n[0])
|
||||
if (!++n[1])
|
||||
if (!++n[2])
|
||||
if (!++n[3])
|
||||
if (!++n[4])
|
||||
if (!++n[5])
|
||||
if (!++n[6])
|
||||
if (!++n[7])
|
||||
;
|
||||
}
|
||||
|
||||
static void testvector(unsigned char *x,unsigned long long xlen)
|
||||
{
|
||||
const static unsigned char testvector_k[33] = "generate inputs for test vectors";
|
||||
static unsigned char testvector_n[8];
|
||||
salsa20(x,xlen,testvector_n,testvector_k);
|
||||
increment(testvector_n);
|
||||
}
|
||||
|
||||
unsigned long long myrandom(void)
|
||||
{
|
||||
unsigned char x[8];
|
||||
unsigned long long result;
|
||||
testvector(x,8);
|
||||
result = x[7];
|
||||
result = (result<<8)|x[6];
|
||||
result = (result<<8)|x[5];
|
||||
result = (result<<8)|x[4];
|
||||
result = (result<<8)|x[3];
|
||||
result = (result<<8)|x[2];
|
||||
result = (result<<8)|x[1];
|
||||
result = (result<<8)|x[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
static void canary(unsigned char *x,unsigned long long xlen)
|
||||
{
|
||||
const static unsigned char canary_k[33] = "generate pad to catch overwrites";
|
||||
static unsigned char canary_n[8];
|
||||
salsa20(x,xlen,canary_n,canary_k);
|
||||
increment(canary_n);
|
||||
}
|
||||
|
||||
void double_canary(unsigned char *x2,unsigned char *x,unsigned long long xlen)
|
||||
{
|
||||
canary(x - 16,16);
|
||||
canary(x + xlen,16);
|
||||
memcpy(x2 - 16,x - 16,16);
|
||||
memcpy(x2 + xlen,x + xlen,16);
|
||||
}
|
||||
|
||||
void input_prepare(unsigned char *x2,unsigned char *x,unsigned long long xlen)
|
||||
{
|
||||
testvector(x,xlen);
|
||||
canary(x - 16,16);
|
||||
canary(x + xlen,16);
|
||||
memcpy(x2 - 16,x - 16,xlen + 32);
|
||||
}
|
||||
|
||||
void input_compare(const unsigned char *x2,const unsigned char *x,unsigned long long xlen,const char *fun)
|
||||
{
|
||||
if (memcmp(x2 - 16,x - 16,xlen + 32)) {
|
||||
fprintf(stderr,"%s overwrites input\n",fun);
|
||||
exit(111);
|
||||
}
|
||||
}
|
||||
|
||||
void output_prepare(unsigned char *x2,unsigned char *x,unsigned long long xlen)
|
||||
{
|
||||
canary(x - 16,xlen + 32);
|
||||
memcpy(x2 - 16,x - 16,xlen + 32);
|
||||
}
|
||||
|
||||
void output_compare(const unsigned char *x2,const unsigned char *x,unsigned long long xlen,const char *fun)
|
||||
{
|
||||
if (memcmp(x2 - 16,x - 16,16)) {
|
||||
fprintf(stderr,"%s writes before output\n",fun);
|
||||
exit(111);
|
||||
}
|
||||
if (memcmp(x2 + xlen,x + xlen,16)) {
|
||||
fprintf(stderr,"%s writes after output\n",fun);
|
||||
exit(111);
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned char checksum_state[64];
|
||||
static char checksum_hex[65];
|
||||
|
||||
void checksum(const unsigned char *x,unsigned long long xlen)
|
||||
{
|
||||
u8 block[16];
|
||||
int i;
|
||||
while (xlen >= 16) {
|
||||
core(checksum_state,x,checksum_state);
|
||||
x += 16;
|
||||
xlen -= 16;
|
||||
}
|
||||
FOR(i,16) block[i] = 0;
|
||||
FOR(i,xlen) block[i] = x[i];
|
||||
block[xlen] = 1;
|
||||
checksum_state[0] ^= 1;
|
||||
core(checksum_state,block,checksum_state);
|
||||
}
|
||||
|
||||
static void printword(const char *s)
|
||||
{
|
||||
if (!*s) putchar('-');
|
||||
while (*s) {
|
||||
if (*s == ' ') putchar('_');
|
||||
else if (*s == '\t') putchar('_');
|
||||
else if (*s == '\r') putchar('_');
|
||||
else if (*s == '\n') putchar('_');
|
||||
else putchar(*s);
|
||||
++s;
|
||||
}
|
||||
putchar(' ');
|
||||
}
|
||||
|
||||
static void printnum(long long x)
|
||||
{
|
||||
printf("%lld ",x);
|
||||
}
|
||||
|
||||
void fail(const char *why)
|
||||
{
|
||||
fprintf(stderr,"%s\n",why);
|
||||
exit(111);
|
||||
}
|
||||
|
||||
unsigned char *alignedcalloc(unsigned long long len)
|
||||
{
|
||||
unsigned char *x = (unsigned char *) calloc(1,len + 256);
|
||||
long long i;
|
||||
if (!x) fail("out of memory");
|
||||
/* will never deallocate so shifting is ok */
|
||||
for (i = 0;i < len + 256;++i) x[i] = random();
|
||||
x += 64;
|
||||
x += 63 & (-(unsigned long) x);
|
||||
for (i = 0;i < len;++i) x[i] = 0;
|
||||
return x;
|
||||
}
|
||||
|
||||
#define TIMINGS 63
|
||||
static long long cycles[TIMINGS + 1];
|
||||
|
||||
void limits()
|
||||
{
|
||||
#ifdef RLIM_INFINITY
|
||||
struct rlimit r;
|
||||
r.rlim_cur = 0;
|
||||
r.rlim_max = 0;
|
||||
#ifdef RLIMIT_NOFILE
|
||||
setrlimit(RLIMIT_NOFILE,&r);
|
||||
#endif
|
||||
#ifdef RLIMIT_NPROC
|
||||
setrlimit(RLIMIT_NPROC,&r);
|
||||
#endif
|
||||
#ifdef RLIMIT_CORE
|
||||
setrlimit(RLIMIT_CORE,&r);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
static unsigned char randombyte[1];
|
||||
|
||||
int main()
|
||||
{
|
||||
long long i;
|
||||
long long j;
|
||||
long long abovej;
|
||||
long long belowj;
|
||||
long long checksumcycles;
|
||||
long long cyclespersecond;
|
||||
|
||||
cycles[0] = cpucycles();
|
||||
cycles[1] = cpucycles();
|
||||
cyclespersecond = cpucycles_persecond();
|
||||
|
||||
kernelrandombytes(randombyte,1);
|
||||
preallocate();
|
||||
limits();
|
||||
|
||||
allocate();
|
||||
srandom(getpid());
|
||||
|
||||
cycles[0] = cpucycles();
|
||||
test();
|
||||
cycles[1] = cpucycles();
|
||||
checksumcycles = cycles[1] - cycles[0];
|
||||
|
||||
predoit();
|
||||
for (i = 0;i <= TIMINGS;++i) {
|
||||
cycles[i] = cpucycles();
|
||||
}
|
||||
for (i = 0;i <= TIMINGS;++i) {
|
||||
cycles[i] = cpucycles();
|
||||
doit();
|
||||
}
|
||||
for (i = 0;i < TIMINGS;++i) cycles[i] = cycles[i + 1] - cycles[i];
|
||||
for (j = 0;j < TIMINGS;++j) {
|
||||
belowj = 0;
|
||||
for (i = 0;i < TIMINGS;++i) if (cycles[i] < cycles[j]) ++belowj;
|
||||
abovej = 0;
|
||||
for (i = 0;i < TIMINGS;++i) if (cycles[i] > cycles[j]) ++abovej;
|
||||
if (belowj * 2 < TIMINGS && abovej * 2 < TIMINGS) break;
|
||||
}
|
||||
|
||||
for (i = 0;i < 32;++i) {
|
||||
checksum_hex[2 * i] = "0123456789abcdef"[15 & (checksum_state[i] >> 4)];
|
||||
checksum_hex[2 * i + 1] = "0123456789abcdef"[15 & checksum_state[i]];
|
||||
}
|
||||
checksum_hex[2 * i] = 0;
|
||||
|
||||
printword(checksum_hex);
|
||||
printnum(cycles[j]);
|
||||
printnum(checksumcycles);
|
||||
printnum(cyclespersecond);
|
||||
printword(primitiveimplementation);
|
||||
printf("\n");
|
||||
return 0;
|
||||
}
|
242
aeadaes256ocbtaglen128v1-rv32/try.c
Normal file
242
aeadaes256ocbtaglen128v1-rv32/try.c
Normal file
|
@ -0,0 +1,242 @@
|
|||
/*
|
||||
* crypto_aead/try.c version 20200406
|
||||
* D. J. Bernstein
|
||||
* Public domain.
|
||||
* Auto-generated by trygen.py; do not edit.
|
||||
*/
|
||||
|
||||
#include "crypto_aead.h"
|
||||
#include "try.h"
|
||||
|
||||
const char *primitiveimplementation = crypto_aead_IMPLEMENTATION;
|
||||
|
||||
#define TUNE_BYTES 1536
|
||||
#ifdef SMALL
|
||||
#define MAXTEST_BYTES 128
|
||||
#else
|
||||
#define MAXTEST_BYTES 4096
|
||||
#endif
|
||||
#ifdef SMALL
|
||||
#define LOOPS 64
|
||||
#else
|
||||
#define LOOPS 512
|
||||
#endif
|
||||
|
||||
static unsigned char *k;
|
||||
static unsigned char *s;
|
||||
static unsigned char *p;
|
||||
static unsigned char *a;
|
||||
static unsigned char *m;
|
||||
static unsigned char *c;
|
||||
static unsigned char *t;
|
||||
static unsigned char *r;
|
||||
static unsigned char *k2;
|
||||
static unsigned char *s2;
|
||||
static unsigned char *p2;
|
||||
static unsigned char *a2;
|
||||
static unsigned char *m2;
|
||||
static unsigned char *c2;
|
||||
static unsigned char *t2;
|
||||
static unsigned char *r2;
|
||||
#define klen crypto_aead_KEYBYTES
|
||||
#define slen crypto_aead_NSECBYTES
|
||||
#define plen crypto_aead_NPUBBYTES
|
||||
unsigned long long alen;
|
||||
unsigned long long mlen;
|
||||
unsigned long long clen;
|
||||
unsigned long long tlen;
|
||||
#define rlen crypto_aead_NSECBYTES
|
||||
|
||||
void preallocate(void)
|
||||
{
|
||||
}
|
||||
|
||||
void allocate(void)
|
||||
{
|
||||
unsigned long long alloclen = 0;
|
||||
if (alloclen < TUNE_BYTES) alloclen = TUNE_BYTES;
|
||||
if (alloclen < MAXTEST_BYTES + crypto_aead_ABYTES) alloclen = MAXTEST_BYTES + crypto_aead_ABYTES;
|
||||
if (alloclen < crypto_aead_KEYBYTES) alloclen = crypto_aead_KEYBYTES;
|
||||
if (alloclen < crypto_aead_NSECBYTES) alloclen = crypto_aead_NSECBYTES;
|
||||
if (alloclen < crypto_aead_NPUBBYTES) alloclen = crypto_aead_NPUBBYTES;
|
||||
if (alloclen < crypto_aead_NSECBYTES) alloclen = crypto_aead_NSECBYTES;
|
||||
k = alignedcalloc(alloclen);
|
||||
s = alignedcalloc(alloclen);
|
||||
p = alignedcalloc(alloclen);
|
||||
a = alignedcalloc(alloclen);
|
||||
m = alignedcalloc(alloclen);
|
||||
c = alignedcalloc(alloclen);
|
||||
t = alignedcalloc(alloclen);
|
||||
r = alignedcalloc(alloclen);
|
||||
k2 = alignedcalloc(alloclen);
|
||||
s2 = alignedcalloc(alloclen);
|
||||
p2 = alignedcalloc(alloclen);
|
||||
a2 = alignedcalloc(alloclen);
|
||||
m2 = alignedcalloc(alloclen);
|
||||
c2 = alignedcalloc(alloclen);
|
||||
t2 = alignedcalloc(alloclen);
|
||||
r2 = alignedcalloc(alloclen);
|
||||
}
|
||||
|
||||
void predoit(void)
|
||||
{
|
||||
}
|
||||
|
||||
void doit(void)
|
||||
{
|
||||
crypto_aead_encrypt(c,&clen,m,TUNE_BYTES,a,TUNE_BYTES,s,p,k);
|
||||
crypto_aead_decrypt(t,&tlen,r,c,clen,a,TUNE_BYTES,p,k);
|
||||
}
|
||||
|
||||
void test(void)
|
||||
{
|
||||
unsigned long long loop;
|
||||
|
||||
for (loop = 0;loop < LOOPS;++loop) {
|
||||
mlen = myrandom() % (MAXTEST_BYTES + 1);
|
||||
alen = myrandom() % (MAXTEST_BYTES + 1);
|
||||
|
||||
clen = mlen + crypto_aead_ABYTES;
|
||||
output_prepare(c2,c,clen);
|
||||
input_prepare(m2,m,mlen);
|
||||
input_prepare(a2,a,alen);
|
||||
input_prepare(s2,s,slen);
|
||||
input_prepare(p2,p,plen);
|
||||
input_prepare(k2,k,klen);
|
||||
if (crypto_aead_encrypt(c,&clen,m,mlen,a,alen,s,p,k) != 0) fail("crypto_aead_encrypt returns nonzero");
|
||||
if (clen < mlen) fail("crypto_aead_encrypt returns smaller output than input");
|
||||
if (clen > mlen + crypto_aead_ABYTES) fail("crypto_aead_encrypt returns more than crypto_aead_ABYTES extra bytes");
|
||||
checksum(c,clen);
|
||||
output_compare(c2,c,clen,"crypto_aead_encrypt");
|
||||
input_compare(m2,m,mlen,"crypto_aead_encrypt");
|
||||
input_compare(a2,a,alen,"crypto_aead_encrypt");
|
||||
input_compare(s2,s,slen,"crypto_aead_encrypt");
|
||||
input_compare(p2,p,plen,"crypto_aead_encrypt");
|
||||
input_compare(k2,k,klen,"crypto_aead_encrypt");
|
||||
|
||||
double_canary(c2,c,clen);
|
||||
double_canary(m2,m,mlen);
|
||||
double_canary(a2,a,alen);
|
||||
double_canary(s2,s,slen);
|
||||
double_canary(p2,p,plen);
|
||||
double_canary(k2,k,klen);
|
||||
if (crypto_aead_encrypt(c2,&clen,m2,mlen,a2,alen,s2,p2,k2) != 0) fail("crypto_aead_encrypt returns nonzero");
|
||||
if (memcmp(c2,c,clen) != 0) fail("crypto_aead_encrypt is nondeterministic");
|
||||
|
||||
#if crypto_aead_NOOVERLAP == 1
|
||||
#else
|
||||
double_canary(c2,c,clen);
|
||||
double_canary(m2,m,mlen);
|
||||
double_canary(a2,a,alen);
|
||||
double_canary(s2,s,slen);
|
||||
double_canary(p2,p,plen);
|
||||
double_canary(k2,k,klen);
|
||||
if (crypto_aead_encrypt(m2,&clen,m2,mlen,a,alen,s,p,k) != 0) fail("crypto_aead_encrypt with m=c overlap returns nonzero");
|
||||
if (memcmp(m2,c,clen) != 0) fail("crypto_aead_encrypt does not handle m=c overlap");
|
||||
memcpy(m2,m,mlen);
|
||||
if (crypto_aead_encrypt(a2,&clen,m,mlen,a2,alen,s,p,k) != 0) fail("crypto_aead_encrypt with a=c overlap returns nonzero");
|
||||
if (memcmp(a2,c,clen) != 0) fail("crypto_aead_encrypt does not handle a=c overlap");
|
||||
memcpy(a2,a,alen);
|
||||
if (crypto_aead_encrypt(s2,&clen,m,mlen,a,alen,s2,p,k) != 0) fail("crypto_aead_encrypt with s=c overlap returns nonzero");
|
||||
if (memcmp(s2,c,clen) != 0) fail("crypto_aead_encrypt does not handle s=c overlap");
|
||||
memcpy(s2,s,slen);
|
||||
if (crypto_aead_encrypt(p2,&clen,m,mlen,a,alen,s,p2,k) != 0) fail("crypto_aead_encrypt with p=c overlap returns nonzero");
|
||||
if (memcmp(p2,c,clen) != 0) fail("crypto_aead_encrypt does not handle p=c overlap");
|
||||
memcpy(p2,p,plen);
|
||||
if (crypto_aead_encrypt(k2,&clen,m,mlen,a,alen,s,p,k2) != 0) fail("crypto_aead_encrypt with k=c overlap returns nonzero");
|
||||
if (memcmp(k2,c,clen) != 0) fail("crypto_aead_encrypt does not handle k=c overlap");
|
||||
memcpy(k2,k,klen);
|
||||
#endif
|
||||
|
||||
tlen = clen;
|
||||
output_prepare(t2,t,tlen);
|
||||
output_prepare(r2,r,rlen);
|
||||
memcpy(c2,c,clen);
|
||||
double_canary(c2,c,clen);
|
||||
memcpy(a2,a,alen);
|
||||
double_canary(a2,a,alen);
|
||||
memcpy(p2,p,plen);
|
||||
double_canary(p2,p,plen);
|
||||
memcpy(k2,k,klen);
|
||||
double_canary(k2,k,klen);
|
||||
if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) != 0) fail("crypto_aead_decrypt returns nonzero");
|
||||
if (tlen != mlen) fail("crypto_aead_decrypt does not match mlen");
|
||||
if (memcmp(t,m,mlen) != 0) fail("crypto_aead_decrypt does not match m");
|
||||
if (memcmp(r,s,slen) != 0) fail("crypto_aead_decrypt does not match s");
|
||||
checksum(t,tlen);
|
||||
checksum(r,rlen);
|
||||
output_compare(t2,t,clen,"crypto_aead_decrypt");
|
||||
output_compare(r2,r,rlen,"crypto_aead_decrypt");
|
||||
input_compare(c2,c,clen,"crypto_aead_decrypt");
|
||||
input_compare(a2,a,alen,"crypto_aead_decrypt");
|
||||
input_compare(p2,p,plen,"crypto_aead_decrypt");
|
||||
input_compare(k2,k,klen,"crypto_aead_decrypt");
|
||||
|
||||
double_canary(t2,t,tlen);
|
||||
double_canary(r2,r,rlen);
|
||||
double_canary(c2,c,clen);
|
||||
double_canary(a2,a,alen);
|
||||
double_canary(p2,p,plen);
|
||||
double_canary(k2,k,klen);
|
||||
if (crypto_aead_decrypt(t2,&tlen,r2,c2,clen,a2,alen,p2,k2) != 0) fail("crypto_aead_decrypt returns nonzero");
|
||||
if (memcmp(t2,t,tlen) != 0) fail("crypto_aead_decrypt is nondeterministic");
|
||||
if (memcmp(r2,r,rlen) != 0) fail("crypto_aead_decrypt is nondeterministic");
|
||||
|
||||
#if crypto_aead_NOOVERLAP == 1
|
||||
#else
|
||||
double_canary(t2,t,tlen);
|
||||
double_canary(r2,r,rlen);
|
||||
double_canary(c2,c,clen);
|
||||
double_canary(a2,a,alen);
|
||||
double_canary(p2,p,plen);
|
||||
double_canary(k2,k,klen);
|
||||
if (crypto_aead_decrypt(c2,&tlen,r,c2,clen,a,alen,p,k) != 0) fail("crypto_aead_decrypt with c=t overlap returns nonzero");
|
||||
if (memcmp(c2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle c=t overlap");
|
||||
memcpy(c2,c,clen);
|
||||
if (crypto_aead_decrypt(a2,&tlen,r,c,clen,a2,alen,p,k) != 0) fail("crypto_aead_decrypt with a=t overlap returns nonzero");
|
||||
if (memcmp(a2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle a=t overlap");
|
||||
memcpy(a2,a,alen);
|
||||
if (crypto_aead_decrypt(p2,&tlen,r,c,clen,a,alen,p2,k) != 0) fail("crypto_aead_decrypt with p=t overlap returns nonzero");
|
||||
if (memcmp(p2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle p=t overlap");
|
||||
memcpy(p2,p,plen);
|
||||
if (crypto_aead_decrypt(k2,&tlen,r,c,clen,a,alen,p,k2) != 0) fail("crypto_aead_decrypt with k=t overlap returns nonzero");
|
||||
if (memcmp(k2,t,tlen) != 0) fail("crypto_aead_decrypt does not handle k=t overlap");
|
||||
memcpy(k2,k,klen);
|
||||
#endif
|
||||
|
||||
#if crypto_aead_NOOVERLAP == 1
|
||||
#else
|
||||
double_canary(t2,t,tlen);
|
||||
double_canary(r2,r,rlen);
|
||||
double_canary(c2,c,clen);
|
||||
double_canary(a2,a,alen);
|
||||
double_canary(p2,p,plen);
|
||||
double_canary(k2,k,klen);
|
||||
if (crypto_aead_decrypt(t,&tlen,c2,c2,clen,a,alen,p,k) != 0) fail("crypto_aead_decrypt with c=r overlap returns nonzero");
|
||||
if (memcmp(c2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle c=r overlap");
|
||||
memcpy(c2,c,clen);
|
||||
if (crypto_aead_decrypt(t,&tlen,a2,c,clen,a2,alen,p,k) != 0) fail("crypto_aead_decrypt with a=r overlap returns nonzero");
|
||||
if (memcmp(a2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle a=r overlap");
|
||||
memcpy(a2,a,alen);
|
||||
if (crypto_aead_decrypt(t,&tlen,p2,c,clen,a,alen,p2,k) != 0) fail("crypto_aead_decrypt with p=r overlap returns nonzero");
|
||||
if (memcmp(p2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle p=r overlap");
|
||||
memcpy(p2,p,plen);
|
||||
if (crypto_aead_decrypt(t,&tlen,k2,c,clen,a,alen,p,k2) != 0) fail("crypto_aead_decrypt with k=r overlap returns nonzero");
|
||||
if (memcmp(k2,r,rlen) != 0) fail("crypto_aead_decrypt does not handle k=r overlap");
|
||||
memcpy(k2,k,klen);
|
||||
#endif
|
||||
|
||||
c[myrandom() % clen] += 1 + (myrandom() % 255);
|
||||
if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) == 0)
|
||||
if ((tlen != mlen) || (memcmp(t,m,mlen) != 0) || (memcmp(r,s,slen) != 0))
|
||||
fail("crypto_aead_decrypt allows trivial forgeries");
|
||||
c[myrandom() % clen] += 1 + (myrandom() % 255);
|
||||
if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) == 0)
|
||||
if ((tlen != mlen) || (memcmp(t,m,mlen) != 0) || (memcmp(r,s,slen) != 0))
|
||||
fail("crypto_aead_decrypt allows trivial forgeries");
|
||||
c[myrandom() % clen] += 1 + (myrandom() % 255);
|
||||
if (crypto_aead_decrypt(t,&tlen,r,c,clen,a,alen,p,k) == 0)
|
||||
if ((tlen != mlen) || (memcmp(t,m,mlen) != 0) || (memcmp(r,s,slen) != 0))
|
||||
fail("crypto_aead_decrypt allows trivial forgeries");
|
||||
}
|
||||
}
|
21
aeadaes256ocbtaglen128v1-rv32/try.h
Normal file
21
aeadaes256ocbtaglen128v1-rv32/try.h
Normal file
|
@ -0,0 +1,21 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* provided by try.c: */
|
||||
extern const char *primitiveimplementation;
|
||||
extern void preallocate(void);
|
||||
extern void allocate(void);;
|
||||
extern void test(void);
|
||||
extern void predoit(void);
|
||||
extern void doit(void);
|
||||
|
||||
/* provided by try-anything.c: */
|
||||
extern void fail(const char *);
|
||||
extern unsigned char *alignedcalloc(unsigned long long);
|
||||
extern void checksum(const unsigned char *,unsigned long long);
|
||||
extern void double_canary(unsigned char *,unsigned char *,unsigned long long);
|
||||
extern void input_prepare(unsigned char *,unsigned char *,unsigned long long);
|
||||
extern void output_prepare(unsigned char *,unsigned char *,unsigned long long);
|
||||
extern void input_compare(const unsigned char *,const unsigned char *,unsigned long long,const char *);
|
||||
extern void output_compare(const unsigned char *,const unsigned char *,unsigned long long,const char *);
|
||||
extern unsigned long long myrandom(void);
|
24
aeadaes256ocbtaglen128v1-rv32/verify.c
Normal file
24
aeadaes256ocbtaglen128v1-rv32/verify.c
Normal file
|
@ -0,0 +1,24 @@
|
|||
#include "crypto_verify.h"
|
||||
|
||||
int crypto_verify(const unsigned char *x,const unsigned char *y)
|
||||
{
|
||||
unsigned int differentbits = 0;
|
||||
#define F(i) differentbits |= x[i] ^ y[i];
|
||||
F(0)
|
||||
F(1)
|
||||
F(2)
|
||||
F(3)
|
||||
F(4)
|
||||
F(5)
|
||||
F(6)
|
||||
F(7)
|
||||
F(8)
|
||||
F(9)
|
||||
F(10)
|
||||
F(11)
|
||||
F(12)
|
||||
F(13)
|
||||
F(14)
|
||||
F(15)
|
||||
return (1 & ((differentbits - 1) >> 8)) - 1;
|
||||
}
|
|
@ -1,17 +0,0 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/types.h>
|
||||
#include "osfreq.c"
|
||||
|
||||
long long cpucycles_riscv(void)
|
||||
{
|
||||
unsigned long long result;
|
||||
asm volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax"
|
||||
: "=a" (result) :: "%rdx");
|
||||
return result;
|
||||
}
|
||||
|
||||
long long cpucycles_riscv_persecond(void)
|
||||
{
|
||||
return osfreq();
|
||||
}
|
|
@ -1,93 +0,0 @@
|
|||
static double osfreq(void)
|
||||
{
|
||||
FILE *f;
|
||||
char *x;
|
||||
double result;
|
||||
int s;
|
||||
|
||||
f = fopen("/etc/cpucyclespersecond", "r");
|
||||
if (f) {
|
||||
s = fscanf(f,"%lf",&result);
|
||||
fclose(f);
|
||||
if (s > 0) return result;
|
||||
}
|
||||
|
||||
f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed", "r");
|
||||
if (f) {
|
||||
s = fscanf(f,"%lf",&result);
|
||||
fclose(f);
|
||||
if (s > 0) return 1000.0 * result;
|
||||
}
|
||||
|
||||
f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq", "r");
|
||||
if (f) {
|
||||
s = fscanf(f,"%lf",&result);
|
||||
fclose(f);
|
||||
if (s > 0) return 1000.0 * result;
|
||||
}
|
||||
|
||||
f = fopen("/sys/devices/system/cpu/cpu0/clock_tick", "r");
|
||||
if (f) {
|
||||
s = fscanf(f,"%lf",&result);
|
||||
fclose(f);
|
||||
if (s > 0) return result;
|
||||
}
|
||||
|
||||
f = fopen("/proc/cpuinfo","r");
|
||||
if (f) {
|
||||
for (;;) {
|
||||
s = fscanf(f,"cpu MHz : %lf",&result);
|
||||
if (s > 0) break;
|
||||
if (s == 0) s = fscanf(f,"%*[^\n]\n");
|
||||
if (s < 0) { result = 0; break; }
|
||||
}
|
||||
fclose(f);
|
||||
if (result) return 1000000.0 * result;
|
||||
}
|
||||
|
||||
f = fopen("/proc/cpuinfo","r");
|
||||
if (f) {
|
||||
for (;;) {
|
||||
s = fscanf(f,"clock : %lf",&result);
|
||||
if (s > 0) break;
|
||||
if (s == 0) s = fscanf(f,"%*[^\n]\n");
|
||||
if (s < 0) { result = 0; break; }
|
||||
}
|
||||
fclose(f);
|
||||
if (result) return 1000000.0 * result;
|
||||
}
|
||||
|
||||
f = popen("sysctl hw.cpufrequency 2>/dev/null","r");
|
||||
if (f) {
|
||||
s = fscanf(f,"hw.cpufrequency: %lf",&result);
|
||||
pclose(f);
|
||||
if (s > 0) if (result > 0) return result;
|
||||
}
|
||||
|
||||
f = popen("/usr/sbin/lsattr -E -l proc0 -a frequency 2>/dev/null","r");
|
||||
if (f) {
|
||||
s = fscanf(f,"frequency %lf",&result);
|
||||
pclose(f);
|
||||
if (s > 0) return result;
|
||||
}
|
||||
|
||||
f = popen("/usr/sbin/psrinfo -v 2>/dev/null","r");
|
||||
if (f) {
|
||||
for (;;) {
|
||||
s = fscanf(f," The %*s processor operates at %lf MHz",&result);
|
||||
if (s > 0) break;
|
||||
if (s == 0) s = fscanf(f,"%*[^\n]\n");
|
||||
if (s < 0) { result = 0; break; }
|
||||
}
|
||||
pclose(f);
|
||||
if (result) return 1000000.0 * result;
|
||||
}
|
||||
|
||||
x = getenv("cpucyclespersecond");
|
||||
if (x) {
|
||||
s = sscanf(x,"%lf",&result);
|
||||
if (s > 0) return result;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -249,190 +249,9 @@ static inline int64_t _rv64_clmulh(int64_t rs1, int64_t rs2)
|
|||
|
||||
/* this is basically Supercop's crypto_aead/aes256gcmv1/dolbeau/aesenc-int,
|
||||
but without the unrolling.
|
||||
So we have a thin compatibility layer to SSE's __m128i data format
|
||||
and associated instructions to support GHASH & the full algo.
|
||||
*/
|
||||
|
||||
/* ouch */
|
||||
typedef struct {
|
||||
uint64_t l;
|
||||
uint64_t h;
|
||||
} __m128i;
|
||||
|
||||
//#define _mm_loadu_si128(a) (*(const __m128i*)a)
|
||||
static inline __m128i _mm_loadu_si128(const __m128i *ptr) {
|
||||
__m128i r;
|
||||
r.l = ((const uint64_t*)ptr)[0];
|
||||
r.h = ((const uint64_t*)ptr)[1];
|
||||
return r;
|
||||
}
|
||||
|
||||
//#define _mm_storeu_si128(x,a) (*(__m128i*)x)=a
|
||||
static inline void _mm_storeu_si128(__m128i *ptr, const __m128i data) {
|
||||
((uint64_t*)ptr)[0] = data.l;
|
||||
((uint64_t*)ptr)[1] = data.h;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_clmulepi64_si128(const __m128i a, const __m128i b, const int x) {
|
||||
__m128i r;
|
||||
switch (x) {
|
||||
case 0x00:
|
||||
r.l = _rv64_clmul(a.l, b.l);
|
||||
r.h = _rv64_clmulh(a.l, b.l);
|
||||
break;
|
||||
case 0x01:
|
||||
r.l = _rv64_clmul(a.l, b.h);
|
||||
r.h = _rv64_clmulh(a.l, b.h);
|
||||
break;
|
||||
case 0x10:
|
||||
r.l = _rv64_clmul(a.h, b.l);
|
||||
r.h = _rv64_clmulh(a.h, b.l);
|
||||
break;
|
||||
case 0x11:
|
||||
r.l = _rv64_clmul(a.h, b.h);
|
||||
r.h = _rv64_clmulh(a.h, b.h);
|
||||
break;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
/*
|
||||
static inline __m128i (const __m128i a, const __m128i b) {
|
||||
__m128i r;
|
||||
return r;
|
||||
}
|
||||
*/
|
||||
static inline __m128i _mm_xor_si128(const __m128i a, const __m128i b) {
|
||||
__m128i r;
|
||||
r.l = a.l ^ b.l;
|
||||
r.h = a.h ^ b.h;
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_or_si128(const __m128i a, const __m128i b) {
|
||||
__m128i r;
|
||||
r.l = a.l | b.l;
|
||||
r.h = a.h | b.h;
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_and_si128(const __m128i a, const __m128i b) {
|
||||
__m128i r;
|
||||
r.l = a.l & b.l;
|
||||
r.h = a.h & b.h;
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_slli_si128(const __m128i a, const int b) {
|
||||
__m128i r;
|
||||
switch (b) {
|
||||
case 4:
|
||||
r.l = a.l << 32;
|
||||
r.h = a.h << 32 | a.l >> 32;
|
||||
break;
|
||||
case 8:
|
||||
r.l = 0;
|
||||
r.h = a.l;
|
||||
break;
|
||||
case 12:
|
||||
r.l = 0;
|
||||
r.h = a.l << 32;
|
||||
break;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_srli_si128(const __m128i a, const int b) {
|
||||
__m128i r;
|
||||
switch (b) {
|
||||
case 4:
|
||||
r.l = a.l >> 32 | a.h << 32;
|
||||
r.h = a.h >> 32;
|
||||
break;
|
||||
case 8:
|
||||
r.l = a.h;
|
||||
r.h = 0;
|
||||
break;
|
||||
case 12:
|
||||
r.l = a.h >> 32;
|
||||
r.h = 0;
|
||||
break;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_srli_epi32(const __m128i a, const int b) {
|
||||
__m128i r;
|
||||
r.l = ((a.l & 0x00000000FFFFFFFFull) >> b) | (((a.l & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull);
|
||||
r.h = ((a.h & 0x00000000FFFFFFFFull) >> b) | (((a.h & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull);
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_slli_epi32(const __m128i a, const int b) {
|
||||
__m128i r;
|
||||
r.l = (((a.l & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.l & 0xFFFFFFFF00000000ull) << b);
|
||||
r.h = (((a.h & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.h & 0xFFFFFFFF00000000ull) << b);
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_insert_epi64(const __m128i a, const uint64_t x, const int b) {
|
||||
__m128i r;
|
||||
if (b == 0) {
|
||||
r.l = x;
|
||||
r.h = a.h;
|
||||
} else {
|
||||
r.l = a.l;
|
||||
r.h = x;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_setzero_si128(void) {
|
||||
__m128i r;
|
||||
r.l = 0;
|
||||
r.h = 0;
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_set1_epi32(const uint32_t x) {
|
||||
__m128i r;
|
||||
r.l = x | ((uint64_t)x) << 32;
|
||||
r.h = x | ((uint64_t)x) << 32;
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline uint64_t bytereverse64(const uint64_t a) {
|
||||
uint64_t r;
|
||||
r = (uint32_t)_rv32_grev((a>>32), 24) | (((uint64_t)_rv32_grev((a&0xFFFFFFFF), 24))<<32);
|
||||
return r;
|
||||
}
|
||||
static inline __m128i bytereverse128(const __m128i a) {
|
||||
__m128i r;
|
||||
r.l = bytereverse64(a.h);
|
||||
r.h = bytereverse64(a.l);
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline uint64_t bitreverse64(const uint64_t a) {
|
||||
uint64_t r;
|
||||
r = (uint32_t)_rv32_grev((a&0xFFFFFFFF), 7) | (((uint64_t)_rv32_grev((a>>32), 7))<<32);
|
||||
return r;
|
||||
}
|
||||
static inline __m128i bitreverse128(const __m128i a) {
|
||||
__m128i r;
|
||||
r.l = bitreverse64(a.l);
|
||||
r.h = bitreverse64(a.h);
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline uint64_t wordreverse64(const uint64_t a) {
|
||||
uint64_t r;
|
||||
r = (a>>32)|(a<<32);
|
||||
return r;
|
||||
}
|
||||
static inline __m128i wordreverse128(const __m128i a) {
|
||||
__m128i r;
|
||||
r.l = wordreverse64(a.h);
|
||||
r.h = wordreverse64(a.l);
|
||||
return r;
|
||||
}
|
||||
static inline __m128i doublewordreverse128(const __m128i a) {
|
||||
__m128i r;
|
||||
r.l = a.h;
|
||||
r.h = a.l;
|
||||
return r;
|
||||
}
|
||||
#include "m128_compat.h"
|
||||
|
||||
static inline void addmul_rv(unsigned char *c,
|
||||
const unsigned char *a, int xlen,
|
||||
|
|
241
m128_compat.h
Normal file
241
m128_compat.h
Normal file
|
@ -0,0 +1,241 @@
|
|||
/*
|
||||
* A thin compatibility layer to SSE's __m128i data format
|
||||
* and associated instructions to support GHASH & the full algo.
|
||||
*/
|
||||
|
||||
#ifndef __M128_COMPAT_H__
|
||||
#define __M128_COMPAT_H__
|
||||
|
||||
#include "new_instructions_support_b.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
/* ouch */
|
||||
typedef struct {
|
||||
uint64_t l;
|
||||
uint64_t h;
|
||||
} __m128i;
|
||||
|
||||
//#define _mm_loadu_si128(a) (*(const __m128i*)a)
|
||||
static inline __m128i _mm_loadu_si128(const __m128i *ptr) {
|
||||
__m128i r;
|
||||
r.l = ((const uint64_t*)ptr)[0];
|
||||
r.h = ((const uint64_t*)ptr)[1];
|
||||
return r;
|
||||
}
|
||||
|
||||
//#define _mm_storeu_si128(x,a) (*(__m128i*)x)=a
|
||||
static inline void _mm_storeu_si128(__m128i *ptr, const __m128i data) {
|
||||
((uint64_t*)ptr)[0] = data.l;
|
||||
((uint64_t*)ptr)[1] = data.h;
|
||||
}
|
||||
static inline void _mm_store_si128(__m128i *ptr, const __m128i data) {
|
||||
((uint64_t*)ptr)[0] = data.l;
|
||||
((uint64_t*)ptr)[1] = data.h;
|
||||
}
|
||||
static inline void _mm_storel_epi64 (__m128i *ptr, const __m128i data) {
|
||||
((uint64_t*)ptr)[0] = data.l;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_clmulepi64_si128(const __m128i a, const __m128i b, const int x) {
|
||||
__m128i r;
|
||||
switch (x) {
|
||||
case 0x00:
|
||||
r.l = _rv64_clmul(a.l, b.l);
|
||||
r.h = _rv64_clmulh(a.l, b.l);
|
||||
break;
|
||||
case 0x01:
|
||||
r.l = _rv64_clmul(a.l, b.h);
|
||||
r.h = _rv64_clmulh(a.l, b.h);
|
||||
break;
|
||||
case 0x10:
|
||||
r.l = _rv64_clmul(a.h, b.l);
|
||||
r.h = _rv64_clmulh(a.h, b.l);
|
||||
break;
|
||||
case 0x11:
|
||||
r.l = _rv64_clmul(a.h, b.h);
|
||||
r.h = _rv64_clmulh(a.h, b.h);
|
||||
break;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
/*
|
||||
static inline __m128i (const __m128i a, const __m128i b) {
|
||||
__m128i r;
|
||||
return r;
|
||||
}
|
||||
*/
|
||||
static inline __m128i _mm_xor_si128(const __m128i a, const __m128i b) {
|
||||
__m128i r;
|
||||
r.l = a.l ^ b.l;
|
||||
r.h = a.h ^ b.h;
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_or_si128(const __m128i a, const __m128i b) {
|
||||
__m128i r;
|
||||
r.l = a.l | b.l;
|
||||
r.h = a.h | b.h;
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_and_si128(const __m128i a, const __m128i b) {
|
||||
__m128i r;
|
||||
r.l = a.l & b.l;
|
||||
r.h = a.h & b.h;
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_slli_si128(const __m128i a, const int b) {
|
||||
__m128i r;
|
||||
switch (b) {
|
||||
case 4:
|
||||
r.l = a.l << 32;
|
||||
r.h = a.h << 32 | a.l >> 32;
|
||||
break;
|
||||
case 8:
|
||||
r.l = 0;
|
||||
r.h = a.l;
|
||||
break;
|
||||
case 12:
|
||||
r.l = 0;
|
||||
r.h = a.l << 32;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "%s: %d unimplemented\n", __PRETTY_FUNCTION__, b);
|
||||
break;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_srli_si128(const __m128i a, const int b) {
|
||||
__m128i r;
|
||||
switch (b) {
|
||||
case 1:
|
||||
r.l = a.l >> 8 | a.h << 56;
|
||||
r.h = a.h >> 8;
|
||||
break;
|
||||
case 4:
|
||||
r.l = a.l >> 32 | a.h << 32;
|
||||
r.h = a.h >> 32;
|
||||
break;
|
||||
case 8:
|
||||
r.l = a.h;
|
||||
r.h = 0;
|
||||
break;
|
||||
case 12:
|
||||
r.l = a.h >> 32;
|
||||
r.h = 0;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "%s: %d unimplemented\n", __PRETTY_FUNCTION__, b);
|
||||
break;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_srli_epi32(const __m128i a, const int b) {
|
||||
__m128i r;
|
||||
r.l = ((a.l & 0x00000000FFFFFFFFull) >> b) | (((a.l & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull);
|
||||
r.h = ((a.h & 0x00000000FFFFFFFFull) >> b) | (((a.h & 0xFFFFFFFF00000000ull) >> b) & 0xFFFFFFFF00000000ull);
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_slli_epi32(const __m128i a, const int b) {
|
||||
__m128i r;
|
||||
r.l = (((a.l & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.l & 0xFFFFFFFF00000000ull) << b);
|
||||
r.h = (((a.h & 0x00000000FFFFFFFFull) << b) & 0x00000000FFFFFFFFull) | ((a.h & 0xFFFFFFFF00000000ull) << b);
|
||||
return r;
|
||||
}
|
||||
/* static inline __m128i _mm_srai_epi32(const __m128i a, const int b) { */
|
||||
/* __m128i r; */
|
||||
/* r.l = (((int32_t)(a.l & 0x00000000FFFFFFFFull)) >> b) | ((((int32_t)(a.l & 0xFFFFFFFF00000000ull)) >> b) & 0xFFFFFFFF00000000ull); */
|
||||
/* r.h = (((int32_t)(a.h & 0x00000000FFFFFFFFull)) >> b) | ((((int32_t)(a.h & 0xFFFFFFFF00000000ull)) >> b) & 0xFFFFFFFF00000000ull); */
|
||||
/* return r; */
|
||||
/* } */
|
||||
static inline __m128i _mm_insert_epi64(const __m128i a, const uint64_t x, const int b) {
|
||||
__m128i r;
|
||||
if (b == 0) {
|
||||
r.l = x;
|
||||
r.h = a.h;
|
||||
} else {
|
||||
r.l = a.l;
|
||||
r.h = x;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_setzero_si128(void) {
|
||||
__m128i r;
|
||||
r.l = 0;
|
||||
r.h = 0;
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_set1_epi32(const uint32_t x) {
|
||||
__m128i r;
|
||||
r.l = x | ((uint64_t)x) << 32;
|
||||
r.h = x | ((uint64_t)x) << 32;
|
||||
return r;
|
||||
}
|
||||
static inline __m128i _mm_set_epi32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
|
||||
__m128i r;
|
||||
r.l = (uint64_t)e0 | ((uint64_t)e1) << 32;
|
||||
r.h = (uint64_t)e2 | ((uint64_t)e3) << 32;
|
||||
return r;
|
||||
}
|
||||
/* non-intel stuff, used to replace some common use cases */
|
||||
static inline uint64_t bytereverse64(const uint64_t a) {
|
||||
uint64_t r;
|
||||
r = (uint32_t)_rv32_grev((a>>32), 24) | (((uint64_t)_rv32_grev((a&0xFFFFFFFF), 24))<<32);
|
||||
return r;
|
||||
}
|
||||
static inline __m128i bytereverse128(const __m128i a) {
|
||||
__m128i r;
|
||||
r.l = bytereverse64(a.h);
|
||||
r.h = bytereverse64(a.l);
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline uint64_t bitreverse64(const uint64_t a) {
|
||||
uint64_t r;
|
||||
r = (uint32_t)_rv32_grev((a&0xFFFFFFFF), 7) | (((uint64_t)_rv32_grev((a>>32), 7))<<32);
|
||||
return r;
|
||||
}
|
||||
static inline __m128i bitreverse128(const __m128i a) {
|
||||
__m128i r;
|
||||
r.l = bitreverse64(a.l);
|
||||
r.h = bitreverse64(a.h);
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline uint64_t wordreverse64(const uint64_t a) {
|
||||
uint64_t r;
|
||||
r = (a>>32)|(a<<32);
|
||||
return r;
|
||||
}
|
||||
static inline __m128i wordreverse128(const __m128i a) {
|
||||
__m128i r;
|
||||
r.l = wordreverse64(a.h);
|
||||
r.h = wordreverse64(a.l);
|
||||
return r;
|
||||
}
|
||||
static inline __m128i doublewordreverse128(const __m128i a) {
|
||||
__m128i r;
|
||||
r.l = a.h;
|
||||
r.h = a.l;
|
||||
return r;
|
||||
}
|
||||
static inline __m128i wordrotate1l128(const __m128i a) {
|
||||
__m128i r;
|
||||
/* i.e. epi32 _MM_SHUFFLE(2,1,0,3) */
|
||||
r.l = (a.h >> 32) | (a.l << 32);
|
||||
r.h = (a.l >> 32) | (a.h << 32);
|
||||
return r;
|
||||
}
|
||||
static inline __m128i halfwordandzero(const uint16_t a) {
|
||||
__m128i r;
|
||||
r.l = a;
|
||||
r.h = 0;
|
||||
return r;
|
||||
}
|
||||
static inline __m128i wordsign128(const __m128i a) {
|
||||
__m128i r;
|
||||
r.l = (a.l & 0x0000000080000000ull ? 0x00000000FFFFFFFFull : 0) | (a.l & 0x8000000000000000ull ? 0xFFFFFFFF00000000ull : 0);
|
||||
r.h = (a.h & 0x0000000080000000ull ? 0x00000000FFFFFFFFull : 0) | (a.h & 0x8000000000000000ull ? 0xFFFFFFFF00000000ull : 0);
|
||||
return r;
|
||||
}
|
||||
#endif // __M128_COMPAT_H__
|
Loading…
Add table
Reference in a new issue