From 29738d20b87fa1a0baa0ffbb0b80755287406774 Mon Sep 17 00:00:00 2001 From: Romain Dolbeau Date: Sat, 27 Feb 2021 04:44:51 -0500 Subject: [PATCH] Using P opcodes (and double-width/read-rs3-from-rd behavior) to try some custom Chacha-oriented instructions --- chacha20standalone-rv32/Makefile | 26 +++++++++++++ chacha20standalone-rv32/chacha.c | 65 ++++++++++++++++++++++++++++---- data_Chacha64.txt | 24 ++++++++++++ data_xar.txt | 11 ++++++ new_instructions_support.h | 15 ++++++++ new_instructions_support_b.h | 12 ++++++ 6 files changed, 145 insertions(+), 8 deletions(-) create mode 100644 data_Chacha64.txt create mode 100644 data_xar.txt diff --git a/chacha20standalone-rv32/Makefile b/chacha20standalone-rv32/Makefile index d76b63c..7d99cca 100644 --- a/chacha20standalone-rv32/Makefile +++ b/chacha20standalone-rv32/Makefile @@ -41,3 +41,29 @@ kernelrandombytes.o: random.cpp cpucycles.o: riscv.c $(R5IMA_GCC) $< -march=rv32ima -mabi=ilp32 -I. -O1 -c -o $@ + + +chacha_XAR.S: chacha.c + $(R5B_GCC) $(R5B_OPT) -DENABLE_XAR $< -S -o $@ + +chacha_XAR.o: chacha_XAR.S + $(R5B_GCC) $(R5B_OPT) -DENABLE_XAR $< -c -o $@ + +chacha20_XAR: $(OBJs) chacha_XAR.o try.o $(SCLIBS) + $(R5IMA_GXX) $(R5IMA_OPT) $^ -o $@ + +chacha20_XAR_small: $(OBJs) chacha_XAR.o try_small.o $(SCLIBS) + $(R5IMA_GXX) $(R5IMA_OPT) $^ -o $@ + + +chacha_CHACHA.S: chacha.c + $(R5B_GCC) $(R5B_OPT) -O3 -DENABLE_CHACHA $< -S -o $@ + +chacha_CHACHA.o: chacha_CHACHA.S + $(R5B_GCC) $(R5B_OPT) -O3 -DENABLE_CHACHA $< -c -o $@ + +chacha20_CHACHA: $(OBJs) chacha_CHACHA.o try.o $(SCLIBS) + $(R5IMA_GXX) $(R5IMA_OPT) $^ -o $@ + +chacha20_CHACHA_small: $(OBJs) chacha_CHACHA.o try_small.o $(SCLIBS) + $(R5IMA_GXX) $(R5IMA_OPT) $^ -o $@ diff --git a/chacha20standalone-rv32/chacha.c b/chacha20standalone-rv32/chacha.c index c56e997..bd9e6ec 100644 --- a/chacha20standalone-rv32/chacha.c +++ b/chacha20standalone-rv32/chacha.c @@ -21,12 +21,61 @@ Public domain. #define PLUS(v,w) (U32V((v) + (w))) #define PLUSONE(v) (PLUS((v),1)) +#if !defined(ENABLE_XAR) && !defined(ENABLE_CHACHA) #define QUARTERROUND(a,b,c,d) \ x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \ x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \ x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \ x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7); +#define QUARTERROUND1(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND2(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND3(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND4(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND5(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND6(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND7(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND8(a,b,c,d) QUARTERROUND(a,b,c,d) +#elif !defined(ENABLE_CHACHA) +#define QUARTERROUND(a,b,c,d) \ + x[a] = PLUS(x[a],x[b]); x[d] = __rv__xar(x[a],16,x[d]); \ + x[c] = PLUS(x[c],x[d]); x[b] = __rv__xar(x[c],12,x[b]); \ + x[a] = PLUS(x[a],x[b]); x[d] = __rv__xar(x[a], 8,x[d]); \ + x[c] = PLUS(x[c],x[d]); x[b] = __rv__xar(x[c], 7,x[b]); + +#define QUARTERROUND1(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND2(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND3(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND4(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND5(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND6(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND7(a,b,c,d) QUARTERROUND(a,b,c,d) +#define QUARTERROUND8(a,b,c,d) QUARTERROUND(a,b,c,d) +#else +#define QUARTERROUNDx(a,b,c,d,r0,r1,r2,r3) \ + { \ + register uint32_t A asm(""#r0) = x[a]; \ + register uint32_t D asm(""#r1) = x[d]; \ + register uint32_t C asm(""#r2) = x[c]; \ + register uint32_t B asm(""#r3) = x[b]; \ + asm("CHACHA16 reg_%0, reg_%1, reg_%3\n" \ + "CHACHA12 reg_%2, reg_%3, reg_%1\n" \ + "CHACHA8 reg_%0, reg_%1, reg_%3\n" \ + "CHACHA7 reg_%2, reg_%3, reg_%1\n" \ + : "+&r" (A), "+&r" (B), "+&r" (C), "+&r" (D)); \ + x[a] = A; x[b] = B; x[c] = C; x[d] = D; \ + } +#define QUARTERROUND1(a,b,c,d) QUARTERROUNDx(a,b,c,d,t3,t4,t5,t6) +#define QUARTERROUND2(a,b,c,d) QUARTERROUNDx(a,b,c,d,s8,s9,s10,s11) +#define QUARTERROUND3(a,b,c,d) QUARTERROUNDx(a,b,c,d,s4,s5,s6,s7) +#define QUARTERROUND4(a,b,c,d) QUARTERROUNDx(a,b,c,d,a6,a7,s2,s3) + +#define QUARTERROUND5(a,b,c,d) QUARTERROUNDx(a,b,c,d,t3,t4,s6,s7) +#define QUARTERROUND6(a,b,c,d) QUARTERROUNDx(a,b,c,d,s8,s9,s2,s3) +#define QUARTERROUND7(a,b,c,d) QUARTERROUNDx(a,b,c,d,s4,s5,t5,t6) +#define QUARTERROUND8(a,b,c,d) QUARTERROUNDx(a,b,c,d,a6,a7,s10,s11) +#endif + static void salsa20_wordtobyte(u8 output[64],const u32 input[16]) { u32 x[16]; @@ -34,14 +83,14 @@ static void salsa20_wordtobyte(u8 output[64],const u32 input[16]) for (i = 0;i < 16;++i) x[i] = input[i]; for (i = ROUNDS;i > 0;i -= 2) { - QUARTERROUND( 0, 4, 8,12) - QUARTERROUND( 1, 5, 9,13) - QUARTERROUND( 2, 6,10,14) - QUARTERROUND( 3, 7,11,15) - QUARTERROUND( 0, 5,10,15) - QUARTERROUND( 1, 6,11,12) - QUARTERROUND( 2, 7, 8,13) - QUARTERROUND( 3, 4, 9,14) + QUARTERROUND1( 0, 4, 8,12) + QUARTERROUND2( 1, 5, 9,13) + QUARTERROUND3( 2, 6,10,14) + QUARTERROUND4( 3, 7,11,15) + QUARTERROUND5( 0, 5,10,15) + QUARTERROUND6( 1, 6,11,12) + QUARTERROUND7( 2, 7, 8,13) + QUARTERROUND8( 3, 4, 9,14) } for (i = 0;i < 16;++i) x[i] = PLUS(x[i],input[i]); for (i = 0;i < 16;++i) U32TO8_LITTLE(output + 4 * i,x[i]); diff --git a/data_Chacha64.txt b/data_Chacha64.txt new file mode 100644 index 0000000..c2d4828 --- /dev/null +++ b/data_Chacha64.txt @@ -0,0 +1,24 @@ +//for vX.Y of P + +// low-order bit of Rd (7) is 0 to ensure even-numbered Rd +I CHACHA CHACHA 101--00----------000----01110111 chacha Zchacha + +S CHACHA "fun_chacha(input(SRC1), input(SRC2), input(SRC3), input(INSTRUCTION)(28 downto 27))" + +P """ + def fun_chacha(rs1: Bits, rs2: Bits, rs3: Bits, num: Bits) : Bits = { + val rotv = (num).mux( + B"2'b00" -> U(16), + B"2'b01" -> U(12), + B"2'b10" -> U( 8), + B"2'b11" -> U( 7) + ) + val a = rs3 + val b = rs1 + val d = rs2 + val sum = (a.asUInt + b.asUInt).asBits.resize(32) + val xor = sum ^ d + val rot = xor.rotateLeft(rotv) + rot ## sum // return value + } +""" diff --git a/data_xar.txt b/data_xar.txt new file mode 100644 index 0000000..fd48593 --- /dev/null +++ b/data_xar.txt @@ -0,0 +1,11 @@ +I XAR XAR 1100100----------000-----1110111 xar Zbxar + +S XAR "fun_xar(input(SRC1), input(SRC2), input(SRC3))" + +P """ + def fun_xar(rs1: Bits, rs2: Bits, rs3: Bits) : Bits = { + val in = rs1 ^ rs3 + val r = in.rotateLeft(rs2(4 downto 0).asUInt) + r // return value + } +""" diff --git a/new_instructions_support.h b/new_instructions_support.h index 30fb7ea..6751d87 100644 --- a/new_instructions_support.h +++ b/new_instructions_support.h @@ -73,6 +73,21 @@ typedef uint32_t uint_xlen_t; return ((uint64_t)r0 | (((uint64_t)r1)<<32)); \ } +// ternary wide (64-bits output in R2n/R2n+1) +#define FUN3Wx(NAME, ASNAME, r0, r1) \ + static inline uint64_t NAME(uint_xlen_t rs1, uint_xlen_t rs2, uint_xlen_t rs3) { \ + register uint32_t r0 asm (""#r0), r1 asm (""#r1); \ + r0 = rs3; \ + asm (#ASNAME " reg_%0, reg_%2, reg_%3\n" \ + : "+r" (r0), "=r" (r1) \ + : "r" (rs1), "r" (rs2)); \ + return ((uint64_t)r0 | (((uint64_t)r1)<<32)); \ + } +#define FUN3Wt5(NAME, ASNAME) FUN3Wx(NAME, ASNAME, t5, t6) +#define FUN3Wt3(NAME, ASNAME) FUN3Wx(NAME, ASNAME, t3, t4) +#define FUN3Ws10(NAME, ASNAME) FUN3Wx(NAME, ASNAME, s10, s11) +#define FUN3Ws8(NAME, ASNAME) FUN3Wx(NAME, ASNAME, s8, s9) + // macro to build assembly macros to generate the proper // opcodes as .word macro // the translation from name to number is done my the diff --git a/new_instructions_support_b.h b/new_instructions_support_b.h index 992d8cb..2e252db 100644 --- a/new_instructions_support_b.h +++ b/new_instructions_support_b.h @@ -19,4 +19,16 @@ FUN2(_sh1add,_SH1ADD) FUN2(_sh2add,_SH2ADD) FUN2(_sh3add,_SH3ADD) + +#ifdef ENABLE_XAR +ASM3RMACRO(XAR, 0xc8000077) +FUN3R(__rv__xar, XAR) +#endif +#ifdef ENABLE_CHACHA +ASM3RMACRO(CHACHA16,0xa0000077) +ASM3RMACRO(CHACHA12,0xa8000077) +ASM3RMACRO(CHACHA8, 0xb0000077) +ASM3RMACRO(CHACHA7, 0xb8000077) +#endif + #endif // __NEW_INSTRUCTION_SUPPORT_B_H__