From eb59ac063069bf455be9610b6db47a1961ef005e Mon Sep 17 00:00:00 2001 From: Romain Dolbeau Date: Sat, 13 Feb 2021 08:30:29 -0500 Subject: [PATCH] drop the mvs in FUN2W ; add [us]maqa (require a non-earlyInjection plugin to meet timing...) ; use umaqa in sse8 --- Makefile | 5 ++++- data_Zpn.txt | 28 ++++++++++++++++++++++++++-- new_instructions_support.h | 9 +++------ new_instructions_support_p.h | 6 ++++++ pbsse.c | 8 ++++++++ test_p.c | 32 ++++++++++++++++++++++++++++++++ 6 files changed, 79 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 17b871e..3c9a4ed 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,10 @@ CryptoZknh.scala: gen_plugin data_sha.txt ./gen_plugin -n CryptoZknh -i data_sha.txt -I '*' >| $@ PDataProcess.scala: gen_plugin data_Zpn.txt - ./gen_plugin -n PDataProcess -i data_Zpn.txt -I '*' >| $@ + ./gen_plugin -n PDataProcess -i data_Zpn.txt -I Zpn >| $@ + +PSlowDataProcess.scala: gen_plugin data_Zpn.txt + ./gen_plugin -n PSlowDataProcess -i data_Zpn.txt -I Zpnslow >| $@ P64DataProcess.scala: gen_plugin data_Zp64.txt ./gen_plugin -w -n P64DataProcess -i data_Zp64.txt -I '*' >| $@ diff --git a/data_Zpn.txt b/data_Zpn.txt index 34396bf..1a202fc 100644 --- a/data_Zpn.txt +++ b/data_Zpn.txt @@ -90,11 +90,12 @@ I URSUBW URSUBW 0011001----------001-----1110111 pdpiadd32 Zpn I INSBI INSBI 101011000--------000-----1110111 pdpibit Zpn //I MADDR32 MADDR32 1100010----------001-----1110111 pdpimac Zpn //I MSUBR32 MSUBR32 1100011----------001-----1110111 pdpimac Zpn +// ternary +I SMAQA SMAQA 1100100----------000----01110111 pdpiumul8 Zpnslow +I UMAQA UMAQA 1100110----------000----01110111 pdpismul8 Zpnslow // ternary + binary (bit 25) I PBSADx PBSADx 111111-----------000-----1110111 pdpipsad Zpn - - // binary S ADD8 "fun_add8(input(SRC1), input(SRC2))" S ADD16 "fun_add16(input(SRC1), input(SRC2))" @@ -148,6 +149,8 @@ S SWAP16 "fun_swap16(input(SRC1))" // ternary S PBSADx "fun_pbsada(input(SRC1), input(SRC2), (input(INSTRUCTION)(25).asUInt === 0) ? U(0, 32 bits).asBits | input(SRC3))" S INSBI "fun_insb(input(SRC1), input(SRC2), input(SRC3))" +S SMAQA "fun_smaqa(input(SRC1), input(SRC2), input(SRC3))" +S UMAQA "fun_umaqa(input(SRC1), input(SRC2), input(SRC3))" P """ def fun_add8(rs1: Bits, rs2: Bits) : Bits = { @@ -556,4 +559,25 @@ P """ ) r // return value } + + def fun_smaqa(rs1: Bits, rs2: Bits, rs3: Bits) : Bits = { + // 18 bits needed so that intermediate sums don't overflow + val h0 = (rs1( 7 downto 0).asSInt * rs2( 7 downto 0).asSInt).asBits.resize(18) + val h1 = (rs1(15 downto 8).asSInt * rs2(15 downto 8).asSInt).asBits.resize(18) + val h2 = (rs1(23 downto 16).asSInt * rs2(23 downto 16).asSInt).asBits.resize(18) + val h3 = (rs1(31 downto 24).asSInt * rs2(31 downto 24).asSInt).asBits.resize(18) + val r = rs3.asSInt + (h0.asSInt + h1.asSInt + h2.asSInt + h3.asSInt) + + r.asBits.resize(32) // return value + } + def fun_umaqa(rs1: Bits, rs2: Bits, rs3: Bits) : Bits = { + // 18 bits needed so that intermediate sums don't overflow + val h0 = (rs1( 7 downto 0).asUInt * rs2( 7 downto 0).asUInt).asBits.resize(18) + val h1 = (rs1(15 downto 8).asUInt * rs2(15 downto 8).asUInt).asBits.resize(18) + val h2 = (rs1(23 downto 16).asUInt * rs2(23 downto 16).asUInt).asBits.resize(18) + val h3 = (rs1(31 downto 24).asUInt * rs2(31 downto 24).asUInt).asBits.resize(18) + val r = rs3.asUInt + (h0.asUInt + h1.asUInt + h2.asUInt + h3.asUInt) + + r.asBits.resize(32) // return value + } """ diff --git a/new_instructions_support.h b/new_instructions_support.h index f81aed6..30fb7ea 100644 --- a/new_instructions_support.h +++ b/new_instructions_support.h @@ -66,13 +66,10 @@ typedef uint32_t uint_xlen_t; // binary wide (64-bits output in R2n/R2n+1, e.g. smul8 from P) #define FUN2W(NAME, ASNAME) \ static inline uint64_t NAME(uint_xlen_t rs1, uint_xlen_t rs2) { \ - uint32_t r0, r1; \ - asm (#ASNAME " reg_t5, reg_%2, reg_%3\n" \ - "mv %0, t5\n" \ - "mv %1, t6\n" \ + register uint32_t r0 asm ("t5"), r1 asm ("t6"); \ + asm (#ASNAME " reg_%0, reg_%2, reg_%3\n" \ : "=r" (r0), "=r" (r1) \ - : "r" (rs1), "r" (rs2) \ - : "t5", "t6"); \ + : "r" (rs1), "r" (rs2)); \ return ((uint64_t)r0 | (((uint64_t)r1)<<32)); \ } diff --git a/new_instructions_support_p.h b/new_instructions_support_p.h index 60ddbbc..99507d5 100644 --- a/new_instructions_support_p.h +++ b/new_instructions_support_p.h @@ -146,4 +146,10 @@ FUN2W(__rv__smulx16,SMULx16) ASM2MACRO(UMULx16,0xb2000077) FUN2W(__rv__umulx16,UMULx16) + +ASM3RMACRO(SMAQA, 0xc8000077) +FUN3R(__rv__smaqa, SMAQA) +ASM3RMACRO(UMAQA, 0xcc000077) +FUN3R(__rv__umaqa, UMAQA) + #endif // __NEW_INSTRUCTION_SUPPORT_P_H__ diff --git a/pbsse.c b/pbsse.c index 9404631..199c492 100644 --- a/pbsse.c +++ b/pbsse.c @@ -45,6 +45,7 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, uint32_t pmin = __rv__umin8(p1, p2); uint32_t pmax = __rv__umax8(p1, p2); uint32_t pd = __rv__sub8(pmax, pmin); +#if 0 uint64_t qs = __rv__umul8(pd, pd); /* uint32_t psl = (uint32_t)qs; */ /* uint32_t psh = (uint32_t)(qs >> 32); */ @@ -56,12 +57,16 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, s += ((qs >> 16) & 0x000000000000FFFFull); s += ((qs >> 32) & 0x000000000000FFFFull); s += ((qs >> 48) & 0x000000000000FFFFull); +#else + s = __rv__umaqa(pd, pd, s); +#endif p1 = ((uint32_t*)pix1)[1]; p2 = ((uint32_t*)pix2)[1]; pmin = __rv__umin8(p1, p2); pmax = __rv__umax8(p1, p2); pd = __rv__sub8(pmax, pmin); +#if 0 qs = __rv__umul8(pd, pd); /* psl = (uint32_t)qs; */ /* psh = (uint32_t)(qs >> 32); */ @@ -73,6 +78,9 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, s += ((qs >> 16) & 0x000000000000FFFFull); s += ((qs >> 32) & 0x000000000000FFFFull); s += ((qs >> 48) & 0x000000000000FFFFull); +#else + s = __rv__umaqa(pd, pd, s); +#endif pix1 += stride; pix2 += stride; diff --git a/test_p.c b/test_p.c index 2f7895b..38d0e76 100644 --- a/test_p.c +++ b/test_p.c @@ -710,6 +710,35 @@ uint64_t __rv__umulx16(const uint32_t rs1, const uint32_t rs2) { memcpy(&r, c, 8); return r; } + + + +uint64_t __rv__smaqa(const uint32_t rs1, const uint32_t rs2, const uint32_t rs3) { + int4x8_t a, b; + int4x16_t c; + int32_t r; + memcpy(a, &rs1, 4); + memcpy(b, &rs2, 4); + c[0] = a[0] * b[0]; + c[1] = a[1] * b[1]; + c[2] = a[2] * b[2]; + c[3] = a[3] * b[3]; + r = ((int32_t)rs3) + c[0] + c[1] + c[2] + c[3]; + return r; +} +uint64_t __rv__umaqa(const uint32_t rs1, const uint32_t rs2, const uint32_t rs3) { + uint4x8_t a, b; + uint4x16_t c; + uint32_t r; + memcpy(a, &rs1, 4); + memcpy(b, &rs2, 4); + c[0] = a[0] * b[0]; + c[1] = a[1] * b[1]; + c[2] = a[2] * b[2]; + c[3] = a[3] * b[3]; + r = rs3 + c[0] + c[1] + c[2] + c[3]; + return r; +} #endif // __riscv unsigned int a = 0x01234567; @@ -866,6 +895,9 @@ int main(int argc, char **argv) { T2W(__rv__smulx16); T2W(__rv__umulx16); + T3(__rv__smaqa); + T3(__rv__umaqa); + b = 0x0100F004 + index; }