drop the mvs in FUN2W ; add [us]maqa (require a non-earlyInjection plugin to meet timing...) ; use umaqa in sse8

This commit is contained in:
Romain Dolbeau 2021-02-13 08:30:29 -05:00
parent ff1b051f56
commit eb59ac0630
6 changed files with 79 additions and 9 deletions

View file

@ -57,7 +57,10 @@ CryptoZknh.scala: gen_plugin data_sha.txt
./gen_plugin -n CryptoZknh -i data_sha.txt -I '*' >| $@ ./gen_plugin -n CryptoZknh -i data_sha.txt -I '*' >| $@
PDataProcess.scala: gen_plugin data_Zpn.txt PDataProcess.scala: gen_plugin data_Zpn.txt
./gen_plugin -n PDataProcess -i data_Zpn.txt -I '*' >| $@ ./gen_plugin -n PDataProcess -i data_Zpn.txt -I Zpn >| $@
PSlowDataProcess.scala: gen_plugin data_Zpn.txt
./gen_plugin -n PSlowDataProcess -i data_Zpn.txt -I Zpnslow >| $@
P64DataProcess.scala: gen_plugin data_Zp64.txt P64DataProcess.scala: gen_plugin data_Zp64.txt
./gen_plugin -w -n P64DataProcess -i data_Zp64.txt -I '*' >| $@ ./gen_plugin -w -n P64DataProcess -i data_Zp64.txt -I '*' >| $@

View file

@ -90,11 +90,12 @@ I URSUBW URSUBW 0011001----------001-----1110111 pdpiadd32 Zpn
I INSBI INSBI 101011000--------000-----1110111 pdpibit Zpn I INSBI INSBI 101011000--------000-----1110111 pdpibit Zpn
//I MADDR32 MADDR32 1100010----------001-----1110111 pdpimac Zpn //I MADDR32 MADDR32 1100010----------001-----1110111 pdpimac Zpn
//I MSUBR32 MSUBR32 1100011----------001-----1110111 pdpimac Zpn //I MSUBR32 MSUBR32 1100011----------001-----1110111 pdpimac Zpn
// ternary
I SMAQA SMAQA 1100100----------000----01110111 pdpiumul8 Zpnslow
I UMAQA UMAQA 1100110----------000----01110111 pdpismul8 Zpnslow
// ternary + binary (bit 25) // ternary + binary (bit 25)
I PBSADx PBSADx 111111-----------000-----1110111 pdpipsad Zpn I PBSADx PBSADx 111111-----------000-----1110111 pdpipsad Zpn
// binary // binary
S ADD8 "fun_add8(input(SRC1), input(SRC2))" S ADD8 "fun_add8(input(SRC1), input(SRC2))"
S ADD16 "fun_add16(input(SRC1), input(SRC2))" S ADD16 "fun_add16(input(SRC1), input(SRC2))"
@ -148,6 +149,8 @@ S SWAP16 "fun_swap16(input(SRC1))"
// ternary // ternary
S PBSADx "fun_pbsada(input(SRC1), input(SRC2), (input(INSTRUCTION)(25).asUInt === 0) ? U(0, 32 bits).asBits | input(SRC3))" S PBSADx "fun_pbsada(input(SRC1), input(SRC2), (input(INSTRUCTION)(25).asUInt === 0) ? U(0, 32 bits).asBits | input(SRC3))"
S INSBI "fun_insb(input(SRC1), input(SRC2), input(SRC3))" S INSBI "fun_insb(input(SRC1), input(SRC2), input(SRC3))"
S SMAQA "fun_smaqa(input(SRC1), input(SRC2), input(SRC3))"
S UMAQA "fun_umaqa(input(SRC1), input(SRC2), input(SRC3))"
P """ P """
def fun_add8(rs1: Bits, rs2: Bits) : Bits = { def fun_add8(rs1: Bits, rs2: Bits) : Bits = {
@ -556,4 +559,25 @@ P """
) )
r // return value r // return value
} }
def fun_smaqa(rs1: Bits, rs2: Bits, rs3: Bits) : Bits = {
// 18 bits needed so that intermediate sums don't overflow
val h0 = (rs1( 7 downto 0).asSInt * rs2( 7 downto 0).asSInt).asBits.resize(18)
val h1 = (rs1(15 downto 8).asSInt * rs2(15 downto 8).asSInt).asBits.resize(18)
val h2 = (rs1(23 downto 16).asSInt * rs2(23 downto 16).asSInt).asBits.resize(18)
val h3 = (rs1(31 downto 24).asSInt * rs2(31 downto 24).asSInt).asBits.resize(18)
val r = rs3.asSInt + (h0.asSInt + h1.asSInt + h2.asSInt + h3.asSInt)
r.asBits.resize(32) // return value
}
def fun_umaqa(rs1: Bits, rs2: Bits, rs3: Bits) : Bits = {
// 18 bits needed so that intermediate sums don't overflow
val h0 = (rs1( 7 downto 0).asUInt * rs2( 7 downto 0).asUInt).asBits.resize(18)
val h1 = (rs1(15 downto 8).asUInt * rs2(15 downto 8).asUInt).asBits.resize(18)
val h2 = (rs1(23 downto 16).asUInt * rs2(23 downto 16).asUInt).asBits.resize(18)
val h3 = (rs1(31 downto 24).asUInt * rs2(31 downto 24).asUInt).asBits.resize(18)
val r = rs3.asUInt + (h0.asUInt + h1.asUInt + h2.asUInt + h3.asUInt)
r.asBits.resize(32) // return value
}
""" """

View file

@ -66,13 +66,10 @@ typedef uint32_t uint_xlen_t;
// binary wide (64-bits output in R2n/R2n+1, e.g. smul8 from P) // binary wide (64-bits output in R2n/R2n+1, e.g. smul8 from P)
#define FUN2W(NAME, ASNAME) \ #define FUN2W(NAME, ASNAME) \
static inline uint64_t NAME(uint_xlen_t rs1, uint_xlen_t rs2) { \ static inline uint64_t NAME(uint_xlen_t rs1, uint_xlen_t rs2) { \
uint32_t r0, r1; \ register uint32_t r0 asm ("t5"), r1 asm ("t6"); \
asm (#ASNAME " reg_t5, reg_%2, reg_%3\n" \ asm (#ASNAME " reg_%0, reg_%2, reg_%3\n" \
"mv %0, t5\n" \
"mv %1, t6\n" \
: "=r" (r0), "=r" (r1) \ : "=r" (r0), "=r" (r1) \
: "r" (rs1), "r" (rs2) \ : "r" (rs1), "r" (rs2)); \
: "t5", "t6"); \
return ((uint64_t)r0 | (((uint64_t)r1)<<32)); \ return ((uint64_t)r0 | (((uint64_t)r1)<<32)); \
} }

View file

@ -146,4 +146,10 @@ FUN2W(__rv__smulx16,SMULx16)
ASM2MACRO(UMULx16,0xb2000077) ASM2MACRO(UMULx16,0xb2000077)
FUN2W(__rv__umulx16,UMULx16) FUN2W(__rv__umulx16,UMULx16)
ASM3RMACRO(SMAQA, 0xc8000077)
FUN3R(__rv__smaqa, SMAQA)
ASM3RMACRO(UMAQA, 0xcc000077)
FUN3R(__rv__umaqa, UMAQA)
#endif // __NEW_INSTRUCTION_SUPPORT_P_H__ #endif // __NEW_INSTRUCTION_SUPPORT_P_H__

View file

@ -45,6 +45,7 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
uint32_t pmin = __rv__umin8(p1, p2); uint32_t pmin = __rv__umin8(p1, p2);
uint32_t pmax = __rv__umax8(p1, p2); uint32_t pmax = __rv__umax8(p1, p2);
uint32_t pd = __rv__sub8(pmax, pmin); uint32_t pd = __rv__sub8(pmax, pmin);
#if 0
uint64_t qs = __rv__umul8(pd, pd); uint64_t qs = __rv__umul8(pd, pd);
/* uint32_t psl = (uint32_t)qs; */ /* uint32_t psl = (uint32_t)qs; */
/* uint32_t psh = (uint32_t)(qs >> 32); */ /* uint32_t psh = (uint32_t)(qs >> 32); */
@ -56,12 +57,16 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
s += ((qs >> 16) & 0x000000000000FFFFull); s += ((qs >> 16) & 0x000000000000FFFFull);
s += ((qs >> 32) & 0x000000000000FFFFull); s += ((qs >> 32) & 0x000000000000FFFFull);
s += ((qs >> 48) & 0x000000000000FFFFull); s += ((qs >> 48) & 0x000000000000FFFFull);
#else
s = __rv__umaqa(pd, pd, s);
#endif
p1 = ((uint32_t*)pix1)[1]; p1 = ((uint32_t*)pix1)[1];
p2 = ((uint32_t*)pix2)[1]; p2 = ((uint32_t*)pix2)[1];
pmin = __rv__umin8(p1, p2); pmin = __rv__umin8(p1, p2);
pmax = __rv__umax8(p1, p2); pmax = __rv__umax8(p1, p2);
pd = __rv__sub8(pmax, pmin); pd = __rv__sub8(pmax, pmin);
#if 0
qs = __rv__umul8(pd, pd); qs = __rv__umul8(pd, pd);
/* psl = (uint32_t)qs; */ /* psl = (uint32_t)qs; */
/* psh = (uint32_t)(qs >> 32); */ /* psh = (uint32_t)(qs >> 32); */
@ -73,6 +78,9 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
s += ((qs >> 16) & 0x000000000000FFFFull); s += ((qs >> 16) & 0x000000000000FFFFull);
s += ((qs >> 32) & 0x000000000000FFFFull); s += ((qs >> 32) & 0x000000000000FFFFull);
s += ((qs >> 48) & 0x000000000000FFFFull); s += ((qs >> 48) & 0x000000000000FFFFull);
#else
s = __rv__umaqa(pd, pd, s);
#endif
pix1 += stride; pix1 += stride;
pix2 += stride; pix2 += stride;

View file

@ -710,6 +710,35 @@ uint64_t __rv__umulx16(const uint32_t rs1, const uint32_t rs2) {
memcpy(&r, c, 8); memcpy(&r, c, 8);
return r; return r;
} }
uint64_t __rv__smaqa(const uint32_t rs1, const uint32_t rs2, const uint32_t rs3) {
int4x8_t a, b;
int4x16_t c;
int32_t r;
memcpy(a, &rs1, 4);
memcpy(b, &rs2, 4);
c[0] = a[0] * b[0];
c[1] = a[1] * b[1];
c[2] = a[2] * b[2];
c[3] = a[3] * b[3];
r = ((int32_t)rs3) + c[0] + c[1] + c[2] + c[3];
return r;
}
uint64_t __rv__umaqa(const uint32_t rs1, const uint32_t rs2, const uint32_t rs3) {
uint4x8_t a, b;
uint4x16_t c;
uint32_t r;
memcpy(a, &rs1, 4);
memcpy(b, &rs2, 4);
c[0] = a[0] * b[0];
c[1] = a[1] * b[1];
c[2] = a[2] * b[2];
c[3] = a[3] * b[3];
r = rs3 + c[0] + c[1] + c[2] + c[3];
return r;
}
#endif // __riscv #endif // __riscv
unsigned int a = 0x01234567; unsigned int a = 0x01234567;
@ -866,6 +895,9 @@ int main(int argc, char **argv) {
T2W(__rv__smulx16); T2W(__rv__smulx16);
T2W(__rv__umulx16); T2W(__rv__umulx16);
T3(__rv__smaqa);
T3(__rv__umaqa);
b = 0x0100F004 + index; b = 0x0100F004 + index;
} }