drop the mvs in FUN2W ; add [us]maqa (require a non-earlyInjection plugin to meet timing...) ; use umaqa in sse8

2025-04-18 18:44:42 -04:00 · 2021-02-13 08:30:29 -05:00 · 2021-02-13 08:30:29 -05:00 · eb59ac0630
commit eb59ac0630
parent ff1b051f56
6 changed files with 79 additions and 9 deletions
--- a/5
+++ b/5
@ -57,7 +57,10 @@ CryptoZknh.scala: gen_plugin data_sha.txt
 	./gen_plugin -n CryptoZknh -i data_sha.txt -I '*' >| $@

 PDataProcess.scala: gen_plugin data_Zpn.txt
-	./gen_plugin -n PDataProcess -i data_Zpn.txt -I '*' >| $@
+	./gen_plugin -n PDataProcess -i data_Zpn.txt -I Zpn >| $@
+
+PSlowDataProcess.scala: gen_plugin data_Zpn.txt
+	./gen_plugin -n PSlowDataProcess -i data_Zpn.txt -I Zpnslow >| $@

 P64DataProcess.scala: gen_plugin data_Zp64.txt
 	./gen_plugin -w -n P64DataProcess -i data_Zp64.txt -I '*' >| $@
--- a/data_Zpn.txt
+++ b/data_Zpn.txt
@ -90,11 +90,12 @@ I	URSUBW	URSUBW	0011001----------001-----1110111	pdpiadd32	Zpn
 I	INSBI	INSBI	101011000--------000-----1110111	pdpibit	Zpn
 //I	MADDR32	MADDR32	1100010----------001-----1110111	pdpimac	Zpn
 //I	MSUBR32	MSUBR32	1100011----------001-----1110111	pdpimac	Zpn
+// ternary
+I	SMAQA   SMAQA	1100100----------000----01110111	pdpiumul8	Zpnslow
+I	UMAQA   UMAQA	1100110----------000----01110111	pdpismul8	Zpnslow
 // ternary + binary (bit 25)
 I	PBSADx	PBSADx	111111-----------000-----1110111	pdpipsad	Zpn

-
-
 // binary
 S	ADD8	"fun_add8(input(SRC1), input(SRC2))"
 S	ADD16	"fun_add16(input(SRC1), input(SRC2))"
@ -148,6 +149,8 @@ S	SWAP16	"fun_swap16(input(SRC1))"
 // ternary
 S	PBSADx	"fun_pbsada(input(SRC1), input(SRC2), (input(INSTRUCTION)(25).asUInt === 0) ? U(0, 32 bits).asBits | input(SRC3))"
 S	INSBI	"fun_insb(input(SRC1), input(SRC2), input(SRC3))"
+S	SMAQA	"fun_smaqa(input(SRC1), input(SRC2), input(SRC3))"
+S	UMAQA	"fun_umaqa(input(SRC1), input(SRC2), input(SRC3))"

 P	"""
 	def fun_add8(rs1: Bits, rs2: Bits) : Bits = {
@ -556,4 +559,25 @@ P	"""
 	    )
 	    r // return value
 	}
+
+	def fun_smaqa(rs1: Bits, rs2: Bits, rs3: Bits) : Bits = {
+	// 18 bits needed so that intermediate sums don't overflow
+	    val h0 = (rs1( 7 downto  0).asSInt * rs2( 7 downto  0).asSInt).asBits.resize(18)
+	    val h1 = (rs1(15 downto  8).asSInt * rs2(15 downto  8).asSInt).asBits.resize(18)
+	    val h2 = (rs1(23 downto 16).asSInt * rs2(23 downto 16).asSInt).asBits.resize(18)
+	    val h3 = (rs1(31 downto 24).asSInt * rs2(31 downto 24).asSInt).asBits.resize(18)
+	    val r = rs3.asSInt + (h0.asSInt + h1.asSInt + h2.asSInt + h3.asSInt)
+
+	    r.asBits.resize(32) // return value
+	}
+	def fun_umaqa(rs1: Bits, rs2: Bits, rs3: Bits) : Bits = {
+	// 18 bits needed so that intermediate sums don't overflow
+	    val h0 = (rs1( 7 downto  0).asUInt * rs2( 7 downto  0).asUInt).asBits.resize(18)
+	    val h1 = (rs1(15 downto  8).asUInt * rs2(15 downto  8).asUInt).asBits.resize(18)
+	    val h2 = (rs1(23 downto 16).asUInt * rs2(23 downto 16).asUInt).asBits.resize(18)
+	    val h3 = (rs1(31 downto 24).asUInt * rs2(31 downto 24).asUInt).asBits.resize(18)
+	    val r = rs3.asUInt + (h0.asUInt + h1.asUInt + h2.asUInt + h3.asUInt)
+
+	    r.asBits.resize(32) // return value
+	}
 """
--- a/new_instructions_support.h
+++ b/new_instructions_support.h
@ -66,13 +66,10 @@ typedef uint32_t uint_xlen_t;
 // binary wide (64-bits output in R2n/R2n+1, e.g. smul8 from P)
 #define FUN2W(NAME, ASNAME)						\
  static inline uint64_t NAME(uint_xlen_t rs1, uint_xlen_t rs2) {	\
-    uint32_t r0, r1;							\
-    asm (#ASNAME " reg_t5, reg_%2, reg_%3\n"				\
-	 "mv %0, t5\n"							\
-	 "mv %1, t6\n"							\
+    register uint32_t r0 asm ("t5"), r1 asm ("t6");			\
+    asm (#ASNAME " reg_%0, reg_%2, reg_%3\n"				\
 	 : "=r" (r0), "=r" (r1)						\
-	 : "r" (rs1), "r" (rs2)						\
-	 : "t5", "t6");							\
+	 : "r" (rs1), "r" (rs2));					\
    return ((uint64_t)r0 | (((uint64_t)r1)<<32));			\
  }

--- a/new_instructions_support_p.h
+++ b/new_instructions_support_p.h
@ -146,4 +146,10 @@ FUN2W(__rv__smulx16,SMULx16)
 ASM2MACRO(UMULx16,0xb2000077)
 FUN2W(__rv__umulx16,UMULx16)

+  
+ASM3RMACRO(SMAQA, 0xc8000077)
+FUN3R(__rv__smaqa, SMAQA)
+ASM3RMACRO(UMAQA, 0xcc000077)
+FUN3R(__rv__umaqa, UMAQA)
+
 #endif // __NEW_INSTRUCTION_SUPPORT_P_H__
--- a/pbsse.c
+++ b/pbsse.c
@ -45,6 +45,7 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 	uint32_t pmin = __rv__umin8(p1, p2);
 	uint32_t pmax = __rv__umax8(p1, p2);
 	uint32_t pd = __rv__sub8(pmax, pmin);
+#if 0
 	uint64_t qs = __rv__umul8(pd, pd);
 	/* uint32_t psl = (uint32_t)qs; */
 	/* uint32_t psh = (uint32_t)(qs >> 32); */
@ -56,12 +57,16 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 	s += ((qs >> 16) & 0x000000000000FFFFull);
 	s += ((qs >> 32) & 0x000000000000FFFFull);
 	s += ((qs >> 48) & 0x000000000000FFFFull);
+#else
+	s = __rv__umaqa(pd, pd, s);
+#endif
 	
 	p1 = ((uint32_t*)pix1)[1];
 	p2 = ((uint32_t*)pix2)[1];
 	pmin = __rv__umin8(p1, p2);
 	pmax = __rv__umax8(p1, p2);
 	pd = __rv__sub8(pmax, pmin);
+#if 0
 	qs = __rv__umul8(pd, pd);
 	/* psl = (uint32_t)qs; */
 	/* psh = (uint32_t)(qs >> 32); */
@ -73,6 +78,9 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 	s += ((qs >> 16) & 0x000000000000FFFFull);
 	s += ((qs >> 32) & 0x000000000000FFFFull);
 	s += ((qs >> 48) & 0x000000000000FFFFull);
+#else
+	s = __rv__umaqa(pd, pd, s);
+#endif
 	
        pix1 += stride;
        pix2 += stride;
--- a/test_p.c
+++ b/test_p.c
@ -710,6 +710,35 @@ uint64_t __rv__umulx16(const uint32_t rs1, const uint32_t rs2) {
  memcpy(&r, c, 8);
  return r;
 }
+
+
+
+uint64_t __rv__smaqa(const uint32_t rs1, const uint32_t rs2, const uint32_t rs3) {
+  int4x8_t a, b;
+  int4x16_t c;
+  int32_t r;
+  memcpy(a, &rs1, 4);
+  memcpy(b, &rs2, 4);
+  c[0] = a[0] * b[0];
+  c[1] = a[1] * b[1];
+  c[2] = a[2] * b[2];
+  c[3] = a[3] * b[3];
+  r = ((int32_t)rs3) + c[0] + c[1] + c[2] + c[3];
+  return r;
+}
+uint64_t __rv__umaqa(const uint32_t rs1, const uint32_t rs2, const uint32_t rs3) {
+  uint4x8_t a, b;
+  uint4x16_t c;
+  uint32_t r;
+  memcpy(a, &rs1, 4);
+  memcpy(b, &rs2, 4);
+  c[0] = a[0] * b[0];
+  c[1] = a[1] * b[1];
+  c[2] = a[2] * b[2];
+  c[3] = a[3] * b[3];
+  r = rs3 + c[0] + c[1] + c[2] + c[3];
+  return r;
+}
 #endif // __riscv
  
  unsigned int a = 0x01234567;
@ -866,6 +895,9 @@ int main(int argc, char **argv) {
  T2W(__rv__smulx16);
  T2W(__rv__umulx16);
  
+  T3(__rv__smaqa);
+  T3(__rv__umaqa);
+  
  b = 0x0100F004 + index;
  }