From eb59ac063069bf455be9610b6db47a1961ef005e Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain.dolbeau@european-processor-initiative.eu>
Date: Sat, 13 Feb 2021 08:30:29 -0500
Subject: [PATCH] drop the mvs in FUN2W ; add [us]maqa (require a
 non-earlyInjection plugin to meet timing...) ; use umaqa in sse8

---
 Makefile                     |  5 ++++-
 data_Zpn.txt                 | 28 ++++++++++++++++++++++++++--
 new_instructions_support.h   |  9 +++------
 new_instructions_support_p.h |  6 ++++++
 pbsse.c                      |  8 ++++++++
 test_p.c                     | 32 ++++++++++++++++++++++++++++++++
 6 files changed, 79 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index 17b871e..3c9a4ed 100644
--- a/Makefile
+++ b/Makefile
@@ -57,7 +57,10 @@ CryptoZknh.scala: gen_plugin data_sha.txt
 	./gen_plugin -n CryptoZknh -i data_sha.txt -I '*' >| $@
 
 PDataProcess.scala: gen_plugin data_Zpn.txt
-	./gen_plugin -n PDataProcess -i data_Zpn.txt -I '*' >| $@
+	./gen_plugin -n PDataProcess -i data_Zpn.txt -I Zpn >| $@
+
+PSlowDataProcess.scala: gen_plugin data_Zpn.txt
+	./gen_plugin -n PSlowDataProcess -i data_Zpn.txt -I Zpnslow >| $@
 
 P64DataProcess.scala: gen_plugin data_Zp64.txt
 	./gen_plugin -w -n P64DataProcess -i data_Zp64.txt -I '*' >| $@
diff --git a/data_Zpn.txt b/data_Zpn.txt
index 34396bf..1a202fc 100644
--- a/data_Zpn.txt
+++ b/data_Zpn.txt
@@ -90,11 +90,12 @@ I	URSUBW	URSUBW	0011001----------001-----1110111	pdpiadd32	Zpn
 I	INSBI	INSBI	101011000--------000-----1110111	pdpibit	Zpn
 //I	MADDR32	MADDR32	1100010----------001-----1110111	pdpimac	Zpn
 //I	MSUBR32	MSUBR32	1100011----------001-----1110111	pdpimac	Zpn
+// ternary
+I	SMAQA   SMAQA	1100100----------000----01110111	pdpiumul8	Zpnslow
+I	UMAQA   UMAQA	1100110----------000----01110111	pdpismul8	Zpnslow
 // ternary + binary (bit 25)
 I	PBSADx	PBSADx	111111-----------000-----1110111	pdpipsad	Zpn
 
-
-
 // binary
 S	ADD8	"fun_add8(input(SRC1), input(SRC2))"
 S	ADD16	"fun_add16(input(SRC1), input(SRC2))"
@@ -148,6 +149,8 @@ S	SWAP16	"fun_swap16(input(SRC1))"
 // ternary
 S	PBSADx	"fun_pbsada(input(SRC1), input(SRC2), (input(INSTRUCTION)(25).asUInt === 0) ? U(0, 32 bits).asBits | input(SRC3))"
 S	INSBI	"fun_insb(input(SRC1), input(SRC2), input(SRC3))"
+S	SMAQA	"fun_smaqa(input(SRC1), input(SRC2), input(SRC3))"
+S	UMAQA	"fun_umaqa(input(SRC1), input(SRC2), input(SRC3))"
 
 P	"""
 	def fun_add8(rs1: Bits, rs2: Bits) : Bits = {
@@ -556,4 +559,25 @@ P	"""
 	    )
 	    r // return value
 	}
+
+	def fun_smaqa(rs1: Bits, rs2: Bits, rs3: Bits) : Bits = {
+	// 18 bits needed so that intermediate sums don't overflow
+	    val h0 = (rs1( 7 downto  0).asSInt * rs2( 7 downto  0).asSInt).asBits.resize(18)
+	    val h1 = (rs1(15 downto  8).asSInt * rs2(15 downto  8).asSInt).asBits.resize(18)
+	    val h2 = (rs1(23 downto 16).asSInt * rs2(23 downto 16).asSInt).asBits.resize(18)
+	    val h3 = (rs1(31 downto 24).asSInt * rs2(31 downto 24).asSInt).asBits.resize(18)
+	    val r = rs3.asSInt + (h0.asSInt + h1.asSInt + h2.asSInt + h3.asSInt)
+
+	    r.asBits.resize(32) // return value
+	}
+	def fun_umaqa(rs1: Bits, rs2: Bits, rs3: Bits) : Bits = {
+	// 18 bits needed so that intermediate sums don't overflow
+	    val h0 = (rs1( 7 downto  0).asUInt * rs2( 7 downto  0).asUInt).asBits.resize(18)
+	    val h1 = (rs1(15 downto  8).asUInt * rs2(15 downto  8).asUInt).asBits.resize(18)
+	    val h2 = (rs1(23 downto 16).asUInt * rs2(23 downto 16).asUInt).asBits.resize(18)
+	    val h3 = (rs1(31 downto 24).asUInt * rs2(31 downto 24).asUInt).asBits.resize(18)
+	    val r = rs3.asUInt + (h0.asUInt + h1.asUInt + h2.asUInt + h3.asUInt)
+
+	    r.asBits.resize(32) // return value
+	}
 """
diff --git a/new_instructions_support.h b/new_instructions_support.h
index f81aed6..30fb7ea 100644
--- a/new_instructions_support.h
+++ b/new_instructions_support.h
@@ -66,13 +66,10 @@ typedef uint32_t uint_xlen_t;
 // binary wide (64-bits output in R2n/R2n+1, e.g. smul8 from P)
 #define FUN2W(NAME, ASNAME)						\
   static inline uint64_t NAME(uint_xlen_t rs1, uint_xlen_t rs2) {	\
-    uint32_t r0, r1;							\
-    asm (#ASNAME " reg_t5, reg_%2, reg_%3\n"				\
-	 "mv %0, t5\n"							\
-	 "mv %1, t6\n"							\
+    register uint32_t r0 asm ("t5"), r1 asm ("t6");			\
+    asm (#ASNAME " reg_%0, reg_%2, reg_%3\n"				\
 	 : "=r" (r0), "=r" (r1)						\
-	 : "r" (rs1), "r" (rs2)						\
-	 : "t5", "t6");							\
+	 : "r" (rs1), "r" (rs2));					\
     return ((uint64_t)r0 | (((uint64_t)r1)<<32));			\
   }
 
diff --git a/new_instructions_support_p.h b/new_instructions_support_p.h
index 60ddbbc..99507d5 100644
--- a/new_instructions_support_p.h
+++ b/new_instructions_support_p.h
@@ -146,4 +146,10 @@ FUN2W(__rv__smulx16,SMULx16)
 ASM2MACRO(UMULx16,0xb2000077)
 FUN2W(__rv__umulx16,UMULx16)
 
+  
+ASM3RMACRO(SMAQA, 0xc8000077)
+FUN3R(__rv__smaqa, SMAQA)
+ASM3RMACRO(UMAQA, 0xcc000077)
+FUN3R(__rv__umaqa, UMAQA)
+
 #endif // __NEW_INSTRUCTION_SUPPORT_P_H__
diff --git a/pbsse.c b/pbsse.c
index 9404631..199c492 100644
--- a/pbsse.c
+++ b/pbsse.c
@@ -45,6 +45,7 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 	uint32_t pmin = __rv__umin8(p1, p2);
 	uint32_t pmax = __rv__umax8(p1, p2);
 	uint32_t pd = __rv__sub8(pmax, pmin);
+#if 0
 	uint64_t qs = __rv__umul8(pd, pd);
 	/* uint32_t psl = (uint32_t)qs; */
 	/* uint32_t psh = (uint32_t)(qs >> 32); */
@@ -56,12 +57,16 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 	s += ((qs >> 16) & 0x000000000000FFFFull);
 	s += ((qs >> 32) & 0x000000000000FFFFull);
 	s += ((qs >> 48) & 0x000000000000FFFFull);
+#else
+	s = __rv__umaqa(pd, pd, s);
+#endif
 	
 	p1 = ((uint32_t*)pix1)[1];
 	p2 = ((uint32_t*)pix2)[1];
 	pmin = __rv__umin8(p1, p2);
 	pmax = __rv__umax8(p1, p2);
 	pd = __rv__sub8(pmax, pmin);
+#if 0
 	qs = __rv__umul8(pd, pd);
 	/* psl = (uint32_t)qs; */
 	/* psh = (uint32_t)(qs >> 32); */
@@ -73,6 +78,9 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 	s += ((qs >> 16) & 0x000000000000FFFFull);
 	s += ((qs >> 32) & 0x000000000000FFFFull);
 	s += ((qs >> 48) & 0x000000000000FFFFull);
+#else
+	s = __rv__umaqa(pd, pd, s);
+#endif
 	
         pix1 += stride;
         pix2 += stride;
diff --git a/test_p.c b/test_p.c
index 2f7895b..38d0e76 100644
--- a/test_p.c
+++ b/test_p.c
@@ -710,6 +710,35 @@ uint64_t __rv__umulx16(const uint32_t rs1, const uint32_t rs2) {
   memcpy(&r, c, 8);
   return r;
 }
+
+
+
+uint64_t __rv__smaqa(const uint32_t rs1, const uint32_t rs2, const uint32_t rs3) {
+  int4x8_t a, b;
+  int4x16_t c;
+  int32_t r;
+  memcpy(a, &rs1, 4);
+  memcpy(b, &rs2, 4);
+  c[0] = a[0] * b[0];
+  c[1] = a[1] * b[1];
+  c[2] = a[2] * b[2];
+  c[3] = a[3] * b[3];
+  r = ((int32_t)rs3) + c[0] + c[1] + c[2] + c[3];
+  return r;
+}
+uint64_t __rv__umaqa(const uint32_t rs1, const uint32_t rs2, const uint32_t rs3) {
+  uint4x8_t a, b;
+  uint4x16_t c;
+  uint32_t r;
+  memcpy(a, &rs1, 4);
+  memcpy(b, &rs2, 4);
+  c[0] = a[0] * b[0];
+  c[1] = a[1] * b[1];
+  c[2] = a[2] * b[2];
+  c[3] = a[3] * b[3];
+  r = rs3 + c[0] + c[1] + c[2] + c[3];
+  return r;
+}
 #endif // __riscv
   
   unsigned int a = 0x01234567;
@@ -866,6 +895,9 @@ int main(int argc, char **argv) {
   T2W(__rv__smulx16);
   T2W(__rv__umulx16);
   
+  T3(__rv__smaqa);
+  T3(__rv__umaqa);
+  
   b = 0x0100F004 + index;
   }