From 98f51a963228981f78efead17f2f8ee295e9d5a4 Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain.dolbeau@european-processor-initiative.eu>
Date: Sat, 20 Mar 2021 06:31:21 -0400
Subject: [PATCH] add and check some saturating instructions (w/o the CSR
 update)

---
 data_Zpn.txt                 | 57 +++++++++++++++++++++++++++++++++++
 new_instructions_support_p.h | 11 +++++++
 test_p.c                     | 58 +++++++++++++++++++++++++++++++++++-
 3 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/data_Zpn.txt b/data_Zpn.txt
index 4dd301f..f936d62 100644
--- a/data_Zpn.txt
+++ b/data_Zpn.txt
@@ -101,6 +101,12 @@ I	UMAQA   UMAQA	1100110----------000----01110111	pdpismul8	Zpnslow
 // ternary + binary (bit 25)
 I	PBSADx	PBSADx	111111-----------000-----1110111	pdpipsad	Zpn
 
+// Saturating, currently doesn't set the CSR
+I  KADD8	KADD8	0001100----------000-----1110111	pdpikadd8	Zpn
+I  UKADD8	UKADD8	0011100----------000-----1110111	pdpikadd8	Zpn
+I  KSUB8	KSUB8	0001101----------000-----1110111	pdpikadd8	Zpn
+I  UKSUB8	UKSUB8	0011101----------000-----1110111	pdpikadd8	Zpn
+
 // binary
 S	ADD8	"fun_add8(input(SRC1), input(SRC2))"
 S	ADD16	"fun_add16(input(SRC1), input(SRC2))"
@@ -159,6 +165,12 @@ S	INSBI	"fun_insb(input(SRC1), input(SRC2), input(SRC3))"
 S	SMAQA	"fun_smaqa(input(SRC1), input(SRC2), input(SRC3))"
 S	UMAQA	"fun_umaqa(input(SRC1), input(SRC2), input(SRC3))"
 
+// saturating, fixme for csr
+S  KADD8	"fun_kadd8(input(SRC1), input(SRC2))"
+S  UKADD8	"fun_ukadd8(input(SRC1), input(SRC2))"
+S  KSUB8	"fun_ksub8(input(SRC1), input(SRC2))"
+S  UKSUB8	"fun_uksub8(input(SRC1), input(SRC2))"
+
 P	"""
 	def fun_add8(rs1: Bits, rs2: Bits) : Bits = {
 	    val b0 = (rs1( 7 downto  0).asUInt + rs2( 7 downto  0).asUInt).asBits.resize(8)
@@ -607,4 +619,49 @@ P	"""
 		)
 		r // return value
 	}
+
+// saturating, csr is missing
+// it seems sat() (and it's shortcut +| and -|) in SpinalHDL don't do what I need
+// for unsigned substraction (no way to tell the difference between overflow
+// and underflow unless going signed, I think)
+   def fun_satsub8u(a: Bits, b: Bits) : Bits = {
+   	   val s = (B"1'b0" ## a).asSInt -^ (B"1'b0" ## b).asSInt // -^ will keep 10 bits
+	   // if sign bit set -> underflow, else if bit eight set -> overflow
+	   val r = ((s(9).asUInt === 1) ? (B"8'x00") | ((s(8).asUInt === 1) ? (B"8'xFF") | (s(7 downto 0).asBits)))
+	   
+	   r // return value
+   }
+
+	def fun_kadd8(rs1: Bits, rs2: Bits) : Bits = {
+	    val b0 = (rs1( 7 downto  0).asSInt +| rs2( 7 downto  0).asSInt).asBits.resize(8)
+	    val b1 = (rs1(15 downto  8).asSInt +| rs2(15 downto  8).asSInt).asBits.resize(8)
+	    val b2 = (rs1(23 downto 16).asSInt +| rs2(23 downto 16).asSInt).asBits.resize(8)
+	    val b3 = (rs1(31 downto 24).asSInt +| rs2(31 downto 24).asSInt).asBits.resize(8)
+
+	    b3 ## b2 ## b1 ## b0 // return value
+	}
+	def fun_ukadd8(rs1: Bits, rs2: Bits) : Bits = {
+	    val b0 = (rs1( 7 downto  0).asUInt +| rs2( 7 downto  0).asUInt).asBits.resize(8)
+	    val b1 = (rs1(15 downto  8).asUInt +| rs2(15 downto  8).asUInt).asBits.resize(8)
+	    val b2 = (rs1(23 downto 16).asUInt +| rs2(23 downto 16).asUInt).asBits.resize(8)
+	    val b3 = (rs1(31 downto 24).asUInt +| rs2(31 downto 24).asUInt).asBits.resize(8)
+
+	    b3 ## b2 ## b1 ## b0 // return value
+	}
+	def fun_ksub8(rs1: Bits, rs2: Bits) : Bits = {
+	    val b0 = (rs1( 7 downto  0).asSInt -| rs2( 7 downto  0).asSInt).asBits.resize(8)
+	    val b1 = (rs1(15 downto  8).asSInt -| rs2(15 downto  8).asSInt).asBits.resize(8)
+	    val b2 = (rs1(23 downto 16).asSInt -| rs2(23 downto 16).asSInt).asBits.resize(8)
+	    val b3 = (rs1(31 downto 24).asSInt -| rs2(31 downto 24).asSInt).asBits.resize(8)
+
+	    b3 ## b2 ## b1 ## b0 // return value
+	}
+	def fun_uksub8(rs1: Bits, rs2: Bits) : Bits = {
+		val b0 = fun_satsub8u(rs1( 7 downto  0), rs2( 7 downto  0)).asBits
+		val b1 = fun_satsub8u(rs1(15 downto  8), rs2(15 downto  8)).asBits
+		val b2 = fun_satsub8u(rs1(23 downto 16), rs2(23 downto 16)).asBits
+		val b3 = fun_satsub8u(rs1(31 downto 24), rs2(31 downto 24)).asBits
+
+	    b3 ## b2 ## b1 ## b0 // return value
+	}
 """
diff --git a/new_instructions_support_p.h b/new_instructions_support_p.h
index 7abcdd3..5a78f5b 100644
--- a/new_instructions_support_p.h
+++ b/new_instructions_support_p.h
@@ -179,4 +179,15 @@ FUN1(__rv__sunpkd831, SUNPKD831)
 ASM1MACRO(SUNPKD832, 0xad300077)
 FUN1(__rv__sunpkd832, SUNPKD832)
 
+
+ASM2MACRO(KADD8,0x18000077)
+FUN2(__rv__kadd8,KADD8)
+ASM2MACRO(UKADD8,0x38000077)
+FUN2(__rv__ukadd8,UKADD8)
+ASM2MACRO(KSUB8,0x1a000077)
+FUN2(__rv__ksub8,KSUB8)
+ASM2MACRO(UKSUB8,0x3a000077)
+FUN2(__rv__uksub8,UKSUB8)
+
+
 #endif // __NEW_INSTRUCTION_SUPPORT_P_H__
diff --git a/test_p.c b/test_p.c
index e928333..63efc5d 100644
--- a/test_p.c
+++ b/test_p.c
@@ -783,6 +783,57 @@ GEN_ZUNPKD8(3,0)
 GEN_ZUNPKD8(3,1)
 GEN_ZUNPKD8(3,2)
 
+#define SATs8(x) ((x) > 127 ? 127 : (x) < -128 ? -128 : (x))
+#define SATu8(x) ((x) > 255 ? 255 : (x) < 0 ? 0 : (x))
+uint32_t __rv__kadd8(const uint32_t rs1, const uint32_t rs2) {
+	int4x8_t a, b, c;
+	int32_t r;
+	memcpy(a, &rs1, 4);
+	memcpy(b, &rs2, 4);
+	c[0] = SATs8((int32_t)a[0] + (int32_t)b[0]);
+	c[1] = SATs8((int32_t)a[1] + (int32_t)b[1]);
+	c[2] = SATs8((int32_t)a[2] + (int32_t)b[2]);
+	c[3] = SATs8((int32_t)a[3] + (int32_t)b[3]);
+	memcpy(&r, c, 4);
+	return r;
+}
+uint32_t __rv__ukadd8(const uint32_t rs1, const uint32_t rs2) {
+	uint4x8_t a, b, c;
+	uint32_t r;
+	memcpy(a, &rs1, 4);
+	memcpy(b, &rs2, 4);
+	c[0] = SATu8((int32_t)a[0] + (int32_t)b[0]);
+	c[1] = SATu8((int32_t)a[1] + (int32_t)b[1]);
+	c[2] = SATu8((int32_t)a[2] + (int32_t)b[2]);
+	c[3] = SATu8((int32_t)a[3] + (int32_t)b[3]);
+	memcpy(&r, c, 4);
+	return r;
+}
+uint32_t __rv__ksub8(const uint32_t rs1, const uint32_t rs2) {
+	int4x8_t a, b, c;
+	int32_t r;
+	memcpy(a, &rs1, 4);
+	memcpy(b, &rs2, 4);
+	c[0] = SATs8((int32_t)a[0] - (int32_t)b[0]);
+	c[1] = SATs8((int32_t)a[1] - (int32_t)b[1]);
+	c[2] = SATs8((int32_t)a[2] - (int32_t)b[2]);
+	c[3] = SATs8((int32_t)a[3] - (int32_t)b[3]);
+	memcpy(&r, c, 4);
+	return r;
+}
+uint32_t __rv__uksub8(const uint32_t rs1, const uint32_t rs2) {
+	uint4x8_t a, b, c;
+	uint32_t r;
+	memcpy(a, &rs1, 4);
+	memcpy(b, &rs2, 4);
+	c[0] = SATu8((int32_t)a[0] - (int32_t)b[0]);
+	c[1] = SATu8((int32_t)a[1] - (int32_t)b[1]);
+	c[2] = SATu8((int32_t)a[2] - (int32_t)b[2]);
+	c[3] = SATu8((int32_t)a[3] - (int32_t)b[3]);
+	memcpy(&r, c, 4);
+	return r;
+}
+
 #endif // __riscv
   
 unsigned int a = 0x01234567;
@@ -823,7 +874,7 @@ int main(int argc, char **argv) {
 		T1(__rv__swap16);
 		
 		for (index1 = 0 ; index1 < nonrandom_cnt[1] ; index1++) {
-			b = nonrandom_b[index];
+			b = nonrandom_b[index1];
 #if 1
 			T2(__rv__add8);
 			T2(__rv__radd8);
@@ -891,6 +942,11 @@ int main(int argc, char **argv) {
 			T2W(__rv__umulx8);
 			T2W(__rv__smulx16);
 			T2W(__rv__umulx16);
+
+			T2(__rv__kadd8);
+			T2(__rv__ukadd8);
+			T2(__rv__ksub8);
+			T2(__rv__uksub8);
   
 			for (index2 = 0 ; index2 < nonrandom_cnt[2] ; index2++) {
 				d = nonrandom_d[index2];