mirror of
https://github.com/rdolbeau/VexRiscvBPluginGenerator.git
synced 2025-04-18 18:44:42 -04:00
drop the mvs in FUN2W ; add [us]maqa (require a non-earlyInjection plugin to meet timing...) ; use umaqa in sse8
This commit is contained in:
parent
ff1b051f56
commit
eb59ac0630
6 changed files with 79 additions and 9 deletions
5
Makefile
5
Makefile
|
@ -57,7 +57,10 @@ CryptoZknh.scala: gen_plugin data_sha.txt
|
|||
./gen_plugin -n CryptoZknh -i data_sha.txt -I '*' >| $@
|
||||
|
||||
PDataProcess.scala: gen_plugin data_Zpn.txt
|
||||
./gen_plugin -n PDataProcess -i data_Zpn.txt -I '*' >| $@
|
||||
./gen_plugin -n PDataProcess -i data_Zpn.txt -I Zpn >| $@
|
||||
|
||||
PSlowDataProcess.scala: gen_plugin data_Zpn.txt
|
||||
./gen_plugin -n PSlowDataProcess -i data_Zpn.txt -I Zpnslow >| $@
|
||||
|
||||
P64DataProcess.scala: gen_plugin data_Zp64.txt
|
||||
./gen_plugin -w -n P64DataProcess -i data_Zp64.txt -I '*' >| $@
|
||||
|
|
28
data_Zpn.txt
28
data_Zpn.txt
|
@ -90,11 +90,12 @@ I URSUBW URSUBW 0011001----------001-----1110111 pdpiadd32 Zpn
|
|||
I INSBI INSBI 101011000--------000-----1110111 pdpibit Zpn
|
||||
//I MADDR32 MADDR32 1100010----------001-----1110111 pdpimac Zpn
|
||||
//I MSUBR32 MSUBR32 1100011----------001-----1110111 pdpimac Zpn
|
||||
// ternary
|
||||
I SMAQA SMAQA 1100100----------000----01110111 pdpiumul8 Zpnslow
|
||||
I UMAQA UMAQA 1100110----------000----01110111 pdpismul8 Zpnslow
|
||||
// ternary + binary (bit 25)
|
||||
I PBSADx PBSADx 111111-----------000-----1110111 pdpipsad Zpn
|
||||
|
||||
|
||||
|
||||
// binary
|
||||
S ADD8 "fun_add8(input(SRC1), input(SRC2))"
|
||||
S ADD16 "fun_add16(input(SRC1), input(SRC2))"
|
||||
|
@ -148,6 +149,8 @@ S SWAP16 "fun_swap16(input(SRC1))"
|
|||
// ternary
|
||||
S PBSADx "fun_pbsada(input(SRC1), input(SRC2), (input(INSTRUCTION)(25).asUInt === 0) ? U(0, 32 bits).asBits | input(SRC3))"
|
||||
S INSBI "fun_insb(input(SRC1), input(SRC2), input(SRC3))"
|
||||
S SMAQA "fun_smaqa(input(SRC1), input(SRC2), input(SRC3))"
|
||||
S UMAQA "fun_umaqa(input(SRC1), input(SRC2), input(SRC3))"
|
||||
|
||||
P """
|
||||
def fun_add8(rs1: Bits, rs2: Bits) : Bits = {
|
||||
|
@ -556,4 +559,25 @@ P """
|
|||
)
|
||||
r // return value
|
||||
}
|
||||
|
||||
def fun_smaqa(rs1: Bits, rs2: Bits, rs3: Bits) : Bits = {
|
||||
// 18 bits needed so that intermediate sums don't overflow
|
||||
val h0 = (rs1( 7 downto 0).asSInt * rs2( 7 downto 0).asSInt).asBits.resize(18)
|
||||
val h1 = (rs1(15 downto 8).asSInt * rs2(15 downto 8).asSInt).asBits.resize(18)
|
||||
val h2 = (rs1(23 downto 16).asSInt * rs2(23 downto 16).asSInt).asBits.resize(18)
|
||||
val h3 = (rs1(31 downto 24).asSInt * rs2(31 downto 24).asSInt).asBits.resize(18)
|
||||
val r = rs3.asSInt + (h0.asSInt + h1.asSInt + h2.asSInt + h3.asSInt)
|
||||
|
||||
r.asBits.resize(32) // return value
|
||||
}
|
||||
def fun_umaqa(rs1: Bits, rs2: Bits, rs3: Bits) : Bits = {
|
||||
// 18 bits needed so that intermediate sums don't overflow
|
||||
val h0 = (rs1( 7 downto 0).asUInt * rs2( 7 downto 0).asUInt).asBits.resize(18)
|
||||
val h1 = (rs1(15 downto 8).asUInt * rs2(15 downto 8).asUInt).asBits.resize(18)
|
||||
val h2 = (rs1(23 downto 16).asUInt * rs2(23 downto 16).asUInt).asBits.resize(18)
|
||||
val h3 = (rs1(31 downto 24).asUInt * rs2(31 downto 24).asUInt).asBits.resize(18)
|
||||
val r = rs3.asUInt + (h0.asUInt + h1.asUInt + h2.asUInt + h3.asUInt)
|
||||
|
||||
r.asBits.resize(32) // return value
|
||||
}
|
||||
"""
|
||||
|
|
|
@ -66,13 +66,10 @@ typedef uint32_t uint_xlen_t;
|
|||
// binary wide (64-bits output in R2n/R2n+1, e.g. smul8 from P)
|
||||
#define FUN2W(NAME, ASNAME) \
|
||||
static inline uint64_t NAME(uint_xlen_t rs1, uint_xlen_t rs2) { \
|
||||
uint32_t r0, r1; \
|
||||
asm (#ASNAME " reg_t5, reg_%2, reg_%3\n" \
|
||||
"mv %0, t5\n" \
|
||||
"mv %1, t6\n" \
|
||||
register uint32_t r0 asm ("t5"), r1 asm ("t6"); \
|
||||
asm (#ASNAME " reg_%0, reg_%2, reg_%3\n" \
|
||||
: "=r" (r0), "=r" (r1) \
|
||||
: "r" (rs1), "r" (rs2) \
|
||||
: "t5", "t6"); \
|
||||
: "r" (rs1), "r" (rs2)); \
|
||||
return ((uint64_t)r0 | (((uint64_t)r1)<<32)); \
|
||||
}
|
||||
|
||||
|
|
|
@ -146,4 +146,10 @@ FUN2W(__rv__smulx16,SMULx16)
|
|||
ASM2MACRO(UMULx16,0xb2000077)
|
||||
FUN2W(__rv__umulx16,UMULx16)
|
||||
|
||||
|
||||
ASM3RMACRO(SMAQA, 0xc8000077)
|
||||
FUN3R(__rv__smaqa, SMAQA)
|
||||
ASM3RMACRO(UMAQA, 0xcc000077)
|
||||
FUN3R(__rv__umaqa, UMAQA)
|
||||
|
||||
#endif // __NEW_INSTRUCTION_SUPPORT_P_H__
|
||||
|
|
8
pbsse.c
8
pbsse.c
|
@ -45,6 +45,7 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
|||
uint32_t pmin = __rv__umin8(p1, p2);
|
||||
uint32_t pmax = __rv__umax8(p1, p2);
|
||||
uint32_t pd = __rv__sub8(pmax, pmin);
|
||||
#if 0
|
||||
uint64_t qs = __rv__umul8(pd, pd);
|
||||
/* uint32_t psl = (uint32_t)qs; */
|
||||
/* uint32_t psh = (uint32_t)(qs >> 32); */
|
||||
|
@ -56,12 +57,16 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
|||
s += ((qs >> 16) & 0x000000000000FFFFull);
|
||||
s += ((qs >> 32) & 0x000000000000FFFFull);
|
||||
s += ((qs >> 48) & 0x000000000000FFFFull);
|
||||
#else
|
||||
s = __rv__umaqa(pd, pd, s);
|
||||
#endif
|
||||
|
||||
p1 = ((uint32_t*)pix1)[1];
|
||||
p2 = ((uint32_t*)pix2)[1];
|
||||
pmin = __rv__umin8(p1, p2);
|
||||
pmax = __rv__umax8(p1, p2);
|
||||
pd = __rv__sub8(pmax, pmin);
|
||||
#if 0
|
||||
qs = __rv__umul8(pd, pd);
|
||||
/* psl = (uint32_t)qs; */
|
||||
/* psh = (uint32_t)(qs >> 32); */
|
||||
|
@ -73,6 +78,9 @@ static inline int sse8_r5vp(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
|||
s += ((qs >> 16) & 0x000000000000FFFFull);
|
||||
s += ((qs >> 32) & 0x000000000000FFFFull);
|
||||
s += ((qs >> 48) & 0x000000000000FFFFull);
|
||||
#else
|
||||
s = __rv__umaqa(pd, pd, s);
|
||||
#endif
|
||||
|
||||
pix1 += stride;
|
||||
pix2 += stride;
|
||||
|
|
32
test_p.c
32
test_p.c
|
@ -710,6 +710,35 @@ uint64_t __rv__umulx16(const uint32_t rs1, const uint32_t rs2) {
|
|||
memcpy(&r, c, 8);
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uint64_t __rv__smaqa(const uint32_t rs1, const uint32_t rs2, const uint32_t rs3) {
|
||||
int4x8_t a, b;
|
||||
int4x16_t c;
|
||||
int32_t r;
|
||||
memcpy(a, &rs1, 4);
|
||||
memcpy(b, &rs2, 4);
|
||||
c[0] = a[0] * b[0];
|
||||
c[1] = a[1] * b[1];
|
||||
c[2] = a[2] * b[2];
|
||||
c[3] = a[3] * b[3];
|
||||
r = ((int32_t)rs3) + c[0] + c[1] + c[2] + c[3];
|
||||
return r;
|
||||
}
|
||||
uint64_t __rv__umaqa(const uint32_t rs1, const uint32_t rs2, const uint32_t rs3) {
|
||||
uint4x8_t a, b;
|
||||
uint4x16_t c;
|
||||
uint32_t r;
|
||||
memcpy(a, &rs1, 4);
|
||||
memcpy(b, &rs2, 4);
|
||||
c[0] = a[0] * b[0];
|
||||
c[1] = a[1] * b[1];
|
||||
c[2] = a[2] * b[2];
|
||||
c[3] = a[3] * b[3];
|
||||
r = rs3 + c[0] + c[1] + c[2] + c[3];
|
||||
return r;
|
||||
}
|
||||
#endif // __riscv
|
||||
|
||||
unsigned int a = 0x01234567;
|
||||
|
@ -866,6 +895,9 @@ int main(int argc, char **argv) {
|
|||
T2W(__rv__smulx16);
|
||||
T2W(__rv__umulx16);
|
||||
|
||||
T3(__rv__smaqa);
|
||||
T3(__rv__umaqa);
|
||||
|
||||
b = 0x0100F004 + index;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue