aarch64: Reimplement vabd* intrinsics using builtins

This patch reimplements the vabd* intrinsics using RTL builtins.
It's fairly straightforward with new builtins + arm_neon.h changes.

gcc/
	* config/aarch64/aarch64-simd.md (aarch64_<su>abd<mode>_3):
	Rename to...
	(aarch64_<su>abd<mode>): ... This.
	(<sur>sadv16qi): Adjust callsite of the above.
	* config/aarch64/aarch64-simd-builtins.def (sabd, uabd): Define
	builtins.
	* config/aarch64/arm_neon.h (vabd_s8): Reimplement using
	builtin.
	(vabd_s16): Likewise.
	(vabd_s32): Likewise.
	(vabd_u8): Likewise.
	(vabd_u16): Likewise.
	(vabd_u32): Likewise.
	(vabdq_s8): Likewise.
	(vabdq_s16): Likewise.
	(vabdq_s32): Likewise.
	(vabdq_u8): Likewise.
	(vabdq_u16): Likewise.
	(vabdq_u32): Likewise.
This commit is contained in:
Kyrylo Tkachov 2021-01-07 15:09:43 +00:00
parent cab822d4ea
commit 79db5945ad
3 changed files with 18 additions and 74 deletions

View File

@ -149,6 +149,10 @@
BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, NONE)
BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, NONE)
/* Implemented by aarch64_<su>abd<mode>. */
BUILTIN_VDQ_BHSI (BINOP, sabd, 0, NONE)
BUILTIN_VDQ_BHSI (BINOPU, uabd, 0, NONE)
/* Implemented by aarch64_<su>aba<mode>. */
BUILTIN_VDQ_BHSI (TERNOP, saba, 0, NONE)
BUILTIN_VDQ_BHSI (TERNOPU, uaba, 0, NONE)

View File

@ -766,7 +766,7 @@
;; So (ABS:QI (minus:QI 64 -128)) == (ABS:QI (192 or -64 signed)) == 64.
;; Whereas SABD would return 192 (-64 signed) on the above example.
;; Use MINUS ([us]max (op1, op2), [us]min (op1, op2)) instead.
(define_insn "aarch64_<su>abd<mode>_3"
(define_insn "aarch64_<su>abd<mode>"
[(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
(minus:VDQ_BHSI
(USMAX:VDQ_BHSI
@ -842,7 +842,7 @@
{
rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
rtx abd = gen_reg_rtx (V16QImode);
emit_insn (gen_aarch64_<sur>abdv16qi_3 (abd, operands[1], operands[2]));
emit_insn (gen_aarch64_<sur>abdv16qi (abd, operands[1], operands[2]));
emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
abd, ones));
DONE;

View File

@ -6849,72 +6849,42 @@ __extension__ extern __inline int8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_s8 (int8x8_t __a, int8x8_t __b)
{
int8x8_t __result;
__asm__ ("sabd %0.8b, %1.8b, %2.8b"
: "=w"(__result)
: "w"(__a), "w"(__b)
: /* No clobbers */);
return __result;
return __builtin_aarch64_sabdv8qi (__a, __b);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_s16 (int16x4_t __a, int16x4_t __b)
{
int16x4_t __result;
__asm__ ("sabd %0.4h, %1.4h, %2.4h"
: "=w"(__result)
: "w"(__a), "w"(__b)
: /* No clobbers */);
return __result;
return __builtin_aarch64_sabdv4hi (__a, __b);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_s32 (int32x2_t __a, int32x2_t __b)
{
int32x2_t __result;
__asm__ ("sabd %0.2s, %1.2s, %2.2s"
: "=w"(__result)
: "w"(__a), "w"(__b)
: /* No clobbers */);
return __result;
return __builtin_aarch64_sabdv2si (__a, __b);
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_u8 (uint8x8_t __a, uint8x8_t __b)
{
uint8x8_t __result;
__asm__ ("uabd %0.8b, %1.8b, %2.8b"
: "=w"(__result)
: "w"(__a), "w"(__b)
: /* No clobbers */);
return __result;
return __builtin_aarch64_uabdv8qi_uuu (__a, __b);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_u16 (uint16x4_t __a, uint16x4_t __b)
{
uint16x4_t __result;
__asm__ ("uabd %0.4h, %1.4h, %2.4h"
: "=w"(__result)
: "w"(__a), "w"(__b)
: /* No clobbers */);
return __result;
return __builtin_aarch64_uabdv4hi_uuu (__a, __b);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_u32 (uint32x2_t __a, uint32x2_t __b)
{
uint32x2_t __result;
__asm__ ("uabd %0.2s, %1.2s, %2.2s"
: "=w"(__result)
: "w"(__a), "w"(__b)
: /* No clobbers */);
return __result;
return __builtin_aarch64_uabdv2si_uuu (__a, __b);
}
__extension__ extern __inline int16x8_t
@ -7065,72 +7035,42 @@ __extension__ extern __inline int8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_s8 (int8x16_t __a, int8x16_t __b)
{
int8x16_t __result;
__asm__ ("sabd %0.16b, %1.16b, %2.16b"
: "=w"(__result)
: "w"(__a), "w"(__b)
: /* No clobbers */);
return __result;
return __builtin_aarch64_sabdv16qi (__a, __b);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_s16 (int16x8_t __a, int16x8_t __b)
{
int16x8_t __result;
__asm__ ("sabd %0.8h, %1.8h, %2.8h"
: "=w"(__result)
: "w"(__a), "w"(__b)
: /* No clobbers */);
return __result;
return __builtin_aarch64_sabdv8hi (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_s32 (int32x4_t __a, int32x4_t __b)
{
int32x4_t __result;
__asm__ ("sabd %0.4s, %1.4s, %2.4s"
: "=w"(__result)
: "w"(__a), "w"(__b)
: /* No clobbers */);
return __result;
return __builtin_aarch64_sabdv4si (__a, __b);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_u8 (uint8x16_t __a, uint8x16_t __b)
{
uint8x16_t __result;
__asm__ ("uabd %0.16b, %1.16b, %2.16b"
: "=w"(__result)
: "w"(__a), "w"(__b)
: /* No clobbers */);
return __result;
return __builtin_aarch64_uabdv16qi_uuu (__a, __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_u16 (uint16x8_t __a, uint16x8_t __b)
{
uint16x8_t __result;
__asm__ ("uabd %0.8h, %1.8h, %2.8h"
: "=w"(__result)
: "w"(__a), "w"(__b)
: /* No clobbers */);
return __result;
return __builtin_aarch64_uabdv8hi_uuu (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_u32 (uint32x4_t __a, uint32x4_t __b)
{
uint32x4_t __result;
__asm__ ("uabd %0.4s, %1.4s, %2.4s"
: "=w"(__result)
: "w"(__a), "w"(__b)
: /* No clobbers */);
return __result;
return __builtin_aarch64_uabdv4si_uuu (__a, __b);
}
__extension__ extern __inline int16_t