Arm: Add NEON and MVE complex mul, mla and mls patterns.
This adds implementation for the optabs for complex operations. With this the following C code: void g (float complex a[restrict N], float complex b[restrict N], float complex c[restrict N]) { for (int i=0; i < N; i++) c[i] = a[i] * b[i]; } generates NEON: g: vmov.f32 q11, #0.0 @ v4sf add r3, r2, #1600 .L2: vmov q8, q11 @ v4sf vld1.32 {q10}, [r1]! vld1.32 {q9}, [r0]! vcmla.f32 q8, q9, q10, #0 vcmla.f32 q8, q9, q10, #90 vst1.32 {q8}, [r2]! cmp r3, r2 bne .L2 bx lr MVE: g: push {lr} mov lr, #100 dls lr, lr .L2: vldrw.32 q1, [r1], #16 vldrw.32 q2, [r0], #16 vcmul.f32 q3, q2, q1, #0 vcmla.f32 q3, q2, q1, #90 vstrw.32 q3, [r2], #16 le lr, .L2 ldr pc, [sp], #4 instead of g: add r3, r2, #1600 .L2: vld2.32 {d20-d23}, [r0]! vld2.32 {d16-d19}, [r1]! vmul.f32 q14, q11, q9 vmul.f32 q15, q11, q8 vneg.f32 q14, q14 vfma.f32 q15, q10, q9 vfma.f32 q14, q10, q8 vmov q13, q15 @ v4sf vmov q12, q14 @ v4sf vst2.32 {d24-d27}, [r2]! cmp r3, r2 bne .L2 bx lr and g: add r3, r2, #1600 .L2: vld2.32 {d20-d23}, [r0]! vld2.32 {d16-d19}, [r1]! vmul.f32 q15, q10, q8 vmul.f32 q14, q10, q9 vmls.f32 q15, q11, q9 vmla.f32 q14, q11, q8 vmov q12, q15 @ v4sf vmov q13, q14 @ v4sf vst2.32 {d24-d27}, [r2]! cmp r3, r2 bne .L2 bx lr respectively. gcc/ChangeLog: * config/arm/iterators.md (rotsplit1, rotsplit2, conj_op, fcmac1, VCMLA_OP, VCMUL_OP): New. * config/arm/mve.md (mve_vcmlaq<mve_rot><mode>): Support vec_dup 0. * config/arm/neon.md (cmul<conj_op><mode>3): New. * config/arm/unspecs.md (UNSPEC_VCMLA_CONJ, UNSPEC_VCMLA180_CONJ, UNSPEC_VCMUL_CONJ): New. * config/arm/vec-common.md (cmul<conj_op><mode>3, arm_vcmla<rot><mode>, cml<fcmac1><conj_op><mode>4): New.
This commit is contained in:
parent
02551aa999
commit
389b67feac
@ -1186,6 +1186,33 @@
|
||||
(UNSPEC_VCMLA180 "180")
|
||||
(UNSPEC_VCMLA270 "270")])
|
||||
|
||||
;; The complex operations when performed on a real complex number require two
|
||||
;; instructions to perform the operation. e.g. complex multiplication requires
|
||||
;; two VCMUL with a particular rotation value.
|
||||
;;
|
||||
;; These values can be looked up in rotsplit1 and rotsplit2. as an example
|
||||
;; VCMUL needs the first instruction to use #0 and the second #90.
|
||||
(define_int_attr rotsplit1 [(UNSPEC_VCMLA "0")
|
||||
(UNSPEC_VCMLA_CONJ "0")
|
||||
(UNSPEC_VCMUL "0")
|
||||
(UNSPEC_VCMUL_CONJ "0")
|
||||
(UNSPEC_VCMLA180 "180")
|
||||
(UNSPEC_VCMLA180_CONJ "180")])
|
||||
|
||||
(define_int_attr rotsplit2 [(UNSPEC_VCMLA "90")
|
||||
(UNSPEC_VCMLA_CONJ "270")
|
||||
(UNSPEC_VCMUL "90")
|
||||
(UNSPEC_VCMUL_CONJ "270")
|
||||
(UNSPEC_VCMLA180 "270")
|
||||
(UNSPEC_VCMLA180_CONJ "90")])
|
||||
|
||||
(define_int_attr conj_op [(UNSPEC_VCMLA180 "")
|
||||
(UNSPEC_VCMLA180_CONJ "_conj")
|
||||
(UNSPEC_VCMLA "")
|
||||
(UNSPEC_VCMLA_CONJ "_conj")
|
||||
(UNSPEC_VCMUL "")
|
||||
(UNSPEC_VCMUL_CONJ "_conj")])
|
||||
|
||||
(define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90")
|
||||
(UNSPEC_VCADD270 "_rot270")
|
||||
(UNSPEC_VCMLA "")
|
||||
@ -1200,6 +1227,9 @@
|
||||
(define_int_iterator VCMUL [UNSPEC_VCMUL UNSPEC_VCMUL90
|
||||
UNSPEC_VCMUL180 UNSPEC_VCMUL270])
|
||||
|
||||
(define_int_attr fcmac1 [(UNSPEC_VCMLA "a") (UNSPEC_VCMLA_CONJ "a")
|
||||
(UNSPEC_VCMLA180 "s") (UNSPEC_VCMLA180_CONJ "s")])
|
||||
|
||||
(define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") (UNSPEC_QSUB8 "qsub8")
|
||||
(UNSPEC_SHADD8 "shadd8") (UNSPEC_SHSUB8 "shsub8")
|
||||
(UNSPEC_UHADD8 "uhadd8") (UNSPEC_UHSUB8 "uhsub8")
|
||||
@ -1723,3 +1753,13 @@
|
||||
(define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
|
||||
(define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
|
||||
(define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
|
||||
|
||||
;; Define iterators for VCMLA operations
|
||||
(define_int_iterator VCMLA_OP [UNSPEC_VCMLA
|
||||
UNSPEC_VCMLA_CONJ
|
||||
UNSPEC_VCMLA180
|
||||
UNSPEC_VCMLA180_CONJ])
|
||||
|
||||
;; Define iterators for VCMLA operations as MUL
|
||||
(define_int_iterator VCMUL_OP [UNSPEC_VCMUL
|
||||
UNSPEC_VCMUL_CONJ])
|
||||
|
@ -4101,15 +4101,16 @@
|
||||
(define_insn "mve_vcmlaq<mve_rot><mode>"
|
||||
[
|
||||
(set (match_operand:MVE_0 0 "s_register_operand" "=w,w")
|
||||
(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0,Dz")
|
||||
(match_operand:MVE_0 2 "s_register_operand" "w,w")
|
||||
(match_operand:MVE_0 3 "s_register_operand" "w,w")]
|
||||
VCMLA))
|
||||
(plus:MVE_0 (match_operand:MVE_0 1 "reg_or_zero_operand" "Dz,0")
|
||||
(unspec:MVE_0
|
||||
[(match_operand:MVE_0 2 "s_register_operand" "w,w")
|
||||
(match_operand:MVE_0 3 "s_register_operand" "w,w")]
|
||||
VCMLA)))
|
||||
]
|
||||
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
|
||||
"@
|
||||
vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>
|
||||
vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>"
|
||||
vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>
|
||||
vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>"
|
||||
[(set_attr "type" "mve_move")
|
||||
])
|
||||
|
||||
|
@ -2952,6 +2952,25 @@
|
||||
[(set_attr "type" "neon_fcmla")]
|
||||
)
|
||||
|
||||
;; The complex mul operations always need to expand to two instructions.
|
||||
;; The first operation does half the computation and the second does the
|
||||
;; remainder. Because of this, expand early.
|
||||
(define_expand "cmul<conj_op><mode>3"
|
||||
[(set (match_operand:VDF 0 "register_operand")
|
||||
(unspec:VDF [(match_operand:VDF 1 "register_operand")
|
||||
(match_operand:VDF 2 "register_operand")]
|
||||
VCMUL_OP))]
|
||||
"TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
|
||||
{
|
||||
rtx res1 = gen_reg_rtx (<MODE>mode);
|
||||
rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
|
||||
emit_insn (gen_neon_vcmla<rotsplit1><mode> (res1, tmp,
|
||||
operands[2], operands[1]));
|
||||
emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], res1,
|
||||
operands[2], operands[1]));
|
||||
DONE;
|
||||
})
|
||||
|
||||
|
||||
;; These instructions map to the __builtins for the Dot Product operations.
|
||||
(define_insn "neon_<sup>dot<vsi2qi>"
|
||||
|
@ -510,10 +510,13 @@
|
||||
UNSPEC_VCMLA90
|
||||
UNSPEC_VCMLA180
|
||||
UNSPEC_VCMLA270
|
||||
UNSPEC_VCMLA_CONJ
|
||||
UNSPEC_VCMLA180_CONJ
|
||||
UNSPEC_VCMUL
|
||||
UNSPEC_VCMUL90
|
||||
UNSPEC_VCMUL180
|
||||
UNSPEC_VCMUL270
|
||||
UNSPEC_VCMUL_CONJ
|
||||
UNSPEC_MATMUL_S
|
||||
UNSPEC_MATMUL_U
|
||||
UNSPEC_MATMUL_US
|
||||
|
@ -215,6 +215,63 @@
|
||||
&& ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
|
||||
)
|
||||
|
||||
;; The complex mul operations always need to expand to two instructions.
|
||||
;; The first operation does half the computation and the second does the
|
||||
;; remainder. Because of this, expand early.
|
||||
(define_expand "cmul<conj_op><mode>3"
|
||||
[(set (match_operand:VQ_HSF 0 "register_operand")
|
||||
(unspec:VQ_HSF [(match_operand:VQ_HSF 1 "register_operand")
|
||||
(match_operand:VQ_HSF 2 "register_operand")]
|
||||
VCMUL_OP))]
|
||||
"(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT))
|
||||
&& !BYTES_BIG_ENDIAN"
|
||||
{
|
||||
rtx res1 = gen_reg_rtx (<MODE>mode);
|
||||
if (TARGET_COMPLEX)
|
||||
{
|
||||
rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
|
||||
emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, tmp,
|
||||
operands[2], operands[1]));
|
||||
}
|
||||
else
|
||||
emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, CONST0_RTX (<MODE>mode),
|
||||
operands[2], operands[1]));
|
||||
|
||||
emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], res1,
|
||||
operands[2], operands[1]));
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_expand "arm_vcmla<rot><mode>"
|
||||
[(set (match_operand:VF 0 "register_operand")
|
||||
(plus:VF (match_operand:VF 1 "register_operand")
|
||||
(unspec:VF [(match_operand:VF 2 "register_operand")
|
||||
(match_operand:VF 3 "register_operand")]
|
||||
VCMLA)))]
|
||||
"(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
|
||||
&& ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
|
||||
)
|
||||
|
||||
;; The complex mla/mls operations always need to expand to two instructions.
|
||||
;; The first operation does half the computation and the second does the
|
||||
;; remainder. Because of this, expand early.
|
||||
(define_expand "cml<fcmac1><conj_op><mode>4"
|
||||
[(set (match_operand:VF 0 "register_operand")
|
||||
(plus:VF (match_operand:VF 1 "register_operand")
|
||||
(unspec:VF [(match_operand:VF 2 "register_operand")
|
||||
(match_operand:VF 3 "register_operand")]
|
||||
VCMLA_OP)))]
|
||||
"(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
|
||||
&& ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
|
||||
{
|
||||
rtx tmp = gen_reg_rtx (<MODE>mode);
|
||||
emit_insn (gen_arm_vcmla<rotsplit1><mode> (tmp, operands[1],
|
||||
operands[3], operands[2]));
|
||||
emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], tmp,
|
||||
operands[3], operands[2]));
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_expand "movmisalign<mode>"
|
||||
[(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
|
||||
(unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")]
|
||||
|
Loading…
Reference in New Issue
Block a user