This adds implementation for the optabs for complex operations. With this the following C code: void g (float complex a[restrict N], float complex b[restrict N], float complex c[restrict N]) { for (int i=0; i < N; i++) c[i] = a[i] * b[i]; } generates NEON: g: vmov.f32 q11, #0.0 @ v4sf add r3, r2, #1600 .L2: vmov q8, q11 @ v4sf vld1.32 {q10}, [r1]! vld1.32 {q9}, [r0]! vcmla.f32 q8, q9, q10, #0 vcmla.f32 q8, q9, q10, #90 vst1.32 {q8}, [r2]! cmp r3, r2 bne .L2 bx lr MVE: g: push {lr} mov lr, #100 dls lr, lr .L2: vldrw.32 q1, [r1], #16 vldrw.32 q2, [r0], #16 vcmul.f32 q3, q2, q1, #0 vcmla.f32 q3, q2, q1, #90 vstrw.32 q3, [r2], #16 le lr, .L2 ldr pc, [sp], #4 instead of g: add r3, r2, #1600 .L2: vld2.32 {d20-d23}, [r0]! vld2.32 {d16-d19}, [r1]! vmul.f32 q14, q11, q9 vmul.f32 q15, q11, q8 vneg.f32 q14, q14 vfma.f32 q15, q10, q9 vfma.f32 q14, q10, q8 vmov q13, q15 @ v4sf vmov q12, q14 @ v4sf vst2.32 {d24-d27}, [r2]! cmp r3, r2 bne .L2 bx lr and g: add r3, r2, #1600 .L2: vld2.32 {d20-d23}, [r0]! vld2.32 {d16-d19}, [r1]! vmul.f32 q15, q10, q8 vmul.f32 q14, q10, q9 vmls.f32 q15, q11, q9 vmla.f32 q14, q11, q8 vmov q12, q15 @ v4sf vmov q13, q14 @ v4sf vst2.32 {d24-d27}, [r2]! cmp r3, r2 bne .L2 bx lr respectively. gcc/ChangeLog: * config/arm/iterators.md (rotsplit1, rotsplit2, conj_op, fcmac1, VCMLA_OP, VCMUL_OP): New. * config/arm/mve.md (mve_vcmlaq<mve_rot><mode>): Support vec_dup 0. * config/arm/neon.md (cmul<conj_op><mode>3): New. * config/arm/unspecs.md (UNSPEC_VCMLA_CONJ, UNSPEC_VCMLA180_CONJ, UNSPEC_VCMUL_CONJ): New. * config/arm/vec-common.md (cmul<conj_op><mode>3, arm_vcmla<rot><mode>, cml<fcmac1><conj_op><mode>4): New.
221 KiB
;; ARM NEON coprocessor Machine Description ;; Copyright (C) 2006-2021 Free Software Foundation, Inc. ;; Written by CodeSourcery. ;; ;; This file is part of GCC. ;; ;; GCC is free software; you can redistribute it and/or modify it ;; under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 3, or (at your option) ;; any later version. ;; ;; GCC is distributed in the hope that it will be useful, but ;; WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;; General Public License for more details. ;; ;; You should have received a copy of the GNU General Public License ;; along with GCC; see the file COPYING3. If not see ;; http://www.gnu.org/licenses/.
;; Attribute used to permit string comparisons against <VQH_mnem> in ;; type attribute definitions. (define_attr "vqh_mnem" "vadd,vmin,vmax" (const_string "vadd"))
(define_insn "unaligned_storev8qi" [(set (match_operand:V8QI 0 "memory_operand" "=Un") (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "w")] UNSPEC_UNALIGNED_STORE))] "TARGET_NEON" "* return output_move_neon (operands); " [(set_attr "type" "neon_store1_1reg")])
(define_insn "*neon_mov" [(set (match_operand:VDXMOV 0 "nonimmediate_operand" "=w,Un,w, w, w, ?r,?w,?r, ?Us,*r") (match_operand:VDXMOV 1 "general_operand" " w,w, Dm,Dn,Uni, w, r, Usi,r,*r"))] "TARGET_NEON && (register_operand (operands[0], mode) || register_operand (operands[1], mode))" { if (which_alternative == 2 || which_alternative == 3) { int width, is_valid; static char templ[40];
is_valid = simd_immediate_valid_for_move (operands[1], <MODE>mode,
&operands[1], &width);
gcc_assert (is_valid != 0);
if (width == 0)
return "vmov.f32\t%P0, %1 @ <mode>";
else
sprintf (templ, "vmov.i%d\t%%P0, %%x1 @ <mode>", width);
return templ;
}
switch (which_alternative)
{
case 0: return "vmov\t%P0, %P1 @ ";
case 1: case 4: return output_move_neon (operands);
case 2: case 3: gcc_unreachable ();
case 5: return "vmov\t%Q0, %R0, %P1 @ ";
case 6: return "vmov\t%P0, %Q1, %R1 @ ";
case 9: return "#";
default: return output_move_double (operands, true, NULL);
}
}
[(set_attr "type" "neon_move,neon_store1_1reg,neon_move
,
neon_move,neon_load1_1reg, neon_to_gp
,
neon_from_gp,neon_load1_2reg, neon_store1_2reg,
multiple")
(set_attr "length" "4,4,4,4,4,4,4,8,8,8")
(set_attr "arm_pool_range" ",,,,1020,,,1020,,")
(set_attr "thumb2_pool_range" ",,,,1018,,,1018,,")
(set_attr "neg_pool_range" ",,,,1004,,,1004,,")])
(define_insn "*neon_mov" [(set (match_operand:VQXMOV 0 "nonimmediate_operand" "=w,Un,w, w, w, ?r,?w,?r,?r, ?Us") (match_operand:VQXMOV 1 "general_operand" " w,w, Dm,DN,Uni, w, r, r, Usi, r"))] "TARGET_NEON && (register_operand (operands[0], mode) || register_operand (operands[1], mode))" { if (which_alternative == 2 || which_alternative == 3) { int width, is_valid; static char templ[40];
is_valid = simd_immediate_valid_for_move (operands[1], <MODE>mode,
&operands[1], &width);
gcc_assert (is_valid != 0);
if (width == 0)
return "vmov.f32\t%q0, %1 @ <mode>";
else
sprintf (templ, "vmov.i%d\t%%q0, %%1 @ <mode>", width);
return templ;
}
switch (which_alternative)
{
case 0: return "vmov\t%q0, %q1 @ ";
case 1: case 4: return output_move_neon (operands);
case 2: case 3: gcc_unreachable ();
case 5: return "vmov\t%Q0, %R0, %e1 @ ;vmov\t%J0, %K0, %f1";
case 6: return "vmov\t%e0, %Q1, %R1 @ ;vmov\t%f0, %J1, %K1";
default: return output_move_quad (operands);
}
}
[(set_attr "type" "neon_move_q,neon_store2_2reg_q,neon_move_q,
neon_move_q,neon_load2_2reg_q,neon_to_gp_q,
neon_from_gp_q,mov_reg,neon_load1_4reg,neon_store1_4reg")
(set_attr "length" "4,8,4,4,8,8,8,16,8,16")
(set_attr "arm_pool_range" ",,,,1020,,,,1020,")
(set_attr "thumb2_pool_range" ",,,,1018,,,,1018,")
(set_attr "neg_pool_range" ",,,,996,,,,996,")])
/* We define these mov expanders to match the standard mov$a optab to prevent the mid-end from trying to do a subreg for these modes which is the most inefficient way to expand the move. Also big-endian subreg's aren't allowed for a subset of modes, See TARGET_CAN_CHANGE_MODE_CLASS. Without these RTL generation patterns the mid-end would attempt to take a sub-reg and may ICE if it can't. */
(define_expand "movti" [(set (match_operand:TI 0 "nonimmediate_operand") (match_operand:TI 1 "general_operand"))] "TARGET_NEON" { gcc_checking_assert (aligned_operand (operands[0], TImode)); gcc_checking_assert (aligned_operand (operands[1], TImode)); if (can_create_pseudo_p ()) { if (!REG_P (operands[0])) operands[1] = force_reg (TImode, operands[1]); } })
(define_expand "mov" [(set (match_operand:VSTRUCT 0 "nonimmediate_operand") (match_operand:VSTRUCT 1 "general_operand"))] "TARGET_NEON || TARGET_HAVE_MVE" { gcc_checking_assert (aligned_operand (operands[0], mode)); gcc_checking_assert (aligned_operand (operands[1], mode)); if (can_create_pseudo_p ()) { if (!REG_P (operands[0])) operands[1] = force_reg (mode, operands[1]); } })
;; The pattern mov where mode is v8hf, v4hf, v4bf and v8bf are split into ;; two groups. The pattern movv8hf is common for MVE and NEON, so it is moved ;; into vec-common.md file. Remaining mov expand patterns with half float and ;; bfloats are implemented below. (define_expand "mov" [(set (match_operand:VHFBF_split 0 "s_register_operand") (match_operand:VHFBF_split 1 "s_register_operand"))] "TARGET_NEON" { gcc_checking_assert (aligned_operand (operands[0], mode)); gcc_checking_assert (aligned_operand (operands[1], mode)); if (can_create_pseudo_p ()) { if (!REG_P (operands[0])) operands[1] = force_reg (mode, operands[1]); } })
(define_insn "*neon_mov" [(set (match_operand:VSTRUCT 0 "nonimmediate_operand" "=w,Ut,w") (match_operand:VSTRUCT 1 "general_operand" " w,w, Ut"))] "(TARGET_NEON || TARGET_HAVE_MVE) && (register_operand (operands[0], mode) || register_operand (operands[1], mode))" { switch (which_alternative) { case 0: return "#"; case 1: case 2: return output_move_neon (operands); default: gcc_unreachable (); } } [(set_attr "type" "neon_move_q,neon_store2_2reg_q,neon_load2_2reg_q") (set (attr "length") (symbol_ref "arm_attr_length_move_neon (insn)"))])
(define_split [(set (match_operand:EI 0 "s_register_operand" "") (match_operand:EI 1 "s_register_operand" ""))] "TARGET_NEON && reload_completed" [(set (match_dup 0) (match_dup 1)) (set (match_dup 2) (match_dup 3))] { int rdest = REGNO (operands[0]); int rsrc = REGNO (operands[1]); rtx dest[2], src[2];
dest[0] = gen_rtx_REG (TImode, rdest); src[0] = gen_rtx_REG (TImode, rsrc); dest[1] = gen_rtx_REG (DImode, rdest + 4); src[1] = gen_rtx_REG (DImode, rsrc + 4);
neon_disambiguate_copy (operands, dest, src, 2); })
(define_split [(set (match_operand:OI 0 "s_register_operand" "") (match_operand:OI 1 "s_register_operand" ""))] "(TARGET_NEON || TARGET_HAVE_MVE)&& reload_completed" [(set (match_dup 0) (match_dup 1)) (set (match_dup 2) (match_dup 3))] { int rdest = REGNO (operands[0]); int rsrc = REGNO (operands[1]); rtx dest[2], src[2];
dest[0] = gen_rtx_REG (TImode, rdest); src[0] = gen_rtx_REG (TImode, rsrc); dest[1] = gen_rtx_REG (TImode, rdest + 4); src[1] = gen_rtx_REG (TImode, rsrc + 4);
neon_disambiguate_copy (operands, dest, src, 2); })
(define_split [(set (match_operand:CI 0 "s_register_operand" "") (match_operand:CI 1 "s_register_operand" ""))] "TARGET_NEON && reload_completed" [(set (match_dup 0) (match_dup 1)) (set (match_dup 2) (match_dup 3)) (set (match_dup 4) (match_dup 5))] { int rdest = REGNO (operands[0]); int rsrc = REGNO (operands[1]); rtx dest[3], src[3];
dest[0] = gen_rtx_REG (TImode, rdest); src[0] = gen_rtx_REG (TImode, rsrc); dest[1] = gen_rtx_REG (TImode, rdest + 4); src[1] = gen_rtx_REG (TImode, rsrc + 4); dest[2] = gen_rtx_REG (TImode, rdest + 8); src[2] = gen_rtx_REG (TImode, rsrc + 8);
neon_disambiguate_copy (operands, dest, src, 3); })
(define_split [(set (match_operand:XI 0 "s_register_operand" "") (match_operand:XI 1 "s_register_operand" ""))] "(TARGET_NEON || TARGET_HAVE_MVE) && reload_completed" [(set (match_dup 0) (match_dup 1)) (set (match_dup 2) (match_dup 3)) (set (match_dup 4) (match_dup 5)) (set (match_dup 6) (match_dup 7))] { int rdest = REGNO (operands[0]); int rsrc = REGNO (operands[1]); rtx dest[4], src[4];
dest[0] = gen_rtx_REG (TImode, rdest); src[0] = gen_rtx_REG (TImode, rsrc); dest[1] = gen_rtx_REG (TImode, rdest + 4); src[1] = gen_rtx_REG (TImode, rsrc + 4); dest[2] = gen_rtx_REG (TImode, rdest + 8); src[2] = gen_rtx_REG (TImode, rsrc + 8); dest[3] = gen_rtx_REG (TImode, rdest + 12); src[3] = gen_rtx_REG (TImode, rsrc + 12);
neon_disambiguate_copy (operands, dest, src, 4); })
(define_insn "*movmisalign_neon_store"
[(set (match_operand:VDX 0 "neon_permissive_struct_operand" "=Um")
(unspec:VDX [(match_operand:VDX 1 "s_register_operand" " w")]
UNSPEC_MISALIGNED_ACCESS))]
"TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access"
"vst1.<V_sz_elem>\t{%P1}, %A0"
[(set_attr "type" "neon_store1_1reg")])
(define_insn "*movmisalign_neon_load"
[(set (match_operand:VDX 0 "s_register_operand" "=w")
(unspec:VDX [(match_operand:VDX 1 "neon_permissive_struct_operand"
" Um")]
UNSPEC_MISALIGNED_ACCESS))]
"TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access"
"vld1.<V_sz_elem>\t{%P0}, %A1"
[(set_attr "type" "neon_load1_1reg")])
(define_insn "*movmisalign_neon_store"
[(set (match_operand:VQX 0 "neon_permissive_struct_operand" "=Um")
(unspec:VQX [(match_operand:VQX 1 "s_register_operand" " w")]
UNSPEC_MISALIGNED_ACCESS))]
"TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access"
"vst1.<V_sz_elem>\t{%q1}, %A0"
[(set_attr "type" "neon_store1_1reg")])
(define_insn "*movmisalign_neon_load"
[(set (match_operand:VQX 0 "s_register_operand" "=w")
(unspec:VQX [(match_operand:VQX 1 "neon_permissive_struct_operand"
" Um")]
UNSPEC_MISALIGNED_ACCESS))]
"TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access"
"vld1.<V_sz_elem>\t{%q0}, %A1"
[(set_attr "type" "neon_load1_1reg")])
(define_insn "@vec_set_internal" [(set (match_operand:VD_LANE 0 "s_register_operand" "=w,w") (vec_merge:VD_LANE (vec_duplicate:VD_LANE (match_operand:<V_elem> 1 "nonimmediate_operand" "Um,r")) (match_operand:VD_LANE 3 "s_register_operand" "0,0") (match_operand:SI 2 "immediate_operand" "i,i")))] "TARGET_NEON" { int elt = ffs ((int) INTVAL (operands[2])) - 1; if (BYTES_BIG_ENDIAN) elt = GET_MODE_NUNITS (mode) - 1 - elt; operands[2] = GEN_INT (elt);
if (which_alternative == 0)
return "vld1.<V_sz_elem>\t{%P0[%c2]}, %A1";
else
return "vmov.<V_sz_elem>\t%P0[%c2], %1";
}
[(set_attr "type" "neon_load1_all_lanes,neon_from_gp
")])
(define_insn "@vec_set_internal" [(set (match_operand:VQ2 0 "s_register_operand" "=w,w") (vec_merge:VQ2 (vec_duplicate:VQ2 (match_operand:<V_elem> 1 "nonimmediate_operand" "Um,r")) (match_operand:VQ2 3 "s_register_operand" "0,0") (match_operand:SI 2 "immediate_operand" "i,i")))] "TARGET_NEON" { HOST_WIDE_INT elem = ffs ((int) INTVAL (operands[2])) - 1; int half_elts = GET_MODE_NUNITS (mode) / 2; int elt = elem % half_elts; int hi = (elem / half_elts) * 2; int regno = REGNO (operands[0]);
if (BYTES_BIG_ENDIAN) elt = half_elts - 1 - elt;
operands[0] = gen_rtx_REG (<V_HALF>mode, regno + hi); operands[2] = GEN_INT (elt);
if (which_alternative == 0)
return "vld1.<V_sz_elem>\t{%P0[%c2]}, %A1";
else
return "vmov.<V_sz_elem>\t%P0[%c2], %1";
}
[(set_attr "type" "neon_load1_all_lanes,neon_from_gp
")]
)
(define_insn "@vec_set_internal" [(set (match_operand:V2DI_ONLY 0 "s_register_operand" "=w,w") (vec_merge:V2DI_ONLY (vec_duplicate:V2DI_ONLY (match_operand:DI 1 "nonimmediate_operand" "Um,r")) (match_operand:V2DI_ONLY 3 "s_register_operand" "0,0") (match_operand:SI 2 "immediate_operand" "i,i")))] "TARGET_NEON" { HOST_WIDE_INT elem = ffs ((int) INTVAL (operands[2])) - 1; int regno = REGNO (operands[0]) + 2 * elem;
operands[0] = gen_rtx_REG (DImode, regno);
if (which_alternative == 0) return "vld1.64\t%P0, %A1"; else return "vmov\t%P0, %Q1, %R1"; } [(set_attr "type" "neon_load1_all_lanes_q,neon_from_gp_q")] )
(define_insn "vec_extract<V_elem_l>" [(set (match_operand:<V_elem> 0 "nonimmediate_operand" "=Um,r") (vec_select:<V_elem> (match_operand:VD_LANE 1 "s_register_operand" "w,w") (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))] "TARGET_NEON" { if (BYTES_BIG_ENDIAN) { int elt = INTVAL (operands[2]); elt = GET_MODE_NUNITS (mode) - 1 - elt; operands[2] = GEN_INT (elt); }
if (which_alternative == 0)
return "vst1.<V_sz_elem>\t{%P1[%c2]}, %A0";
else
return "vmov.<V_uf_sclr>\t%0, %P1[%c2]";
}
[(set_attr "type" "neon_store1_one_lane,neon_to_gp
")]
)
;; This pattern is renamed from "vec_extract<V_elem_l>" to ;; "neon_vec_extract<V_elem_l>" and this pattern is called ;; by define_expand in vec-common.md file. (define_insn "neon_vec_extract<V_elem_l>" [(set (match_operand:<V_elem> 0 "nonimmediate_operand" "=Um,r") (vec_select:<V_elem> (match_operand:VQ2 1 "s_register_operand" "w,w") (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))] "TARGET_NEON" { int half_elts = GET_MODE_NUNITS (mode) / 2; int elt = INTVAL (operands[2]) % half_elts; int hi = (INTVAL (operands[2]) / half_elts) * 2; int regno = REGNO (operands[1]);
if (BYTES_BIG_ENDIAN) elt = half_elts - 1 - elt;
operands[1] = gen_rtx_REG (<V_HALF>mode, regno + hi); operands[2] = GEN_INT (elt);
if (which_alternative == 0)
return "vst1.<V_sz_elem>\t{%P1[%c2]}, %A0";
else
return "vmov.<V_uf_sclr>\t%0, %P1[%c2]";
}
[(set_attr "type" "neon_store1_one_lane,neon_to_gp
")]
)
;; This pattern is renamed from "vec_extractv2didi" to "neon_vec_extractv2didi" ;; and this pattern is called by define_expand in vec-common.md file. (define_insn "neon_vec_extractv2didi" [(set (match_operand:DI 0 "nonimmediate_operand" "=Um,r") (vec_select:DI (match_operand:V2DI 1 "s_register_operand" "w,w") (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))] "TARGET_NEON" { int regno = REGNO (operands[1]) + 2 * INTVAL (operands[2]);
operands[1] = gen_rtx_REG (DImode, regno);
if (which_alternative == 0) return "vst1.64\t{%P1}, %A0 @ v2di"; else return "vmov\t%Q0, %R0, %P1 @ v2di"; } [(set_attr "type" "neon_store1_one_lane_q,neon_to_gp_q")] )
(define_expand "vec_init<V_elem_l>" [(match_operand:VDQ 0 "s_register_operand") (match_operand 1 "" "")] "TARGET_NEON || TARGET_HAVE_MVE" { neon_expand_vector_init (operands[0], operands[1]); DONE; })
;; Doubleword and quadword arithmetic.
;; NOTE: some other instructions also support 64-bit integer ;; element size, which we could potentially use for "long long" operations.
(define_insn "*add3_neon"
[(set (match_operand:VDQ 0 "s_register_operand" "=w")
(plus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
(match_operand:VDQ 2 "s_register_operand" "w")))]
"ARM_HAVE_NEON__ARITH"
"vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_addsub_s")
(const_string "neon_add
")))]
)
(define_insn "*sub3_neon"
[(set (match_operand:VDQ 0 "s_register_operand" "=w")
(minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
(match_operand:VDQ 2 "s_register_operand" "w")))]
"ARM_HAVE_NEON__ARITH"
"vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_addsub_s")
(const_string "neon_sub
")))]
)
(define_insn "*mul3_neon"
[(set (match_operand:VDQW 0 "s_register_operand" "=w")
(mult:VDQW (match_operand:VDQW 1 "s_register_operand" "w")
(match_operand:VDQW 2 "s_register_operand" "w")))]
"ARM_HAVE_NEON_ARITH"
"vmul.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_mul_s")
(const_string "neon_mul
<V_elem_ch>")))]
)
/* Perform division using multiply-by-reciprocal. Reciprocal is calculated using Newton-Raphson method. Enabled with -funsafe-math-optimizations -freciprocal-math and disabled for -Os since it increases code size . */
(define_expand "div3" [(set (match_operand:VCVTF 0 "s_register_operand") (div:VCVTF (match_operand:VCVTF 1 "s_register_operand") (match_operand:VCVTF 2 "s_register_operand")))] "TARGET_NEON && !optimize_size && flag_reciprocal_math" { rtx rec = gen_reg_rtx (mode); rtx vrecps_temp = gen_reg_rtx (mode);
/* Reciprocal estimate. */
emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
/* Perform 2 iterations of newton-raphson method. */
for (int i = 0; i < 2; i++)
{
emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
}
/* We now have reciprocal in rec, perform operands[0] = operands[1] * rec. */
emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
DONE;
} )
(define_insn "mul3addneon"
[(set (match_operand:VDQW 0 "s_register_operand" "=w")
(plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
(match_operand:VDQW 3 "s_register_operand" "w"))
(match_operand:VDQW 1 "s_register_operand" "0")))]
"ARM_HAVE_NEONARITH"
"vmla.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_mla_s")
(const_string "neon_mla
<V_elem_ch>")))]
)
(define_insn "mul3addneon"
[(set (match_operand:VH 0 "s_register_operand" "=w")
(plus:VH (mult:VH (match_operand:VH 2 "s_register_operand" "w")
(match_operand:VH 3 "s_register_operand" "w"))
(match_operand:VH 1 "s_register_operand" "0")))]
"ARM_HAVE_NEON_ARITH"
"vmla.f16\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
[(set_attr "type" "neon_fp_mla_s")]
)
(define_insn "mul3negaddneon"
[(set (match_operand:VDQW 0 "s_register_operand" "=w")
(minus:VDQW (match_operand:VDQW 1 "s_register_operand" "0")
(mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
(match_operand:VDQW 3 "s_register_operand" "w"))))]
"ARM_HAVE_NEONARITH"
"vmls.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_mla_s")
(const_string "neon_mla
<V_elem_ch>")))]
)
;; Fused multiply-accumulate
;; We define each insn twice here:
;; 1: with flag_unsafe_math_optimizations for the widening multiply phase
;; to be able to use when converting to FMA.
;; 2: without flag_unsafe_math_optimizations for the intrinsics to use.
(define_insn "fmaVCVTF:mode4"
[(set (match_operand:VCVTF 0 "register_operand" "=w")
(fma:VCVTF (match_operand:VCVTF 1 "register_operand" "w")
(match_operand:VCVTF 2 "register_operand" "w")
(match_operand:VCVTF 3 "register_operand" "0")))]
"ARM_HAVE_NEON__ARITH && TARGET_FMA"
"vfma.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_mla_s")]
)
(define_insn "fmaVCVTF:mode4_intrinsic"
[(set (match_operand:VCVTF 0 "register_operand" "=w")
(fma:VCVTF (match_operand:VCVTF 1 "register_operand" "w")
(match_operand:VCVTF 2 "register_operand" "w")
(match_operand:VCVTF 3 "register_operand" "0")))]
"TARGET_NEON && TARGET_FMA"
"vfma.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_mla_s")]
)
(define_insn "fmaVH:mode4"
[(set (match_operand:VH 0 "register_operand" "=w")
(fma:VH
(match_operand:VH 1 "register_operand" "w")
(match_operand:VH 2 "register_operand" "w")
(match_operand:VH 3 "register_operand" "0")))]
"ARM_HAVE_NEON__ARITH"
"vfma.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_mla_s")]
)
(define_insn "*fmsubVCVTF:mode4"
[(set (match_operand:VCVTF 0 "register_operand" "=w")
(fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
(match_operand:VCVTF 2 "register_operand" "w")
(match_operand:VCVTF 3 "register_operand" "0")))]
"ARM_HAVE_NEON__ARITH && TARGET_FMA"
"vfms.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_mla_s")]
)
(define_insn "fmsubVCVTF:mode4_intrinsic"
[(set (match_operand:VCVTF 0 "register_operand" "=w")
(fma:VCVTF
(neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
(match_operand:VCVTF 2 "register_operand" "w")
(match_operand:VCVTF 3 "register_operand" "0")))]
"TARGET_NEON && TARGET_FMA"
"vfms.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_mla_s")]
)
(define_insn "fmsubVH:mode4_intrinsic"
[(set (match_operand:VH 0 "register_operand" "=w")
(fma:VH
(neg:VH (match_operand:VH 1 "register_operand" "w"))
(match_operand:VH 2 "register_operand" "w")
(match_operand:VH 3 "register_operand" "0")))]
"TARGET_NEON_FP16INST"
"vfms.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_mla_s")]
)
(define_insn "neon_vrint<NEON_VRINT:nvrint_variant>VCVTF:mode"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(unspec:VCVTF [(match_operand:VCVTF 1
"s_register_operand" "w")]
NEON_VRINT))]
"TARGET_NEON && TARGET_VFP5"
"vrint<nvrint_variant>.f32\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_fp_round_<V_elem_ch>")]
)
(define_insn "neon_vcvt<NEON_VCVT:nvrint_variant><su_optab>VCVTF:mode<v_cmp_result>"
[(set (match_operand:<V_cmp_result> 0 "register_operand" "=w")
(FIXUORS:<V_cmp_result> (unspec:VCVTF
[(match_operand:VCVTF 1 "register_operand" "w")]
NEON_VCVT)))]
"TARGET_NEON && TARGET_VFP5"
"vcvt<nvrint_variant>.32.f32\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_fp_to_int_<V_elem_ch>")
(set_attr "predicable" "no")]
)
(define_insn "ior3_neon"
[(set (match_operand:VDQ 0 "s_register_operand" "=w,w")
(ior:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0")
(match_operand:VDQ 2 "neon_logic_op2" "w,Dl")))]
"TARGET_NEON"
{
switch (which_alternative)
{
case 0: return "vorr\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
case 1: return neon_output_logic_immediate ("vorr", &operands[2],
mode, 0, VALID_NEON_QREG_MODE (mode));
default: gcc_unreachable ();
}
}
[(set_attr "type" "neon_logic")]
)
;; The concrete forms of the Neon immediate-logic instructions are vbic and ;; vorr. We support the pseudo-instruction vand instead, because that ;; corresponds to the canonical form the middle-end expects to use for ;; immediate bitwise-ANDs.
(define_insn "and3_neon"
[(set (match_operand:VDQ 0 "s_register_operand" "=w,w")
(and:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0")
(match_operand:VDQ 2 "neon_inv_logic_op2" "w,DL")))]
"TARGET_NEON"
{
switch (which_alternative)
{
case 0: return "vand\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
case 1: return neon_output_logic_immediate ("vand", &operands[2],
mode, 1, VALID_NEON_QREG_MODE (mode));
default: gcc_unreachable ();
}
}
[(set_attr "type" "neon_logic")]
)
(define_insn "orn3_neon"
[(set (match_operand:VDQ 0 "s_register_operand" "=w")
(ior:VDQ (not:VDQ (match_operand:VDQ 2 "s_register_operand" "w"))
(match_operand:VDQ 1 "s_register_operand" "w")))]
"TARGET_NEON"
"vorn\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_logic")]
)
(define_insn "bic3_neon"
[(set (match_operand:VDQ 0 "s_register_operand" "=w")
(and:VDQ (not:VDQ (match_operand:VDQ 2 "s_register_operand" "w"))
(match_operand:VDQ 1 "s_register_operand" "w")))]
"TARGET_NEON"
"vbic\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_logic")]
)
(define_insn "xor3_neon"
[(set (match_operand:VDQ 0 "s_register_operand" "=w")
(xor:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
(match_operand:VDQ 2 "s_register_operand" "w")))]
"TARGET_NEON"
"veor\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_logic")]
)
(define_insn "one_cmpl2_neon"
[(set (match_operand:VDQ 0 "s_register_operand" "=w")
(not:VDQ (match_operand:VDQ 1 "s_register_operand" "w")))]
"TARGET_NEON"
"vmvn\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_move")]
)
(define_insn "abs2"
[(set (match_operand:VDQW 0 "s_register_operand" "=w")
(abs:VDQW (match_operand:VDQW 1 "s_register_operand" "w")))]
"TARGET_NEON"
"vabs.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_abs_s")
(const_string "neon_abs
")))]
)
(define_insn "neon_neg2"
[(set (match_operand:VDQW 0 "s_register_operand" "=w")
(neg:VDQW (match_operand:VDQW 1 "s_register_operand" "w")))]
"TARGET_NEON"
"vneg.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_neg_s")
(const_string "neon_neg
")))]
)
(define_insn "neon_<absneg_str>2"
[(set (match_operand:VH 0 "s_register_operand" "=w")
(ABSNEG:VH (match_operand:VH 1 "s_register_operand" "w")))]
"TARGET_NEON_FP16INST"
"v<absneg_str>.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_abs")]
)
(define_expand "neon_v<absneg_str>" [(set (match_operand:VH 0 "s_register_operand") (ABSNEG:VH (match_operand:VH 1 "s_register_operand")))] "TARGET_NEON_FP16INST" { emit_insn (gen_neon_<absneg_str>2 (operands[0], operands[1])); DONE; })
(define_insn "neon_v<fp16_rnd_str>"
[(set (match_operand:VH 0 "s_register_operand" "=w")
(unspec:VH
[(match_operand:VH 1 "s_register_operand" "w")]
FP16_RND))]
"TARGET_NEON_FP16INST"
"<fp16_rnd_insn>.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_fp_round_s")]
)
(define_insn "neon_vrsqrte"
[(set (match_operand:VH 0 "s_register_operand" "=w")
(unspec:VH
[(match_operand:VH 1 "s_register_operand" "w")]
UNSPEC_VRSQRTE))]
"TARGET_NEON_FP16INST"
"vrsqrte.f16\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_fp_rsqrte_s")]
)
(define_insn "*umin3_neon"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(umin:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
(match_operand:VDQIW 2 "s_register_operand" "w")))]
"TARGET_NEON"
"vmin.<V_u_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_minmax")]
)
(define_insn "*umax3_neon"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(umax:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
(match_operand:VDQIW 2 "s_register_operand" "w")))]
"TARGET_NEON"
"vmax.<V_u_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_minmax")]
)
(define_insn "*smin3_neon"
[(set (match_operand:VDQW 0 "s_register_operand" "=w")
(smin:VDQW (match_operand:VDQW 1 "s_register_operand" "w")
(match_operand:VDQW 2 "s_register_operand" "w")))]
"TARGET_NEON"
"vmin.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_minmax_s")
(const_string "neon_minmax
")))]
)
(define_insn "*smax3_neon"
[(set (match_operand:VDQW 0 "s_register_operand" "=w")
(smax:VDQW (match_operand:VDQW 1 "s_register_operand" "w")
(match_operand:VDQW 2 "s_register_operand" "w")))]
"TARGET_NEON"
"vmax.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_minmax_s")
(const_string "neon_minmax
")))]
)
; TODO: V2DI shifts are current disabled because there are bugs in the ; generic vectorizer code. It ends up creating a V2DI constructor with ; SImode elements.
(define_insn "vashr3_imm"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
(match_operand:VDQIW 2 "imm_for_neon_rshift_operand" "Dm")))]
"TARGET_NEON"
{
return neon_output_shift_immediate ("vshr", 's', &operands[2],
mode, VALID_NEON_QREG_MODE (mode),
false);
}
[(set_attr "type" "neon_shift_imm")]
)
(define_insn "vlshr3_imm"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
(match_operand:VDQIW 2 "imm_for_neon_rshift_operand" "Dm")))]
"TARGET_NEON"
{
return neon_output_shift_immediate ("vshr", 'u', &operands[2],
mode, VALID_NEON_QREG_MODE (mode),
false);
}
[(set_attr "type" "neon_shift_imm")]
)
; Used for implementing logical shift-right, which is a left-shift by a negative ; amount, with signed operands. This is essentially the same as ashl3 ; above, but using an unspec in case GCC tries anything tricky with negative ; shift amounts.
(define_insn "ashl3_signed"
[(set (match_operand:VDQI 0 "s_register_operand" "=w")
(unspec:VDQI [(match_operand:VDQI 1 "s_register_operand" "w")
(match_operand:VDQI 2 "s_register_operand" "w")]
UNSPEC_ASHIFT_SIGNED))]
"TARGET_NEON"
"vshl.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_shift_reg")]
)
; Used for implementing logical shift-right, which is a left-shift by a negative ; amount, with unsigned operands.
(define_insn "ashl3_unsigned"
[(set (match_operand:VDQI 0 "s_register_operand" "=w")
(unspec:VDQI [(match_operand:VDQI 1 "s_register_operand" "w")
(match_operand:VDQI 2 "s_register_operand" "w")]
UNSPEC_ASHIFT_UNSIGNED))]
"TARGET_NEON"
"vshl.<V_u_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_shift_reg")]
)
;; 64-bit shifts
;; This pattern loads a 32-bit shift count into a 64-bit NEON register, ;; leaving the upper half uninitalized. This is OK since the shift ;; instruction only looks at the low 8 bits anyway. To avoid confusing ;; data flow analysis however, we pretend the full register is set ;; using an unspec. (define_insn "neon_load_count" [(set (match_operand:DI 0 "s_register_operand" "=w,w") (unspec:DI [(match_operand:SI 1 "nonimmediate_operand" "Um,r")] UNSPEC_LOAD_COUNT))] "TARGET_NEON" "@ vld1.32\t{%P0[0]}, %A1 vmov.32\t%P0[0], %1" [(set_attr "type" "neon_load1_1reg,neon_from_gp")] )
;; Widening operations
(define_expand "widen_ssum3" [(set (match_operand:<V_double_width> 0 "s_register_operand") (plus:<V_double_width> (sign_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand")) (match_operand:<V_double_width> 2 "s_register_operand")))] "TARGET_NEON" { machine_mode mode = GET_MODE (operands[1]); rtx p1, p2;
p1 = arm_simd_vect_par_cnst_half (mode, false);
p2 = arm_simd_vect_par_cnst_half (mode, true);
if (operands[0] != operands[2])
emit_move_insn (operands[0], operands[2]);
emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0],
operands[1],
p1,
operands[0]));
emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0],
operands[1],
p2,
operands[0]));
DONE;
} )
(define_insn "vec_sel_widen_ssum_lo<V_half>3" [(set (match_operand:<V_double_width> 0 "s_register_operand" "=w") (plus:<V_double_width> (sign_extend:<V_double_width> (vec_select:<V_HALF> (match_operand:VQI 1 "s_register_operand" "%w") (match_operand:VQI 2 "vect_par_constant_low" ""))) (match_operand:<V_double_width> 3 "s_register_operand" "0")))] "TARGET_NEON" { return BYTES_BIG_ENDIAN ? "vaddw.<V_s_elem>\t%q0, %q3, %f1" : "vaddw.<V_s_elem>\t%q0, %q3, %e1"; } [(set_attr "type" "neon_add_widen")])
(define_insn "vec_sel_widen_ssum_hi<V_half>3" [(set (match_operand:<V_double_width> 0 "s_register_operand" "=w") (plus:<V_double_width> (sign_extend:<V_double_width> (vec_select:<V_HALF> (match_operand:VQI 1 "s_register_operand" "%w") (match_operand:VQI 2 "vect_par_constant_high" ""))) (match_operand:<V_double_width> 3 "s_register_operand" "0")))] "TARGET_NEON" { return BYTES_BIG_ENDIAN ? "vaddw.<V_s_elem>\t%q0, %q3, %e1" : "vaddw.<V_s_elem>\t%q0, %q3, %f1"; } [(set_attr "type" "neon_add_widen")])
(define_insn "widen_ssum3" [(set (match_operand:<V_widen> 0 "s_register_operand" "=w") (plus:<V_widen> (sign_extend:<V_widen> (match_operand:VW 1 "s_register_operand" "%w")) (match_operand:<V_widen> 2 "s_register_operand" "w")))] "TARGET_NEON" "vaddw.<V_s_elem>\t%q0, %q2, %P1" [(set_attr "type" "neon_add_widen")] )
(define_expand "widen_usum3" [(set (match_operand:<V_double_width> 0 "s_register_operand") (plus:<V_double_width> (zero_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand")) (match_operand:<V_double_width> 2 "s_register_operand")))] "TARGET_NEON" { machine_mode mode = GET_MODE (operands[1]); rtx p1, p2;
p1 = arm_simd_vect_par_cnst_half (mode, false);
p2 = arm_simd_vect_par_cnst_half (mode, true);
if (operands[0] != operands[2])
emit_move_insn (operands[0], operands[2]);
emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0],
operands[1],
p1,
operands[0]));
emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0],
operands[1],
p2,
operands[0]));
DONE;
} )
(define_insn "vec_sel_widen_usum_lo<V_half>3" [(set (match_operand:<V_double_width> 0 "s_register_operand" "=w") (plus:<V_double_width> (zero_extend:<V_double_width> (vec_select:<V_HALF> (match_operand:VQI 1 "s_register_operand" "%w") (match_operand:VQI 2 "vect_par_constant_low" ""))) (match_operand:<V_double_width> 3 "s_register_operand" "0")))] "TARGET_NEON" { return BYTES_BIG_ENDIAN ? "vaddw.<V_u_elem>\t%q0, %q3, %f1" : "vaddw.<V_u_elem>\t%q0, %q3, %e1"; } [(set_attr "type" "neon_add_widen")])
(define_insn "vec_sel_widen_usum_hi<V_half>3" [(set (match_operand:<V_double_width> 0 "s_register_operand" "=w") (plus:<V_double_width> (zero_extend:<V_double_width> (vec_select:<V_HALF> (match_operand:VQI 1 "s_register_operand" "%w") (match_operand:VQI 2 "vect_par_constant_high" ""))) (match_operand:<V_double_width> 3 "s_register_operand" "0")))] "TARGET_NEON" { return BYTES_BIG_ENDIAN ? "vaddw.<V_u_elem>\t%q0, %q3, %e1" : "vaddw.<V_u_elem>\t%q0, %q3, %f1"; } [(set_attr "type" "neon_add_widen")])
(define_insn "widen_usum3" [(set (match_operand:<V_widen> 0 "s_register_operand" "=w") (plus:<V_widen> (zero_extend:<V_widen> (match_operand:VW 1 "s_register_operand" "%w")) (match_operand:<V_widen> 2 "s_register_operand" "w")))] "TARGET_NEON" "vaddw.<V_u_elem>\t%q0, %q2, %P1" [(set_attr "type" "neon_add_widen")] )
;; Helpers for quad-word reduction operations
; Add (or smin, smax...) the low N/2 elements of the N-element vector ; operand[1] to the high N/2 elements of same. Put the result in operand[0], an ; N/2-element vector.
(define_insn "quad_halves_v4si"
[(set (match_operand:V2SI 0 "s_register_operand" "=w")
(VQH_OPS:V2SI
(vec_select:V2SI (match_operand:V4SI 1 "s_register_operand" "w")
(parallel [(const_int 0) (const_int 1)]))
(vec_select:V2SI (match_dup 1)
(parallel [(const_int 2) (const_int 3)]))))]
"TARGET_NEON"
"<VQH_mnem>.<VQH_sign>32\t%P0, %e1, %f1"
[(set_attr "vqh_mnem" "<VQH_mnem>")
(set_attr "type" "neon_reduc_<VQH_type>_q")]
)
(define_insn "quad_halves_v4sf"
[(set (match_operand:V2SF 0 "s_register_operand" "=w")
(VQHS_OPS:V2SF
(vec_select:V2SF (match_operand:V4SF 1 "s_register_operand" "w")
(parallel [(const_int 0) (const_int 1)]))
(vec_select:V2SF (match_dup 1)
(parallel [(const_int 2) (const_int 3)]))))]
"ARM_HAVE_NEON_V4SF_ARITH"
"<VQH_mnem>.f32\t%P0, %e1, %f1"
[(set_attr "vqh_mnem" "<VQH_mnem>")
(set_attr "type" "neon_fp_reduc_<VQH_type>_s_q")]
)
(define_insn "quad_halves_v8hi"
[(set (match_operand:V4HI 0 "s_register_operand" "+w")
(VQH_OPS:V4HI
(vec_select:V4HI (match_operand:V8HI 1 "s_register_operand" "w")
(parallel [(const_int 0) (const_int 1)
(const_int 2) (const_int 3)]))
(vec_select:V4HI (match_dup 1)
(parallel [(const_int 4) (const_int 5)
(const_int 6) (const_int 7)]))))]
"TARGET_NEON"
"<VQH_mnem>.<VQH_sign>16\t%P0, %e1, %f1"
[(set_attr "vqh_mnem" "<VQH_mnem>")
(set_attr "type" "neon_reduc_<VQH_type>_q")]
)
(define_insn "quad_halves_v16qi"
[(set (match_operand:V8QI 0 "s_register_operand" "+w")
(VQH_OPS:V8QI
(vec_select:V8QI (match_operand:V16QI 1 "s_register_operand" "w")
(parallel [(const_int 0) (const_int 1)
(const_int 2) (const_int 3)
(const_int 4) (const_int 5)
(const_int 6) (const_int 7)]))
(vec_select:V8QI (match_dup 1)
(parallel [(const_int 8) (const_int 9)
(const_int 10) (const_int 11)
(const_int 12) (const_int 13)
(const_int 14) (const_int 15)]))))]
"TARGET_NEON"
"<VQH_mnem>.<VQH_sign>8\t%P0, %e1, %f1"
[(set_attr "vqh_mnem" "<VQH_mnem>")
(set_attr "type" "neon_reduc_<VQH_type>_q")]
)
(define_expand "move_hi_quad_"
[(match_operand:ANY128 0 "s_register_operand")
(match_operand:<V_HALF> 1 "s_register_operand")]
"TARGET_NEON"
{
emit_move_insn (simplify_gen_subreg (<V_HALF>mode, operands[0], mode,
GET_MODE_SIZE (<V_HALF>mode)),
operands[1]);
DONE;
})
(define_expand "move_lo_quad_"
[(match_operand:ANY128 0 "s_register_operand")
(match_operand:<V_HALF> 1 "s_register_operand")]
"TARGET_NEON"
{
emit_move_insn (simplify_gen_subreg (<V_HALF>mode, operands[0],
mode, 0),
operands[1]);
DONE;
})
;; Reduction operations
(define_expand "reduc_plus_scal_"
[(match_operand:<V_elem> 0 "nonimmediate_operand")
(match_operand:VD 1 "s_register_operand")]
"ARM_HAVE_NEON__ARITH"
{
rtx vec = gen_reg_rtx (mode);
neon_pairwise_reduce (vec, operands[1], mode,
&gen_neon_vpadd_internal);
/* The same result is actually computed into every element. */
emit_insn (gen_vec_extract<V_elem_l> (operands[0], vec, const0_rtx));
DONE;
})
(define_expand "reduc_plus_scal_"
[(match_operand:<V_elem> 0 "nonimmediate_operand")
(match_operand:VQ 1 "s_register_operand")]
"ARM_HAVE_NEON__ARITH && !BYTES_BIG_ENDIAN"
{
rtx step1 = gen_reg_rtx (<V_HALF>mode);
emit_insn (gen_quad_halves_plus (step1, operands[1]));
emit_insn (gen_reduc_plus_scal_<V_half> (operands[0], step1));
DONE;
})
(define_expand "reduc_plus_scal_v2di"
[(match_operand:DI 0 "nonimmediate_operand")
(match_operand:V2DI 1 "s_register_operand")]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
rtx vec = gen_reg_rtx (V2DImode);
emit_insn (gen_arm_reduc_plus_internal_v2di (vec, operands[1]));
emit_insn (gen_vec_extractv2didi (operands[0], vec, const0_rtx));
DONE;
})
(define_insn "arm_reduc_plus_internal_v2di"
[(set (match_operand:V2DI 0 "s_register_operand" "=w")
(unspec:V2DI [(match_operand:V2DI 1 "s_register_operand" "w")]
UNSPEC_VPADD))]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
"vadd.i64\t%e0, %e1, %f1"
[(set_attr "type" "neon_add_q")]
)
(define_expand "reduc_smin_scal_"
[(match_operand:<V_elem> 0 "nonimmediate_operand")
(match_operand:VD 1 "s_register_operand")]
"ARM_HAVE_NEON__ARITH"
{
rtx vec = gen_reg_rtx (mode);
neon_pairwise_reduce (vec, operands[1], mode,
&gen_neon_vpsmin);
/* The result is computed into every element of the vector. */
emit_insn (gen_vec_extract<V_elem_l> (operands[0], vec, const0_rtx));
DONE;
})
(define_expand "reduc_smin_scal_"
[(match_operand:<V_elem> 0 "nonimmediate_operand")
(match_operand:VQ 1 "s_register_operand")]
"ARM_HAVE_NEON__ARITH && !BYTES_BIG_ENDIAN"
{
rtx step1 = gen_reg_rtx (<V_HALF>mode);
emit_insn (gen_quad_halves_smin (step1, operands[1]));
emit_insn (gen_reduc_smin_scal_<V_half> (operands[0], step1));
DONE;
})
(define_expand "reduc_smax_scal_"
[(match_operand:<V_elem> 0 "nonimmediate_operand")
(match_operand:VD 1 "s_register_operand")]
"ARM_HAVE_NEON__ARITH"
{
rtx vec = gen_reg_rtx (mode);
neon_pairwise_reduce (vec, operands[1], mode,
&gen_neon_vpsmax);
/* The result is computed into every element of the vector. */
emit_insn (gen_vec_extract<V_elem_l> (operands[0], vec, const0_rtx));
DONE;
})
(define_expand "reduc_smax_scal_"
[(match_operand:<V_elem> 0 "nonimmediate_operand")
(match_operand:VQ 1 "s_register_operand")]
"ARM_HAVE_NEON__ARITH && !BYTES_BIG_ENDIAN"
{
rtx step1 = gen_reg_rtx (<V_HALF>mode);
emit_insn (gen_quad_halves_smax (step1, operands[1]));
emit_insn (gen_reduc_smax_scal_<V_half> (operands[0], step1));
DONE;
})
(define_expand "reduc_umin_scal_"
[(match_operand:<V_elem> 0 "nonimmediate_operand")
(match_operand:VDI 1 "s_register_operand")]
"TARGET_NEON"
{
rtx vec = gen_reg_rtx (mode);
neon_pairwise_reduce (vec, operands[1], mode,
&gen_neon_vpumin);
/* The result is computed into every element of the vector. */
emit_insn (gen_vec_extract<V_elem_l> (operands[0], vec, const0_rtx));
DONE;
})
(define_expand "reduc_umin_scal_"
[(match_operand:<V_elem> 0 "nonimmediate_operand")
(match_operand:VQI 1 "s_register_operand")]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
rtx step1 = gen_reg_rtx (<V_HALF>mode);
emit_insn (gen_quad_halves_umin (step1, operands[1]));
emit_insn (gen_reduc_umin_scal_<V_half> (operands[0], step1));
DONE;
})
(define_expand "reduc_umax_scal_"
[(match_operand:<V_elem> 0 "nonimmediate_operand")
(match_operand:VDI 1 "s_register_operand")]
"TARGET_NEON"
{
rtx vec = gen_reg_rtx (mode);
neon_pairwise_reduce (vec, operands[1], mode,
&gen_neon_vpumax);
/* The result is computed into every element of the vector. */
emit_insn (gen_vec_extract<V_elem_l> (operands[0], vec, const0_rtx));
DONE;
})
(define_expand "reduc_umax_scal_"
[(match_operand:<V_elem> 0 "nonimmediate_operand")
(match_operand:VQI 1 "s_register_operand")]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
rtx step1 = gen_reg_rtx (<V_HALF>mode);
emit_insn (gen_quad_halves_umax (step1, operands[1]));
emit_insn (gen_reduc_umax_scal_<V_half> (operands[0], step1));
DONE;
})
(define_insn "neon_vpadd_internal"
[(set (match_operand:VD 0 "s_register_operand" "=w")
(unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
(match_operand:VD 2 "s_register_operand" "w")]
UNSPEC_VPADD))]
"TARGET_NEON"
"vpadd.<V_if_elem>\t%P0, %P1, %P2"
;; Assume this schedules like vadd.
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_reduc_add_s")
(const_string "neon_reduc_add")))]
)
(define_insn "neon_vpaddv4hf"
[(set
(match_operand:V4HF 0 "s_register_operand" "=w")
(unspec:V4HF [(match_operand:V4HF 1 "s_register_operand" "w")
(match_operand:V4HF 2 "s_register_operand" "w")]
UNSPEC_VPADD))]
"TARGET_NEON_FP16INST"
"vpadd.f16\t%P0, %P1, %P2"
[(set_attr "type" "neon_reduc_add")]
)
(define_insn "neon_vpsmin"
[(set (match_operand:VD 0 "s_register_operand" "=w")
(unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
(match_operand:VD 2 "s_register_operand" "w")]
UNSPEC_VPSMIN))]
"TARGET_NEON"
"vpmin.<V_s_elem>\t%P0, %P1, %P2"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_reduc_minmax_s")
(const_string "neon_reduc_minmax")))]
)
(define_insn "neon_vpsmax"
[(set (match_operand:VD 0 "s_register_operand" "=w")
(unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
(match_operand:VD 2 "s_register_operand" "w")]
UNSPEC_VPSMAX))]
"TARGET_NEON"
"vpmax.<V_s_elem>\t%P0, %P1, %P2"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_reduc_minmax_s")
(const_string "neon_reduc_minmax")))]
)
(define_insn "neon_vpumin"
[(set (match_operand:VDI 0 "s_register_operand" "=w")
(unspec:VDI [(match_operand:VDI 1 "s_register_operand" "w")
(match_operand:VDI 2 "s_register_operand" "w")]
UNSPEC_VPUMIN))]
"TARGET_NEON"
"vpmin.<V_u_elem>\t%P0, %P1, %P2"
[(set_attr "type" "neon_reduc_minmax")]
)
(define_insn "neon_vpumax"
[(set (match_operand:VDI 0 "s_register_operand" "=w")
(unspec:VDI [(match_operand:VDI 1 "s_register_operand" "w")
(match_operand:VDI 2 "s_register_operand" "w")]
UNSPEC_VPUMAX))]
"TARGET_NEON"
"vpmax.<V_u_elem>\t%P0, %P1, %P2"
[(set_attr "type" "neon_reduc_minmax")]
)
;; Saturating arithmetic
; NOTE: Neon supports many more saturating variants of instructions than the
; following, but these are all GCC currently understands.
; FIXME: Actually, GCC doesn't know how to create saturating add/sub by itself
; yet either, although these patterns may be used by intrinsics when they're
; added.
(define_insn "*ss_add_neon"
[(set (match_operand:VD 0 "s_register_operand" "=w")
(ss_plus:VD (match_operand:VD 1 "s_register_operand" "w")
(match_operand:VD 2 "s_register_operand" "w")))]
"TARGET_NEON"
"vqadd.<V_s_elem>\t%P0, %P1, %P2"
[(set_attr "type" "neon_qadd")]
)
(define_insn "*us_add_neon"
[(set (match_operand:VD 0 "s_register_operand" "=w")
(us_plus:VD (match_operand:VD 1 "s_register_operand" "w")
(match_operand:VD 2 "s_register_operand" "w")))]
"TARGET_NEON"
"vqadd.<V_u_elem>\t%P0, %P1, %P2"
[(set_attr "type" "neon_qadd")]
)
(define_insn "*ss_sub_neon"
[(set (match_operand:VD 0 "s_register_operand" "=w")
(ss_minus:VD (match_operand:VD 1 "s_register_operand" "w")
(match_operand:VD 2 "s_register_operand" "w")))]
"TARGET_NEON"
"vqsub.<V_s_elem>\t%P0, %P1, %P2"
[(set_attr "type" "neon_qsub")]
)
(define_insn "*us_sub_neon"
[(set (match_operand:VD 0 "s_register_operand" "=w")
(us_minus:VD (match_operand:VD 1 "s_register_operand" "w")
(match_operand:VD 2 "s_register_operand" "w")))]
"TARGET_NEON"
"vqsub.<V_u_elem>\t%P0, %P1, %P2"
[(set_attr "type" "neon_qsub")]
)
(define_expand "vec_cmp<v_cmp_result>"
[(set (match_operand:<V_cmp_result> 0 "s_register_operand")
(match_operator:<V_cmp_result> 1 "comparison_operator"
[(match_operand:VDQW 2 "s_register_operand")
(match_operand:VDQW 3 "reg_or_zero_operand")]))]
"TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
{
arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
operands[2], operands[3], false);
DONE;
})
(define_expand "vec_cmpu"
[(set (match_operand:VDQIW 0 "s_register_operand")
(match_operator:VDQIW 1 "comparison_operator"
[(match_operand:VDQIW 2 "s_register_operand")
(match_operand:VDQIW 3 "reg_or_zero_operand")]))]
"TARGET_NEON"
{
arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
operands[2], operands[3], false);
DONE;
})
;; Conditional instructions. These are comparisons with conditional moves for
;; vectors. They perform the assignment:
;;
;; Vop0 = (Vop4 Vop5) ? Vop1 : Vop2;
;;
;; where op3 is <, <=, ==, !=, >= or >. Operations are performed
;; element-wise.
(define_expand "vcond"
[(set (match_operand:VDQW 0 "s_register_operand")
(if_then_else:VDQW
(match_operator 3 "comparison_operator"
[(match_operand:VDQW 4 "s_register_operand")
(match_operand:VDQW 5 "reg_or_zero_operand")])
(match_operand:VDQW 1 "s_register_operand")
(match_operand:VDQW 2 "s_register_operand")))]
"TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
{
arm_expand_vcond (operands, <V_cmp_result>mode);
DONE;
})
(define_expand "vcond<V_cvtto>"
[(set (match_operand:<V_CVTTO> 0 "s_register_operand")
(if_then_else:<V_CVTTO>
(match_operator 3 "comparison_operator"
[(match_operand:V32 4 "s_register_operand")
(match_operand:V32 5 "reg_or_zero_operand")])
(match_operand:<V_CVTTO> 1 "s_register_operand")
(match_operand:<V_CVTTO> 2 "s_register_operand")))]
"TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
{
arm_expand_vcond (operands, <V_cmp_result>mode);
DONE;
})
(define_expand "vcondu<v_cmp_result>"
[(set (match_operand:VDQW 0 "s_register_operand")
(if_then_else:VDQW
(match_operator 3 "arm_comparison_operator"
[(match_operand:<V_cmp_result> 4 "s_register_operand")
(match_operand:<V_cmp_result> 5 "reg_or_zero_operand")])
(match_operand:VDQW 1 "s_register_operand")
(match_operand:VDQW 2 "s_register_operand")))]
"TARGET_NEON"
{
arm_expand_vcond (operands, <V_cmp_result>mode);
DONE;
})
(define_expand "vcond_mask_<v_cmp_result>"
[(set (match_operand:VDQW 0 "s_register_operand")
(if_then_else:VDQW
(match_operand:<V_cmp_result> 3 "s_register_operand")
(match_operand:VDQW 1 "s_register_operand")
(match_operand:VDQW 2 "s_register_operand")))]
"TARGET_NEON"
{
emit_insn (gen_neon_vbsl (operands[0], operands[3], operands[1],
operands[2]));
DONE;
})
;; Patterns for builtins.
; good for plain vadd, vaddq.
(define_expand "neon_vadd"
[(match_operand:VCVTF 0 "s_register_operand")
(match_operand:VCVTF 1 "s_register_operand")
(match_operand:VCVTF 2 "s_register_operand")]
"TARGET_NEON"
{
if (ARM_HAVE_NEON__ARITH)
emit_insn (gen_add3 (operands[0], operands[1], operands[2]));
else
emit_insn (gen_neon_vadd_unspec (operands[0], operands[1],
operands[2]));
DONE;
})
(define_expand "neon_vadd"
[(match_operand:VH 0 "s_register_operand")
(match_operand:VH 1 "s_register_operand")
(match_operand:VH 2 "s_register_operand")]
"TARGET_NEON_FP16INST"
{
emit_insn (gen_add3 (operands[0], operands[1], operands[2]));
DONE;
})
(define_expand "neon_vsub"
[(match_operand:VH 0 "s_register_operand")
(match_operand:VH 1 "s_register_operand")
(match_operand:VH 2 "s_register_operand")]
"TARGET_NEON_FP16INST"
{
emit_insn (gen_sub3 (operands[0], operands[1], operands[2]));
DONE;
})
; Note that NEON operations don't support the full IEEE 754 standard: in
; particular, denormal values are flushed to zero. This means that GCC cannot
; use those instructions for autovectorization, etc. unless
; -funsafe-math-optimizations is in effect (in which case flush-to-zero
; behavior is permissible). Intrinsic operations (provided by the arm_neon.h
; header) must work in either case: if -funsafe-math-optimizations is given,
; intrinsics expand to "canonical" RTL where possible, otherwise intrinsics
; expand to unspecs (which may potentially limit the extent to which they might
; be optimized by generic code).
; Used for intrinsics when flag_unsafe_math_optimizations is false.
(define_insn "neon_vadd_unspec"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
(match_operand:VCVTF 2 "s_register_operand" "w")]
UNSPEC_VADD))]
"TARGET_NEON"
"vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_addsub_s")
(const_string "neon_add")))]
)
(define_insn "neon_vaddl"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:VDI 1 "s_register_operand" "w")
(match_operand:VDI 2 "s_register_operand" "w")]
VADDL))]
"TARGET_NEON"
"vaddl.%#<V_sz_elem>\t%q0, %P1, %P2"
[(set_attr "type" "neon_add_long")]
)
(define_insn "neon_vaddw"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "w")
(match_operand:VDI 2 "s_register_operand" "w")]
VADDW))]
"TARGET_NEON"
"vaddw.%#<V_sz_elem>\t%q0, %q1, %P2"
[(set_attr "type" "neon_add_widen")]
)
; vhadd and vrhadd.
(define_insn "neon_vhadd"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
(match_operand:VDQIW 2 "s_register_operand" "w")]
VHADD))]
"TARGET_NEON"
"vhadd.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_add_halve_q")]
)
(define_insn "neon_vqadd"
[(set (match_operand:VDQIX 0 "s_register_operand" "=w")
(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
(match_operand:VDQIX 2 "s_register_operand" "w")]
VQADD))]
"TARGET_NEON"
"vqadd.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_qadd")]
)
(define_insn "neon_vaddhn"
[(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
(match_operand:VN 2 "s_register_operand" "w")]
VADDHN))]
"TARGET_NEON"
"vaddhn.<V_if_elem>\t%P0, %q1, %q2"
[(set_attr "type" "neon_add_halve_narrow_q")]
)
;; Polynomial and Float multiplication.
(define_insn "neon_vmul"
[(set (match_operand:VPF 0 "s_register_operand" "=w")
(unspec:VPF [(match_operand:VPF 1 "s_register_operand" "w")
(match_operand:VPF 2 "s_register_operand" "w")]
UNSPEC_VMUL))]
"TARGET_NEON"
"vmul.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_mul_s")
(const_string "neon_mul_<V_elem_ch>")))]
)
(define_insn "neon_vmulf"
[(set
(match_operand:VH 0 "s_register_operand" "=w")
(mult:VH
(match_operand:VH 1 "s_register_operand" "w")
(match_operand:VH 2 "s_register_operand" "w")))]
"TARGET_NEON_FP16INST"
"vmul.f16\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_mul_<VH_elem_ch>")]
)
(define_expand "neon_vmla"
[(match_operand:VDQW 0 "s_register_operand")
(match_operand:VDQW 1 "s_register_operand")
(match_operand:VDQW 2 "s_register_operand")
(match_operand:VDQW 3 "s_register_operand")]
"TARGET_NEON"
{
if (ARM_HAVE_NEON__ARITH)
emit_insn (gen_mul3add_neon (operands[0], operands[1],
operands[2], operands[3]));
else
emit_insn (gen_neon_vmla_unspec (operands[0], operands[1],
operands[2], operands[3]));
DONE;
})
(define_expand "neon_vfmaVCVTF:mode"
[(match_operand:VCVTF 0 "s_register_operand")
(match_operand:VCVTF 1 "s_register_operand")
(match_operand:VCVTF 2 "s_register_operand")
(match_operand:VCVTF 3 "s_register_operand")]
"TARGET_NEON && TARGET_FMA"
{
emit_insn (gen_fma4_intrinsic (operands[0], operands[2], operands[3],
operands[1]));
DONE;
})
(define_expand "neon_vfmaVH:mode"
[(match_operand:VH 0 "s_register_operand")
(match_operand:VH 1 "s_register_operand")
(match_operand:VH 2 "s_register_operand")
(match_operand:VH 3 "s_register_operand")]
"TARGET_NEON_FP16INST"
{
emit_insn (gen_fma4 (operands[0], operands[2], operands[3],
operands[1]));
DONE;
})
(define_expand "neon_vfmsVCVTF:mode"
[(match_operand:VCVTF 0 "s_register_operand")
(match_operand:VCVTF 1 "s_register_operand")
(match_operand:VCVTF 2 "s_register_operand")
(match_operand:VCVTF 3 "s_register_operand")]
"TARGET_NEON && TARGET_FMA"
{
emit_insn (gen_fmsub4_intrinsic (operands[0], operands[2], operands[3],
operands[1]));
DONE;
})
(define_expand "neon_vfmsVH:mode"
[(match_operand:VH 0 "s_register_operand")
(match_operand:VH 1 "s_register_operand")
(match_operand:VH 2 "s_register_operand")
(match_operand:VH 3 "s_register_operand")]
"TARGET_NEON_FP16INST"
{
emit_insn (gen_fmsub4_intrinsic (operands[0], operands[2], operands[3],
operands[1]));
DONE;
})
;; The expand RTL structure here is not important.
;; We use the gen_* functions anyway.
;; We just need something to wrap the iterators around.
(define_expand "neon_vfm<vfml_op>l_<vfml_half>"
[(set (match_operand:VCVTF 0 "s_register_operand")
(unspec:VCVTF
[(match_operand:VCVTF 1 "s_register_operand")
(PLUSMINUS:
(match_operand: 2 "s_register_operand")
(match_operand: 3 "s_register_operand"))] VFMLHALVES))]
"TARGET_FP16FML"
{
rtx half = arm_simd_vect_par_cnst_half (mode, <vfml_half_selector>);
emit_insn (gen_vfm<vfml_op>l_<vfml_half>_intrinsic (operands[0],
operands[1],
operands[2],
operands[3],
half, half));
DONE;
})
(define_insn "vfmal_low_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:
(match_operand: 2 "s_register_operand" "<VF_constraint>")
(match_operand: 4 "vect_par_constant_low" "")))
(float_extend:VCVTF
(vec_select:
(match_operand: 3 "s_register_operand" "<VF_constraint>")
(match_operand: 5 "vect_par_constant_low" "")))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
"vfmal.f16\t%<V_reg>0, %<V_lo>2, %<V_lo>3"
[(set_attr "type" "neon_fp_mla_s")]
)
(define_insn "vfmsl_high_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:
(vec_select:
(match_operand: 2 "s_register_operand" "<VF_constraint>")
(match_operand: 4 "vect_par_constant_high" ""))))
(float_extend:VCVTF
(vec_select:
(match_operand: 3 "s_register_operand" "<VF_constraint>")
(match_operand: 5 "vect_par_constant_high" "")))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
"vfmsl.f16\t%<V_reg>0, %<V_hi>2, %<V_hi>3"
[(set_attr "type" "neon_fp_mla_s")]
)
(define_insn "vfmal_high_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:
(match_operand: 2 "s_register_operand" "<VF_constraint>")
(match_operand: 4 "vect_par_constant_high" "")))
(float_extend:VCVTF
(vec_select:
(match_operand: 3 "s_register_operand" "<VF_constraint>")
(match_operand: 5 "vect_par_constant_high" "")))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
"vfmal.f16\t%<V_reg>0, %<V_hi>2, %<V_hi>3"
[(set_attr "type" "neon_fp_mla_s")]
)
(define_insn "vfmsl_low_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:
(vec_select:
(match_operand: 2 "s_register_operand" "<VF_constraint>")
(match_operand: 4 "vect_par_constant_low" ""))))
(float_extend:VCVTF
(vec_select:
(match_operand: 3 "s_register_operand" "<VF_constraint>")
(match_operand: 5 "vect_par_constant_low" "")))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
"vfmsl.f16\t%<V_reg>0, %<V_lo>2, %<V_lo>3"
[(set_attr "type" "neon_fp_mla_s")]
)
(define_expand "neon_vfm<vfml_op>l_lane_<vfml_half>VCVTF:mode"
[(set:VCVTF (match_operand:VCVTF 0 "s_register_operand")
(unspec:VCVTF
[(match_operand:VCVTF 1 "s_register_operand")
(PLUSMINUS:
(match_operand: 2 "s_register_operand")
(match_operand: 3 "s_register_operand"))
(match_operand:SI 4 "const_int_operand")] VFMLHALVES))]
"TARGET_FP16FML"
{
rtx lane = GEN_INT (NEON_ENDIAN_LANE_N (mode, INTVAL (operands[4])));
rtx half = arm_simd_vect_par_cnst_half (mode, <vfml_half_selector>);
emit_insn (gen_vfm<vfml_op>l_lane_<vfml_half>_intrinsic
(operands[0], operands[1],
operands[2], operands[3],
half, lane));
DONE;
})
(define_insn "vfmal_lane_low_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:
(match_operand: 2 "s_register_operand" "<VF_constraint>")
(match_operand: 4 "vect_par_constant_low" "")))
(float_extend:VCVTF
(vec_duplicate:
(vec_select:HF
(match_operand: 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5]));
if (lane > GET_MODE_NUNITS (mode) - 1)
{
operands[5] = GEN_INT (lane - GET_MODE_NUNITS (mode));
return "vfmal.f16\t%<V_reg>0, %<V_lo>2, %<V_hi>3[%c5]";
}
else
{
operands[5] = GEN_INT (lane);
return "vfmal.f16\t%<V_reg>0, %<V_lo>2, %<V_lo>3[%c5]";
}
}
[(set_attr "type" "neon_fp_mla_s")]
)
(define_expand "neon_vfm<vfml_op>l_lane_<vfml_half>"
[(set:VCVTF (match_operand:VCVTF 0 "s_register_operand")
(unspec:VCVTF
[(match_operand:VCVTF 1 "s_register_operand")
(PLUSMINUS:
(match_operand: 2 "s_register_operand")
(match_operand: 3 "s_register_operand"))
(match_operand:SI 4 "const_int_operand")] VFMLHALVES))]
"TARGET_FP16FML"
{
rtx lane
= GEN_INT (NEON_ENDIAN_LANE_N (mode, INTVAL (operands[4])));
rtx half = arm_simd_vect_par_cnst_half (mode, <vfml_half_selector>);
emit_insn (gen_vfm<vfml_op>l_lane_<vfml_half>_intrinsic
(operands[0], operands[1], operands[2], operands[3],
half, lane));
DONE;
})
;; Used to implement the intrinsics:
;; float32x4_t vfmlalq_lane_low_f16 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
;; float32x2_t vfmlal_laneq_low_f16 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
;; Needs a bit of care to get the modes of the different sub-expressions right
;; due to 'a' and 'b' having different sizes and make sure we use the right
;; S or D subregister to select the appropriate lane from.
(define_insn "vfmal_lane_low_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:
(match_operand: 2 "s_register_operand" "<VF_constraint>")
(match_operand: 4 "vect_par_constant_low" "")))
(float_extend:VCVTF
(vec_duplicate:
(vec_select:HF
(match_operand: 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5]));
int elts_per_reg = GET_MODE_NUNITS (mode);
int new_lane = lane % elts_per_reg;
int regdiff = lane / elts_per_reg;
operands[5] = GEN_INT (new_lane);
/* We re-create operands[2] and operands[3] in the halved VFMLSEL modes
because we want the print_operand code to print the appropriate
S or D register prefix. */
operands[3] = gen_rtx_REG (mode, REGNO (operands[3]) + regdiff);
operands[2] = gen_rtx_REG (mode, REGNO (operands[2]));
return "vfmal.f16\t%<V_reg>0, %<V_lane_reg>2, %<V_lane_reg>3[%c5]";
}
[(set_attr "type" "neon_fp_mla_s")]
)
;; Used to implement the intrinsics:
;; float32x4_t vfmlalq_lane_high_f16 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
;; float32x2_t vfmlal_laneq_high_f16 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
;; Needs a bit of care to get the modes of the different sub-expressions right
;; due to 'a' and 'b' having different sizes and make sure we use the right
;; S or D subregister to select the appropriate lane from.
(define_insn "vfmal_lane_high_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:
(match_operand: 2 "s_register_operand" "<VF_constraint>")
(match_operand: 4 "vect_par_constant_high" "")))
(float_extend:VCVTF
(vec_duplicate:
(vec_select:HF
(match_operand: 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5]));
int elts_per_reg = GET_MODE_NUNITS (mode);
int new_lane = lane % elts_per_reg;
int regdiff = lane / elts_per_reg;
operands[5] = GEN_INT (new_lane);
/* We re-create operands[3] in the halved VFMLSEL mode
because we've calculated the correct half-width subreg to extract
the lane from and we want to print that subreg instead. */
operands[3] = gen_rtx_REG (mode, REGNO (operands[3]) + regdiff);
return "vfmal.f16\t%<V_reg>0, %<V_hi>2, %<V_lane_reg>3[%c5]";
}
[(set_attr "type" "neon_fp_mla_s")]
)
(define_insn "vfmal_lane_high_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:
(match_operand: 2 "s_register_operand" "<VF_constraint>")
(match_operand: 4 "vect_par_constant_high" "")))
(float_extend:VCVTF
(vec_duplicate:
(vec_select:HF
(match_operand: 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5]));
if (lane > GET_MODE_NUNITS (mode) - 1)
{
operands[5] = GEN_INT (lane - GET_MODE_NUNITS (mode));
return "vfmal.f16\t%<V_reg>0, %<V_hi>2, %<V_hi>3[%c5]";
}
else
{
operands[5] = GEN_INT (lane);
return "vfmal.f16\t%<V_reg>0, %<V_hi>2, %<V_lo>3[%c5]";
}
}
[(set_attr "type" "neon_fp_mla_s")]
)
(define_insn "vfmsl_lane_low_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:
(vec_select:
(match_operand: 2 "s_register_operand" "<VF_constraint>")
(match_operand: 4 "vect_par_constant_low" ""))))
(float_extend:VCVTF
(vec_duplicate:
(vec_select:HF
(match_operand: 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5]));
if (lane > GET_MODE_NUNITS (mode) - 1)
{
operands[5] = GEN_INT (lane - GET_MODE_NUNITS (mode));
return "vfmsl.f16\t%<V_reg>0, %<V_lo>2, %<V_hi>3[%c5]";
}
else
{
operands[5] = GEN_INT (lane);
return "vfmsl.f16\t%<V_reg>0, %<V_lo>2, %<V_lo>3[%c5]";
}
}
[(set_attr "type" "neon_fp_mla_s")]
)
;; Used to implement the intrinsics:
;; float32x4_t vfmlslq_lane_low_f16 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
;; float32x2_t vfmlsl_laneq_low_f16 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
;; Needs a bit of care to get the modes of the different sub-expressions right
;; due to 'a' and 'b' having different sizes and make sure we use the right
;; S or D subregister to select the appropriate lane from.
(define_insn "vfmsl_lane_low_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:
(vec_select:
(match_operand: 2 "s_register_operand" "<VF_constraint>")
(match_operand: 4 "vect_par_constant_low" ""))))
(float_extend:VCVTF
(vec_duplicate:
(vec_select:HF
(match_operand: 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5]));
int elts_per_reg = GET_MODE_NUNITS (mode);
int new_lane = lane % elts_per_reg;
int regdiff = lane / elts_per_reg;
operands[5] = GEN_INT (new_lane);
/* We re-create operands[2] and operands[3] in the halved VFMLSEL modes
because we want the print_operand code to print the appropriate
S or D register prefix. */
operands[3] = gen_rtx_REG (mode, REGNO (operands[3]) + regdiff);
operands[2] = gen_rtx_REG (mode, REGNO (operands[2]));
return "vfmsl.f16\t%<V_reg>0, %<V_lane_reg>2, %<V_lane_reg>3[%c5]";
}
[(set_attr "type" "neon_fp_mla_s")]
)
;; Used to implement the intrinsics:
;; float32x4_t vfmlslq_lane_high_f16 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
;; float32x2_t vfmlsl_laneq_high_f16 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
;; Needs a bit of care to get the modes of the different sub-expressions right
;; due to 'a' and 'b' having different sizes and make sure we use the right
;; S or D subregister to select the appropriate lane from.
(define_insn "vfmsl_lane_high_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:
(vec_select:
(match_operand: 2 "s_register_operand" "<VF_constraint>")
(match_operand: 4 "vect_par_constant_high" ""))))
(float_extend:VCVTF
(vec_duplicate:
(vec_select:HF
(match_operand: 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5]));
int elts_per_reg = GET_MODE_NUNITS (mode);
int new_lane = lane % elts_per_reg;
int regdiff = lane / elts_per_reg;
operands[5] = GEN_INT (new_lane);
/* We re-create operands[3] in the halved VFMLSEL mode
because we've calculated the correct half-width subreg to extract
the lane from and we want to print that subreg instead. */
operands[3] = gen_rtx_REG (mode, REGNO (operands[3]) + regdiff);
return "vfmsl.f16\t%<V_reg>0, %<V_hi>2, %<V_lane_reg>3[%c5]";
}
[(set_attr "type" "neon_fp_mla_s")]
)
(define_insn "vfmsl_lane_high_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:
(vec_select:
(match_operand: 2 "s_register_operand" "<VF_constraint>")
(match_operand: 4 "vect_par_constant_high" ""))))
(float_extend:VCVTF
(vec_duplicate:
(vec_select:HF
(match_operand: 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[5]));
if (lane > GET_MODE_NUNITS (mode) - 1)
{
operands[5] = GEN_INT (lane - GET_MODE_NUNITS (mode));
return "vfmsl.f16\t%<V_reg>0, %<V_hi>2, %<V_hi>3[%c5]";
}
else
{
operands[5] = GEN_INT (lane);
return "vfmsl.f16\t%<V_reg>0, %<V_hi>2, %<V_lo>3[%c5]";
}
}
[(set_attr "type" "neon_fp_mla_s")]
)
; Used for intrinsics when flag_unsafe_math_optimizations is false.
(define_insn "neon_vmlaunspec"
[(set (match_operand:VDQW 0 "s_register_operand" "=w")
(unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
(match_operand:VDQW 2 "s_register_operand" "w")
(match_operand:VDQW 3 "s_register_operand" "w")]
UNSPEC_VMLA))]
"TARGET_NEON"
"vmla.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_mla_s")
(const_string "neon_mla
<V_elem_ch>")))]
)
(define_insn "neon_vmlal"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
(match_operand:VW 2 "s_register_operand" "w")
(match_operand:VW 3 "s_register_operand" "w")]
VMLAL))]
"TARGET_NEON"
"vmlal.%#<V_sz_elem>\t%q0, %P2, %P3"
[(set_attr "type" "neon_mla_<V_elem_ch>_long")]
)
(define_expand "neon_vmls"
[(match_operand:VDQW 0 "s_register_operand")
(match_operand:VDQW 1 "s_register_operand")
(match_operand:VDQW 2 "s_register_operand")
(match_operand:VDQW 3 "s_register_operand")]
"TARGET_NEON"
{
if (ARM_HAVE_NEON__ARITH)
emit_insn (gen_mul3negadd_neon (operands[0],
operands[1], operands[2], operands[3]));
else
emit_insn (gen_neon_vmls_unspec (operands[0], operands[1],
operands[2], operands[3]));
DONE;
})
; Used for intrinsics when flag_unsafe_math_optimizations is false.
(define_insn "neon_vmlsunspec"
[(set (match_operand:VDQW 0 "s_register_operand" "=w")
(unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
(match_operand:VDQW 2 "s_register_operand" "w")
(match_operand:VDQW 3 "s_register_operand" "w")]
UNSPEC_VMLS))]
"TARGET_NEON"
"vmls.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_mla_s")
(const_string "neon_mla
<V_elem_ch>")))]
)
(define_insn "neon_vmlsl"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
(match_operand:VW 2 "s_register_operand" "w")
(match_operand:VW 3 "s_register_operand" "w")]
VMLSL))]
"TARGET_NEON"
"vmlsl.%#<V_sz_elem>\t%q0, %P2, %P3"
[(set_attr "type" "neon_mla_<V_elem_ch>_long")]
)
;; vqdmulh, vqrdmulh
(define_insn "neon_vqdmulh"
[(set (match_operand:VMDQI 0 "s_register_operand" "=w")
(unspec:VMDQI [(match_operand:VMDQI 1 "s_register_operand" "w")
(match_operand:VMDQI 2 "s_register_operand" "w")]
VQDMULH))]
"TARGET_NEON"
"vqdmulh.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_sat_mul_<V_elem_ch>")]
)
;; vqrdmlah, vqrdmlsh
(define_insn "neon_vqrdml<VQRDMLH_AS:neon_rdma_as>h"
[(set (match_operand:VMDQI 0 "s_register_operand" "=w")
(unspec:VMDQI [(match_operand:VMDQI 1 "s_register_operand" "0")
(match_operand:VMDQI 2 "s_register_operand" "w")
(match_operand:VMDQI 3 "s_register_operand" "w")]
VQRDMLH_AS))]
"TARGET_NEON_RDMA"
"vqrdml<VQRDMLH_AS:neon_rdma_as>h.<V_s_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
[(set_attr "type" "neon_sat_mla_<V_elem_ch>_long")]
)
(define_insn "neon_vqdmlal"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
(match_operand:VMDI 2 "s_register_operand" "w")
(match_operand:VMDI 3 "s_register_operand" "w")]
UNSPEC_VQDMLAL))]
"TARGET_NEON"
"vqdmlal.<V_s_elem>\t%q0, %P2, %P3"
[(set_attr "type" "neon_sat_mla_<V_elem_ch>_long")]
)
(define_insn "neon_vqdmlsl"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
(match_operand:VMDI 2 "s_register_operand" "w")
(match_operand:VMDI 3 "s_register_operand" "w")]
UNSPEC_VQDMLSL))]
"TARGET_NEON"
"vqdmlsl.<V_s_elem>\t%q0, %P2, %P3"
[(set_attr "type" "neon_sat_mla_<V_elem_ch>_long")]
)
(define_insn "neon_vmull"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:VW 1 "s_register_operand" "w")
(match_operand:VW 2 "s_register_operand" "w")]
VMULL))]
"TARGET_NEON"
"vmull.%#<V_sz_elem>\t%q0, %P1, %P2"
[(set_attr "type" "neon_mul_<V_elem_ch>_long")]
)
(define_insn "neon_vqdmull"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:VMDI 1 "s_register_operand" "w")
(match_operand:VMDI 2 "s_register_operand" "w")]
UNSPEC_VQDMULL))]
"TARGET_NEON"
"vqdmull.<V_s_elem>\t%q0, %P1, %P2"
[(set_attr "type" "neon_sat_mul_<V_elem_ch>_long")]
)
(define_expand "neon_vsub"
[(match_operand:VCVTF 0 "s_register_operand")
(match_operand:VCVTF 1 "s_register_operand")
(match_operand:VCVTF 2 "s_register_operand")]
"TARGET_NEON"
{
if (ARM_HAVE_NEON__ARITH)
emit_insn (gen_sub3 (operands[0], operands[1], operands[2]));
else
emit_insn (gen_neon_vsub_unspec (operands[0], operands[1],
operands[2]));
DONE;
})
; Used for intrinsics when flag_unsafe_math_optimizations is false.
(define_insn "neon_vsub_unspec"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
(match_operand:VCVTF 2 "s_register_operand" "w")]
UNSPEC_VSUB))]
"TARGET_NEON"
"vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_addsub_s")
(const_string "neon_sub")))]
)
(define_insn "neon_vsubl"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:VDI 1 "s_register_operand" "w")
(match_operand:VDI 2 "s_register_operand" "w")]
VSUBL))]
"TARGET_NEON"
"vsubl.%#<V_sz_elem>\t%q0, %P1, %P2"
[(set_attr "type" "neon_sub_long")]
)
(define_insn "neon_vsubw"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "w")
(match_operand:VDI 2 "s_register_operand" "w")]
VSUBW))]
"TARGET_NEON"
"vsubw.%#<V_sz_elem>\t%q0, %q1, %P2"
[(set_attr "type" "neon_sub_widen")]
)
(define_insn "neon_vqsub"
[(set (match_operand:VDQIX 0 "s_register_operand" "=w")
(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
(match_operand:VDQIX 2 "s_register_operand" "w")]
VQSUB))]
"TARGET_NEON"
"vqsub.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_qsub")]
)
(define_insn "neon_vhsub"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
(match_operand:VDQIW 2 "s_register_operand" "w")]
VHSUB))]
"TARGET_NEON"
"vhsub.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_sub_halve")]
)
(define_insn "neon_vsubhn"
[(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
(match_operand:VN 2 "s_register_operand" "w")]
VSUBHN))]
"TARGET_NEON"
"vsubhn.<V_if_elem>\t%P0, %q1, %q2"
[(set_attr "type" "neon_sub_halve_narrow_q")]
)
;; These may expand to an UNSPEC pattern when a floating point mode is used
;; without unsafe math optimizations.
(define_expand "@neon_vc<cmp_op>"
[(match_operand:<V_cmp_result> 0 "s_register_operand")
(neg:<V_cmp_result>
(COMPARISONS:VDQW (match_operand:VDQW 1 "s_register_operand")
(match_operand:VDQW 2 "reg_or_zero_operand")))]
"TARGET_NEON"
{
/* For FP comparisons use UNSPECS unless -funsafe-math-optimizations
are enabled. /
if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
&& !flag_unsafe_math_optimizations)
{
/ We don't just emit a gen_neon_vc<cmp_op>_insn_unspec because
we define gen_neon_vceq_insn_unspec only for float modes
whereas this expander iterates over the integer modes as well,
but we will never expand to UNSPECs for the integer comparisons. */
switch (mode)
{
case E_V2SFmode:
emit_insn (gen_neon_vc<cmp_op>v2sf_insn_unspec (operands[0],
operands[1],
operands[2]));
break;
case E_V4SFmode:
emit_insn (gen_neon_vc<cmp_op>v4sf_insn_unspec (operands[0],
operands[1],
operands[2]));
break;
default:
gcc_unreachable ();
}
}
else
emit_insn (gen_neon_vc<cmp_op>_insn (operands[0],
operands[1],
operands[2]));
DONE;
}
)
(define_insn "@neon_vc<cmp_op>_insn"
[(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w,w")
(neg:<V_cmp_result>
(COMPARISONS:<V_cmp_result>
(match_operand:VDQW 1 "s_register_operand" "w,w")
(match_operand:VDQW 2 "reg_or_zero_operand" "w,Dz"))))]
"TARGET_NEON && !(GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
&& !flag_unsafe_math_optimizations)"
{
char pattern[100];
sprintf (pattern, "vc<cmp_op>.%s%%#<V_sz_elem>\t%%<V_reg>0,"
" %%<V_reg>1, %s",
GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
? "f" : "<cmp_type>",
which_alternative == 0
? "%<V_reg>2" : "#0");
output_asm_insn (pattern, operands);
return "";
}
[(set (attr "type")
(if_then_else (match_operand 2 "zero_operand")
(const_string "neon_compare_zero")
(const_string "neon_compare")))]
)
(define_insn "neon_vc<cmp_op_unsp>_insn_unspec"
[(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w,w")
(unspec:<V_cmp_result>
[(match_operand:VCVTF 1 "s_register_operand" "w,w")
(match_operand:VCVTF 2 "reg_or_zero_operand" "w,Dz")]
NEON_VCMP))]
"TARGET_NEON"
{
char pattern[100];
sprintf (pattern, "vc<cmp_op_unsp>.f%%#<V_sz_elem>\t%%<V_reg>0,"
" %%<V_reg>1, %s",
which_alternative == 0
? "%<V_reg>2" : "#0");
output_asm_insn (pattern, operands);
return "";
}
[(set_attr "type" "neon_fp_compare_s")]
)
(define_expand "@neon_vc<cmp_op>"
[(match_operand:<V_cmp_result> 0 "s_register_operand")
(neg:<V_cmp_result>
(COMPARISONS:VH
(match_operand:VH 1 "s_register_operand")
(match_operand:VH 2 "reg_or_zero_operand")))]
"TARGET_NEON_FP16INST"
{
/* For FP comparisons use UNSPECS unless -funsafe-math-optimizations
are enabled. */
if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
&& !flag_unsafe_math_optimizations)
emit_insn
(gen_neon_vc<cmp_op>_fp16insn_unspec
(operands[0], operands[1], operands[2]));
else
emit_insn
(gen_neon_vc<cmp_op>_fp16insn
(operands[0], operands[1], operands[2]));
DONE;
})
(define_insn "neon_vc<cmp_op>_fp16insn"
[(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w,w")
(neg:<V_cmp_result>
(COMPARISONS:<V_cmp_result>
(match_operand:VH 1 "s_register_operand" "w,w")
(match_operand:VH 2 "reg_or_zero_operand" "w,Dz"))))]
"TARGET_NEON_FP16INST
&& !(GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
&& !flag_unsafe_math_optimizations)"
{
char pattern[100];
sprintf (pattern, "vc<cmp_op>.%s%%#<V_sz_elem>\t%%<V_reg>0,"
" %%<V_reg>1, %s",
GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
? "f" : "<cmp_type>",
which_alternative == 0
? "%<V_reg>2" : "#0");
output_asm_insn (pattern, operands);
return "";
}
[(set (attr "type")
(if_then_else (match_operand 2 "zero_operand")
(const_string "neon_compare_zero")
(const_string "neon_compare")))])
(define_insn "neon_vc<cmp_op_unsp>_fp16insn_unspec"
[(set
(match_operand:<V_cmp_result> 0 "s_register_operand" "=w,w")
(unspec:<V_cmp_result>
[(match_operand:VH 1 "s_register_operand" "w,w")
(match_operand:VH 2 "reg_or_zero_operand" "w,Dz")]
NEON_VCMP))]
"TARGET_NEON_FP16INST"
{
char pattern[100];
sprintf (pattern, "vc<cmp_op_unsp>.f%%#<V_sz_elem>\t%%<V_reg>0,"
" %%<V_reg>1, %s",
which_alternative == 0
? "%<V_reg>2" : "#0");
output_asm_insn (pattern, operands);
return "";
}
[(set_attr "type" "neon_fp_compare_s")])
(define_insn "@neon_vc"
[(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
(neg:<V_cmp_result>
(GTUGEU:<V_cmp_result>
(match_operand:VDQIW 1 "s_register_operand" "w")
(match_operand:VDQIW 2 "s_register_operand" "w"))))]
"TARGET_NEON"
"vc<cmp_op>.u%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_compare")]
)
(define_expand "neon_vca<cmp_op>"
[(set (match_operand:<V_cmp_result> 0 "s_register_operand")
(neg:<V_cmp_result>
(GTGE:<V_cmp_result>
(abs:VCVTF (match_operand:VCVTF 1 "s_register_operand"))
(abs:VCVTF (match_operand:VCVTF 2 "s_register_operand")))))]
"TARGET_NEON"
{
if (flag_unsafe_math_optimizations)
emit_insn (gen_neon_vca<cmp_op>_insn (operands[0], operands[1],
operands[2]));
else
emit_insn (gen_neon_vca<cmp_op>_insn_unspec (operands[0],
operands[1],
operands[2]));
DONE;
}
)
(define_insn "neon_vca<cmp_op>_insn"
[(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
(neg:<V_cmp_result>
(GTGE:<V_cmp_result>
(abs:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w"))
(abs:VCVTF (match_operand:VCVTF 2 "s_register_operand" "w")))))]
"TARGET_NEON && flag_unsafe_math_optimizations"
"vac<cmp_op>.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_compare_s")]
)
(define_insn "neon_vca<cmp_op_unsp>_insn_unspec"
[(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
(unspec:<V_cmp_result> [(match_operand:VCVTF 1 "s_register_operand" "w")
(match_operand:VCVTF 2 "s_register_operand" "w")]
NEON_VACMP))]
"TARGET_NEON"
"vac<cmp_op_unsp>.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_compare_s")]
)
(define_expand "neon_vca<cmp_op>"
[(set
(match_operand:<V_cmp_result> 0 "s_register_operand")
(neg:<V_cmp_result>
(GLTE:<V_cmp_result>
(abs:VH (match_operand:VH 1 "s_register_operand"))
(abs:VH (match_operand:VH 2 "s_register_operand")))))]
"TARGET_NEON_FP16INST"
{
if (flag_unsafe_math_optimizations)
emit_insn (gen_neon_vca<cmp_op>_fp16insn
(operands[0], operands[1], operands[2]));
else
emit_insn (gen_neon_vca<cmp_op>_fp16insn_unspec
(operands[0], operands[1], operands[2]));
DONE;
})
(define_insn "neon_vca<cmp_op>_fp16insn"
[(set
(match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
(neg:<V_cmp_result>
(GLTE:<V_cmp_result>
(abs:VH (match_operand:VH 1 "s_register_operand" "w"))
(abs:VH (match_operand:VH 2 "s_register_operand" "w")))))]
"TARGET_NEON_FP16INST && flag_unsafe_math_optimizations"
"vac<cmp_op>.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_compare_s")]
)
(define_insn "neon_vca<cmp_op_unsp>_fp16insn_unspec"
[(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
(unspec:<V_cmp_result>
[(match_operand:VH 1 "s_register_operand" "w")
(match_operand:VH 2 "s_register_operand" "w")]
NEON_VAGLTE))]
"TARGET_NEON"
"vac<cmp_op_unsp>.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_compare_s")]
)
(define_expand "neon_vc<cmp_op>z"
[(set
(match_operand:<V_cmp_result> 0 "s_register_operand")
(COMPARISONS:<V_cmp_result>
(match_operand:VH 1 "s_register_operand")
(const_int 0)))]
"TARGET_NEON_FP16INST"
{
emit_insn (gen_neon_vc<cmp_op> (operands[0], operands[1],
CONST0_RTX (mode)));
DONE;
})
(define_insn "neon_vtst"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
(match_operand:VDQIW 2 "s_register_operand" "w")]
UNSPEC_VTST))]
"TARGET_NEON"
"vtst.<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_tst")]
)
(define_insn "neon_vabd"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
(match_operand:VDQIW 2 "s_register_operand" "w")]
VABD))]
"TARGET_NEON"
"vabd.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_abd")]
)
(define_insn "neon_vabd"
[(set (match_operand:VH 0 "s_register_operand" "=w")
(unspec:VH [(match_operand:VH 1 "s_register_operand" "w")
(match_operand:VH 2 "s_register_operand" "w")]
UNSPEC_VABD_F))]
"TARGET_NEON_FP16INST"
"vabd.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_abd")]
)
(define_insn "neon_vabdf"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
(match_operand:VCVTF 2 "s_register_operand" "w")]
UNSPEC_VABD_F))]
"TARGET_NEON"
"vabd.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_abd_s")]
)
(define_insn "neon_vabdl"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:VW 1 "s_register_operand" "w")
(match_operand:VW 2 "s_register_operand" "w")]
VABDL))]
"TARGET_NEON"
"vabdl.%#<V_sz_elem>\t%q0, %P1, %P2"
[(set_attr "type" "neon_abd_long")]
)
(define_insn "neon_vaba"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(plus:VDQIW (unspec:VDQIW [(match_operand:VDQIW 2 "s_register_operand" "w")
(match_operand:VDQIW 3 "s_register_operand" "w")]
VABD)
(match_operand:VDQIW 1 "s_register_operand" "0")))]
"TARGET_NEON"
"vaba.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
[(set_attr "type" "neon_arith_acc")]
)
(define_insn "neon_vabal"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(plus:<V_widen> (unspec:<V_widen> [(match_operand:VW 2 "s_register_operand" "w")
(match_operand:VW 3 "s_register_operand" "w")]
VABDL)
(match_operand:<V_widen> 1 "s_register_operand" "0")))]
"TARGET_NEON"
"vabal.%#<V_sz_elem>\t%q0, %P2, %P3"
[(set_attr "type" "neon_arith_acc")]
)
(define_expand "sadv16qi"
[(use (match_operand:V4SI 0 "register_operand"))
(unspec:V16QI [(use (match_operand:V16QI 1 "register_operand"))
(use (match_operand:V16QI 2 "register_operand"))] VABAL)
(use (match_operand:V4SI 3 "register_operand"))]
"TARGET_NEON"
{
rtx reduc = gen_reg_rtx (V8HImode);
rtx op1_highpart = gen_reg_rtx (V8QImode);
rtx op2_highpart = gen_reg_rtx (V8QImode);
emit_insn (gen_neon_vabdl<sup>v8qi (reduc,
gen_lowpart (V8QImode, operands[1]),
gen_lowpart (V8QImode, operands[2])));
emit_insn (gen_neon_vget_highv16qi (op1_highpart, operands[1]));
emit_insn (gen_neon_vget_highv16qi (op2_highpart, operands[2]));
emit_insn (gen_neon_vabal<sup>v8qi (reduc, reduc,
op1_highpart, op2_highpart));
emit_insn (gen_neon_vpadal<sup>v8hi (operands[3], operands[3], reduc));
emit_move_insn (operands[0], operands[3]);
DONE;
}
)
(define_insn "neon_v"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
(match_operand:VDQIW 2 "s_register_operand" "w")]
VMAXMIN))]
"TARGET_NEON"
"v.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_minmax")]
)
(define_insn "neon_vf"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
(match_operand:VCVTF 2 "s_register_operand" "w")]
VMAXMINF))]
"TARGET_NEON"
"v.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_minmax_s")]
)
(define_insn "neon_vf"
[(set (match_operand:VH 0 "s_register_operand" "=w")
(unspec:VH
[(match_operand:VH 1 "s_register_operand" "w")
(match_operand:VH 2 "s_register_operand" "w")]
VMAXMINF))]
"TARGET_NEON_FP16INST"
"v.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_minmax_s")]
)
(define_insn "neon_vpfv4hf"
[(set (match_operand:V4HF 0 "s_register_operand" "=w")
(unspec:V4HF
[(match_operand:V4HF 1 "s_register_operand" "w")
(match_operand:V4HF 2 "s_register_operand" "w")]
VPMAXMINF))]
"TARGET_NEON_FP16INST"
"vp.f16\t%P0, %P1, %P2"
[(set_attr "type" "neon_reduc_minmax")]
)
(define_insn "neon_<fmaxmin_op>"
[(set
(match_operand:VH 0 "s_register_operand" "=w")
(unspec:VH
[(match_operand:VH 1 "s_register_operand" "w")
(match_operand:VH 2 "s_register_operand" "w")]
VMAXMINFNM))]
"TARGET_NEON_FP16INST"
"<fmaxmin_op>.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_minmax_s")]
)
;; vnm intrinsics.
(define_insn "neon_<fmaxmin_op>"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
(match_operand:VCVTF 2 "s_register_operand" "w")]
VMAXMINFNM))]
"TARGET_NEON && TARGET_VFP5"
"<fmaxmin_op>.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_minmax_s")]
)
;; Vector forms for the IEEE-754 fmax()/fmin() functions
(define_insn "3"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
(match_operand:VCVTF 2 "s_register_operand" "w")]
VMAXMINFNM))]
"TARGET_NEON && TARGET_VFP5"
"<fmaxmin_op>.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_minmax_s")]
)
(define_expand "neon_vpadd"
[(match_operand:VD 0 "s_register_operand")
(match_operand:VD 1 "s_register_operand")
(match_operand:VD 2 "s_register_operand")]
"TARGET_NEON"
{
emit_insn (gen_neon_vpadd_internal (operands[0], operands[1],
operands[2]));
DONE;
})
(define_insn "neon_vpaddl"
[(set (match_operand:<V_double_width> 0 "s_register_operand" "=w")
(unspec:<V_double_width> [(match_operand:VDQIW 1 "s_register_operand" "w")]
VPADDL))]
"TARGET_NEON"
"vpaddl.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_reduc_add_long")]
)
(define_insn "neon_vpadal"
[(set (match_operand:<V_double_width> 0 "s_register_operand" "=w")
(unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
(match_operand:VDQIW 2 "s_register_operand" "w")]
VPADAL))]
"TARGET_NEON"
"vpadal.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
[(set_attr "type" "neon_reduc_add_acc")]
)
(define_insn "neon_vp"
[(set (match_operand:VDI 0 "s_register_operand" "=w")
(unspec:VDI [(match_operand:VDI 1 "s_register_operand" "w")
(match_operand:VDI 2 "s_register_operand" "w")]
VPMAXMIN))]
"TARGET_NEON"
"vp.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_reduc_minmax")]
)
(define_insn "neon_vpf"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
(match_operand:VCVTF 2 "s_register_operand" "w")]
VPMAXMINF))]
"TARGET_NEON"
"vp.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_reduc_minmax_s")]
)
(define_insn "neon_vrecps"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
(match_operand:VCVTF 2 "s_register_operand" "w")]
UNSPEC_VRECPS))]
"TARGET_NEON"
"vrecps.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_recps_s")]
)
(define_insn "neon_vrecps"
[(set
(match_operand:VH 0 "s_register_operand" "=w")
(unspec:VH [(match_operand:VH 1 "s_register_operand" "w")
(match_operand:VH 2 "s_register_operand" "w")]
UNSPEC_VRECPS))]
"TARGET_NEON_FP16INST"
"vrecps.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_recps_s")]
)
(define_insn "neon_vrsqrts"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
(match_operand:VCVTF 2 "s_register_operand" "w")]
UNSPEC_VRSQRTS))]
"TARGET_NEON"
"vrsqrts.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_rsqrts_s")]
)
(define_insn "neon_vrsqrts"
[(set
(match_operand:VH 0 "s_register_operand" "=w")
(unspec:VH [(match_operand:VH 1 "s_register_operand" "w")
(match_operand:VH 2 "s_register_operand" "w")]
UNSPEC_VRSQRTS))]
"TARGET_NEON_FP16INST"
"vrsqrts.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_rsqrts_s")]
)
(define_expand "neon_vabs"
[(match_operand:VDQW 0 "s_register_operand")
(match_operand:VDQW 1 "s_register_operand")]
"TARGET_NEON"
{
emit_insn (gen_abs2 (operands[0], operands[1]));
DONE;
})
(define_insn "neon_vqabs"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")]
UNSPEC_VQABS))]
"TARGET_NEON"
"vqabs.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_qabs")]
)
(define_insn "neon_bswap"
[(set (match_operand:VDQHSD 0 "register_operand" "=w")
(bswap:VDQHSD (match_operand:VDQHSD 1 "register_operand" "w")))]
"TARGET_NEON"
"vrev<V_sz_elem>.8\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_rev")]
)
(define_expand "neon_vneg"
[(match_operand:VDQW 0 "s_register_operand")
(match_operand:VDQW 1 "s_register_operand")]
"TARGET_NEON"
{
emit_insn (gen_neon_neg2 (operands[0], operands[1]));
DONE;
})
;; The vcadd and vcmla patterns are made UNSPEC for the explicitly due to the
;; fact that their usage need to guarantee that the source vectors are
;; contiguous. It would be wrong to describe the operation without being able
;; to describe the permute that is also required, but even if that is done
;; the permute would have been created as a LOAD_LANES which means the values
;; in the registers are in the wrong order.
(define_insn "neon_vcadd"
[(set (match_operand:VF 0 "register_operand" "=w")
(unspec:VF [(match_operand:VF 1 "register_operand" "w")
(match_operand:VF 2 "register_operand" "w")]
VCADD))]
"TARGET_COMPLEX"
"vcadd.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2, #"
[(set_attr "type" "neon_fcadd")]
)
(define_insn "neon_vcmla"
[(set (match_operand:VF 0 "register_operand" "=w")
(plus:VF (match_operand:VF 1 "register_operand" "0")
(unspec:VF [(match_operand:VF 2 "register_operand" "w")
(match_operand:VF 3 "register_operand" "w")]
VCMLA)))]
"TARGET_COMPLEX"
"vcmla.<V_s_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3, #"
[(set_attr "type" "neon_fcmla")]
)
(define_insn "neon_vcmla_lane"
[(set (match_operand:VF 0 "s_register_operand" "=w")
(plus:VF (match_operand:VF 1 "s_register_operand" "0")
(unspec:VF [(match_operand:VF 2 "s_register_operand" "w")
(match_operand:VF 3 "s_register_operand" "<VF_constraint>")
(match_operand:SI 4 "const_int_operand" "n")]
VCMLA)))]
"TARGET_COMPLEX"
{
operands = neon_vcmla_lane_prepare_operands (operands);
return "vcmla.<V_s_elem>\t%<V_reg>0, %<V_reg>2, d%c3[%c4], #";
}
[(set_attr "type" "neon_fcmla")]
)
(define_insn "neon_vcmla_laneq"
[(set (match_operand:VDF 0 "s_register_operand" "=w")
(plus:VDF (match_operand:VDF 1 "s_register_operand" "0")
(unspec:VDF [(match_operand:VDF 2 "s_register_operand" "w")
(match_operand:<V_DOUBLE> 3 "s_register_operand" "<VF_constraint>")
(match_operand:SI 4 "const_int_operand" "n")]
VCMLA)))]
"TARGET_COMPLEX"
{
operands = neon_vcmla_lane_prepare_operands (operands);
return "vcmla.<V_s_elem>\t%<V_reg>0, %<V_reg>2, d%c3[%c4], #";
}
[(set_attr "type" "neon_fcmla")]
)
(define_insn "neon_vcmlaq_lane"
[(set (match_operand:VQ_HSF 0 "s_register_operand" "=w")
(plus:VQ_HSF (match_operand:VQ_HSF 1 "s_register_operand" "0")
(unspec:VQ_HSF [(match_operand:VQ_HSF 2 "s_register_operand" "w")
(match_operand:<V_HALF> 3 "s_register_operand" "<VF_constraint>")
(match_operand:SI 4 "const_int_operand" "n")]
VCMLA)))]
"TARGET_COMPLEX"
{
operands = neon_vcmla_lane_prepare_operands (operands);
return "vcmla.<V_s_elem>\t%<V_reg>0, %<V_reg>2, d%c3[%c4], #";
}
[(set_attr "type" "neon_fcmla")]
)
;; The complex mul operations always need to expand to two instructions.
;; The first operation does half the computation and the second does the
;; remainder. Because of this, expand early.
(define_expand "cmul<conj_op>3"
[(set (match_operand:VDF 0 "register_operand")
(unspec:VDF [(match_operand:VDF 1 "register_operand")
(match_operand:VDF 2 "register_operand")]
VCMUL_OP))]
"TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
{
rtx res1 = gen_reg_rtx (mode);
rtx tmp = force_reg (mode, CONST0_RTX (mode));
emit_insn (gen_neon_vcmla (res1, tmp,
operands[2], operands[1]));
emit_insn (gen_neon_vcmla (operands[0], res1,
operands[2], operands[1]));
DONE;
})
;; These instructions map to the _builtins for the Dot Product operations.
(define_insn "neondot"
[(set (match_operand:VCVTI 0 "register_operand" "=w")
(plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
(unspec:VCVTI [(match_operand: 2
"register_operand" "w")
(match_operand: 3
"register_operand" "w")]
DOTPROD)))]
"TARGET_DOTPROD"
"vdot.\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
[(set_attr "type" "neon_dot")]
)
;; These instructions map to the __builtins for the Dot Product operations.
(define_insn "neon_usdot"
[(set (match_operand:VCVTI 0 "register_operand" "=w")
(plus:VCVTI
(unspec:VCVTI
[(match_operand: 2 "register_operand" "w")
(match_operand: 3 "register_operand" "w")]
UNSPEC_DOT_US)
(match_operand:VCVTI 1 "register_operand" "0")))]
"TARGET_I8MM"
"vusdot.s8\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
[(set_attr "type" "neon_dot")]
)
;; These instructions map to the _builtins for the Dot Product
;; indexed operations.
(define_insn "neondot_lane"
[(set (match_operand:VCVTI 0 "register_operand" "=w")
(plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
(unspec:VCVTI [(match_operand: 2
"register_operand" "w")
(match_operand:V8QI 3 "register_operand" "t")
(match_operand:SI 4 "immediate_operand" "i")]
DOTPROD)))]
"TARGET_DOTPROD"
{
operands[4]
= GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL (operands[4])));
return "vdot.\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
}
[(set_attr "type" "neon_dot")]
)
;; These instructions map to the _builtins for the Dot Product
;; indexed operations in the v8.6 I8MM extension.
(define_insn "neondot_lane"
[(set (match_operand:VCVTI 0 "register_operand" "=w")
(plus:VCVTI
(unspec:VCVTI
[(match_operand: 2 "register_operand" "w")
(match_operand:V8QI 3 "register_operand" "t")
(match_operand:SI 4 "immediate_operand" "i")]
DOTPROD_I8MM)
(match_operand:VCVTI 1 "register_operand" "0")))]
"TARGET_I8MM"
{
operands[4] = GEN_INT (INTVAL (operands[4]));
return "vdot.\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
}
[(set_attr "type" "neon_dot")]
)
;; These expands map to the Dot Product optab the vectorizer checks for.
;; The auto-vectorizer expects a dot product builtin that also does an
;; accumulation into the provided register.
;; Given the following pattern
;;
;; for (i=0; i<len; i++) {
;; c = a[i] * b[i];
;; r += c;
;; }
;; return result;
;;
;; This can be auto-vectorized to
;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
;;
;; given enough iterations. However the vectorizer can keep unrolling the loop
;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
;; ...
;;
;; and so the vectorizer provides r, in which the result has to be accumulated.
(define_expand "dot_prod"
[(set (match_operand:VCVTI 0 "register_operand")
(plus:VCVTI (unspec:VCVTI [(match_operand: 1
"register_operand")
(match_operand: 2
"register_operand")]
DOTPROD)
(match_operand:VCVTI 3 "register_operand")))]
"TARGET_DOTPROD"
{
emit_insn (
gen_neon_dot (operands[3], operands[3], operands[1],
operands[2]));
emit_insn (gen_rtx_SET (operands[0], operands[3]));
DONE;
})
(define_expand "neon_copysignf"
[(match_operand:VCVTF 0 "register_operand")
(match_operand:VCVTF 1 "register_operand")
(match_operand:VCVTF 2 "register_operand")]
"TARGET_NEON"
"{
rtx v_bitmask_cast;
rtx v_bitmask = gen_reg_rtx (VCVTF:V_cmp_resultmode);
rtx c = gen_int_mode (0x80000000, SImode);
emit_move_insn (v_bitmask,
gen_const_vec_duplicate (<VCVTF:V_cmp_result>mode, c));
emit_move_insn (operands[0], operands[2]);
v_bitmask_cast = simplify_gen_subreg (<MODE>mode, v_bitmask,
<VCVTF:V_cmp_result>mode, 0);
emit_insn (gen_neon_vbsl<mode> (operands[0], v_bitmask_cast, operands[0],
operands[1]));
DONE;
}"
)
(define_insn "neon_vqneg"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")]
UNSPEC_VQNEG))]
"TARGET_NEON"
"vqneg.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_qneg")]
)
(define_insn "neon_vcls"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")]
UNSPEC_VCLS))]
"TARGET_NEON"
"vcls.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_cls")]
)
(define_insn "clz2"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(clz:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")))]
"TARGET_NEON"
"vclz.<V_if_elem>\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_cnt")]
)
(define_expand "neon_vclz"
[(match_operand:VDQIW 0 "s_register_operand")
(match_operand:VDQIW 1 "s_register_operand")]
"TARGET_NEON"
{
emit_insn (gen_clz2 (operands[0], operands[1]));
DONE;
})
(define_insn "popcount2"
[(set (match_operand:VE 0 "s_register_operand" "=w")
(popcount:VE (match_operand:VE 1 "s_register_operand" "w")))]
"TARGET_NEON"
"vcnt.<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_cnt")]
)
(define_expand "neon_vcnt"
[(match_operand:VE 0 "s_register_operand")
(match_operand:VE 1 "s_register_operand")]
"TARGET_NEON"
{
emit_insn (gen_popcount2 (operands[0], operands[1]));
DONE;
})
(define_insn "neon_vrecpe"
[(set (match_operand:VH 0 "s_register_operand" "=w")
(unspec:VH [(match_operand:VH 1 "s_register_operand" "w")]
UNSPEC_VRECPE))]
"TARGET_NEON_FP16INST"
"vrecpe.f16\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_fp_recpe_s")]
)
(define_insn "neon_vrecpe"
[(set (match_operand:V32 0 "s_register_operand" "=w")
(unspec:V32 [(match_operand:V32 1 "s_register_operand" "w")]
UNSPEC_VRECPE))]
"TARGET_NEON"
"vrecpe.<V_u_elem>\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_fp_recpe_s")]
)
(define_insn "neon_vrsqrte"
[(set (match_operand:V32 0 "s_register_operand" "=w")
(unspec:V32 [(match_operand:V32 1 "s_register_operand" "w")]
UNSPEC_VRSQRTE))]
"TARGET_NEON"
"vrsqrte.<V_u_elem>\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_fp_rsqrte_s")]
)
(define_expand "neon_vmvn"
[(match_operand:VDQIW 0 "s_register_operand")
(match_operand:VDQIW 1 "s_register_operand")]
"TARGET_NEON"
{
emit_insn (gen_one_cmpl2_neon (operands[0], operands[1]));
DONE;
})
(define_insn "neon_vget_lane_sext_internal"
[(set (match_operand:SI 0 "s_register_operand" "=r")
(sign_extend:SI
(vec_select:<V_elem>
(match_operand:VD 1 "s_register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
"TARGET_NEON"
{
if (BYTES_BIG_ENDIAN)
{
int elt = INTVAL (operands[2]);
elt = GET_MODE_NUNITS (mode) - 1 - elt;
operands[2] = GEN_INT (elt);
}
return "vmov.s<V_sz_elem>\t%0, %P1[%c2]";
}
[(set_attr "type" "neon_to_gp")]
)
(define_insn "neon_vget_lane_zext_internal"
[(set (match_operand:SI 0 "s_register_operand" "=r")
(zero_extend:SI
(vec_select:<V_elem>
(match_operand:VD 1 "s_register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
"TARGET_NEON"
{
if (BYTES_BIG_ENDIAN)
{
int elt = INTVAL (operands[2]);
elt = GET_MODE_NUNITS (mode) - 1 - elt;
operands[2] = GEN_INT (elt);
}
return "vmov.u<V_sz_elem>\t%0, %P1[%c2]";
}
[(set_attr "type" "neon_to_gp")]
)
(define_insn "neon_vget_lane_sext_internal"
[(set (match_operand:SI 0 "s_register_operand" "=r")
(sign_extend:SI
(vec_select:<V_elem>
(match_operand:VQ2 1 "s_register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
"TARGET_NEON"
{
rtx ops[3];
int regno = REGNO (operands[1]);
unsigned int halfelts = GET_MODE_NUNITS (mode) / 2;
unsigned int elt = INTVAL (operands[2]);
unsigned int elt_adj = elt % halfelts;
if (BYTES_BIG_ENDIAN)
elt_adj = halfelts - 1 - elt_adj;
ops[0] = operands[0];
ops[1] = gen_rtx_REG (<V_HALF>mode, regno + 2 * (elt / halfelts));
ops[2] = GEN_INT (elt_adj);
output_asm_insn ("vmov.s<V_sz_elem>\t%0, %P1[%c2]", ops);
return "";
}
[(set_attr "type" "neon_to_gp_q")]
)
(define_insn "neon_vget_lane_zext_internal"
[(set (match_operand:SI 0 "s_register_operand" "=r")
(zero_extend:SI
(vec_select:<V_elem>
(match_operand:VQ2 1 "s_register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
"TARGET_NEON"
{
rtx ops[3];
int regno = REGNO (operands[1]);
unsigned int halfelts = GET_MODE_NUNITS (mode) / 2;
unsigned int elt = INTVAL (operands[2]);
unsigned int elt_adj = elt % halfelts;
if (BYTES_BIG_ENDIAN)
elt_adj = halfelts - 1 - elt_adj;
ops[0] = operands[0];
ops[1] = gen_rtx_REG (<V_HALF>mode, regno + 2 * (elt / halfelts));
ops[2] = GEN_INT (elt_adj);
output_asm_insn ("vmov.u<V_sz_elem>\t%0, %P1[%c2]", ops);
return "";
}
[(set_attr "type" "neon_to_gp_q")]
)
(define_expand "neon_vget_lane"
[(match_operand:<V_ext> 0 "s_register_operand")
(match_operand:VDQW 1 "s_register_operand")
(match_operand:SI 2 "immediate_operand")]
"TARGET_NEON"
{
if (BYTES_BIG_ENDIAN)
{
/* The intrinsics are defined in terms of a model where the
element ordering in memory is vldm order, whereas the generic
RTL is defined in terms of a model where the element ordering
in memory is array order. Convert the lane number to conform
to this model. */
unsigned int elt = INTVAL (operands[2]);
unsigned int reg_nelts
= 64 / GET_MODE_UNIT_BITSIZE (mode);
elt ^= reg_nelts - 1;
operands[2] = GEN_INT (elt);
}
if (GET_MODE_UNIT_BITSIZE (mode) == 32)
emit_insn (gen_vec_extract<V_elem_l> (operands[0], operands[1],
operands[2]));
else
emit_insn (gen_neon_vget_lane_sext_internal (operands[0],
operands[1],
operands[2]));
DONE;
})
(define_expand "neon_vget_laneu"
[(match_operand:<V_ext> 0 "s_register_operand")
(match_operand:VDQIW 1 "s_register_operand")
(match_operand:SI 2 "immediate_operand")]
"TARGET_NEON"
{
if (BYTES_BIG_ENDIAN)
{
/* The intrinsics are defined in terms of a model where the
element ordering in memory is vldm order, whereas the generic
RTL is defined in terms of a model where the element ordering
in memory is array order. Convert the lane number to conform
to this model. */
unsigned int elt = INTVAL (operands[2]);
unsigned int reg_nelts
= 64 / GET_MODE_UNIT_BITSIZE (mode);
elt ^= reg_nelts - 1;
operands[2] = GEN_INT (elt);
}
if (GET_MODE_UNIT_BITSIZE (mode) == 32)
emit_insn (gen_vec_extract<V_elem_l> (operands[0], operands[1],
operands[2]));
else
emit_insn (gen_neon_vget_lane_zext_internal (operands[0],
operands[1],
operands[2]));
DONE;
})
(define_expand "neon_vget_lanedi"
[(match_operand:DI 0 "s_register_operand")
(match_operand:DI 1 "s_register_operand")
(match_operand:SI 2 "immediate_operand")]
"TARGET_NEON"
{
emit_move_insn (operands[0], operands[1]);
DONE;
})
(define_expand "neon_vget_lanev2di"
[(match_operand:DI 0 "s_register_operand")
(match_operand:V2DI 1 "s_register_operand")
(match_operand:SI 2 "immediate_operand")]
"TARGET_NEON"
{
int lane;
if (BYTES_BIG_ENDIAN)
{
/* The intrinsics are defined in terms of a model where the
element ordering in memory is vldm order, whereas the generic
RTL is defined in terms of a model where the element ordering
in memory is array order. Convert the lane number to conform
to this model. */
unsigned int elt = INTVAL (operands[2]);
unsigned int reg_nelts = 2;
elt ^= reg_nelts - 1;
operands[2] = GEN_INT (elt);
}
lane = INTVAL (operands[2]);
gcc_assert ((lane ==0) || (lane == 1));
emit_move_insn (operands[0], lane == 0
? gen_lowpart (DImode, operands[1])
: gen_highpart (DImode, operands[1]));
DONE;
})
(define_expand "neon_vset_lane"
[(match_operand:VDQ 0 "s_register_operand")
(match_operand:<V_elem> 1 "s_register_operand")
(match_operand:VDQ 2 "s_register_operand")
(match_operand:SI 3 "immediate_operand")]
"TARGET_NEON"
{
unsigned int elt = INTVAL (operands[3]);
if (BYTES_BIG_ENDIAN)
{
unsigned int reg_nelts
= 64 / GET_MODE_UNIT_BITSIZE (mode);
elt ^= reg_nelts - 1;
}
emit_insn (gen_vec_set_internal (operands[0], operands[1],
GEN_INT (1 << elt), operands[2]));
DONE;
})
; See neon_vget_lanedi comment for reasons operands 2 & 3 are ignored.
(define_expand "neon_vset_lanedi"
[(match_operand:DI 0 "s_register_operand")
(match_operand:DI 1 "s_register_operand")
(match_operand:DI 2 "s_register_operand")
(match_operand:SI 3 "immediate_operand")]
"TARGET_NEON"
{
emit_move_insn (operands[0], operands[1]);
DONE;
})
(define_expand "neon_vcreate"
[(match_operand:VD_RE 0 "s_register_operand")
(match_operand:DI 1 "general_operand")]
"TARGET_NEON"
{
rtx src = gen_lowpart (mode, operands[1]);
emit_move_insn (operands[0], src);
DONE;
})
(define_insn "neon_vdup_n"
[(set (match_operand:VX 0 "s_register_operand" "=w")
(vec_duplicate:VX (match_operand:<V_elem> 1 "s_register_operand" "r")))]
"TARGET_NEON"
"vdup.<V_sz_elem>\t%<V_reg>0, %1"
[(set_attr "type" "neon_from_gp")]
)
(define_insn "neon_vdup_nv4hf"
[(set (match_operand:V4HF 0 "s_register_operand" "=w")
(vec_duplicate:V4HF (match_operand:HF 1 "s_register_operand" "r")))]
"TARGET_NEON"
"vdup.16\t%P0, %1"
[(set_attr "type" "neon_from_gp")]
)
(define_insn "neon_vdup_nv8hf"
[(set (match_operand:V8HF 0 "s_register_operand" "=w")
(vec_duplicate:V8HF (match_operand:HF 1 "s_register_operand" "r")))]
"TARGET_NEON"
"vdup.16\t%q0, %1"
[(set_attr "type" "neon_from_gp_q")]
)
(define_insn "neon_vdup_nv4bf"
[(set (match_operand:V4BF 0 "s_register_operand" "=w")
(vec_duplicate:V4BF (match_operand:BF 1 "s_register_operand" "r")))]
"TARGET_NEON"
"vdup.16\t%P0, %1"
[(set_attr "type" "neon_from_gp")]
)
(define_insn "neon_vdup_nv8bf"
[(set (match_operand:V8BF 0 "s_register_operand" "=w")
(vec_duplicate:V8BF (match_operand:BF 1 "s_register_operand" "r")))]
"TARGET_NEON"
"vdup.16\t%q0, %1"
[(set_attr "type" "neon_from_gp_q")]
)
(define_insn "neon_vdup_n"
[(set (match_operand:V32 0 "s_register_operand" "=w,w")
(vec_duplicate:V32 (match_operand:<V_elem> 1 "s_register_operand" "r,t")))]
"TARGET_NEON"
"@
vdup.<V_sz_elem>\t%<V_reg>0, %1
vdup.<V_sz_elem>\t%<V_reg>0, %y1"
[(set_attr "type" "neon_from_gp,neon_dup")]
)
(define_expand "neon_vdup_ndi"
[(match_operand:DI 0 "s_register_operand")
(match_operand:DI 1 "s_register_operand")]
"TARGET_NEON"
{
emit_move_insn (operands[0], operands[1]);
DONE;
}
)
(define_insn "neon_vdup_nv2di"
[(set (match_operand:V2DI 0 "s_register_operand" "=w,w")
(vec_duplicate:V2DI (match_operand:DI 1 "s_register_operand" "r,w")))]
"TARGET_NEON"
"@
vmov\t%e0, %Q1, %R1;vmov\t%f0, %Q1, %R1
vmov\t%e0, %P1;vmov\t%f0, %P1"
[(set_attr "length" "8")
(set_attr "type" "multiple")]
)
(define_insn "neon_vdup_lane_internal"
[(set (match_operand:VDQW 0 "s_register_operand" "=w")
(vec_duplicate:VDQW
(vec_select:<V_elem>
(match_operand:<V_double_vector_mode> 1 "s_register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
"TARGET_NEON"
{
if (BYTES_BIG_ENDIAN)
{
int elt = INTVAL (operands[2]);
elt = GET_MODE_NUNITS (<V_double_vector_mode>mode) - 1 - elt;
operands[2] = GEN_INT (elt);
}
if (<Is_d_reg>)
return "vdup.<V_sz_elem>\t%P0, %P1[%c2]";
else
return "vdup.<V_sz_elem>\t%q0, %P1[%c2]";
}
[(set_attr "type" "neon_dup")]
)
(define_insn "neon_vdup_lane_internal"
[(set (match_operand:VHFBF 0 "s_register_operand" "=w")
(vec_duplicate:VHFBF
(vec_select:<V_elem>
(match_operand:<V_double_vector_mode> 1 "s_register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
"TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)"
{
if (BYTES_BIG_ENDIAN)
{
int elt = INTVAL (operands[2]);
elt = GET_MODE_NUNITS (<V_double_vector_mode>mode) - 1 - elt;
operands[2] = GEN_INT (elt);
}
if (<Is_d_reg>)
return "vdup.<V_sz_elem>\t%P0, %P1[%c2]";
else
return "vdup.<V_sz_elem>\t%q0, %P1[%c2]";
}
[(set_attr "type" "neon_dup")]
)
(define_expand "neon_vdup_lane"
[(match_operand:VDQW 0 "s_register_operand")
(match_operand:<V_double_vector_mode> 1 "s_register_operand")
(match_operand:SI 2 "immediate_operand")]
"TARGET_NEON"
{
if (BYTES_BIG_ENDIAN)
{
unsigned int elt = INTVAL (operands[2]);
unsigned int reg_nelts
= 64 / GET_MODE_UNIT_BITSIZE (<V_double_vector_mode>mode);
elt ^= reg_nelts - 1;
operands[2] = GEN_INT (elt);
}
emit_insn (gen_neon_vdup_lane_internal (operands[0], operands[1],
operands[2]));
DONE;
})
(define_expand "neon_vdup_lane"
[(match_operand:VHFBF 0 "s_register_operand")
(match_operand:<V_double_vector_mode> 1 "s_register_operand")
(match_operand:SI 2 "immediate_operand")]
"TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)"
{
if (BYTES_BIG_ENDIAN)
{
unsigned int elt = INTVAL (operands[2]);
unsigned int reg_nelts
= 64 / GET_MODE_UNIT_BITSIZE (<V_double_vector_mode>mode);
elt ^= reg_nelts - 1;
operands[2] = GEN_INT (elt);
}
emit_insn (gen_neon_vdup_lane_internal (operands[0], operands[1],
operands[2]));
DONE;
})
; Scalar index is ignored, since only zero is valid here.
(define_expand "neon_vdup_lanedi"
[(match_operand:DI 0 "s_register_operand")
(match_operand:DI 1 "s_register_operand")
(match_operand:SI 2 "immediate_operand")]
"TARGET_NEON"
{
emit_move_insn (operands[0], operands[1]);
DONE;
})
; Likewise for v2di, as the DImode second operand has only a single element.
(define_expand "neon_vdup_lanev2di"
[(match_operand:V2DI 0 "s_register_operand")
(match_operand:DI 1 "s_register_operand")
(match_operand:SI 2 "immediate_operand")]
"TARGET_NEON"
{
emit_insn (gen_neon_vdup_nv2di (operands[0], operands[1]));
DONE;
})
; Disabled before reload because we don't want combine doing something silly,
; but used by the post-reload expansion of neon_vcombine.
(define_insn "*neon_vswp"
[(set (match_operand:VDQX 0 "s_register_operand" "+w")
(match_operand:VDQX 1 "s_register_operand" "+w"))
(set (match_dup 1) (match_dup 0))]
"TARGET_NEON && reload_completed"
"vswp\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_permute")]
)
;; In this insn, operand 1 should be low, and operand 2 the high part of the
;; dest vector.
;; FIXME: A different implementation of this builtin could make it much
;; more likely that we wouldn't actually need to output anything (we could make
;; it so that the reg allocator puts things in the right places magically
;; instead). Lack of subregs for vectors makes that tricky though, I think.
(define_insn_and_split "neon_vcombine"
[(set (match_operand:<V_DOUBLE> 0 "s_register_operand" "=w")
(vec_concat:<V_DOUBLE>
(match_operand:VDX 1 "s_register_operand" "w")
(match_operand:VDX 2 "s_register_operand" "w")))]
"TARGET_NEON"
"#"
"&& reload_completed"
[(const_int 0)]
{
neon_split_vcombine (operands);
DONE;
}
[(set_attr "type" "multiple")]
)
(define_expand "neon_vget_high"
[(match_operand:<V_HALF> 0 "s_register_operand")
(match_operand:VQXBF 1 "s_register_operand")]
"TARGET_NEON"
{
emit_move_insn (operands[0],
simplify_gen_subreg (<V_HALF>mode, operands[1], mode,
GET_MODE_SIZE (<V_HALF>mode)));
DONE;
})
(define_expand "neon_vget_low"
[(match_operand:<V_HALF> 0 "s_register_operand")
(match_operand:VQX 1 "s_register_operand")]
"TARGET_NEON"
{
emit_move_insn (operands[0],
simplify_gen_subreg (<V_HALF>mode, operands[1],
mode, 0));
DONE;
})
(define_insn "float<V_cvtto>2"
[(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
(float:<V_CVTTO> (match_operand:VCVTI 1 "s_register_operand" "w")))]
"TARGET_NEON && !flag_rounding_math"
"vcvt.f32.s32\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_int_to_fp_<V_elem_ch>")]
)
(define_insn "floatuns<V_cvtto>2"
[(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
(unsigned_float:<V_CVTTO> (match_operand:VCVTI 1 "s_register_operand" "w")))]
"TARGET_NEON && !flag_rounding_math"
"vcvt.f32.u32\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_int_to_fp_<V_elem_ch>")]
)
(define_insn "fix_trunc<V_cvtto>2"
[(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
(fix:<V_CVTTO> (match_operand:VCVTF 1 "s_register_operand" "w")))]
"TARGET_NEON"
"vcvt.s32.f32\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_fp_to_int_<V_elem_ch>")]
)
(define_insn "fixuns_trunc<V_cvtto>2"
[(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
(unsigned_fix:<V_CVTTO> (match_operand:VCVTF 1 "s_register_operand" "w")))]
"TARGET_NEON"
"vcvt.u32.f32\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_fp_to_int_<V_elem_ch>")]
)
(define_insn "neon_vcvt"
[(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
(unspec:<V_CVTTO> [(match_operand:VCVTF 1 "s_register_operand" "w")]
VCVT_US))]
"TARGET_NEON"
"vcvt.%#32.f32\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_fp_to_int_<V_elem_ch>")]
)
(define_insn "neon_vcvt"
[(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
(unspec:<V_CVTTO> [(match_operand:VCVTI 1 "s_register_operand" "w")]
VCVT_US))]
"TARGET_NEON"
"vcvt.f32.%#32\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_int_to_fp_<V_elem_ch>")]
)
(define_insn "neon_vcvtv4sfv4hf"
[(set (match_operand:V4SF 0 "s_register_operand" "=w")
(unspec:V4SF [(match_operand:V4HF 1 "s_register_operand" "w")]
UNSPEC_VCVT))]
"TARGET_NEON && TARGET_FP16"
"vcvt.f32.f16\t%q0, %P1"
[(set_attr "type" "neon_fp_cvt_widen_h")]
)
(define_insn "neon_vcvtv4hfv4sf"
[(set (match_operand:V4HF 0 "s_register_operand" "=w")
(unspec:V4HF [(match_operand:V4SF 1 "s_register_operand" "w")]
UNSPEC_VCVT))]
"TARGET_NEON && TARGET_FP16"
"vcvt.f16.f32\t%P0, %q1"
[(set_attr "type" "neon_fp_cvt_narrow_s_q")]
)
(define_insn "neon_vcvt"
[(set
(match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
(unspec:<VH_CVTTO>
[(match_operand:VCVTHI 1 "s_register_operand" "w")]
VCVT_US))]
"TARGET_NEON_FP16INST"
"vcvt.f16.%#16\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_int_to_fp_<VH_elem_ch>")]
)
(define_insn "neon_vcvt"
[(set
(match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
(unspec:<VH_CVTTO>
[(match_operand:VH 1 "s_register_operand" "w")]
VCVT_US))]
"TARGET_NEON_FP16INST"
"vcvt.%#16.f16\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_fp_to_int_<VH_elem_ch>")]
)
(define_insn "neon_vcvtn"
[(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
(unspec:<V_CVTTO> [(match_operand:VCVTF 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
VCVT_US_N))]
"TARGET_NEON"
{
arm_const_bounds (operands[2], 1, 33);
return "vcvt.%#32.f32\t%<V_reg>0, %<V_reg>1, %2";
}
[(set_attr "type" "neon_fp_to_int<V_elem_ch>")]
)
(define_insn "neon_vcvtn"
[(set (match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
(unspec:<VH_CVTTO>
[(match_operand:VH 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
VCVT_US_N))]
"TARGET_NEON_FP16INST"
{
arm_const_bounds (operands[2], 0, 17);
return "vcvt.%#16.f16\t%<V_reg>0, %<V_reg>1, %2";
}
[(set_attr "type" "neon_fp_to_int<VH_elem_ch>")]
)
(define_insn "neon_vcvtn"
[(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
(unspec:<V_CVTTO> [(match_operand:VCVTI 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
VCVT_US_N))]
"TARGET_NEON"
{
arm_const_bounds (operands[2], 1, 33);
return "vcvt.f32.%#32\t%<V_reg>0, %<V_reg>1, %2";
}
[(set_attr "type" "neon_int_to_fp<V_elem_ch>")]
)
(define_insn "neon_vcvtn"
[(set (match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
(unspec:<VH_CVTTO>
[(match_operand:VCVTHI 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
VCVT_US_N))]
"TARGET_NEON_FP16INST"
{
arm_const_bounds (operands[2], 0, 17);
return "vcvt.f16.%#16\t%<V_reg>0, %<V_reg>1, %2";
}
[(set_attr "type" "neon_int_to_fp<VH_elem_ch>")]
)
(define_insn "neon_vcvt<vcvth_op>"
[(set
(match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
(unspec:<VH_CVTTO>
[(match_operand:VH 1 "s_register_operand" "w")]
VCVT_HF_US))]
"TARGET_NEON_FP16INST"
"vcvt<vcvth_op>.%#16.f16\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_fp_to_int_<VH_elem_ch>")]
)
(define_insn "neon_vmovn"
[(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")]
UNSPEC_VMOVN))]
"TARGET_NEON"
"vmovn.<V_if_elem>\t%P0, %q1"
[(set_attr "type" "neon_shift_imm_narrow_q")]
)
(define_insn "neon_vqmovn"
[(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")]
VQMOVN))]
"TARGET_NEON"
"vqmovn.%#<V_sz_elem>\t%P0, %q1"
[(set_attr "type" "neon_sat_shift_imm_narrow_q")]
)
(define_insn "neon_vqmovun"
[(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")]
UNSPEC_VQMOVUN))]
"TARGET_NEON"
"vqmovun.<V_s_elem>\t%P0, %q1"
[(set_attr "type" "neon_sat_shift_imm_narrow_q")]
)
(define_insn "neon_vmovl"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:VW 1 "s_register_operand" "w")]
VMOVL))]
"TARGET_NEON"
"vmovl.%#<V_sz_elem>\t%q0, %P1"
[(set_attr "type" "neon_shift_imm_long")]
)
(define_insn "neon_vmul_lane"
[(set (match_operand:VMD 0 "s_register_operand" "=w")
(unspec:VMD [(match_operand:VMD 1 "s_register_operand" "w")
(match_operand:VMD 2 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 3 "immediate_operand" "i")]
UNSPEC_VMUL_LANE))]
"TARGET_NEON"
{
return "vmul.<V_if_elem>\t%P0, %P1, %P2[%c3]";
}
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_mul_s_scalar")
(const_string "neon_mul_<V_elem_ch>_scalar")))]
)
(define_insn "neon_vmul_lane"
[(set (match_operand:VMQ 0 "s_register_operand" "=w")
(unspec:VMQ [(match_operand:VMQ 1 "s_register_operand" "w")
(match_operand:<V_HALF> 2 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 3 "immediate_operand" "i")]
UNSPEC_VMUL_LANE))]
"TARGET_NEON"
{
return "vmul.<V_if_elem>\t%q0, %q1, %P2[%c3]";
}
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_mul_s_scalar")
(const_string "neon_mul_<V_elem_ch>_scalar")))]
)
(define_insn "neon_vmul_lane"
[(set (match_operand:VH 0 "s_register_operand" "=w")
(unspec:VH [(match_operand:VH 1 "s_register_operand" "w")
(match_operand:V4HF 2 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 3 "immediate_operand" "i")]
UNSPEC_VMUL_LANE))]
"TARGET_NEON_FP16INST"
"vmul.f16\t%<V_reg>0, %<V_reg>1, %P2[%c3]"
[(set_attr "type" "neon_fp_mul_s_scalar")]
)
(define_insn "neon_vmulllane"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:VMDI 1 "s_register_operand" "w")
(match_operand:VMDI 2 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 3 "immediate_operand" "i")]
VMULL_LANE))]
"TARGET_NEON"
{
return "vmull.%#<V_sz_elem>\t%q0, %P1, %P2[%c3]";
}
[(set_attr "type" "neon_mul<V_elem_ch>_scalar_long")]
)
(define_insn "neon_vqdmull_lane"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:VMDI 1 "s_register_operand" "w")
(match_operand:VMDI 2 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 3 "immediate_operand" "i")]
UNSPEC_VQDMULL_LANE))]
"TARGET_NEON"
{
return "vqdmull.<V_s_elem>\t%q0, %P1, %P2[%c3]";
}
[(set_attr "type" "neon_sat_mul_<V_elem_ch>_scalar_long")]
)
(define_insn "neon_vqdmulh_lane"
[(set (match_operand:VMQI 0 "s_register_operand" "=w")
(unspec:VMQI [(match_operand:VMQI 1 "s_register_operand" "w")
(match_operand:<V_HALF> 2 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 3 "immediate_operand" "i")]
VQDMULH_LANE))]
"TARGET_NEON"
{
return "vqdmulh.<V_s_elem>\t%q0, %q1, %P2[%c3]";
}
[(set_attr "type" "neon_sat_mul_<V_elem_ch>_scalar_q")]
)
(define_insn "neon_vqdmulh_lane"
[(set (match_operand:VMDI 0 "s_register_operand" "=w")
(unspec:VMDI [(match_operand:VMDI 1 "s_register_operand" "w")
(match_operand:VMDI 2 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 3 "immediate_operand" "i")]
VQDMULH_LANE))]
"TARGET_NEON"
{
return "vqdmulh.<V_s_elem>\t%P0, %P1, %P2[%c3]";
}
[(set_attr "type" "neon_sat_mul_<V_elem_ch>_scalar_q")]
)
;; vqrdmlah_lane, vqrdmlsh_lane
(define_insn "neon_vqrdml<VQRDMLH_AS:neon_rdma_as>h_lane"
[(set (match_operand:VMQI 0 "s_register_operand" "=w")
(unspec:VMQI [(match_operand:VMQI 1 "s_register_operand" "0")
(match_operand:VMQI 2 "s_register_operand" "w")
(match_operand:<V_HALF> 3 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 4 "immediate_operand" "i")]
VQRDMLH_AS))]
"TARGET_NEON_RDMA"
{
return
"vqrdml<VQRDMLH_AS:neon_rdma_as>h.<V_s_elem>\t%q0, %q2, %P3[%c4]";
}
[(set_attr "type" "neon_mla_<V_elem_ch>_scalar")]
)
(define_insn "neon_vqrdml<VQRDMLH_AS:neon_rdma_as>h_lane"
[(set (match_operand:VMDI 0 "s_register_operand" "=w")
(unspec:VMDI [(match_operand:VMDI 1 "s_register_operand" "0")
(match_operand:VMDI 2 "s_register_operand" "w")
(match_operand:VMDI 3 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 4 "immediate_operand" "i")]
VQRDMLH_AS))]
"TARGET_NEON_RDMA"
{
return
"vqrdml<VQRDMLH_AS:neon_rdma_as>h.<V_s_elem>\t%P0, %P2, %P3[%c4]";
}
[(set_attr "type" "neon_mla_<V_elem_ch>_scalar")]
)
(define_insn "neon_vmla_lane"
[(set (match_operand:VMD 0 "s_register_operand" "=w")
(unspec:VMD [(match_operand:VMD 1 "s_register_operand" "0")
(match_operand:VMD 2 "s_register_operand" "w")
(match_operand:VMD 3 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 4 "immediate_operand" "i")]
UNSPEC_VMLA_LANE))]
"TARGET_NEON"
{
return "vmla.<V_if_elem>\t%P0, %P2, %P3[%c4]";
}
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_mla_s_scalar")
(const_string "neon_mla_<V_elem_ch>_scalar")))]
)
(define_insn "neon_vmla_lane"
[(set (match_operand:VMQ 0 "s_register_operand" "=w")
(unspec:VMQ [(match_operand:VMQ 1 "s_register_operand" "0")
(match_operand:VMQ 2 "s_register_operand" "w")
(match_operand:<V_HALF> 3 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 4 "immediate_operand" "i")]
UNSPEC_VMLA_LANE))]
"TARGET_NEON"
{
return "vmla.<V_if_elem>\t%q0, %q2, %P3[%c4]";
}
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_mla_s_scalar")
(const_string "neon_mla_<V_elem_ch>_scalar")))]
)
(define_insn "neon_vmlallane"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
(match_operand:VMDI 2 "s_register_operand" "w")
(match_operand:VMDI 3 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 4 "immediate_operand" "i")]
VMLAL_LANE))]
"TARGET_NEON"
{
return "vmlal.%#<V_sz_elem>\t%q0, %P2, %P3[%c4]";
}
[(set_attr "type" "neon_mla<V_elem_ch>_scalar_long")]
)
(define_insn "neon_vqdmlal_lane"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
(match_operand:VMDI 2 "s_register_operand" "w")
(match_operand:VMDI 3 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 4 "immediate_operand" "i")]
UNSPEC_VQDMLAL_LANE))]
"TARGET_NEON"
{
return "vqdmlal.<V_s_elem>\t%q0, %P2, %P3[%c4]";
}
[(set_attr "type" "neon_sat_mla_<V_elem_ch>_scalar_long")]
)
(define_insn "neon_vmls_lane"
[(set (match_operand:VMD 0 "s_register_operand" "=w")
(unspec:VMD [(match_operand:VMD 1 "s_register_operand" "0")
(match_operand:VMD 2 "s_register_operand" "w")
(match_operand:VMD 3 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 4 "immediate_operand" "i")]
UNSPEC_VMLS_LANE))]
"TARGET_NEON"
{
return "vmls.<V_if_elem>\t%P0, %P2, %P3[%c4]";
}
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_mla_s_scalar")
(const_string "neon_mla_<V_elem_ch>_scalar")))]
)
(define_insn "neon_vmls_lane"
[(set (match_operand:VMQ 0 "s_register_operand" "=w")
(unspec:VMQ [(match_operand:VMQ 1 "s_register_operand" "0")
(match_operand:VMQ 2 "s_register_operand" "w")
(match_operand:<V_HALF> 3 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 4 "immediate_operand" "i")]
UNSPEC_VMLS_LANE))]
"TARGET_NEON"
{
return "vmls.<V_if_elem>\t%q0, %q2, %P3[%c4]";
}
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_mla_s_scalar")
(const_string "neon_mla_<V_elem_ch>_scalar")))]
)
(define_insn "neon_vmlsllane"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
(match_operand:VMDI 2 "s_register_operand" "w")
(match_operand:VMDI 3 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 4 "immediate_operand" "i")]
VMLSL_LANE))]
"TARGET_NEON"
{
return "vmlsl.%#<V_sz_elem>\t%q0, %P2, %P3[%c4]";
}
[(set_attr "type" "neon_mla<V_elem_ch>_scalar_long")]
)
(define_insn "neon_vqdmlsl_lane"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
(match_operand:VMDI 2 "s_register_operand" "w")
(match_operand:VMDI 3 "s_register_operand"
"<scalar_mul_constraint>")
(match_operand:SI 4 "immediate_operand" "i")]
UNSPEC_VQDMLSL_LANE))]
"TARGET_NEON"
{
return "vqdmlsl.<V_s_elem>\t%q0, %P2, %P3[%c4]";
}
[(set_attr "type" "neon_sat_mla_<V_elem_ch>_scalar_long")]
)
; FIXME: For the "_n" multiply/multiply-accumulate insns, we copy a value in a
; core register into a temp register, then use a scalar taken from that. This
; isn't an optimal solution if e.g. the scalar has just been read from memory
; or extracted from another vector. The latter case it's currently better to
; use the "_lane" variant, and the former case can probably be implemented
; using vld1_lane, but that hasn't been done yet.
(define_expand "neon_vmul_n"
[(match_operand:VMD 0 "s_register_operand")
(match_operand:VMD 1 "s_register_operand")
(match_operand:<V_elem> 2 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[2], tmp, const0_rtx));
emit_insn (gen_neon_vmul_lane (operands[0], operands[1], tmp,
const0_rtx));
DONE;
})
(define_expand "neon_vmul_n"
[(match_operand:VMQ 0 "s_register_operand")
(match_operand:VMQ 1 "s_register_operand")
(match_operand:<V_elem> 2 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (<V_HALF>mode);
emit_insn (gen_neon_vset_lane<V_half> (tmp, operands[2], tmp, const0_rtx));
emit_insn (gen_neon_vmul_lane (operands[0], operands[1], tmp,
const0_rtx));
DONE;
})
(define_expand "neon_vmul_n"
[(match_operand:VH 0 "s_register_operand")
(match_operand:VH 1 "s_register_operand")
(match_operand:<V_elem> 2 "s_register_operand")]
"TARGET_NEON_FP16INST"
{
rtx tmp = gen_reg_rtx (V4HFmode);
emit_insn (gen_neon_vset_lanev4hf (tmp, operands[2], tmp, const0_rtx));
emit_insn (gen_neon_vmul_lane (operands[0], operands[1], tmp,
const0_rtx));
DONE;
})
(define_expand "neon_vmulls_n"
[(match_operand:<V_widen> 0 "s_register_operand")
(match_operand:VMDI 1 "s_register_operand")
(match_operand:<V_elem> 2 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[2], tmp, const0_rtx));
emit_insn (gen_neon_vmulls_lane (operands[0], operands[1], tmp,
const0_rtx));
DONE;
})
(define_expand "neon_vmullu_n"
[(match_operand:<V_widen> 0 "s_register_operand")
(match_operand:VMDI 1 "s_register_operand")
(match_operand:<V_elem> 2 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[2], tmp, const0_rtx));
emit_insn (gen_neon_vmullu_lane (operands[0], operands[1], tmp,
const0_rtx));
DONE;
})
(define_expand "neon_vqdmull_n"
[(match_operand:<V_widen> 0 "s_register_operand")
(match_operand:VMDI 1 "s_register_operand")
(match_operand:<V_elem> 2 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[2], tmp, const0_rtx));
emit_insn (gen_neon_vqdmull_lane (operands[0], operands[1], tmp,
const0_rtx));
DONE;
})
(define_expand "neon_vqdmulh_n"
[(match_operand:VMDI 0 "s_register_operand")
(match_operand:VMDI 1 "s_register_operand")
(match_operand:<V_elem> 2 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[2], tmp, const0_rtx));
emit_insn (gen_neon_vqdmulh_lane (operands[0], operands[1], tmp,
const0_rtx));
DONE;
})
(define_expand "neon_vqrdmulh_n"
[(match_operand:VMDI 0 "s_register_operand")
(match_operand:VMDI 1 "s_register_operand")
(match_operand:<V_elem> 2 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[2], tmp, const0_rtx));
emit_insn (gen_neon_vqrdmulh_lane (operands[0], operands[1], tmp,
const0_rtx));
DONE;
})
(define_expand "neon_vqdmulh_n"
[(match_operand:VMQI 0 "s_register_operand")
(match_operand:VMQI 1 "s_register_operand")
(match_operand:<V_elem> 2 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (<V_HALF>mode);
emit_insn (gen_neon_vset_lane<V_half> (tmp, operands[2], tmp, const0_rtx));
emit_insn (gen_neon_vqdmulh_lane (operands[0], operands[1], tmp,
const0_rtx));
DONE;
})
(define_expand "neon_vqrdmulh_n"
[(match_operand:VMQI 0 "s_register_operand")
(match_operand:VMQI 1 "s_register_operand")
(match_operand:<V_elem> 2 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (<V_HALF>mode);
emit_insn (gen_neon_vset_lane<V_half> (tmp, operands[2], tmp, const0_rtx));
emit_insn (gen_neon_vqrdmulh_lane (operands[0], operands[1], tmp,
const0_rtx));
DONE;
})
(define_expand "neon_vmla_n"
[(match_operand:VMD 0 "s_register_operand")
(match_operand:VMD 1 "s_register_operand")
(match_operand:VMD 2 "s_register_operand")
(match_operand:<V_elem> 3 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx));
emit_insn (gen_neon_vmla_lane (operands[0], operands[1], operands[2],
tmp, const0_rtx));
DONE;
})
(define_expand "neon_vmla_n"
[(match_operand:VMQ 0 "s_register_operand")
(match_operand:VMQ 1 "s_register_operand")
(match_operand:VMQ 2 "s_register_operand")
(match_operand:<V_elem> 3 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (<V_HALF>mode);
emit_insn (gen_neon_vset_lane<V_half> (tmp, operands[3], tmp, const0_rtx));
emit_insn (gen_neon_vmla_lane (operands[0], operands[1], operands[2],
tmp, const0_rtx));
DONE;
})
(define_expand "neon_vmlals_n"
[(match_operand:<V_widen> 0 "s_register_operand")
(match_operand:<V_widen> 1 "s_register_operand")
(match_operand:VMDI 2 "s_register_operand")
(match_operand:<V_elem> 3 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx));
emit_insn (gen_neon_vmlals_lane (operands[0], operands[1], operands[2],
tmp, const0_rtx));
DONE;
})
(define_expand "neon_vmlalu_n"
[(match_operand:<V_widen> 0 "s_register_operand")
(match_operand:<V_widen> 1 "s_register_operand")
(match_operand:VMDI 2 "s_register_operand")
(match_operand:<V_elem> 3 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx));
emit_insn (gen_neon_vmlalu_lane (operands[0], operands[1], operands[2],
tmp, const0_rtx));
DONE;
})
(define_expand "neon_vqdmlal_n"
[(match_operand:<V_widen> 0 "s_register_operand")
(match_operand:<V_widen> 1 "s_register_operand")
(match_operand:VMDI 2 "s_register_operand")
(match_operand:<V_elem> 3 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx));
emit_insn (gen_neon_vqdmlal_lane (operands[0], operands[1], operands[2],
tmp, const0_rtx));
DONE;
})
(define_expand "neon_vmls_n"
[(match_operand:VMD 0 "s_register_operand")
(match_operand:VMD 1 "s_register_operand")
(match_operand:VMD 2 "s_register_operand")
(match_operand:<V_elem> 3 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx));
emit_insn (gen_neon_vmls_lane (operands[0], operands[1], operands[2],
tmp, const0_rtx));
DONE;
})
(define_expand "neon_vmls_n"
[(match_operand:VMQ 0 "s_register_operand")
(match_operand:VMQ 1 "s_register_operand")
(match_operand:VMQ 2 "s_register_operand")
(match_operand:<V_elem> 3 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (<V_HALF>mode);
emit_insn (gen_neon_vset_lane<V_half> (tmp, operands[3], tmp, const0_rtx));
emit_insn (gen_neon_vmls_lane (operands[0], operands[1], operands[2],
tmp, const0_rtx));
DONE;
})
(define_expand "neon_vmlsls_n"
[(match_operand:<V_widen> 0 "s_register_operand")
(match_operand:<V_widen> 1 "s_register_operand")
(match_operand:VMDI 2 "s_register_operand")
(match_operand:<V_elem> 3 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx));
emit_insn (gen_neon_vmlsls_lane (operands[0], operands[1], operands[2],
tmp, const0_rtx));
DONE;
})
(define_expand "neon_vmlslu_n"
[(match_operand:<V_widen> 0 "s_register_operand")
(match_operand:<V_widen> 1 "s_register_operand")
(match_operand:VMDI 2 "s_register_operand")
(match_operand:<V_elem> 3 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx));
emit_insn (gen_neon_vmlslu_lane (operands[0], operands[1], operands[2],
tmp, const0_rtx));
DONE;
})
(define_expand "neon_vqdmlsl_n"
[(match_operand:<V_widen> 0 "s_register_operand")
(match_operand:<V_widen> 1 "s_register_operand")
(match_operand:VMDI 2 "s_register_operand")
(match_operand:<V_elem> 3 "s_register_operand")]
"TARGET_NEON"
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx));
emit_insn (gen_neon_vqdmlsl_lane (operands[0], operands[1], operands[2],
tmp, const0_rtx));
DONE;
})
(define_insn "@neon_vext"
[(set (match_operand:VDQX 0 "s_register_operand" "=w")
(unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w")
(match_operand:VDQX 2 "s_register_operand" "w")
(match_operand:SI 3 "immediate_operand" "i")]
UNSPEC_VEXT))]
"TARGET_NEON"
{
arm_const_bounds (operands[3], 0, GET_MODE_NUNITS (mode));
return "vext.<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2, %3";
}
[(set_attr "type" "neon_ext")]
)
(define_insn "@neon_vrev64"
[(set (match_operand:VDQ 0 "s_register_operand" "=w")
(unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "w")]
UNSPEC_VREV64))]
"TARGET_NEON"
"vrev64.<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_rev")]
)
(define_insn "@neon_vrev32"
[(set (match_operand:VX 0 "s_register_operand" "=w")
(unspec:VX [(match_operand:VX 1 "s_register_operand" "w")]
UNSPEC_VREV32))]
"TARGET_NEON"
"vrev32.<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_rev")]
)
(define_insn "@neon_vrev16"
[(set (match_operand:VE 0 "s_register_operand" "=w")
(unspec:VE [(match_operand:VE 1 "s_register_operand" "w")]
UNSPEC_VREV16))]
"TARGET_NEON"
"vrev16.<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
[(set_attr "type" "neon_rev")]
)
; vbsl_* intrinsics may compile to any of vbsl/vbif/vbit depending on register
; allocation. For an intrinsic of form:
; rD = vbsl_* (rS, rN, rM)
; We can use any of:
; vbsl rS, rN, rM (if D = S)
; vbit rD, rN, rS (if D = M, so 1-bits in rS choose bits from rN, else rM)
; vbif rD, rM, rS (if D = N, so 0-bits in rS choose bits from rM, else rN)
(define_insn "neon_vbsl_internal"
[(set (match_operand:VDQX 0 "s_register_operand" "=w,w,w")
(unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" " 0,w,w")
(match_operand:VDQX 2 "s_register_operand" " w,w,0")
(match_operand:VDQX 3 "s_register_operand" " w,0,w")]
UNSPEC_VBSL))]
"TARGET_NEON"
"@
vbsl\t%<V_reg>0, %<V_reg>2, %<V_reg>3
vbit\t%<V_reg>0, %<V_reg>2, %<V_reg>1
vbif\t%<V_reg>0, %<V_reg>3, %<V_reg>1"
[(set_attr "type" "neon_bsl")]
)
(define_expand "@neon_vbsl"
[(set (match_operand:VDQX 0 "s_register_operand")
(unspec:VDQX [(match_operand:<V_cmp_result> 1 "s_register_operand")
(match_operand:VDQX 2 "s_register_operand")
(match_operand:VDQX 3 "s_register_operand")]
UNSPEC_VBSL))]
"TARGET_NEON"
{
/* We can't alias operands together if they have different modes. */
operands[1] = gen_lowpart (mode, operands[1]);
})
;; vshl, vrshl
(define_insn "neon_v<shift_op>"
[(set (match_operand:VDQIX 0 "s_register_operand" "=w")
(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
(match_operand:VDQIX 2 "s_register_operand" "w")]
VSHL))]
"TARGET_NEON"
"v<shift_op>.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_shift_imm")]
)
;; vqshl, vqrshl
(define_insn "neon_v<shift_op>"
[(set (match_operand:VDQIX 0 "s_register_operand" "=w")
(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
(match_operand:VDQIX 2 "s_register_operand" "w")]
VQSHL))]
"TARGET_NEON"
"v<shift_op>.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_sat_shift_imm")]
)
;; vshr_n, vrshr_n
(define_insn "neon_v<shift_op>_n"
[(set (match_operand:VDQIX 0 "s_register_operand" "=w")
(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
VSHR_N))]
"TARGET_NEON"
{
arm_const_bounds (operands[2], 1, neon_element_bits (mode) + 1);
return "v<shift_op>.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %2";
}
[(set_attr "type" "neon_shift_imm")]
)
;; vshrn_n, vrshrn_n
(define_insn "neon_v<shift_op>_n"
[(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
VSHRN_N))]
"TARGET_NEON"
{
arm_const_bounds (operands[2], 1, neon_element_bits (mode) / 2 + 1);
return "v<shift_op>.<V_if_elem>\t%P0, %q1, %2";
}
[(set_attr "type" "neon_shift_imm_narrow_q")]
)
;; vqshrn_n, vqrshrn_n
(define_insn "neon_v<shift_op>_n"
[(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
VQSHRN_N))]
"TARGET_NEON"
{
arm_const_bounds (operands[2], 1, neon_element_bits (mode) / 2 + 1);
return "v<shift_op>.%#<V_sz_elem>\t%P0, %q1, %2";
}
[(set_attr "type" "neon_sat_shift_imm_narrow_q")]
)
;; vqshrun_n, vqrshrun_n
(define_insn "neon_v<shift_op>_n"
[(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
VQSHRUN_N))]
"TARGET_NEON"
{
arm_const_bounds (operands[2], 1, neon_element_bits (mode) / 2 + 1);
return "v<shift_op>.<V_s_elem>\t%P0, %q1, %2";
}
[(set_attr "type" "neon_sat_shift_imm_narrow_q")]
)
(define_insn "neon_vshl_n"
[(set (match_operand:VDQIX 0 "s_register_operand" "=w")
(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
UNSPEC_VSHL_N))]
"TARGET_NEON"
{
arm_const_bounds (operands[2], 0, neon_element_bits (mode));
return "vshl.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %2";
}
[(set_attr "type" "neon_shift_imm")]
)
(define_insn "neon_vqshl__n"
[(set (match_operand:VDQIX 0 "s_register_operand" "=w")
(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
VQSHL_N))]
"TARGET_NEON"
{
arm_const_bounds (operands[2], 0, neon_element_bits (mode));
return "vqshl.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %2";
}
[(set_attr "type" "neon_sat_shift_imm")]
)
(define_insn "neon_vqshlu_n"
[(set (match_operand:VDQIX 0 "s_register_operand" "=w")
(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
UNSPEC_VQSHLU_N))]
"TARGET_NEON"
{
arm_const_bounds (operands[2], 0, neon_element_bits (mode));
return "vqshlu.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %2";
}
[(set_attr "type" "neon_sat_shift_imm")]
)
(define_insn "neon_vshll_n"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(unspec:<V_widen> [(match_operand:VW 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
VSHLL_N))]
"TARGET_NEON"
{
/* The boundaries are: 0 < imm <= size. */
arm_const_bounds (operands[2], 0, neon_element_bits (mode) + 1);
return "vshll.%#<V_sz_elem>\t%q0, %P1, %2";
}
[(set_attr "type" "neon_shift_imm_long")]
)
;; vsra_n, vrsra_n
(define_insn "neon_v<shift_op>_n"
[(set (match_operand:VDQIX 0 "s_register_operand" "=w")
(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "0")
(match_operand:VDQIX 2 "s_register_operand" "w")
(match_operand:SI 3 "immediate_operand" "i")]
VSRA_N))]
"TARGET_NEON"
{
arm_const_bounds (operands[3], 1, neon_element_bits (mode) + 1);
return "v<shift_op>.%#<V_sz_elem>\t%<V_reg>0, %<V_reg>2, %3";
}
[(set_attr "type" "neon_shift_acc")]
)
(define_insn "neon_vsri_n"
[(set (match_operand:VDQIX 0 "s_register_operand" "=w")
(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "0")
(match_operand:VDQIX 2 "s_register_operand" "w")
(match_operand:SI 3 "immediate_operand" "i")]
UNSPEC_VSRI))]
"TARGET_NEON"
{
arm_const_bounds (operands[3], 1, neon_element_bits (mode) + 1);
return "vsri.<V_sz_elem>\t%<V_reg>0, %<V_reg>2, %3";
}
[(set_attr "type" "neon_shift_reg")]
)
(define_insn "neon_vsli_n"
[(set (match_operand:VDQIX 0 "s_register_operand" "=w")
(unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "0")
(match_operand:VDQIX 2 "s_register_operand" "w")
(match_operand:SI 3 "immediate_operand" "i")]
UNSPEC_VSLI))]
"TARGET_NEON"
{
arm_const_bounds (operands[3], 0, neon_element_bits (mode));
return "vsli.<V_sz_elem>\t%<V_reg>0, %<V_reg>2, %3";
}
[(set_attr "type" "neon_shift_reg")]
)
(define_insn "neon_vtbl1v8qi"
[(set (match_operand:V8QI 0 "s_register_operand" "=w")
(unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "w")
(match_operand:V8QI 2 "s_register_operand" "w")]
UNSPEC_VTBL))]
"TARGET_NEON"
"vtbl.8\t%P0, {%P1}, %P2"
[(set_attr "type" "neon_tbl1")]
)
(define_insn "neon_vtbl2v8qi"
[(set (match_operand:V8QI 0 "s_register_operand" "=w")
(unspec:V8QI [(match_operand:TI 1 "s_register_operand" "w")
(match_operand:V8QI 2 "s_register_operand" "w")]
UNSPEC_VTBL))]
"TARGET_NEON"
{
rtx ops[4];
int tabbase = REGNO (operands[1]);
ops[0] = operands[0];
ops[1] = gen_rtx_REG (V8QImode, tabbase);
ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
ops[3] = operands[2];
output_asm_insn ("vtbl.8\t%P0, {%P1, %P2}, %P3", ops);
return "";
}
[(set_attr "type" "neon_tbl2")]
)
(define_insn "neon_vtbl3v8qi"
[(set (match_operand:V8QI 0 "s_register_operand" "=w")
(unspec:V8QI [(match_operand:EI 1 "s_register_operand" "w")
(match_operand:V8QI 2 "s_register_operand" "w")]
UNSPEC_VTBL))]
"TARGET_NEON"
{
rtx ops[5];
int tabbase = REGNO (operands[1]);
ops[0] = operands[0];
ops[1] = gen_rtx_REG (V8QImode, tabbase);
ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
ops[3] = gen_rtx_REG (V8QImode, tabbase + 4);
ops[4] = operands[2];
output_asm_insn ("vtbl.8\t%P0, {%P1, %P2, %P3}, %P4", ops);
return "";
}
[(set_attr "type" "neon_tbl3")]
)
(define_insn "neon_vtbl4v8qi"
[(set (match_operand:V8QI 0 "s_register_operand" "=w")
(unspec:V8QI [(match_operand:OI 1 "s_register_operand" "w")
(match_operand:V8QI 2 "s_register_operand" "w")]
UNSPEC_VTBL))]
"TARGET_NEON"
{
rtx ops[6];
int tabbase = REGNO (operands[1]);
ops[0] = operands[0];
ops[1] = gen_rtx_REG (V8QImode, tabbase);
ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
ops[3] = gen_rtx_REG (V8QImode, tabbase + 4);
ops[4] = gen_rtx_REG (V8QImode, tabbase + 6);
ops[5] = operands[2];
output_asm_insn ("vtbl.8\t%P0, {%P1, %P2, %P3, %P4}, %P5", ops);
return "";
}
[(set_attr "type" "neon_tbl4")]
)
;; These three are used by the vec_perm infrastructure for V16QImode.
(define_insn_and_split "neon_vtbl1v16qi"
[(set (match_operand:V16QI 0 "s_register_operand" "=&w")
(unspec:V16QI [(match_operand:V16QI 1 "s_register_operand" "w")
(match_operand:V16QI 2 "s_register_operand" "w")]
UNSPEC_VTBL))]
"TARGET_NEON"
"#"
"&& reload_completed"
[(const_int 0)]
{
rtx op0, op1, op2, part0, part2;
unsigned ofs;
op0 = operands[0];
op1 = gen_lowpart (TImode, operands[1]);
op2 = operands[2];
ofs = subreg_lowpart_offset (V8QImode, V16QImode);
part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
ofs = subreg_highpart_offset (V8QImode, V16QImode);
part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
DONE;
}
[(set_attr "type" "multiple")]
)
(define_insn_and_split "neon_vtbl2v16qi"
[(set (match_operand:V16QI 0 "s_register_operand" "=&w")
(unspec:V16QI [(match_operand:OI 1 "s_register_operand" "w")
(match_operand:V16QI 2 "s_register_operand" "w")]
UNSPEC_VTBL))]
"TARGET_NEON"
"#"
"&& reload_completed"
[(const_int 0)]
{
rtx op0, op1, op2, part0, part2;
unsigned ofs;
op0 = operands[0];
op1 = operands[1];
op2 = operands[2];
ofs = subreg_lowpart_offset (V8QImode, V16QImode);
part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
ofs = subreg_highpart_offset (V8QImode, V16QImode);
part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
DONE;
}
[(set_attr "type" "multiple")]
)
;; ??? Logically we should extend the regular neon_vcombine pattern to
;; handle quad-word input modes, producing octa-word output modes. But
;; that requires us to add support for octa-word vector modes in moves.
;; That seems overkill for this one use in vec_perm.
(define_insn_and_split "neon_vcombinev16qi"
[(set (match_operand:OI 0 "s_register_operand" "=w")
(unspec:OI [(match_operand:V16QI 1 "s_register_operand" "w")
(match_operand:V16QI 2 "s_register_operand" "w")]
UNSPEC_VCONCAT))]
"TARGET_NEON"
"#"
"&& reload_completed"
[(const_int 0)]
{
neon_split_vcombine (operands);
DONE;
}
[(set_attr "type" "multiple")]
)
(define_insn "neon_vtbx1v8qi"
[(set (match_operand:V8QI 0 "s_register_operand" "=w")
(unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
(match_operand:V8QI 2 "s_register_operand" "w")
(match_operand:V8QI 3 "s_register_operand" "w")]
UNSPEC_VTBX))]
"TARGET_NEON"
"vtbx.8\t%P0, {%P2}, %P3"
[(set_attr "type" "neon_tbl1")]
)
(define_insn "neon_vtbx2v8qi"
[(set (match_operand:V8QI 0 "s_register_operand" "=w")
(unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
(match_operand:TI 2 "s_register_operand" "w")
(match_operand:V8QI 3 "s_register_operand" "w")]
UNSPEC_VTBX))]
"TARGET_NEON"
{
rtx ops[4];
int tabbase = REGNO (operands[2]);
ops[0] = operands[0];
ops[1] = gen_rtx_REG (V8QImode, tabbase);
ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
ops[3] = operands[3];
output_asm_insn ("vtbx.8\t%P0, {%P1, %P2}, %P3", ops);
return "";
}
[(set_attr "type" "neon_tbl2")]
)
(define_insn "neon_vtbx3v8qi"
[(set (match_operand:V8QI 0 "s_register_operand" "=w")
(unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
(match_operand:EI 2 "s_register_operand" "w")
(match_operand:V8QI 3 "s_register_operand" "w")]
UNSPEC_VTBX))]
"TARGET_NEON"
{
rtx ops[5];
int tabbase = REGNO (operands[2]);
ops[0] = operands[0];
ops[1] = gen_rtx_REG (V8QImode, tabbase);
ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
ops[3] = gen_rtx_REG (V8QImode, tabbase + 4);
ops[4] = operands[3];
output_asm_insn ("vtbx.8\t%P0, {%P1, %P2, %P3}, %P4", ops);
return "";
}
[(set_attr "type" "neon_tbl3")]
)
(define_insn "neon_vtbx4v8qi"
[(set (match_operand:V8QI 0 "s_register_operand" "=w")
(unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
(match_operand:OI 2 "s_register_operand" "w")
(match_operand:V8QI 3 "s_register_operand" "w")]
UNSPEC_VTBX))]
"TARGET_NEON"
{
rtx ops[6];
int tabbase = REGNO (operands[2]);
ops[0] = operands[0];
ops[1] = gen_rtx_REG (V8QImode, tabbase);
ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
ops[3] = gen_rtx_REG (V8QImode, tabbase + 4);
ops[4] = gen_rtx_REG (V8QImode, tabbase + 6);
ops[5] = operands[3];
output_asm_insn ("vtbx.8\t%P0, {%P1, %P2, %P3, %P4}, %P5", ops);
return "";
}
[(set_attr "type" "neon_tbl4")]
)
(define_expand "@neon_vtrn_internal"
[(parallel
[(set (match_operand:VDQWH 0 "s_register_operand")
(unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand")
(match_operand:VDQWH 2 "s_register_operand")]
UNSPEC_VTRN1))
(set (match_operand:VDQWH 3 "s_register_operand")
(unspec:VDQWH [(match_dup 1) (match_dup 2)] UNSPEC_VTRN2))])]
"TARGET_NEON"
""
)
;; Note: Different operand numbering to handle tied registers correctly.
(define_insn "*neon_vtrn_insn"
[(set (match_operand:VDQWH 0 "s_register_operand" "=&w")
(unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand" "0")
(match_operand:VDQWH 3 "s_register_operand" "2")]
UNSPEC_VTRN1))
(set (match_operand:VDQWH 2 "s_register_operand" "=&w")
(unspec:VDQWH [(match_dup 1) (match_dup 3)]
UNSPEC_VTRN2))]
"TARGET_NEON"
"vtrn.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
[(set_attr "type" "neon_permute")]
)
(define_expand "@neon_vzip_internal"
[(parallel
[(set (match_operand:VDQWH 0 "s_register_operand")
(unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand")
(match_operand:VDQWH 2 "s_register_operand")]
UNSPEC_VZIP1))
(set (match_operand:VDQWH 3 "s_register_operand")
(unspec:VDQWH [(match_dup 1) (match_dup 2)] UNSPEC_VZIP2))])]
"TARGET_NEON"
""
)
;; Note: Different operand numbering to handle tied registers correctly.
(define_insn "*neon_vzip_insn"
[(set (match_operand:VDQWH 0 "s_register_operand" "=&w")
(unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand" "0")
(match_operand:VDQWH 3 "s_register_operand" "2")]
UNSPEC_VZIP1))
(set (match_operand:VDQWH 2 "s_register_operand" "=&w")
(unspec:VDQWH [(match_dup 1) (match_dup 3)]
UNSPEC_VZIP2))]
"TARGET_NEON"
"vzip.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
[(set_attr "type" "neon_zip")]
)
(define_expand "@neon_vuzp_internal"
[(parallel
[(set (match_operand:VDQWH 0 "s_register_operand")
(unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand")
(match_operand:VDQWH 2 "s_register_operand")]
UNSPEC_VUZP1))
(set (match_operand:VDQWH 3 "s_register_operand")
(unspec:VDQWH [(match_dup 1) (match_dup 2)] UNSPEC_VUZP2))])]
"TARGET_NEON"
""
)
;; Note: Different operand numbering to handle tied registers correctly.
(define_insn "*neon_vuzp_insn"
[(set (match_operand:VDQWH 0 "s_register_operand" "=&w")
(unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand" "0")
(match_operand:VDQWH 3 "s_register_operand" "2")]
UNSPEC_VUZP1))
(set (match_operand:VDQWH 2 "s_register_operand" "=&w")
(unspec:VDQWH [(match_dup 1) (match_dup 3)]
UNSPEC_VUZP2))]
"TARGET_NEON"
"vuzp.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
[(set_attr "type" "neon_zip")]
)
(define_expand "vec_load_lanes"
[(set (match_operand:VDQX 0 "s_register_operand")
(unspec:VDQX [(match_operand:VDQX 1 "neon_struct_operand")]
UNSPEC_VLD1))]
"TARGET_NEON")
(define_insn "neon_vld1"
[(set (match_operand:VDQX 0 "s_register_operand" "=w")
(unspec:VDQX [(match_operand:VDQX 1 "neon_struct_operand" "Um")]
UNSPEC_VLD1))]
"TARGET_NEON"
"vld1.<V_sz_elem>\t%h0, %A1"
[(set_attr "type" "neon_load1_1reg")]
)
;; The lane numbers in the RTL are in GCC lane order, having been flipped
;; in arm_expand_neon_args. The lane numbers are restored to architectural
;; lane order here.
(define_insn "neon_vld1_lane"
[(set (match_operand:VDX 0 "s_register_operand" "=w")
(unspec:VDX [(match_operand:<V_elem> 1 "neon_struct_operand" "Um")
(match_operand:VDX 2 "s_register_operand" "0")
(match_operand:SI 3 "immediate_operand" "i")]
UNSPEC_VLD1_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[3]));
HOST_WIDE_INT max = GET_MODE_NUNITS (mode);
operands[3] = GEN_INT (lane);
if (max == 1)
return "vld1.<V_sz_elem>\t%P0, %A1";
else
return "vld1.<V_sz_elem>\t{%P0[%c3]}, %A1";
}
[(set_attr "type" "neon_load1_one_lane")]
)
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vld1_lane"
[(set (match_operand:VQX 0 "s_register_operand" "=w")
(unspec:VQX [(match_operand:<V_elem> 1 "neon_struct_operand" "Um")
(match_operand:VQX 2 "s_register_operand" "0")
(match_operand:SI 3 "immediate_operand" "i")]
UNSPEC_VLD1_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[3]));
HOST_WIDE_INT max = GET_MODE_NUNITS (mode);
operands[3] = GEN_INT (lane);
int regno = REGNO (operands[0]);
if (lane >= max / 2)
{
lane -= max / 2;
regno += 2;
operands[3] = GEN_INT (lane);
}
operands[0] = gen_rtx_REG (<V_HALF>mode, regno);
if (max == 2)
return "vld1.<V_sz_elem>\t%P0, %A1";
else
return "vld1.<V_sz_elem>\t{%P0[%c3]}, %A1";
}
[(set_attr "type" "neon_load1_one_lane")]
)
(define_insn "neon_vld1_dup"
[(set (match_operand:VD_LANE 0 "s_register_operand" "=w")
(vec_duplicate:VD_LANE (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
"TARGET_NEON"
"vld1.<V_sz_elem>\t{%P0[]}, %A1"
[(set_attr "type" "neon_load1_all_lanes")]
)
;; Special case for DImode. Treat it exactly like a simple load.
(define_expand "neon_vld1_dupdi"
[(set (match_operand:DI 0 "s_register_operand")
(unspec:DI [(match_operand:DI 1 "neon_struct_operand")]
UNSPEC_VLD1))]
"TARGET_NEON"
""
)
(define_insn "neon_vld1_dup"
[(set (match_operand:VQ2 0 "s_register_operand" "=w")
(vec_duplicate:VQ2 (match_operand:<V_elem> 1 "neon_struct_operand" "Um")))]
"TARGET_NEON"
{
return "vld1.<V_sz_elem>\t{%e0[], %f0[]}, %A1";
}
[(set_attr "type" "neon_load1_all_lanes")]
)
(define_insn_and_split "neon_vld1_dupv2di"
[(set (match_operand:V2DI 0 "s_register_operand" "=w")
(vec_duplicate:V2DI (match_operand:DI 1 "neon_struct_operand" "Um")))]
"TARGET_NEON"
"#"
"&& reload_completed"
[(const_int 0)]
{
rtx tmprtx = gen_lowpart (DImode, operands[0]);
emit_insn (gen_neon_vld1_dupdi (tmprtx, operands[1]));
emit_move_insn (gen_highpart (DImode, operands[0]), tmprtx );
DONE;
}
[(set_attr "length" "8")
(set_attr "type" "neon_load1_all_lanes_q")]
)
(define_expand "vec_store_lanes"
[(set (match_operand:VDQX 0 "neon_struct_operand")
(unspec:VDQX [(match_operand:VDQX 1 "s_register_operand")]
UNSPEC_VST1))]
"TARGET_NEON")
(define_insn "neon_vst1"
[(set (match_operand:VDQX 0 "neon_struct_operand" "=Um")
(unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w")]
UNSPEC_VST1))]
"TARGET_NEON"
"vst1.<V_sz_elem>\t%h1, %A0"
[(set_attr "type" "neon_store1_1reg")])
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vst1_lane"
[(set (match_operand:<V_elem> 0 "neon_struct_operand" "=Um")
(unspec:<V_elem>
[(match_operand:VDX 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
UNSPEC_VST1_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[2]));
HOST_WIDE_INT max = GET_MODE_NUNITS (mode);
operands[2] = GEN_INT (lane);
if (max == 1)
return "vst1.<V_sz_elem>\t{%P1}, %A0";
else
return "vst1.<V_sz_elem>\t{%P1[%c2]}, %A0";
}
[(set_attr "type" "neon_store1_one_lane")]
)
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vst1_lane"
[(set (match_operand:<V_elem> 0 "neon_struct_operand" "=Um")
(unspec:<V_elem>
[(match_operand:VQX 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
UNSPEC_VST1_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[2]));
HOST_WIDE_INT max = GET_MODE_NUNITS (mode);
int regno = REGNO (operands[1]);
if (lane >= max / 2)
{
lane -= max / 2;
regno += 2;
}
operands[2] = GEN_INT (lane);
operands[1] = gen_rtx_REG (<V_HALF>mode, regno);
if (max == 2)
return "vst1.<V_sz_elem>\t{%P1}, %A0";
else
return "vst1.<V_sz_elem>\t{%P1[%c2]}, %A0";
}
[(set_attr "type" "neon_store1_one_lane")]
)
(define_expand "vec_load_lanesti"
[(set (match_operand:TI 0 "s_register_operand")
(unspec:TI [(match_operand:TI 1 "neon_struct_operand")
(unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2))]
"TARGET_NEON")
(define_insn "neon_vld2"
[(set (match_operand:TI 0 "s_register_operand" "=w")
(unspec:TI [(match_operand:TI 1 "neon_struct_operand" "Um")
(unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2))]
"TARGET_NEON"
{
if (<V_sz_elem> == 64)
return "vld1.64\t%h0, %A1";
else
return "vld2.<V_sz_elem>\t%h0, %A1";
}
[(set (attr "type")
(if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
(const_string "neon_load1_2reg")
(const_string "neon_load2_2reg")))]
)
(define_expand "vec_load_lanesoi"
[(set (match_operand:OI 0 "s_register_operand")
(unspec:OI [(match_operand:OI 1 "neon_struct_operand")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2))]
"TARGET_NEON")
(define_insn "neon_vld2"
[(set (match_operand:OI 0 "s_register_operand" "=w")
(unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2))]
"TARGET_NEON"
"vld2.<V_sz_elem>\t%h0, %A1"
[(set_attr "type" "neon_load2_2reg_q")])
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vld2_lane"
[(set (match_operand:TI 0 "s_register_operand" "=w")
(unspec:TI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um")
(match_operand:TI 2 "s_register_operand" "0")
(match_operand:SI 3 "immediate_operand" "i")
(unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[3]));
int regno = REGNO (operands[0]);
rtx ops[4];
ops[0] = gen_rtx_REG (DImode, regno);
ops[1] = gen_rtx_REG (DImode, regno + 2);
ops[2] = operands[1];
ops[3] = GEN_INT (lane);
output_asm_insn ("vld2.<V_sz_elem>\t{%P0[%c3], %P1[%c3]}, %A2", ops);
return "";
}
[(set_attr "type" "neon_load2_one_lane")]
)
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vld2_lane"
[(set (match_operand:OI 0 "s_register_operand" "=w")
(unspec:OI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um")
(match_operand:OI 2 "s_register_operand" "0")
(match_operand:SI 3 "immediate_operand" "i")
(unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[3]));
HOST_WIDE_INT max = GET_MODE_NUNITS (mode);
int regno = REGNO (operands[0]);
rtx ops[4];
if (lane >= max / 2)
{
lane -= max / 2;
regno += 2;
}
ops[0] = gen_rtx_REG (DImode, regno);
ops[1] = gen_rtx_REG (DImode, regno + 4);
ops[2] = operands[1];
ops[3] = GEN_INT (lane);
output_asm_insn ("vld2.<V_sz_elem>\t{%P0[%c3], %P1[%c3]}, %A2", ops);
return "";
}
[(set_attr "type" "neon_load2_one_lane")]
)
(define_insn "neon_vld2_dup"
[(set (match_operand:TI 0 "s_register_operand" "=w")
(unspec:TI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um")
(unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2_DUP))]
"TARGET_NEON"
{
if (GET_MODE_NUNITS (mode) > 1)
return "vld2.<V_sz_elem>\t{%e0[], %f0[]}, %A1";
else
return "vld1.<V_sz_elem>\t%h0, %A1";
}
[(set (attr "type")
(if_then_else (gt (const_string "<V_mode_nunits>") (const_string "1"))
(const_string "neon_load2_all_lanes")
(const_string "neon_load1_1reg")))]
)
(define_insn "neon_vld2_dupv8bf"
[(set (match_operand:OI 0 "s_register_operand" "=w")
(unspec:OI [(match_operand:V2BF 1 "neon_struct_operand" "Um")
(unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2_DUP))]
"TARGET_BF16_SIMD"
{
rtx ops[5];
int tabbase = REGNO (operands[0]);
ops[4] = operands[1];
ops[0] = gen_rtx_REG (V4BFmode, tabbase);
ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2);
ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4);
ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6);
output_asm_insn ("vld2.16\t{%P0, %P1, %P2, %P3}, %A4", ops);
return "";
}
[(set_attr "type" "neon_load2_all_lanes_q")]
)
(define_expand "vec_store_lanesti"
[(set (match_operand:TI 0 "neon_struct_operand")
(unspec:TI [(match_operand:TI 1 "s_register_operand")
(unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST2))]
"TARGET_NEON")
(define_insn "neon_vst2"
[(set (match_operand:TI 0 "neon_struct_operand" "=Um")
(unspec:TI [(match_operand:TI 1 "s_register_operand" "w")
(unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST2))]
"TARGET_NEON"
{
if (<V_sz_elem> == 64)
return "vst1.64\t%h1, %A0";
else
return "vst2.<V_sz_elem>\t%h1, %A0";
}
[(set (attr "type")
(if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
(const_string "neon_store1_2reg")
(const_string "neon_store2_one_lane")))]
)
(define_expand "vec_store_lanesoi"
[(set (match_operand:OI 0 "neon_struct_operand")
(unspec:OI [(match_operand:OI 1 "s_register_operand")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST2))]
"TARGET_NEON")
(define_insn "neon_vst2"
[(set (match_operand:OI 0 "neon_struct_operand" "=Um")
(unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST2))]
"TARGET_NEON"
"vst2.<V_sz_elem>\t%h1, %A0"
[(set_attr "type" "neon_store2_4reg")]
)
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vst2_lane"
[(set (match_operand:<V_two_elem> 0 "neon_struct_operand" "=Um")
(unspec:<V_two_elem>
[(match_operand:TI 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")
(unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST2_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[2]));
int regno = REGNO (operands[1]);
rtx ops[4];
ops[0] = operands[0];
ops[1] = gen_rtx_REG (DImode, regno);
ops[2] = gen_rtx_REG (DImode, regno + 2);
ops[3] = GEN_INT (lane);
output_asm_insn ("vst2.<V_sz_elem>\t{%P1[%c3], %P2[%c3]}, %A0", ops);
return "";
}
[(set_attr "type" "neon_store2_one_lane")]
)
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vst2_lane"
[(set (match_operand:<V_two_elem> 0 "neon_struct_operand" "=Um")
(unspec:<V_two_elem>
[(match_operand:OI 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")
(unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST2_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[2]));
HOST_WIDE_INT max = GET_MODE_NUNITS (mode);
int regno = REGNO (operands[1]);
rtx ops[4];
if (lane >= max / 2)
{
lane -= max / 2;
regno += 2;
}
ops[0] = operands[0];
ops[1] = gen_rtx_REG (DImode, regno);
ops[2] = gen_rtx_REG (DImode, regno + 4);
ops[3] = GEN_INT (lane);
output_asm_insn ("vst2.<V_sz_elem>\t{%P1[%c3], %P2[%c3]}, %A0", ops);
return "";
}
[(set_attr "type" "neon_store2_one_lane")]
)
(define_expand "vec_load_lanesei"
[(set (match_operand:EI 0 "s_register_operand")
(unspec:EI [(match_operand:EI 1 "neon_struct_operand")
(unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD3))]
"TARGET_NEON")
(define_insn "neon_vld3"
[(set (match_operand:EI 0 "s_register_operand" "=w")
(unspec:EI [(match_operand:EI 1 "neon_struct_operand" "Um")
(unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD3))]
"TARGET_NEON"
{
if (<V_sz_elem> == 64)
return "vld1.64\t%h0, %A1";
else
return "vld3.<V_sz_elem>\t%h0, %A1";
}
[(set (attr "type")
(if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
(const_string "neon_load1_3reg")
(const_string "neon_load3_3reg")))]
)
(define_expand "vec_load_lanesci"
[(match_operand:CI 0 "s_register_operand")
(match_operand:CI 1 "neon_struct_operand")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
emit_insn (gen_neon_vld3 (operands[0], operands[1]));
DONE;
})
(define_expand "neon_vld3"
[(match_operand:CI 0 "s_register_operand")
(match_operand:CI 1 "neon_struct_operand")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
rtx mem;
mem = adjust_address (operands[1], EImode, 0);
emit_insn (gen_neon_vld3qa (operands[0], mem));
mem = adjust_address (mem, EImode, GET_MODE_SIZE (EImode));
emit_insn (gen_neon_vld3qb (operands[0], mem, operands[0]));
DONE;
})
(define_insn "neon_vld3qa"
[(set (match_operand:CI 0 "s_register_operand" "=w")
(unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD3A))]
"TARGET_NEON"
{
int regno = REGNO (operands[0]);
rtx ops[4];
ops[0] = gen_rtx_REG (DImode, regno);
ops[1] = gen_rtx_REG (DImode, regno + 4);
ops[2] = gen_rtx_REG (DImode, regno + 8);
ops[3] = operands[1];
output_asm_insn ("vld3.<V_sz_elem>\t{%P0, %P1, %P2}, %A3", ops);
return "";
}
[(set_attr "type" "neon_load3_3reg")]
)
(define_insn "neon_vld3qb"
[(set (match_operand:CI 0 "s_register_operand" "=w")
(unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um")
(match_operand:CI 2 "s_register_operand" "0")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD3B))]
"TARGET_NEON"
{
int regno = REGNO (operands[0]);
rtx ops[4];
ops[0] = gen_rtx_REG (DImode, regno + 2);
ops[1] = gen_rtx_REG (DImode, regno + 6);
ops[2] = gen_rtx_REG (DImode, regno + 10);
ops[3] = operands[1];
output_asm_insn ("vld3.<V_sz_elem>\t{%P0, %P1, %P2}, %A3", ops);
return "";
}
[(set_attr "type" "neon_load3_3reg")]
)
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vld3_lane"
[(set (match_operand:EI 0 "s_register_operand" "=w")
(unspec:EI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um")
(match_operand:EI 2 "s_register_operand" "0")
(match_operand:SI 3 "immediate_operand" "i")
(unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD3_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N (mode, INTVAL (operands[3]));
int regno = REGNO (operands[0]);
rtx ops[5];
ops[0] = gen_rtx_REG (DImode, regno);
ops[1] = gen_rtx_REG (DImode, regno + 2);
ops[2] = gen_rtx_REG (DImode, regno + 4);
ops[3] = operands[1];
ops[4] = GEN_INT (lane);
output_asm_insn ("vld3.<V_sz_elem>\t{%P0[%c4], %P1[%c4], %P2[%c4]}, %3",
ops);
return "";
}
[(set_attr "type" "neon_load3_one_lane")]
)
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vld3_lane"
[(set (match_operand:CI 0 "s_register_operand" "=w")
(unspec:CI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um")
(match_operand:CI 2 "s_register_operand" "0")
(match_operand:SI 3 "immediate_operand" "i")
(unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD3_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[3]));
HOST_WIDE_INT max = GET_MODE_NUNITS (mode);
int regno = REGNO (operands[0]);
rtx ops[5];
if (lane >= max / 2)
{
lane -= max / 2;
regno += 2;
}
ops[0] = gen_rtx_REG (DImode, regno);
ops[1] = gen_rtx_REG (DImode, regno + 4);
ops[2] = gen_rtx_REG (DImode, regno + 8);
ops[3] = operands[1];
ops[4] = GEN_INT (lane);
output_asm_insn ("vld3.<V_sz_elem>\t{%P0[%c4], %P1[%c4], %P2[%c4]}, %3",
ops);
return "";
}
[(set_attr "type" "neon_load3_one_lane")]
)
(define_insn "neon_vld3_dup"
[(set (match_operand:EI 0 "s_register_operand" "=w")
(unspec:EI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um")
(unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD3_DUP))]
"TARGET_NEON"
{
if (GET_MODE_NUNITS (mode) > 1)
{
int regno = REGNO (operands[0]);
rtx ops[4];
ops[0] = gen_rtx_REG (DImode, regno);
ops[1] = gen_rtx_REG (DImode, regno + 2);
ops[2] = gen_rtx_REG (DImode, regno + 4);
ops[3] = operands[1];
output_asm_insn ("vld3.<V_sz_elem>\t{%P0[], %P1[], %P2[]}, %3", ops);
return "";
}
else
return "vld1.<V_sz_elem>\t%h0, %A1";
}
[(set (attr "type")
(if_then_else (gt (const_string "<V_mode_nunits>") (const_string "1"))
(const_string "neon_load3_all_lanes")
(const_string "neon_load1_1reg")))])
(define_insn "neon_vld3_dupv8bf"
[(set (match_operand:CI 0 "s_register_operand" "=w")
(unspec:CI [(match_operand:V2BF 1 "neon_struct_operand" "Um")
(unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2_DUP))]
"TARGET_BF16_SIMD"
{
rtx ops[4];
int tabbase = REGNO (operands[0]);
ops[3] = operands[1];
ops[0] = gen_rtx_REG (V4BFmode, tabbase);
ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2);
ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4);
output_asm_insn ("vld3.16\t{%P0[], %P1[], %P2[]}, %A3", ops);
return "";
}
[(set_attr "type" "neon_load3_all_lanes_q")]
)
(define_expand "vec_store_lanesei"
[(set (match_operand:EI 0 "neon_struct_operand")
(unspec:EI [(match_operand:EI 1 "s_register_operand")
(unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST3))]
"TARGET_NEON")
(define_insn "neon_vst3"
[(set (match_operand:EI 0 "neon_struct_operand" "=Um")
(unspec:EI [(match_operand:EI 1 "s_register_operand" "w")
(unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST3))]
"TARGET_NEON"
{
if (<V_sz_elem> == 64)
return "vst1.64\t%h1, %A0";
else
return "vst3.<V_sz_elem>\t%h1, %A0";
}
[(set (attr "type")
(if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
(const_string "neon_store1_3reg")
(const_string "neon_store3_one_lane")))])
(define_expand "vec_store_lanesci"
[(match_operand:CI 0 "neon_struct_operand")
(match_operand:CI 1 "s_register_operand")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
emit_insn (gen_neon_vst3 (operands[0], operands[1]));
DONE;
})
(define_expand "neon_vst3"
[(match_operand:CI 0 "neon_struct_operand")
(match_operand:CI 1 "s_register_operand")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
rtx mem;
mem = adjust_address (operands[0], EImode, 0);
emit_insn (gen_neon_vst3qa (mem, operands[1]));
mem = adjust_address (mem, EImode, GET_MODE_SIZE (EImode));
emit_insn (gen_neon_vst3qb (mem, operands[1]));
DONE;
})
(define_insn "neon_vst3qa"
[(set (match_operand:EI 0 "neon_struct_operand" "=Um")
(unspec:EI [(match_operand:CI 1 "s_register_operand" "w")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST3A))]
"TARGET_NEON"
{
int regno = REGNO (operands[1]);
rtx ops[4];
ops[0] = operands[0];
ops[1] = gen_rtx_REG (DImode, regno);
ops[2] = gen_rtx_REG (DImode, regno + 4);
ops[3] = gen_rtx_REG (DImode, regno + 8);
output_asm_insn ("vst3.<V_sz_elem>\t{%P1, %P2, %P3}, %A0", ops);
return "";
}
[(set_attr "type" "neon_store3_3reg")]
)
(define_insn "neon_vst3qb"
[(set (match_operand:EI 0 "neon_struct_operand" "=Um")
(unspec:EI [(match_operand:CI 1 "s_register_operand" "w")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST3B))]
"TARGET_NEON"
{
int regno = REGNO (operands[1]);
rtx ops[4];
ops[0] = operands[0];
ops[1] = gen_rtx_REG (DImode, regno + 2);
ops[2] = gen_rtx_REG (DImode, regno + 6);
ops[3] = gen_rtx_REG (DImode, regno + 10);
output_asm_insn ("vst3.<V_sz_elem>\t{%P1, %P2, %P3}, %A0", ops);
return "";
}
[(set_attr "type" "neon_store3_3reg")]
)
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vst3_lane"
[(set (match_operand:<V_three_elem> 0 "neon_struct_operand" "=Um")
(unspec:<V_three_elem>
[(match_operand:EI 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")
(unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST3_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[2]));
int regno = REGNO (operands[1]);
rtx ops[5];
ops[0] = operands[0];
ops[1] = gen_rtx_REG (DImode, regno);
ops[2] = gen_rtx_REG (DImode, regno + 2);
ops[3] = gen_rtx_REG (DImode, regno + 4);
ops[4] = GEN_INT (lane);
output_asm_insn ("vst3.<V_sz_elem>\t{%P1[%c4], %P2[%c4], %P3[%c4]}, %0",
ops);
return "";
}
[(set_attr "type" "neon_store3_one_lane")]
)
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vst3_lane"
[(set (match_operand:<V_three_elem> 0 "neon_struct_operand" "=Um")
(unspec:<V_three_elem>
[(match_operand:CI 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")
(unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST3_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[2]));
HOST_WIDE_INT max = GET_MODE_NUNITS (mode);
int regno = REGNO (operands[1]);
rtx ops[5];
if (lane >= max / 2)
{
lane -= max / 2;
regno += 2;
}
ops[0] = operands[0];
ops[1] = gen_rtx_REG (DImode, regno);
ops[2] = gen_rtx_REG (DImode, regno + 4);
ops[3] = gen_rtx_REG (DImode, regno + 8);
ops[4] = GEN_INT (lane);
output_asm_insn ("vst3.<V_sz_elem>\t{%P1[%c4], %P2[%c4], %P3[%c4]}, %0",
ops);
return "";
}
[(set_attr "type" "neon_store3_one_lane")]
)
(define_expand "vec_load_lanesoi"
[(set (match_operand:OI 0 "s_register_operand")
(unspec:OI [(match_operand:OI 1 "neon_struct_operand")
(unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD4))]
"TARGET_NEON")
(define_insn "neon_vld4"
[(set (match_operand:OI 0 "s_register_operand" "=w")
(unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um")
(unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD4))]
"TARGET_NEON"
{
if (<V_sz_elem> == 64)
return "vld1.64\t%h0, %A1";
else
return "vld4.<V_sz_elem>\t%h0, %A1";
}
[(set (attr "type")
(if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
(const_string "neon_load1_4reg")
(const_string "neon_load4_4reg")))]
)
(define_expand "vec_load_lanesxi"
[(match_operand:XI 0 "s_register_operand")
(match_operand:XI 1 "neon_struct_operand")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
emit_insn (gen_neon_vld4 (operands[0], operands[1]));
DONE;
})
(define_expand "neon_vld4"
[(match_operand:XI 0 "s_register_operand")
(match_operand:XI 1 "neon_struct_operand")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
rtx mem;
mem = adjust_address (operands[1], OImode, 0);
emit_insn (gen_neon_vld4qa (operands[0], mem));
mem = adjust_address (mem, OImode, GET_MODE_SIZE (OImode));
emit_insn (gen_neon_vld4qb (operands[0], mem, operands[0]));
DONE;
})
(define_insn "neon_vld4qa"
[(set (match_operand:XI 0 "s_register_operand" "=w")
(unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD4A))]
"TARGET_NEON"
{
int regno = REGNO (operands[0]);
rtx ops[5];
ops[0] = gen_rtx_REG (DImode, regno);
ops[1] = gen_rtx_REG (DImode, regno + 4);
ops[2] = gen_rtx_REG (DImode, regno + 8);
ops[3] = gen_rtx_REG (DImode, regno + 12);
ops[4] = operands[1];
output_asm_insn ("vld4.<V_sz_elem>\t{%P0, %P1, %P2, %P3}, %A4", ops);
return "";
}
[(set_attr "type" "neon_load4_4reg")]
)
(define_insn "neon_vld4qb"
[(set (match_operand:XI 0 "s_register_operand" "=w")
(unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um")
(match_operand:XI 2 "s_register_operand" "0")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD4B))]
"TARGET_NEON"
{
int regno = REGNO (operands[0]);
rtx ops[5];
ops[0] = gen_rtx_REG (DImode, regno + 2);
ops[1] = gen_rtx_REG (DImode, regno + 6);
ops[2] = gen_rtx_REG (DImode, regno + 10);
ops[3] = gen_rtx_REG (DImode, regno + 14);
ops[4] = operands[1];
output_asm_insn ("vld4.<V_sz_elem>\t{%P0, %P1, %P2, %P3}, %A4", ops);
return "";
}
[(set_attr "type" "neon_load4_4reg")]
)
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vld4_lane"
[(set (match_operand:OI 0 "s_register_operand" "=w")
(unspec:OI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um")
(match_operand:OI 2 "s_register_operand" "0")
(match_operand:SI 3 "immediate_operand" "i")
(unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD4_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[3]));
int regno = REGNO (operands[0]);
rtx ops[6];
ops[0] = gen_rtx_REG (DImode, regno);
ops[1] = gen_rtx_REG (DImode, regno + 2);
ops[2] = gen_rtx_REG (DImode, regno + 4);
ops[3] = gen_rtx_REG (DImode, regno + 6);
ops[4] = operands[1];
ops[5] = GEN_INT (lane);
output_asm_insn ("vld4.<V_sz_elem>\t{%P0[%c5], %P1[%c5], %P2[%c5], %P3[%c5]}, %A4",
ops);
return "";
}
[(set_attr "type" "neon_load4_one_lane")]
)
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vld4_lane"
[(set (match_operand:XI 0 "s_register_operand" "=w")
(unspec:XI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um")
(match_operand:XI 2 "s_register_operand" "0")
(match_operand:SI 3 "immediate_operand" "i")
(unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD4_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[3]));
HOST_WIDE_INT max = GET_MODE_NUNITS (mode);
int regno = REGNO (operands[0]);
rtx ops[6];
if (lane >= max / 2)
{
lane -= max / 2;
regno += 2;
}
ops[0] = gen_rtx_REG (DImode, regno);
ops[1] = gen_rtx_REG (DImode, regno + 4);
ops[2] = gen_rtx_REG (DImode, regno + 8);
ops[3] = gen_rtx_REG (DImode, regno + 12);
ops[4] = operands[1];
ops[5] = GEN_INT (lane);
output_asm_insn ("vld4.<V_sz_elem>\t{%P0[%c5], %P1[%c5], %P2[%c5], %P3[%c5]}, %A4",
ops);
return "";
}
[(set_attr "type" "neon_load4_one_lane")]
)
(define_insn "neon_vld4_dup"
[(set (match_operand:OI 0 "s_register_operand" "=w")
(unspec:OI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um")
(unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD4_DUP))]
"TARGET_NEON"
{
if (GET_MODE_NUNITS (mode) > 1)
{
int regno = REGNO (operands[0]);
rtx ops[5];
ops[0] = gen_rtx_REG (DImode, regno);
ops[1] = gen_rtx_REG (DImode, regno + 2);
ops[2] = gen_rtx_REG (DImode, regno + 4);
ops[3] = gen_rtx_REG (DImode, regno + 6);
ops[4] = operands[1];
output_asm_insn ("vld4.<V_sz_elem>\t{%P0[], %P1[], %P2[], %P3[]}, %A4",
ops);
return "";
}
else
return "vld1.<V_sz_elem>\t%h0, %A1";
}
[(set (attr "type")
(if_then_else (gt (const_string "<V_mode_nunits>") (const_string "1"))
(const_string "neon_load4_all_lanes")
(const_string "neon_load1_1reg")))]
)
(define_insn "neon_vld4_dupv8bf"
[(set (match_operand:XI 0 "s_register_operand" "=w")
(unspec:XI [(match_operand:V2BF 1 "neon_struct_operand" "Um")
(unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2_DUP))]
"TARGET_BF16_SIMD"
{
rtx ops[5];
int tabbase = REGNO (operands[0]);
ops[4] = operands[1];
ops[0] = gen_rtx_REG (V4BFmode, tabbase);
ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2);
ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4);
ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6);
output_asm_insn ("vld4.16\t{%P0[], %P1[], %P2[], %P3[]}, %A4", ops);
return "";
}
[(set_attr "type" "neon_load4_all_lanes_q")]
)
(define_expand "vec_store_lanesoi"
[(set (match_operand:OI 0 "neon_struct_operand")
(unspec:OI [(match_operand:OI 1 "s_register_operand")
(unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST4))]
"TARGET_NEON")
(define_insn "neon_vst4"
[(set (match_operand:OI 0 "neon_struct_operand" "=Um")
(unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
(unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST4))]
"TARGET_NEON"
{
if (<V_sz_elem> == 64)
return "vst1.64\t%h1, %A0";
else
return "vst4.<V_sz_elem>\t%h1, %A0";
}
[(set (attr "type")
(if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
(const_string "neon_store1_4reg")
(const_string "neon_store4_4reg")))]
)
(define_expand "vec_store_lanesxi"
[(match_operand:XI 0 "neon_struct_operand")
(match_operand:XI 1 "s_register_operand")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
emit_insn (gen_neon_vst4 (operands[0], operands[1]));
DONE;
})
(define_expand "neon_vst4"
[(match_operand:XI 0 "neon_struct_operand")
(match_operand:XI 1 "s_register_operand")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
rtx mem;
mem = adjust_address (operands[0], OImode, 0);
emit_insn (gen_neon_vst4qa (mem, operands[1]));
mem = adjust_address (mem, OImode, GET_MODE_SIZE (OImode));
emit_insn (gen_neon_vst4qb (mem, operands[1]));
DONE;
})
(define_insn "neon_vst4qa"
[(set (match_operand:OI 0 "neon_struct_operand" "=Um")
(unspec:OI [(match_operand:XI 1 "s_register_operand" "w")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST4A))]
"TARGET_NEON"
{
int regno = REGNO (operands[1]);
rtx ops[5];
ops[0] = operands[0];
ops[1] = gen_rtx_REG (DImode, regno);
ops[2] = gen_rtx_REG (DImode, regno + 4);
ops[3] = gen_rtx_REG (DImode, regno + 8);
ops[4] = gen_rtx_REG (DImode, regno + 12);
output_asm_insn ("vst4.<V_sz_elem>\t{%P1, %P2, %P3, %P4}, %A0", ops);
return "";
}
[(set_attr "type" "neon_store4_4reg")]
)
(define_insn "neon_vst4qb"
[(set (match_operand:OI 0 "neon_struct_operand" "=Um")
(unspec:OI [(match_operand:XI 1 "s_register_operand" "w")
(unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST4B))]
"TARGET_NEON"
{
int regno = REGNO (operands[1]);
rtx ops[5];
ops[0] = operands[0];
ops[1] = gen_rtx_REG (DImode, regno + 2);
ops[2] = gen_rtx_REG (DImode, regno + 6);
ops[3] = gen_rtx_REG (DImode, regno + 10);
ops[4] = gen_rtx_REG (DImode, regno + 14);
output_asm_insn ("vst4.<V_sz_elem>\t{%P1, %P2, %P3, %P4}, %A0", ops);
return "";
}
[(set_attr "type" "neon_store4_4reg")]
)
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vst4_lane"
[(set (match_operand:<V_four_elem> 0 "neon_struct_operand" "=Um")
(unspec:<V_four_elem>
[(match_operand:OI 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")
(unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST4_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[2]));
int regno = REGNO (operands[1]);
rtx ops[6];
ops[0] = operands[0];
ops[1] = gen_rtx_REG (DImode, regno);
ops[2] = gen_rtx_REG (DImode, regno + 2);
ops[3] = gen_rtx_REG (DImode, regno + 4);
ops[4] = gen_rtx_REG (DImode, regno + 6);
ops[5] = GEN_INT (lane);
output_asm_insn ("vst4.<V_sz_elem>\t{%P1[%c5], %P2[%c5], %P3[%c5], %P4[%c5]}, %A0",
ops);
return "";
}
[(set_attr "type" "neon_store4_one_lane")]
)
;; see comment on neon_vld1_lane for reason why the lane numbers are reversed
;; here on big endian targets.
(define_insn "neon_vst4_lane"
[(set (match_operand:<V_four_elem> 0 "neon_struct_operand" "=Um")
(unspec:<V_four_elem>
[(match_operand:XI 1 "s_register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")
(unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST4_LANE))]
"TARGET_NEON"
{
HOST_WIDE_INT lane = NEON_ENDIAN_LANE_N(mode, INTVAL (operands[2]));
HOST_WIDE_INT max = GET_MODE_NUNITS (mode);
int regno = REGNO (operands[1]);
rtx ops[6];
if (lane >= max / 2)
{
lane -= max / 2;
regno += 2;
}
ops[0] = operands[0];
ops[1] = gen_rtx_REG (DImode, regno);
ops[2] = gen_rtx_REG (DImode, regno + 4);
ops[3] = gen_rtx_REG (DImode, regno + 8);
ops[4] = gen_rtx_REG (DImode, regno + 12);
ops[5] = GEN_INT (lane);
output_asm_insn ("vst4.<V_sz_elem>\t{%P1[%c5], %P2[%c5], %P3[%c5], %P4[%c5]}, %A0",
ops);
return "";
}
[(set_attr "type" "neon_store4_4reg")]
)
(define_insn "neon_vec_unpacklo"
[(set (match_operand:<V_unpack> 0 "register_operand" "=w")
(SE:<V_unpack> (vec_select:<V_HALF>
(match_operand:VU 1 "register_operand" "w")
(match_operand:VU 2 "vect_par_constant_low" ""))))]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
"vmovl.<V_sz_elem> %q0, %e1"
[(set_attr "type" "neon_shift_imm_long")]
)
(define_insn "neon_vec_unpackhi"
[(set (match_operand:<V_unpack> 0 "register_operand" "=w")
(SE:<V_unpack> (vec_select:<V_HALF>
(match_operand:VU 1 "register_operand" "w")
(match_operand:VU 2 "vect_par_constant_high" ""))))]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
"vmovl.<V_sz_elem> %q0, %f1"
[(set_attr "type" "neon_shift_imm_long")]
)
(define_expand "vec_unpackhi"
[(match_operand:<V_unpack> 0 "register_operand")
(SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
rtvec v = rtvec_alloc (<V_mode_nunits>/2) ;
rtx t1;
int i;
for (i = 0; i < (<V_mode_nunits>/2); i++)
RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
t1 = gen_rtx_PARALLEL (mode, v);
emit_insn (gen_neon_vec_unpackhi (operands[0],
operands[1],
t1));
DONE;
}
)
(define_expand "vec_unpacklo"
[(match_operand:<V_unpack> 0 "register_operand")
(SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
rtvec v = rtvec_alloc (<V_mode_nunits>/2) ;
rtx t1;
int i;
for (i = 0; i < (<V_mode_nunits>/2) ; i++)
RTVEC_ELT (v, i) = GEN_INT (i);
t1 = gen_rtx_PARALLEL (mode, v);
emit_insn (gen_neon_vec_unpacklo (operands[0],
operands[1],
t1));
DONE;
}
)
(define_insn "neon_vec_mult_lo_"
[(set (match_operand:<V_unpack> 0 "register_operand" "=w")
(mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
(match_operand:VU 1 "register_operand" "w")
(match_operand:VU 2 "vect_par_constant_low" "")))
(SE:<V_unpack> (vec_select:<V_HALF>
(match_operand:VU 3 "register_operand" "w")
(match_dup 2)))))]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
"vmull.<V_sz_elem> %q0, %e1, %e3"
[(set_attr "type" "neon_mul_<V_elem_ch>_long")]
)
(define_expand "vec_widen_mult_lo_"
[(match_operand:<V_unpack> 0 "register_operand")
(SE:<V_unpack> (match_operand:VU 1 "register_operand"))
(SE:<V_unpack> (match_operand:VU 2 "register_operand"))]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
rtvec v = rtvec_alloc (<V_mode_nunits>/2) ;
rtx t1;
int i;
for (i = 0; i < (<V_mode_nunits>/2) ; i++)
RTVEC_ELT (v, i) = GEN_INT (i);
t1 = gen_rtx_PARALLEL (mode, v);
emit_insn (gen_neon_vec_mult_lo_ (operands[0],
operands[1],
t1,
operands[2]));
DONE;
}
)
(define_insn "neon_vec_mult_hi_"
[(set (match_operand:<V_unpack> 0 "register_operand" "=w")
(mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
(match_operand:VU 1 "register_operand" "w")
(match_operand:VU 2 "vect_par_constant_high" "")))
(SE:<V_unpack> (vec_select:<V_HALF>
(match_operand:VU 3 "register_operand" "w")
(match_dup 2)))))]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
"vmull.<V_sz_elem> %q0, %f1, %f3"
[(set_attr "type" "neon_mul_<V_elem_ch>_long")]
)
(define_expand "vec_widen_mult_hi_"
[(match_operand:<V_unpack> 0 "register_operand")
(SE:<V_unpack> (match_operand:VU 1 "register_operand"))
(SE:<V_unpack> (match_operand:VU 2 "register_operand"))]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
rtvec v = rtvec_alloc (<V_mode_nunits>/2) ;
rtx t1;
int i;
for (i = 0; i < (<V_mode_nunits>/2) ; i++)
RTVEC_ELT (v, i) = GEN_INT (<V_mode_nunits>/2 + i);
t1 = gen_rtx_PARALLEL (mode, v);
emit_insn (gen_neon_vec_mult_hi_ (operands[0],
operands[1],
t1,
operands[2]));
DONE;
}
)
(define_insn "neon_vec_shiftl_"
[(set (match_operand:<V_widen> 0 "register_operand" "=w")
(SE:<V_widen> (ashift:VW (match_operand:VW 1 "register_operand" "w")
(match_operand:<V_innermode> 2 "const_neon_scalar_shift_amount_operand" ""))))]
"TARGET_NEON"
{
return "vshll.<V_sz_elem> %q0, %P1, %2";
}
[(set_attr "type" "neon_shift_imm_long")]
)
(define_expand "vec_widen_shiftl_lo_"
[(match_operand:<V_unpack> 0 "register_operand")
(SE:<V_unpack> (match_operand:VU 1 "register_operand"))
(match_operand:SI 2 "immediate_operand")]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
emit_insn (gen_neon_vec_shiftl_<V_half> (operands[0],
simplify_gen_subreg (<V_HALF>mode, operands[1], mode, 0),
operands[2]));
DONE;
}
)
(define_expand "vec_widen_shiftl_hi_"
[(match_operand:<V_unpack> 0 "register_operand")
(SE:<V_unpack> (match_operand:VU 1 "register_operand"))
(match_operand:SI 2 "immediate_operand")]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
emit_insn (gen_neon_vec_shiftl_<V_half> (operands[0],
simplify_gen_subreg (<V_HALF>mode, operands[1], mode,
GET_MODE_SIZE (<V_HALF>mode)),
operands[2]));
DONE;
}
)
;; Vectorize for non-neon-quad case
(define_insn "neon_unpack_"
[(set (match_operand:<V_widen> 0 "register_operand" "=w")
(SE:<V_widen> (match_operand:VDI 1 "register_operand" "w")))]
"TARGET_NEON"
"vmovl.<V_sz_elem> %q0, %P1"
[(set_attr "type" "neon_move")]
)
(define_expand "vec_unpacklo"
[(match_operand:<V_double_width> 0 "register_operand")
(SE:<V_double_width>(match_operand:VDI 1 "register_operand"))]
"TARGET_NEON"
{
rtx tmpreg = gen_reg_rtx (<V_widen>mode);
emit_insn (gen_neon_unpack_ (tmpreg, operands[1]));
emit_insn (gen_neon_vget_low<V_widen_l> (operands[0], tmpreg));
DONE;
}
)
(define_expand "vec_unpackhi"
[(match_operand:<V_double_width> 0 "register_operand")
(SE:<V_double_width>(match_operand:VDI 1 "register_operand"))]
"TARGET_NEON"
{
rtx tmpreg = gen_reg_rtx (<V_widen>mode);
emit_insn (gen_neon_unpack_ (tmpreg, operands[1]));
emit_insn (gen_neon_vget_high<V_widen_l> (operands[0], tmpreg));
DONE;
}
)
(define_insn "neon_vec_mult_"
[(set (match_operand:<V_widen> 0 "register_operand" "=w")
(mult:<V_widen> (SE:<V_widen>
(match_operand:VDI 1 "register_operand" "w"))
(SE:<V_widen>
(match_operand:VDI 2 "register_operand" "w"))))]
"TARGET_NEON"
"vmull.<V_sz_elem> %q0, %P1, %P2"
[(set_attr "type" "neon_mul_<V_elem_ch>_long")]
)
(define_expand "vec_widen_mult_hi_"
[(match_operand:<V_double_width> 0 "register_operand")
(SE:<V_double_width> (match_operand:VDI 1 "register_operand"))
(SE:<V_double_width> (match_operand:VDI 2 "register_operand"))]
"TARGET_NEON"
{
rtx tmpreg = gen_reg_rtx (<V_widen>mode);
emit_insn (gen_neon_vec_mult_ (tmpreg, operands[1], operands[2]));
emit_insn (gen_neon_vget_high<V_widen_l> (operands[0], tmpreg));
DONE;
}
)
(define_expand "vec_widen_mult_lo_"
[(match_operand:<V_double_width> 0 "register_operand")
(SE:<V_double_width> (match_operand:VDI 1 "register_operand"))
(SE:<V_double_width> (match_operand:VDI 2 "register_operand"))]
"TARGET_NEON"
{
rtx tmpreg = gen_reg_rtx (<V_widen>mode);
emit_insn (gen_neon_vec_mult_ (tmpreg, operands[1], operands[2]));
emit_insn (gen_neon_vget_low<V_widen_l> (operands[0], tmpreg));
DONE;
}
)
(define_expand "vec_widen_shiftl_hi_"
[(match_operand:<V_double_width> 0 "register_operand")
(SE:<V_double_width> (match_operand:VDI 1 "register_operand"))
(match_operand:SI 2 "immediate_operand")]
"TARGET_NEON"
{
rtx tmpreg = gen_reg_rtx (<V_widen>mode);
emit_insn (gen_neon_vec_shiftl_ (tmpreg, operands[1], operands[2]));
emit_insn (gen_neon_vget_high<V_widen_l> (operands[0], tmpreg));
DONE;
}
)
(define_expand "vec_widen_shiftl_lo_"
[(match_operand:<V_double_width> 0 "register_operand")
(SE:<V_double_width> (match_operand:VDI 1 "register_operand"))
(match_operand:SI 2 "immediate_operand")]
"TARGET_NEON"
{
rtx tmpreg = gen_reg_rtx (<V_widen>mode);
emit_insn (gen_neon_vec_shiftl_ (tmpreg, operands[1], operands[2]));
emit_insn (gen_neon_vget_low<V_widen_l> (operands[0], tmpreg));
DONE;
}
)
; FIXME: These instruction patterns can't be used safely in big-endian mode
; because the ordering of vector elements in Q registers is different from what
; the semantics of the instructions require.
(define_insn "vec_pack_trunc_"
[(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w")
(vec_concat:<V_narrow_pack>
(truncate:<V_narrow>
(match_operand:VN 1 "register_operand" "w"))
(truncate:<V_narrow>
(match_operand:VN 2 "register_operand" "w"))))]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
"vmovn.i<V_sz_elem>\t%e0, %q1;vmovn.i<V_sz_elem>\t%f0, %q2"
[(set_attr "type" "multiple")
(set_attr "length" "8")]
)
;; For the non-quad case.
(define_insn "neon_vec_pack_trunc_"
[(set (match_operand:<V_narrow> 0 "register_operand" "=w")
(truncate:<V_narrow> (match_operand:VN 1 "register_operand" "w")))]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
"vmovn.i<V_sz_elem>\t%P0, %q1"
[(set_attr "type" "neon_move_narrow_q")]
)
(define_expand "vec_pack_trunc_"
[(match_operand:<V_narrow_pack> 0 "register_operand")
(match_operand:VSHFT 1 "register_operand")
(match_operand:VSHFT 2 "register_operand")]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
rtx tempreg = gen_reg_rtx (<V_DOUBLE>mode);
emit_insn (gen_move_lo_quad_<V_double> (tempreg, operands[1]));
emit_insn (gen_move_hi_quad_<V_double> (tempreg, operands[2]));
emit_insn (gen_neon_vec_pack_trunc_<V_double> (operands[0], tempreg));
DONE;
})
(define_insn "neon_vabd2"
[(set (match_operand:VF 0 "s_register_operand" "=w")
(abs:VF (minus:VF (match_operand:VF 1 "s_register_operand" "w")
(match_operand:VF 2 "s_register_operand" "w"))))]
"ARM_HAVE_NEON_ARITH"
"vabd.<V_s_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_abd_s")]
)
(define_insn "neon_vabd3"
[(set (match_operand:VF 0 "s_register_operand" "=w")
(abs:VF (unspec:VF [(match_operand:VF 1 "s_register_operand" "w")
(match_operand:VF 2 "s_register_operand" "w")]
UNSPEC_VSUB)))]
"ARM_HAVE_NEON_ARITH"
"vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_abd_s")]
)
(define_insn "neon_mmlav16qi"
[(set (match_operand:V4SI 0 "register_operand" "=w")
(plus:V4SI
(unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")
(match_operand:V16QI 3 "register_operand" "w")] MATMUL)
(match_operand:V4SI 1 "register_operand" "0")))]
"TARGET_I8MM"
"vmmla.<mmla_sfx>\t%q0, %q2, %q3"
[(set_attr "type" "neon_mla_s_q")]
)
(define_insn "neon_vbfdotVCVTF:mode"
[(set (match_operand:VCVTF 0 "register_operand" "=w")
(plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
(unspec:VCVTF [
(match_operand: 2 "register_operand" "w")
(match_operand: 3 "register_operand" "w")]
UNSPEC_DOT_S)))]
"TARGET_BF16_SIMD"
"vdot.bf16\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
[(set_attr "type" "neon_dot")]
)
(define_insn "neon_vbfdot_lanev4bfVCVTF:mode"
[(set (match_operand:VCVTF 0 "register_operand" "=w")
(plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
(unspec:VCVTF [
(match_operand: 2 "register_operand" "w")
(match_operand:V4BF 3 "register_operand" "x")
(match_operand:SI 4 "immediate_operand" "i")]
UNSPEC_DOT_S)))]
"TARGET_BF16_SIMD"
"vdot.bf16\t%<V_reg>0, %<V_reg>2, %P3[%c4]"
[(set_attr "type" "neon_dot")]
)
(define_insn "neon_vbfdot_lanev8bfVCVTF:mode"
[(set (match_operand:VCVTF 0 "register_operand" "=w")
(plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
(unspec:VCVTF [
(match_operand: 2 "register_operand" "w")
(match_operand:V8BF 3 "register_operand" "x")
(match_operand:SI 4 "immediate_operand" "i")]
UNSPEC_DOT_S)))]
"TARGET_BF16_SIMD"
{
int lane = INTVAL (operands[4]);
int half = GET_MODE_NUNITS (GET_MODE (operands[3])) / 4;
if (lane < half)
return "vdot.bf16\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
else
{
operands[4] = GEN_INT (lane - half);
return "vdot.bf16\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
}
}
[(set_attr "type" "neon_dot")]
)
(define_insn "neon_vbfcvtv4sfVBFCVT:mode"
[(set (match_operand:VBFCVT 0 "register_operand" "=w")
(unspec:VBFCVT [(match_operand:V4SF 1 "register_operand" "w")]
UNSPEC_BFCVT))]
"TARGET_BF16_SIMD"
"vcvt.bf16.f32\t%<V_bf_low>0, %q1"
[(set_attr "type" "neon_fp_cvt_narrow_s_q")]
)
(define_insn "neon_vbfcvtv4sf_highv8bf"
[(set (match_operand:V8BF 0 "register_operand" "=w")
(unspec:V8BF [(match_operand:V8BF 1 "register_operand" "0")
(match_operand:V4SF 2 "register_operand" "w")]
UNSPEC_BFCVT_HIGH))]
"TARGET_BF16_SIMD"
"vcvt.bf16.f32\t%f0, %q2"
[(set_attr "type" "neon_fp_cvt_narrow_s_q")]
)
(define_insn "neon_vbfcvtsf"
[(set (match_operand:BF 0 "register_operand" "=t")
(unspec:BF [(match_operand:SF 1 "register_operand" "t")]
UNSPEC_BFCVT))]
"TARGET_BF16_FP"
"vcvtb.bf16.f32\t%0, %1"
[(set_attr "type" "f_cvt")]
)
(define_insn "neon_vbfcvtVBFCVT:mode"
[(set (match_operand:V4SF 0 "register_operand" "=w")
(unspec:V4SF [(match_operand:VBFCVT 1 "register_operand" "w")]
UNSPEC_BFCVT))]
"TARGET_BF16_SIMD"
"vshll.u32\t%q0, %<V_bf_low>1, #16"
[(set_attr "type" "neon_shift_imm_q")]
)
(define_insn "neon_vbfcvt_highv8bf"
[(set (match_operand:V4SF 0 "register_operand" "=w")
(unspec:V4SF [(match_operand:V8BF 1 "register_operand" "w")]
UNSPEC_BFCVT_HIGH))]
"TARGET_BF16_SIMD"
"vshll.u32\t%q0, %f1, #16"
[(set_attr "type" "neon_shift_imm_q")]
)
;; Convert a BF scalar operand to SF via VSHL.
;; VSHL doesn't accept 32-bit registers where the BF and SF scalar operands
;; would be allocated, therefore the operands must be converted to intermediate
;; vectors (i.e. V2SI) in order to apply 64-bit registers.
(define_expand "neon_vbfcvtbf"
[(match_operand:SF 0 "register_operand")
(unspec:SF [(match_operand:BF 1 "register_operand")] UNSPEC_BFCVT)]
"TARGET_BF16_FP"
{
rtx op0 = gen_reg_rtx (V2SImode);
rtx op1 = gen_reg_rtx (V2SImode);
emit_insn (gen_neon_vbfcvtbf_cvtmodev2si (op1, operands[1]));
emit_insn (gen_neon_vshl_nv2si (op0, op1, gen_int_mode(16, SImode)));
emit_insn (gen_neon_vbfcvtbf_cvtmodesf (operands[0], op0));
DONE;
})
;; Convert BF mode to V2SI and V2SI to SF.
;; Implement this by allocating a 32-bit operand in the low half of a 64-bit
;; register indexed by a 32-bit sub-register number.
;; This will generate reloads but compiler can optimize out the moves.
;; Use 'x' constraint to guarantee the 32-bit sub-registers in an indexable
;; range so that to avoid extra moves.
(define_insn "neon_vbfcvtbf_cvtmode"
[(set (match_operand:VBFCVTM 0 "register_operand" "=x")
(unspec:VBFCVTM [(match_operand:<V_bf_cvt_m> 1 "register_operand" "0")]
UNSPEC_BFCVT))]
"TARGET_BF16_FP"
""
)
(define_insn "neon_vmmlav8bf"
[(set (match_operand:V4SF 0 "register_operand" "=w")
(plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
(unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
(match_operand:V8BF 3 "register_operand" "w")]
UNSPEC_BFMMLA)))]
"TARGET_BF16_SIMD"
"vmmla.bf16\t%q0, %q2, %q3"
[(set_attr "type" "neon_fp_mla_s_q")]
)
(define_insn "neon_vfmav8bf"
[(set (match_operand:V4SF 0 "register_operand" "=w")
(plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
(unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
(match_operand:V8BF 3 "register_operand" "w")]
BF_MA)))]
"TARGET_BF16_SIMD"
"vfma.bf16\t%q0, %q2, %q3"
[(set_attr "type" "neon_fp_mla_s_q")]
)
(define_insn "neon_vfma_lanev8bf"
[(set (match_operand:V4SF 0 "register_operand" "=w")
(plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
(unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
(match_operand:V4BF 3 "register_operand" "x")
(match_operand:SI 4 "const_int_operand" "n")]
BF_MA)))]
"TARGET_BF16_SIMD"
"vfma.bf16\t%q0, %q2, %P3[%c4]"
[(set_attr "type" "neon_fp_mla_s_scalar_q")]
)
(define_expand "neon_vfma_laneqv8bf"
[(set (match_operand:V4SF 0 "register_operand" "=w")
(plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
(unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
(match_operand:V8BF 3 "register_operand" "x")
(match_operand:SI 4 "const_int_operand" "n")]
BF_MA)))]
"TARGET_BF16_SIMD"
{
int lane = INTVAL (operands[4]);
gcc_assert (IN_RANGE(lane, 0, 7));
if (lane < 4)
{
emit_insn (gen_neon_vfma_lanev8bf (operands[0], operands[1], operands[2], operands[3], operands[4]));
}
else
{
rtx op_highpart = gen_reg_rtx (V4BFmode);
emit_insn (gen_neon_vget_highv8bf (op_highpart, operands[3]));
operands[4] = GEN_INT (lane - 4);
emit_insn (gen_neon_vfma_lanev8bf (operands[0], operands[1], operands[2], op_highpart, operands[4]));
}
DONE;
}
[(set_attr "type" "neon_fp_mla_s_scalar_q")]
)