vect: Restore variable-length SLP permutes [PR97513]

Many of the gcc.target/sve/slp-perm*.c tests started failing
after the introduction of separate SLP permute nodes.
This patch adds variable-length support using a similar
technique to vect_transform_slp_perm_load.

As there, the idea is to detect when every permute mask vector
is the same and can be generated using a regular stepped sequence.
We can easily handle those cases for variable-length, but still
need to restrict the general case to constant-length.

Again copying vect_transform_slp_perm_load, the idea is to distinguish
the two cases regardless of whether the length is variable or not,
partly to increase testing coverage and partly because it avoids
generating redundant trees.

Doing this means that we can also use SLP for the two-vector
permute in pr88834.c, which we couldn't before VEC_PERM_EXPR
nodes were introduced.  The patch therefore makes pr88834.c
check that we don't regress back to not using SLP and adds
pr88834_ld3.c to check for the original problem in the PR.

gcc/
	PR tree-optimization/97513
	* tree-vect-slp.c (vect_add_slp_permutation): New function,
	split out from...
	(vectorizable_slp_permutation): ...here.  Detect cases in which
	all VEC_PERM_EXPRs are guaranteed to have the same stepped
	permute vector and only generate one permute vector for that case.
	Extend that case to handle variable-length vectors.

gcc/testsuite/
	* gcc.target/aarch64/sve/pr88834.c: Expect the vectorizer to use SLP.
	* gcc.target/aarch64/sve/pr88834_ld3.c: New test.
This commit is contained in:
Richard Sandiford 2021-04-07 15:21:56 +01:00
parent 5c5b31975e
commit 2f3d910461
3 changed files with 167 additions and 72 deletions

View File

@ -11,5 +11,6 @@ f (int *restrict x, int *restrict y, int *restrict z, int n)
}
}
/* { dg-final { scan-assembler-times {\tld2w\t{z[0-9]+.s - z[0-9]+.s}, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 2 } } */
/* { dg-final { scan-assembler-times {\tst2w\t{z[0-9]+.s - z[0-9]+.s}, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 1 } } */
/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.d, all\n} } } */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 2 } } */
/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 1 } } */

View File

@ -0,0 +1,16 @@
/* { dg-do compile } */
/* { dg-options "-O3" } */
void
f (int *restrict x, int *restrict y, int *restrict z, int n)
{
for (int i = 0; i < n; i += 3)
{
x[i] = y[i] + z[i];
x[i + 1] = y[i + 1] - z[i + 1];
x[i + 2] = y[i + 2] | z[i + 2];
}
}
/* { dg-final { scan-assembler-times {\tld3w\t{z[0-9]+.s - z[0-9]+.s}, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 2 } } */
/* { dg-final { scan-assembler-times {\tst3w\t{z[0-9]+.s - z[0-9]+.s}, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 1 } } */

View File

@ -5829,6 +5829,57 @@ vect_transform_slp_perm_load (vec_info *vinfo,
return true;
}
/* Produce the next vector result for SLP permutation NODE by adding a vector
statement at GSI. If MASK_VEC is nonnull, add:
<new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
otherwise add:
<new SSA name> = FIRST_DEF. */
static void
vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
slp_tree node, tree first_def, tree second_def,
tree mask_vec)
{
tree vectype = SLP_TREE_VECTYPE (node);
/* ??? We SLP match existing vector element extracts but
allow punning which we need to re-instantiate at uses
but have no good way of explicitly representing. */
if (!types_compatible_p (TREE_TYPE (first_def), vectype))
{
gassign *conv_stmt
= gimple_build_assign (make_ssa_name (vectype),
build1 (VIEW_CONVERT_EXPR, vectype, first_def));
vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
first_def = gimple_assign_lhs (conv_stmt);
}
gassign *perm_stmt;
tree perm_dest = make_ssa_name (vectype);
if (mask_vec)
{
if (!types_compatible_p (TREE_TYPE (second_def), vectype))
{
gassign *conv_stmt
= gimple_build_assign (make_ssa_name (vectype),
build1 (VIEW_CONVERT_EXPR,
vectype, second_def));
vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
second_def = gimple_assign_lhs (conv_stmt);
}
perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
first_def, second_def,
mask_vec);
}
else
/* We need a copy here in case the def was external. */
perm_stmt = gimple_build_assign (perm_dest, first_def);
vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
/* Store the vector statement in NODE. */
SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
}
/* Vectorize the SLP permutations in NODE as specified
in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
@ -5852,7 +5903,10 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
arbitrary mismatches. */
slp_tree child;
unsigned i;
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
{
if (!vect_maybe_update_slp_op_vectype (child, vectype)
|| !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
{
@ -5861,6 +5915,9 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
"Unsupported lane permutation\n");
return false;
}
if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
repeating_p = false;
}
vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
gcc_assert (perm.length () == SLP_TREE_LANES (node));
@ -5870,18 +5927,58 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
"vectorizing permutation");
for (unsigned i = 0; i < perm.length (); ++i)
dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
if (repeating_p)
dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
dump_printf (MSG_NOTE, "\n");
}
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
if (!nunits.is_constant ())
/* REPEATING_P is true if every output vector is guaranteed to use the
same permute vector. We can handle that case for both variable-length
and constant-length vectors, but we only handle other cases for
constant-length vectors.
Set:
- NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
mask vector that we want to build.
- NCOPIES to the number of copies of PERM that we need in order
to build the necessary permute mask vectors.
- NOUTPUTS_PER_MASK to the number of output vectors we want to create
for each permute mask vector. This is only relevant when GSI is
nonnull. */
uint64_t npatterns;
unsigned nelts_per_pattern;
uint64_t ncopies;
unsigned noutputs_per_mask;
if (repeating_p)
{
/* We need a single permute mask vector that has the form:
{ X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
In other words, the original n-element permute in PERM is
"unrolled" to fill a full vector. The stepped vector encoding
that we use for permutes requires 3n elements. */
npatterns = SLP_TREE_LANES (node);
nelts_per_pattern = ncopies = 3;
noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
}
else
{
/* Calculate every element of every permute mask vector explicitly,
instead of relying on the pattern described above. */
if (!nunits.is_constant (&npatterns))
return false;
unsigned HOST_WIDE_INT vf = 1;
nelts_per_pattern = ncopies = 1;
if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
return false;
unsigned olanes = vf * SLP_TREE_LANES (node);
gcc_assert (multiple_p (olanes, nunits));
noutputs_per_mask = 1;
}
unsigned olanes = ncopies * SLP_TREE_LANES (node);
gcc_assert (repeating_p || multiple_p (olanes, nunits));
/* Compute the { { SLP operand, vector index}, lane } permutation sequence
from the { SLP operand, scalar lane } permutation as recorded in the
@ -5891,16 +5988,22 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
auto_vec<unsigned> active_lane;
vperm.create (olanes);
active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
for (unsigned i = 0; i < vf; ++i)
for (unsigned i = 0; i < ncopies; ++i)
{
for (unsigned pi = 0; pi < perm.length (); ++pi)
{
std::pair<unsigned, unsigned> p = perm[pi];
tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
if (repeating_p)
vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
else
{
/* We checked above that the vectors are constant-length. */
unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
unsigned vi = (active_lane[p.first] + p.second) / vnunits;
unsigned vl = (active_lane[p.first] + p.second) % vnunits;
vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
vperm.quick_push ({{p.first, vi}, vl});
}
}
/* Advance to the next group. */
for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
@ -5912,7 +6015,10 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
dump_printf_loc (MSG_NOTE, vect_location, "as");
for (unsigned i = 0; i < vperm.length (); ++i)
{
if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
if (i != 0
&& (repeating_p
? multiple_p (i, npatterns)
: multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
dump_printf (MSG_NOTE, ",");
dump_printf (MSG_NOTE, " vops%u[%u][%u]",
vperm[i].first.first, vperm[i].first.second,
@ -5929,11 +6035,10 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
somehow? */
std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
unsigned int const_nunits = nunits.to_constant ();
unsigned int index = 0;
unsigned int mask_element;
poly_uint64 mask_element;
vec_perm_builder mask;
mask.new_vector (const_nunits, const_nunits, 1);
mask.new_vector (nunits, npatterns, nelts_per_pattern);
unsigned int count = mask.encoded_nelts ();
mask.quick_grow (count);
vec_perm_indices indices;
@ -5948,7 +6053,7 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
|| second_vec == vperm[i].first)
{
second_vec = vperm[i].first;
mask_element += const_nunits;
mask_element += nunits;
}
else
{
@ -5964,8 +6069,7 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
if (index == count)
{
indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
const_nunits);
indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
bool identity_p = indices.series_p (0, 1, 0, 1);
if (!identity_p
&& !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
@ -5993,51 +6097,25 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
if (second_vec.first == -1U)
second_vec = first_vec;
/* Generate the permute statement if necessary. */
slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
tree first_def
= vect_get_slp_vect_def (first_node, first_vec.second);
/* ??? We SLP match existing vector element extracts but
allow punning which we need to re-instantiate at uses
but have no good way of explicitely representing. */
if (!types_compatible_p (TREE_TYPE (first_def), vectype))
{
gassign *conv_stmt;
conv_stmt = gimple_build_assign (make_ssa_name (vectype),
build1 (VIEW_CONVERT_EXPR,
vectype, first_def));
vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
first_def = gimple_assign_lhs (conv_stmt);
}
gassign *perm_stmt;
tree perm_dest = make_ssa_name (vectype);
slp_tree
first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
tree mask_vec = NULL_TREE;
if (!identity_p)
mask_vec = vect_gen_perm_mask_checked (vectype, indices);
for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
{
slp_tree second_node
= SLP_TREE_CHILDREN (node)[second_vec.first];
tree first_def
= vect_get_slp_vect_def (first_node,
first_vec.second + vi);
tree second_def
= vect_get_slp_vect_def (second_node, second_vec.second);
if (!types_compatible_p (TREE_TYPE (second_def), vectype))
{
gassign *conv_stmt;
conv_stmt = gimple_build_assign (make_ssa_name (vectype),
build1
(VIEW_CONVERT_EXPR,
vectype, second_def));
vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
second_def = gimple_assign_lhs (conv_stmt);
= vect_get_slp_vect_def (second_node,
second_vec.second + vi);
vect_add_slp_permutation (vinfo, gsi, node, first_def,
second_def, mask_vec);
}
tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
first_def, second_def,
mask_vec);
}
else
/* We need a copy here in case the def was external. */
perm_stmt = gimple_build_assign (perm_dest, first_def);
vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
/* Store the vector statement in NODE. */
SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
}
index = 0;