diff --git a/gcc/testsuite/gcc.dg/vect/slp-21.c b/gcc/testsuite/gcc.dg/vect/slp-21.c index bf8f434dd50..85393975b45 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-21.c +++ b/gcc/testsuite/gcc.dg/vect/slp-21.c @@ -210,7 +210,7 @@ int main (void) Not all vect_perm targets support that, and it's a bit too specific to have its own effective-target selector, so we just test targets directly. */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target { aarch64*-*-* arm*-*-* powerpc64*-*-* } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_strided4 && { ! { aarch64*-*-* arm*-*-* powerpc64*-*-* } } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target powerpc64*-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_strided4 && { ! powerpc64*-*-* } } } } } */ /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_strided4 } } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c index 81fdb67ce81..addcf60438c 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c +++ b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c @@ -40,4 +40,5 @@ main (void) return 0; } -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { ! vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { ! vect_load_lanes } xfail { ! vect_hw_misalign } } } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/pr99873_1.c b/gcc/testsuite/gcc.target/aarch64/pr99873_1.c new file mode 100644 index 00000000000..bc4d81e3ae5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr99873_1.c @@ -0,0 +1,17 @@ +/* { dg-options "-O3" } */ + +#pragma GCC target "+nosve" + +void +f (int *restrict x, int *restrict y, int *restrict z, int n) +{ + for (int i = 0; i < n; i += 3) + { + x[i] = y[i] + z[i]; + x[i + 1] = y[i + 1] - z[i + 1]; + x[i + 2] = y[i + 2] | z[i + 2]; + } +} + +/* { dg-final { scan-assembler-times {\tld3\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tst3\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/pr99873_2.c b/gcc/testsuite/gcc.target/aarch64/pr99873_2.c new file mode 100644 index 00000000000..b73fbdc0a18 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr99873_2.c @@ -0,0 +1,20 @@ +/* { dg-options "-O3" } */ + +#include + +#pragma GCC target "+nosve" + +void __attribute ((noipa)) +foo (uint64_t *__restrict x, uint64_t *__restrict y, int n) +{ + for (int i = 0; i < n; i += 4) + { + x[i] += y[i]; + x[i + 1] += y[i + 1]; + x[i + 2] |= y[i + 2]; + x[i + 3] |= y[i + 3]; + } +} + +/* { dg-final { scan-assembler-not {\tld4\t} } } */ +/* { dg-final { scan-assembler-not {\tst4\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/pr99873_3.c b/gcc/testsuite/gcc.target/aarch64/pr99873_3.c new file mode 100644 index 00000000000..ccbab6d74be --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr99873_3.c @@ -0,0 +1,20 @@ +/* { dg-options "-O3" } */ + +#include + +#pragma GCC target "+nosve" + +void __attribute ((noipa)) +foo (uint32_t *__restrict x, uint32_t *__restrict y, int n) +{ + for (int i = 0; i < n; i += 4) + { + x[i] += y[i]; + x[i + 1] += y[i + 1]; + x[i + 2] |= y[i + 2]; + x[i + 3] |= y[i + 3]; + } +} + +/* { dg-final { scan-assembler-times {\tld4\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tst4\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c new file mode 100644 index 00000000000..f4b95da2afa --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c @@ -0,0 +1,15 @@ +/* { dg-options "-O3" } */ + +void +f (int *restrict x, int *restrict y, int *restrict z, int n) +{ + for (int i = 0; i < n; i += 3) + { + x[i] = y[i] + z[i]; + x[i + 1] = y[i + 1] - z[i + 1]; + x[i + 2] = y[i + 2] | z[i + 2]; + } +} + +/* { dg-final { scan-assembler-times {\tld3w\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tst3w\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c new file mode 100644 index 00000000000..03dc4ef930d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c @@ -0,0 +1,18 @@ +/* { dg-options "-O3" } */ + +#include + +void __attribute ((noipa)) +foo (uint64_t *__restrict x, uint64_t *__restrict y, int n) +{ + for (int i = 0; i < n; i += 4) + { + x[i] += y[i]; + x[i + 1] += y[i + 1]; + x[i + 2] |= y[i + 2]; + x[i + 3] |= y[i + 3]; + } +} + +/* { dg-final { scan-assembler-times {\tld4d\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tst4d\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c new file mode 100644 index 00000000000..87a0141e508 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c @@ -0,0 +1,18 @@ +/* { dg-options "-O3" } */ + +#include + +void __attribute ((noipa)) +foo (uint32_t *__restrict x, uint32_t *__restrict y, int n) +{ + for (int i = 0; i < n; i += 4) + { + x[i] += y[i]; + x[i + 1] += y[i + 1]; + x[i + 2] |= y[i + 2]; + x[i + 3] |= y[i + 3]; + } +} + +/* { dg-final { scan-assembler-times {\tld4w\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tst4w\t} 1 } } */ diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 58dedfc35b7..7d5f9d10ca4 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -2458,6 +2458,34 @@ vect_match_slp_patterns (slp_instance instance, vec_info *vinfo, return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited); } +/* STMT_INFO is a store group of size GROUP_SIZE that we are considering + splitting into two, with the first split group having size NEW_GROUP_SIZE. + Return true if we could use IFN_STORE_LANES instead and if that appears + to be the better approach. */ + +static bool +vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info, + unsigned int group_size, + unsigned int new_group_size) +{ + tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info))); + tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type); + if (!vectype) + return false; + /* Allow the split if one of the two new groups would operate on full + vectors *within* rather than across one scalar loop iteration. + This is purely a heuristic, but it should work well for group + sizes of 3 and 4, where the possible splits are: + + 3->2+1: OK if the vector has exactly two elements + 4->2+2: Likewise + 4->3+1: Less clear-cut. */ + if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype)) + || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype))) + return false; + return vect_store_lanes_supported (vectype, group_size, false); +} + /* Analyze an SLP instance starting from a group of grouped stores. Call vect_build_slp_tree to build a tree of packed stmts if possible. Return FALSE if it's impossible to SLP any stmt in the loop. */ @@ -2693,7 +2721,8 @@ vect_build_slp_instance (vec_info *vinfo, /* For loop vectorization split into arbitrary pieces of size > 1. */ if (is_a (vinfo) - && (i > 1 && i < group_size)) + && (i > 1 && i < group_size) + && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i)) { unsigned group1_size = i;