Generation of adjusted ldp/stp for vector types
Introduce simple peephole2 optimization which substitutes a sequence of
four consecutive load or store (LDR, STR) instructions with two load or
store pair (LDP, STP) instructions for 2 element supported vector modes
(V2SI, V2SF, V2DI, and V2DF).
Generated load / store pair instruction offset is adjusted accordingly.
Bootstrapped and tested on aarch64-none-linux-gnu.
Example:
$ cat stp_vec_v2sf.c
typedef float __attribute__((vector_size(8))) vec;
void
store_adjusted(vec *out, vec x, vec y)
{
out[400] = x;
out[401] = y;
out[402] = y;
out[403] = x;
}
Example compiled with:
$ ./aarch64-none-linux-gnu-gcc -S -O2 stp_vec_v2sf.c -dp
Before the patch:
store_adjusted:
str d0, [x0, 3200] // 9 [c=4 l=4] *aarch64_simd_movv2si/2
str d1, [x0, 3208] // 11 [c=4 l=4] *aarch64_simd_movv2si/2
str d1, [x0, 3216] // 13 [c=4 l=4] *aarch64_simd_movv2si/2
str d0, [x0, 3224] // 15 [c=4 l=4] *aarch64_simd_movv2si/2
ret // 26 [c=0 l=4] *do_return
After the patch:
store_adjusted:
add x1, x0, 3200 // 27 [c=4 l=4] *adddi3_aarch64/0
stp d0, d1, [x1] // 28 [c=0 l=4] vec_store_pairv2siv2si
stp d1, d0, [x1, 16] // 29 [c=0 l=4] vec_store_pairv2siv2si
ret // 22 [c=0 l=4] *do_return
gcc/ChangeLog:
* config/aarch64/aarch64-ldpstp.md: Add two peepholes for adjusted vector
V2SI, V2SF, V2DI, V2DF load pair and store pair modes.
* config/aarch64/aarch64-protos.h (aarch64_gen_adjusted_ldpstp):
Change mode parameter to machine_mode.
(aarch64_operands_adjust_ok_for_ldpstp): Change mode parameter to
machine_mode.
* config/aarch64/aarch64.c (aarch64_operands_adjust_ok_for_ldpstp):
Change mode parameter to machine_mode.
(aarch64_gen_adjusted_ldpstp): Change mode parameter to machine_mode.
* config/aarch64/iterators.md (VP_2E): New iterator for 2 element vectors.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/ldp_vec_v2sf.c: New test.
* gcc.target/aarch64/ldp_vec_v2si.c: New test.
* gcc.target/aarch64/stp_vec_v2df.c: New test.
* gcc.target/aarch64/stp_vec_v2di.c: New test.
* gcc.target/aarch64/stp_vec_v2sf.c: New test.
* gcc.target/aarch64/stp_vec_v2si.c: New test.
This commit is contained in:
parent
e93676fb53
commit
cd91a08487
@ -294,3 +294,45 @@
|
||||
else
|
||||
FAIL;
|
||||
})
|
||||
|
||||
(define_peephole2
|
||||
[(match_scratch:DI 8 "r")
|
||||
(set (match_operand:VP_2E 0 "memory_operand" "")
|
||||
(match_operand:VP_2E 1 "aarch64_reg_or_zero" ""))
|
||||
(set (match_operand:VP_2E 2 "memory_operand" "")
|
||||
(match_operand:VP_2E 3 "aarch64_reg_or_zero" ""))
|
||||
(set (match_operand:VP_2E 4 "memory_operand" "")
|
||||
(match_operand:VP_2E 5 "aarch64_reg_or_zero" ""))
|
||||
(set (match_operand:VP_2E 6 "memory_operand" "")
|
||||
(match_operand:VP_2E 7 "aarch64_reg_or_zero" ""))
|
||||
(match_dup 8)]
|
||||
"TARGET_SIMD
|
||||
&& aarch64_operands_adjust_ok_for_ldpstp (operands, false, <MODE>mode)"
|
||||
[(const_int 0)]
|
||||
{
|
||||
if (aarch64_gen_adjusted_ldpstp (operands, false, <MODE>mode, UNKNOWN))
|
||||
DONE;
|
||||
else
|
||||
FAIL;
|
||||
})
|
||||
|
||||
(define_peephole2
|
||||
[(match_scratch:DI 8 "r")
|
||||
(set (match_operand:VP_2E 0 "register_operand" "")
|
||||
(match_operand:VP_2E 1 "memory_operand" ""))
|
||||
(set (match_operand:VP_2E 2 "register_operand" "")
|
||||
(match_operand:VP_2E 3 "memory_operand" ""))
|
||||
(set (match_operand:VP_2E 4 "register_operand" "")
|
||||
(match_operand:VP_2E 5 "memory_operand" ""))
|
||||
(set (match_operand:VP_2E 6 "register_operand" "")
|
||||
(match_operand:VP_2E 7 "memory_operand" ""))
|
||||
(match_dup 8)]
|
||||
"TARGET_SIMD
|
||||
&& aarch64_operands_adjust_ok_for_ldpstp (operands, true, <MODE>mode)"
|
||||
[(const_int 0)]
|
||||
{
|
||||
if (aarch64_gen_adjusted_ldpstp (operands, true, <MODE>mode, UNKNOWN))
|
||||
DONE;
|
||||
else
|
||||
FAIL;
|
||||
})
|
||||
|
||||
@ -682,7 +682,7 @@ void aarch64_split_compare_and_swap (rtx op[]);
|
||||
|
||||
void aarch64_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx);
|
||||
|
||||
bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
|
||||
bool aarch64_gen_adjusted_ldpstp (rtx *, bool, machine_mode, RTX_CODE);
|
||||
|
||||
void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx);
|
||||
bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
|
||||
@ -733,7 +733,7 @@ int aarch64_ccmp_mode_to_code (machine_mode mode);
|
||||
|
||||
bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset);
|
||||
bool aarch64_operands_ok_for_ldpstp (rtx *, bool, machine_mode);
|
||||
bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, scalar_mode);
|
||||
bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, machine_mode);
|
||||
void aarch64_swap_ldrstr_operands (rtx *, bool);
|
||||
|
||||
extern void aarch64_asm_output_pool_epilogue (FILE *, const char *,
|
||||
|
||||
@ -22158,7 +22158,7 @@ aarch64_ldrstr_offset_compare (const void *x, const void *y)
|
||||
|
||||
bool
|
||||
aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
|
||||
scalar_mode mode)
|
||||
machine_mode mode)
|
||||
{
|
||||
const int num_insns = 4;
|
||||
enum reg_class rclass;
|
||||
@ -22235,7 +22235,7 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
|
||||
for (int i = 0; i < num_insns; i++)
|
||||
offvals[i] = INTVAL (offset[i]);
|
||||
|
||||
msize = GET_MODE_SIZE (mode);
|
||||
msize = GET_MODE_SIZE (mode).to_constant ();
|
||||
|
||||
/* Check if the offsets can be put in the right order to do a ldp/stp. */
|
||||
qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
|
||||
@ -22275,7 +22275,7 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
|
||||
|
||||
bool
|
||||
aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
|
||||
scalar_mode mode, RTX_CODE code)
|
||||
machine_mode mode, RTX_CODE code)
|
||||
{
|
||||
rtx base, offset_1, offset_3, t1, t2;
|
||||
rtx mem_1, mem_2, mem_3, mem_4;
|
||||
@ -22314,7 +22314,7 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
|
||||
&& offset_3 != NULL_RTX);
|
||||
|
||||
/* Adjust offset so it can fit in LDP/STP instruction. */
|
||||
msize = GET_MODE_SIZE (mode);
|
||||
msize = GET_MODE_SIZE (mode).to_constant();
|
||||
stp_off_upper_limit = msize * (0x40 - 1);
|
||||
stp_off_lower_limit = - msize * 0x40;
|
||||
|
||||
|
||||
@ -98,6 +98,9 @@
|
||||
;; Copy of the above.
|
||||
(define_mode_iterator DREG2 [V8QI V4HI V4HF V2SI V2SF DF])
|
||||
|
||||
;; All modes suitable to store/load pair (2 elements) using STP/LDP.
|
||||
(define_mode_iterator VP_2E [V2SI V2SF V2DI V2DF])
|
||||
|
||||
;; Advanced SIMD, 64-bit container, all integer modes.
|
||||
(define_mode_iterator VD_BHSI [V8QI V4HI V2SI])
|
||||
|
||||
|
||||
14
gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
Normal file
14
gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
Normal file
@ -0,0 +1,14 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2" } */
|
||||
|
||||
typedef float __attribute__((vector_size(8))) vec;
|
||||
|
||||
vec
|
||||
load_long(vec *v) {
|
||||
return v[110] + v[111] + v[112] + v[113];
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler {add\tx[0-9]+, x[0-9]+, 880} } } */
|
||||
/* { dg-final { scan-assembler {ldp\td[0-9]+, d[0-9]+, \[x[0-9]+\]} } } */
|
||||
/* { dg-final { scan-assembler {ldp\td[0-9]+, d[0-9]+, \[x[0-9]+, 16\]} } } */
|
||||
/* { dg-final { scan-assembler-not "ldr\t" } } */
|
||||
14
gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
Normal file
14
gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
Normal file
@ -0,0 +1,14 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2" } */
|
||||
|
||||
typedef int __attribute__((vector_size(8))) vec;
|
||||
|
||||
vec
|
||||
load_long(vec *v) {
|
||||
return v[110] + v[111] + v[112] + v[113];
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler {add\tx[0-9]+, x[0-9]+, 880} } } */
|
||||
/* { dg-final { scan-assembler {ldp\td[0-9]+, d[0-9]+, \[x[0-9]+\]} } } */
|
||||
/* { dg-final { scan-assembler {ldp\td[0-9]+, d[0-9]+, \[x[0-9]+, 16\]} } } */
|
||||
/* { dg-final { scan-assembler-not "ldr\t" } } */
|
||||
18
gcc/testsuite/gcc.target/aarch64/stp_vec_v2df.c
Normal file
18
gcc/testsuite/gcc.target/aarch64/stp_vec_v2df.c
Normal file
@ -0,0 +1,18 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2" } */
|
||||
|
||||
typedef double __attribute__((vector_size(16))) vec;
|
||||
|
||||
void
|
||||
store_adjusted(vec *out, vec x, vec y)
|
||||
{
|
||||
out[100] = x;
|
||||
out[101] = y;
|
||||
out[102] = y;
|
||||
out[103] = x;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler {add\tx[0-9]+, x[0-9]+, 1600} } } */
|
||||
/* { dg-final { scan-assembler {stp\tq[0-9]+, q[0-9]+, \[x[0-9]+\]} } } */
|
||||
/* { dg-final { scan-assembler {stp\tq[0-9]+, q[0-9]+, \[x[0-9]+, 32\]} } } */
|
||||
/* { dg-final { scan-assembler-not "str\t" } } */
|
||||
18
gcc/testsuite/gcc.target/aarch64/stp_vec_v2di.c
Normal file
18
gcc/testsuite/gcc.target/aarch64/stp_vec_v2di.c
Normal file
@ -0,0 +1,18 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2" } */
|
||||
|
||||
typedef long long __attribute__((vector_size(16))) vec;
|
||||
|
||||
void
|
||||
store_adjusted(vec *out, vec x, vec y)
|
||||
{
|
||||
out[100] = x;
|
||||
out[101] = y;
|
||||
out[102] = y;
|
||||
out[103] = x;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler {add\tx[0-9]+, x[0-9]+, 1600} } } */
|
||||
/* { dg-final { scan-assembler {stp\tq[0-9]+, q[0-9]+, \[x[0-9]+\]} } } */
|
||||
/* { dg-final { scan-assembler {stp\tq[0-9]+, q[0-9]+, \[x[0-9]+, 32\]} } } */
|
||||
/* { dg-final { scan-assembler-not "str\t" } } */
|
||||
18
gcc/testsuite/gcc.target/aarch64/stp_vec_v2sf.c
Normal file
18
gcc/testsuite/gcc.target/aarch64/stp_vec_v2sf.c
Normal file
@ -0,0 +1,18 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2" } */
|
||||
|
||||
typedef float __attribute__((vector_size(8))) vec;
|
||||
|
||||
void
|
||||
store_adjusted(vec *out, vec x, vec y)
|
||||
{
|
||||
out[400] = x;
|
||||
out[401] = y;
|
||||
out[402] = y;
|
||||
out[403] = x;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler {add\tx[0-9]+, x[0-9]+, 3200} } } */
|
||||
/* { dg-final { scan-assembler {stp\td[0-9]+, d[0-9]+, \[x[0-9]+\]} } } */
|
||||
/* { dg-final { scan-assembler {stp\td[0-9]+, d[0-9]+, \[x[0-9]+, 16\]} } } */
|
||||
/* { dg-final { scan-assembler-not "str\t" } } */
|
||||
18
gcc/testsuite/gcc.target/aarch64/stp_vec_v2si.c
Normal file
18
gcc/testsuite/gcc.target/aarch64/stp_vec_v2si.c
Normal file
@ -0,0 +1,18 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2" } */
|
||||
|
||||
typedef int __attribute__((vector_size(8))) vec;
|
||||
|
||||
void
|
||||
store_adjusted(vec *out, vec x, vec y)
|
||||
{
|
||||
out[400] = x;
|
||||
out[401] = y;
|
||||
out[402] = y;
|
||||
out[403] = x;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler {add\tx[0-9]+, x[0-9]+, 3200} } } */
|
||||
/* { dg-final { scan-assembler {stp\td[0-9]+, d[0-9]+, \[x[0-9]+\]} } } */
|
||||
/* { dg-final { scan-assembler {stp\td[0-9]+, d[0-9]+, \[x[0-9]+, 16\]} } } */
|
||||
/* { dg-final { scan-assembler-not "str\t" } } */
|
||||
Loading…
Reference in New Issue
Block a user