x86: Update memcpy/memset inline strategies for Ice Lake
Simply memcpy and memset inline strategies to avoid branches for
-mtune=icelake:
1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
load and store for up to 16 * 16 (256) bytes when the data size is
fixed and known.
2. Inline only if data size is known to be <= 256.
a. Use "rep movsb/stosb" with simple code sequence if the data size
is a constant.
b. Use loop if data size is not a constant.
3. Use memcpy/memset libray function if data size is unknown or > 256.
On Ice Lake processor with -march=native -Ofast -flto,
1. Performance impacts of SPEC CPU 2017 rate are:
500.perlbench_r -0.93%
502.gcc_r 0.36%
505.mcf_r 0.31%
520.omnetpp_r -0.07%
523.xalancbmk_r -0.53%
525.x264_r -0.09%
531.deepsjeng_r -0.19%
541.leela_r 0.16%
548.exchange2_r 0.22%
557.xz_r -1.64%
Geomean -0.24%
503.bwaves_r -0.01%
507.cactuBSSN_r 0.00%
508.namd_r 0.12%
510.parest_r 0.07%
511.povray_r 0.29%
519.lbm_r 0.00%
521.wrf_r -0.38%
526.blender_r 0.16%
527.cam4_r 0.18%
538.imagick_r 0.76%
544.nab_r -0.84%
549.fotonik3d_r -0.07%
554.roms_r -0.01%
Geomean 0.02%
2. Significant impacts on eembc benchmarks are:
eembc/nnet_test 9.90%
eembc/mp2decoddata2 16.42%
eembc/textv2data3 -4.86%
eembc/qos 12.90%
gcc/
* config/i386/i386-expand.c (expand_set_or_cpymem_via_rep):
For TARGET_PREFER_KNOWN_REP_MOVSB_STOSB, don't convert QImode
to SImode.
(decide_alg): For TARGET_PREFER_KNOWN_REP_MOVSB_STOSB, use
"rep movsb/stosb" only for known sizes.
* config/i386/i386-options.c (processor_cost_table): Use Ice
Lake cost for Cannon Lake, Ice Lake, Tiger Lake, Sapphire
Rapids and Alder Lake.
* config/i386/i386.h (TARGET_PREFER_KNOWN_REP_MOVSB_STOSB): New.
* config/i386/x86-tune-costs.h (icelake_memcpy): New.
(icelake_memset): Likewise.
(icelake_cost): Likewise.
* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
New.
gcc/testsuite/
* gcc.target/i386/memcpy-strategy-5.c: New test.
* gcc.target/i386/memcpy-strategy-6.c: Likewise.
* gcc.target/i386/memcpy-strategy-7.c: Likewise.
* gcc.target/i386/memcpy-strategy-8.c: Likewise.
* gcc.target/i386/memset-strategy-3.c: Likewise.
* gcc.target/i386/memset-strategy-4.c: Likewise.
* gcc.target/i386/memset-strategy-5.c: Likewise.
* gcc.target/i386/memset-strategy-6.c: Likewise.
This commit is contained in:
parent
1393938e4c
commit
bf24f4ec73
@ -5976,6 +5976,7 @@ expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
|
||||
/* If possible, it is shorter to use rep movs.
|
||||
TODO: Maybe it is better to move this logic to decide_alg. */
|
||||
if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
|
||||
&& !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
|
||||
&& (!issetmem || orig_value == const0_rtx))
|
||||
mode = SImode;
|
||||
|
||||
@ -6984,7 +6985,12 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
|
||||
else if (!any_alg_usable_p)
|
||||
break;
|
||||
}
|
||||
else if (alg_usable_p (candidate, memset, have_as))
|
||||
else if (alg_usable_p (candidate, memset, have_as)
|
||||
&& !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
|
||||
&& candidate == rep_prefix_1_byte
|
||||
/* NB: If min_size != max_size, size is
|
||||
unknown. */
|
||||
&& min_size != max_size))
|
||||
{
|
||||
*noalign = algs->size[i].noalign;
|
||||
return candidate;
|
||||
|
||||
@ -721,14 +721,14 @@ static const struct processor_costs *processor_cost_table[] =
|
||||
&slm_cost,
|
||||
&skylake_cost,
|
||||
&skylake_cost,
|
||||
&icelake_cost,
|
||||
&icelake_cost,
|
||||
&icelake_cost,
|
||||
&skylake_cost,
|
||||
&icelake_cost,
|
||||
&skylake_cost,
|
||||
&skylake_cost,
|
||||
&skylake_cost,
|
||||
&skylake_cost,
|
||||
&skylake_cost,
|
||||
&skylake_cost,
|
||||
&skylake_cost,
|
||||
&icelake_cost,
|
||||
&icelake_cost,
|
||||
&intel_cost,
|
||||
&geode_cost,
|
||||
&k6_cost,
|
||||
|
||||
@ -523,6 +523,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
|
||||
#define TARGET_PROMOTE_QImode ix86_tune_features[X86_TUNE_PROMOTE_QIMODE]
|
||||
#define TARGET_FAST_PREFIX ix86_tune_features[X86_TUNE_FAST_PREFIX]
|
||||
#define TARGET_SINGLE_STRINGOP ix86_tune_features[X86_TUNE_SINGLE_STRINGOP]
|
||||
#define TARGET_PREFER_KNOWN_REP_MOVSB_STOSB \
|
||||
ix86_tune_features[X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB]
|
||||
#define TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES \
|
||||
ix86_tune_features[X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES]
|
||||
#define TARGET_QIMODE_MATH ix86_tune_features[X86_TUNE_QIMODE_MATH]
|
||||
|
||||
@ -1936,6 +1936,133 @@ struct processor_costs skylake_cost = {
|
||||
"0:0:8", /* Label alignment. */
|
||||
"16", /* Func alignment. */
|
||||
};
|
||||
|
||||
/* icelake_cost should produce code tuned for Icelake family of CPUs.
|
||||
NB: rep_prefix_1_byte is used only for known size. */
|
||||
|
||||
static stringop_algs icelake_memcpy[2] = {
|
||||
{libcall,
|
||||
{{256, rep_prefix_1_byte, true},
|
||||
{256, loop, false},
|
||||
{-1, libcall, false}}},
|
||||
{libcall,
|
||||
{{256, rep_prefix_1_byte, true},
|
||||
{256, loop, false},
|
||||
{-1, libcall, false}}}};
|
||||
|
||||
static stringop_algs icelake_memset[2] = {
|
||||
{libcall,
|
||||
{{256, rep_prefix_1_byte, true},
|
||||
{256, loop, false},
|
||||
{-1, libcall, false}}},
|
||||
{libcall,
|
||||
{{256, rep_prefix_1_byte, true},
|
||||
{256, loop, false},
|
||||
{-1, libcall, false}}}};
|
||||
|
||||
static const
|
||||
struct processor_costs icelake_cost = {
|
||||
{
|
||||
/* Start of register allocator costs. integer->integer move cost is 2. */
|
||||
6, /* cost for loading QImode using movzbl */
|
||||
{4, 4, 4}, /* cost of loading integer registers
|
||||
in QImode, HImode and SImode.
|
||||
Relative to reg-reg move (2). */
|
||||
{6, 6, 6}, /* cost of storing integer registers */
|
||||
2, /* cost of reg,reg fld/fst */
|
||||
{6, 6, 8}, /* cost of loading fp registers
|
||||
in SFmode, DFmode and XFmode */
|
||||
{6, 6, 10}, /* cost of storing fp registers
|
||||
in SFmode, DFmode and XFmode */
|
||||
2, /* cost of moving MMX register */
|
||||
{6, 6}, /* cost of loading MMX registers
|
||||
in SImode and DImode */
|
||||
{6, 6}, /* cost of storing MMX registers
|
||||
in SImode and DImode */
|
||||
2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
|
||||
{6, 6, 6, 10, 20}, /* cost of loading SSE registers
|
||||
in 32,64,128,256 and 512-bit */
|
||||
{8, 8, 8, 12, 24}, /* cost of storing SSE registers
|
||||
in 32,64,128,256 and 512-bit */
|
||||
6, 6, /* SSE->integer and integer->SSE moves */
|
||||
5, 5, /* mask->integer and integer->mask moves */
|
||||
{8, 8, 8}, /* cost of loading mask register
|
||||
in QImode, HImode, SImode. */
|
||||
{6, 6, 6}, /* cost if storing mask register
|
||||
in QImode, HImode, SImode. */
|
||||
3, /* cost of moving mask register. */
|
||||
/* End of register allocator costs. */
|
||||
},
|
||||
|
||||
COSTS_N_INSNS (1), /* cost of an add instruction */
|
||||
COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
|
||||
COSTS_N_INSNS (1), /* variable shift costs */
|
||||
COSTS_N_INSNS (1), /* constant shift costs */
|
||||
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
|
||||
COSTS_N_INSNS (4), /* HI */
|
||||
COSTS_N_INSNS (3), /* SI */
|
||||
COSTS_N_INSNS (3), /* DI */
|
||||
COSTS_N_INSNS (3)}, /* other */
|
||||
0, /* cost of multiply per each bit set */
|
||||
/* Expanding div/mod currently doesn't consider parallelism. So the cost
|
||||
model is not realistic. We compensate by increasing the latencies a bit. */
|
||||
{COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
|
||||
COSTS_N_INSNS (11), /* HI */
|
||||
COSTS_N_INSNS (14), /* SI */
|
||||
COSTS_N_INSNS (76), /* DI */
|
||||
COSTS_N_INSNS (76)}, /* other */
|
||||
COSTS_N_INSNS (1), /* cost of movsx */
|
||||
COSTS_N_INSNS (0), /* cost of movzx */
|
||||
8, /* "large" insn */
|
||||
17, /* MOVE_RATIO */
|
||||
17, /* CLEAR_RATIO */
|
||||
{4, 4, 4}, /* cost of loading integer registers
|
||||
in QImode, HImode and SImode.
|
||||
Relative to reg-reg move (2). */
|
||||
{6, 6, 6}, /* cost of storing integer registers */
|
||||
{6, 6, 6, 10, 20}, /* cost of loading SSE register
|
||||
in 32bit, 64bit, 128bit, 256bit and 512bit */
|
||||
{8, 8, 8, 12, 24}, /* cost of storing SSE register
|
||||
in 32bit, 64bit, 128bit, 256bit and 512bit */
|
||||
{6, 6, 6, 10, 20}, /* cost of unaligned loads. */
|
||||
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
|
||||
2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
|
||||
6, /* cost of moving SSE register to integer. */
|
||||
20, 8, /* Gather load static, per_elt. */
|
||||
22, 10, /* Gather store static, per_elt. */
|
||||
64, /* size of l1 cache. */
|
||||
512, /* size of l2 cache. */
|
||||
64, /* size of prefetch block */
|
||||
6, /* number of parallel prefetches */
|
||||
3, /* Branch cost */
|
||||
COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
|
||||
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
|
||||
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
|
||||
COSTS_N_INSNS (1), /* cost of FABS instruction. */
|
||||
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
|
||||
COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
|
||||
|
||||
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
|
||||
COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
|
||||
COSTS_N_INSNS (4), /* cost of MULSS instruction. */
|
||||
COSTS_N_INSNS (4), /* cost of MULSD instruction. */
|
||||
COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
|
||||
COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
|
||||
COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
|
||||
COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
|
||||
COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
|
||||
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
|
||||
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
|
||||
icelake_memcpy,
|
||||
icelake_memset,
|
||||
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
|
||||
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
|
||||
"16:11:8", /* Loop alignment. */
|
||||
"16:11:8", /* Jump alignment. */
|
||||
"0:0:8", /* Label alignment. */
|
||||
"16", /* Func alignment. */
|
||||
};
|
||||
|
||||
/* BTVER1 has optimized REP instruction for medium sized blocks, but for
|
||||
very small blocks it is better to use loop. For large blocks, libcall can
|
||||
do nontemporary accesses and beat inline considerably. */
|
||||
|
||||
@ -269,6 +269,13 @@ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
|
||||
as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */
|
||||
DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
|
||||
|
||||
/* X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB: Enable use of REP MOVSB/STOSB to
|
||||
move/set sequences of bytes with known size. */
|
||||
DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
|
||||
"prefer_known_rep_movsb_stosb",
|
||||
m_CANNONLAKE | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_TIGERLAKE
|
||||
| m_ALDERLAKE | m_SAPPHIRERAPIDS)
|
||||
|
||||
/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
|
||||
compact prologues and epilogues by issuing a misaligned moves. This
|
||||
requires target to handle misaligned moves and partial memory stalls
|
||||
|
||||
11
gcc/testsuite/gcc.target/i386/memcpy-strategy-5.c
Normal file
11
gcc/testsuite/gcc.target/i386/memcpy-strategy-5.c
Normal file
@ -0,0 +1,11 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=tigerlake -mno-sse" } */
|
||||
/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */
|
||||
/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */
|
||||
/* { dg-final { scan-assembler-not "rep movsb" } } */
|
||||
|
||||
void
|
||||
foo (char *dest, char *src)
|
||||
{
|
||||
__builtin_memcpy (dest, src, 257);
|
||||
}
|
||||
18
gcc/testsuite/gcc.target/i386/memcpy-strategy-6.c
Normal file
18
gcc/testsuite/gcc.target/i386/memcpy-strategy-6.c
Normal file
@ -0,0 +1,18 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -mtune=generic" } */
|
||||
/* { dg-final { scan-assembler-not "jmp\tmemcpy" { target { ! ia32 } } } } */
|
||||
/* { dg-final { scan-assembler-not "call\tmemcpy" { target ia32 } } } */
|
||||
/* { dg-final { scan-assembler-not "rep movsb" } } */
|
||||
|
||||
typedef unsigned char e_u8;
|
||||
|
||||
#define MAXBC 8
|
||||
|
||||
void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
|
||||
{
|
||||
e_u8 b[4][MAXBC];
|
||||
int i, j;
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < BC; j++) a[i][j] = b[i][j];
|
||||
}
|
||||
9
gcc/testsuite/gcc.target/i386/memcpy-strategy-7.c
Normal file
9
gcc/testsuite/gcc.target/i386/memcpy-strategy-7.c
Normal file
@ -0,0 +1,9 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=tigerlake -mno-sse" } */
|
||||
/* { dg-final { scan-assembler "rep movsb" } } */
|
||||
|
||||
void
|
||||
foo (char *dest, char *src)
|
||||
{
|
||||
__builtin_memcpy (dest, src, 256);
|
||||
}
|
||||
18
gcc/testsuite/gcc.target/i386/memcpy-strategy-8.c
Normal file
18
gcc/testsuite/gcc.target/i386/memcpy-strategy-8.c
Normal file
@ -0,0 +1,18 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=tigerlake" } */
|
||||
/* { dg-final { scan-assembler-not "jmp\tmemcpy" { target { ! ia32 } } } } */
|
||||
/* { dg-final { scan-assembler-not "call\tmemcpy" { target ia32 } } } */
|
||||
/* { dg-final { scan-assembler-not "rep movsb" } } */
|
||||
|
||||
typedef unsigned char e_u8;
|
||||
|
||||
#define MAXBC 8
|
||||
|
||||
void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
|
||||
{
|
||||
e_u8 b[4][MAXBC];
|
||||
int i, j;
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < BC; j++) a[i][j] = b[i][j];
|
||||
}
|
||||
17
gcc/testsuite/gcc.target/i386/memset-strategy-3.c
Normal file
17
gcc/testsuite/gcc.target/i386/memset-strategy-3.c
Normal file
@ -0,0 +1,17 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -mtune=generic" } */
|
||||
/* { dg-final { scan-assembler-not "jmp\tmemset" { target { ! ia32 } } } } */
|
||||
/* { dg-final { scan-assembler-not "call\tmemset" { target ia32 } } } */
|
||||
/* { dg-final { scan-assembler-not "rep stosb" } } */
|
||||
|
||||
typedef unsigned char e_u8;
|
||||
|
||||
#define MAXBC 8
|
||||
|
||||
void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < BC; j++) a[i][j] = 1;
|
||||
}
|
||||
17
gcc/testsuite/gcc.target/i386/memset-strategy-4.c
Normal file
17
gcc/testsuite/gcc.target/i386/memset-strategy-4.c
Normal file
@ -0,0 +1,17 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=tigerlake" } */
|
||||
/* { dg-final { scan-assembler-not "jmp\tmemset" { target { ! ia32 } } } } */
|
||||
/* { dg-final { scan-assembler-not "call\tmemset" { target ia32 } } } */
|
||||
/* { dg-final { scan-assembler-not "rep stosb" } } */
|
||||
|
||||
typedef unsigned char e_u8;
|
||||
|
||||
#define MAXBC 8
|
||||
|
||||
void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < BC; j++) a[i][j] = 1;
|
||||
}
|
||||
11
gcc/testsuite/gcc.target/i386/memset-strategy-5.c
Normal file
11
gcc/testsuite/gcc.target/i386/memset-strategy-5.c
Normal file
@ -0,0 +1,11 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=tigerlake -mno-sse" } */
|
||||
/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */
|
||||
/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */
|
||||
/* { dg-final { scan-assembler-not "rep stosb" } } */
|
||||
|
||||
void
|
||||
foo (char *dest)
|
||||
{
|
||||
__builtin_memset (dest, 0, 257);
|
||||
}
|
||||
9
gcc/testsuite/gcc.target/i386/memset-strategy-6.c
Normal file
9
gcc/testsuite/gcc.target/i386/memset-strategy-6.c
Normal file
@ -0,0 +1,9 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=tigerlake -mno-sse" } */
|
||||
/* { dg-final { scan-assembler "rep stosb" } } */
|
||||
|
||||
void
|
||||
foo (char *dest)
|
||||
{
|
||||
__builtin_memset (dest, 0, 256);
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user