x86: Update memcpy/memset inline strategies for Skylake family CPUs
Simply memcpy and memset inline strategies to avoid branches for
Skylake family CPUs:
1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
load and store for up to 16 * 16 (256) bytes when the data size is
fixed and known.
2. Inline only if data size is known to be <= 256.
a. Use "rep movsb/stosb" with simple code sequence if the data size
is a constant.
b. Use loop if data size is not a constant.
3. Use memcpy/memset libray function if data size is unknown or > 256.
On Cascadelake processor with -march=native -Ofast -flto,
1. Performance impacts of SPEC CPU 2017 rate are:
500.perlbench_r 0.17%
502.gcc_r -0.36%
505.mcf_r 0.00%
520.omnetpp_r 0.08%
523.xalancbmk_r -0.62%
525.x264_r 1.04%
531.deepsjeng_r 0.11%
541.leela_r -1.09%
548.exchange2_r -0.25%
557.xz_r 0.17%
Geomean -0.08%
503.bwaves_r 0.00%
507.cactuBSSN_r 0.69%
508.namd_r -0.07%
510.parest_r 1.12%
511.povray_r 1.82%
519.lbm_r 0.00%
521.wrf_r -1.32%
526.blender_r -0.47%
527.cam4_r 0.23%
538.imagick_r -1.72%
544.nab_r -0.56%
549.fotonik3d_r 0.12%
554.roms_r 0.43%
Geomean 0.02%
2. Significant impacts on eembc benchmarks are:
eembc/idctrn01 9.23%
eembc/nnet_test 29.26%
gcc/
* config/i386/x86-tune-costs.h (skylake_memcpy): Updated.
(skylake_memset): Likewise.
(skylake_cost): Change CLEAR_RATIO to 17.
* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
Replace m_CANNONLAKE, m_ICELAKE_CLIENT, m_ICELAKE_SERVER,
m_TIGERLAKE and m_SAPPHIRERAPIDS with m_SKYLAKE and m_CORE_AVX512.
gcc/testsuite/
* gcc.target/i386/memcpy-strategy-9.c: New test.
* gcc.target/i386/memcpy-strategy-10.c: Likewise.
* gcc.target/i386/memcpy-strategy-11.c: Likewise.
* gcc.target/i386/memset-strategy-7.c: Likewise.
* gcc.target/i386/memset-strategy-8.c: Likewise.
* gcc.target/i386/memset-strategy-9.c: Likewise.
This commit is contained in:
parent
e5c170e080
commit
a32452a544
@ -1822,17 +1822,24 @@ struct processor_costs znver3_cost = {
|
||||
|
||||
/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
|
||||
static stringop_algs skylake_memcpy[2] = {
|
||||
{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
|
||||
{libcall, {{16, loop, false}, {512, unrolled_loop, false},
|
||||
{-1, libcall, false}}}};
|
||||
{libcall,
|
||||
{{256, rep_prefix_1_byte, true},
|
||||
{256, loop, false},
|
||||
{-1, libcall, false}}},
|
||||
{libcall,
|
||||
{{256, rep_prefix_1_byte, true},
|
||||
{256, loop, false},
|
||||
{-1, libcall, false}}}};
|
||||
|
||||
static stringop_algs skylake_memset[2] = {
|
||||
{libcall, {{6, loop_1_byte, true},
|
||||
{24, loop, true},
|
||||
{8192, rep_prefix_4_byte, true},
|
||||
{-1, libcall, false}}},
|
||||
{libcall, {{24, loop, true}, {512, unrolled_loop, false},
|
||||
{-1, libcall, false}}}};
|
||||
{libcall,
|
||||
{{256, rep_prefix_1_byte, true},
|
||||
{256, loop, false},
|
||||
{-1, libcall, false}}},
|
||||
{libcall,
|
||||
{{256, rep_prefix_1_byte, true},
|
||||
{256, loop, false},
|
||||
{-1, libcall, false}}}};
|
||||
|
||||
static const
|
||||
struct processor_costs skylake_cost = {
|
||||
@ -1889,7 +1896,7 @@ struct processor_costs skylake_cost = {
|
||||
COSTS_N_INSNS (0), /* cost of movzx */
|
||||
8, /* "large" insn */
|
||||
17, /* MOVE_RATIO */
|
||||
6, /* CLEAR_RATIO */
|
||||
17, /* CLEAR_RATIO */
|
||||
{4, 4, 4}, /* cost of loading integer registers
|
||||
in QImode, HImode and SImode.
|
||||
Relative to reg-reg move (2). */
|
||||
|
||||
@ -273,8 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
|
||||
move/set sequences of bytes with known size. */
|
||||
DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
|
||||
"prefer_known_rep_movsb_stosb",
|
||||
m_CANNONLAKE | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_TIGERLAKE
|
||||
| m_ALDERLAKE | m_SAPPHIRERAPIDS)
|
||||
m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
|
||||
|
||||
/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
|
||||
compact prologues and epilogues by issuing a misaligned moves. This
|
||||
|
||||
11
gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c
Normal file
11
gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c
Normal file
@ -0,0 +1,11 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=skylake -mno-sse" } */
|
||||
/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */
|
||||
/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */
|
||||
/* { dg-final { scan-assembler-not "rep movsb" } } */
|
||||
|
||||
void
|
||||
foo (char *dest, char *src)
|
||||
{
|
||||
__builtin_memcpy (dest, src, 257);
|
||||
}
|
||||
18
gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c
Normal file
18
gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c
Normal file
@ -0,0 +1,18 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=skylake" } */
|
||||
/* { dg-final { scan-assembler-not "jmp\tmemcpy" { target { ! ia32 } } } } */
|
||||
/* { dg-final { scan-assembler-not "call\tmemcpy" { target ia32 } } } */
|
||||
/* { dg-final { scan-assembler-not "rep movsb" } } */
|
||||
|
||||
typedef unsigned char e_u8;
|
||||
|
||||
#define MAXBC 8
|
||||
|
||||
void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
|
||||
{
|
||||
e_u8 b[4][MAXBC];
|
||||
int i, j;
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < BC; j++) a[i][j] = b[i][j];
|
||||
}
|
||||
9
gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c
Normal file
9
gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c
Normal file
@ -0,0 +1,9 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=skylake -mno-sse" } */
|
||||
/* { dg-final { scan-assembler "rep movsb" } } */
|
||||
|
||||
void
|
||||
foo (char *dest, char *src)
|
||||
{
|
||||
__builtin_memcpy (dest, src, 256);
|
||||
}
|
||||
11
gcc/testsuite/gcc.target/i386/memset-strategy-7.c
Normal file
11
gcc/testsuite/gcc.target/i386/memset-strategy-7.c
Normal file
@ -0,0 +1,11 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=skylake -mno-sse" } */
|
||||
/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */
|
||||
/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */
|
||||
/* { dg-final { scan-assembler-not "rep stosb" } } */
|
||||
|
||||
void
|
||||
foo (char *dest)
|
||||
{
|
||||
__builtin_memset (dest, 0, 257);
|
||||
}
|
||||
9
gcc/testsuite/gcc.target/i386/memset-strategy-8.c
Normal file
9
gcc/testsuite/gcc.target/i386/memset-strategy-8.c
Normal file
@ -0,0 +1,9 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=skylake -mno-sse" } */
|
||||
/* { dg-final { scan-assembler "rep stosb" } } */
|
||||
|
||||
void
|
||||
foo (char *dest)
|
||||
{
|
||||
__builtin_memset (dest, 0, 256);
|
||||
}
|
||||
17
gcc/testsuite/gcc.target/i386/memset-strategy-9.c
Normal file
17
gcc/testsuite/gcc.target/i386/memset-strategy-9.c
Normal file
@ -0,0 +1,17 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=skylake" } */
|
||||
/* { dg-final { scan-assembler-not "jmp\tmemset" { target { ! ia32 } } } } */
|
||||
/* { dg-final { scan-assembler-not "call\tmemset" { target ia32 } } } */
|
||||
/* { dg-final { scan-assembler-not "rep stosb" } } */
|
||||
|
||||
typedef unsigned char e_u8;
|
||||
|
||||
#define MAXBC 8
|
||||
|
||||
void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < BC; j++) a[i][j] = 1;
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user