aarch64: Add reduction costs to simd_vec_costs

This patch is part of a series that makes opt-in tweaks to the
AArch64 vector cost model.

At the moment, all reductions are costed as vec_to_scalar, which
also includes things like extracting a single element from a vector.
This is a bit too coarse in practice, since the cost of a reduction
depends very much on the type of value that it's processing.
This patch therefore adds separate costs for each case.  To start with,
all the new costs are copied from the associated vec_to_scalar ones.

Due the extreme lateness of this patch in the GCC 11 cycle, I've added
a new tuning flag (use_new_vector_costs) that selects the new behaviour.
This should help to ensure that the risk of the new code is only borne
by the CPUs that need it.  Generic tuning is not affected.

gcc/
	* config/aarch64/aarch64-tuning-flags.def (use_new_vector_costs):
	New tuning flag.
	* config/aarch64/aarch64-protos.h (simd_vec_cost): Put comments
	above the fields rather than to the right.
	(simd_vec_cost::reduc_i8_cost): New member variable.
	(simd_vec_cost::reduc_i16_cost): Likewise.
	(simd_vec_cost::reduc_i32_cost): Likewise.
	(simd_vec_cost::reduc_i64_cost): Likewise.
	(simd_vec_cost::reduc_f16_cost): Likewise.
	(simd_vec_cost::reduc_f32_cost): Likewise.
	(simd_vec_cost::reduc_f64_cost): Likewise.
	* config/aarch64/aarch64.c (generic_advsimd_vector_cost): Update
	accordingly, using the vec_to_scalar_cost for the new fields.
	(generic_sve_vector_cost, a64fx_advsimd_vector_cost): Likewise.
	(a64fx_sve_vector_cost, qdf24xx_advsimd_vector_cost): Likewise.
	(thunderx_advsimd_vector_cost, tsv110_advsimd_vector_cost): Likewise.
	(cortexa57_advsimd_vector_cost, exynosm1_advsimd_vector_cost)
	(xgene1_advsimd_vector_cost, thunderx2t99_advsimd_vector_cost)
	(thunderx3t110_advsimd_vector_cost): Likewise.
	(aarch64_use_new_vector_costs_p): New function.
	(aarch64_simd_vec_costs): New function, split out from...
	(aarch64_builtin_vectorization_cost): ...here.
	(aarch64_is_reduction): New function.
	(aarch64_detect_vector_stmt_subtype): Likewise.
	(aarch64_add_stmt_cost): Call aarch64_detect_vector_stmt_subtype if
	using the new vector costs.
This commit is contained in:
Richard Sandiford 2021-03-26 16:08:29 +00:00
parent fdfcb5353c
commit e253bb8b79
3 changed files with 216 additions and 22 deletions

View File

@ -194,22 +194,46 @@ struct cpu_regmove_cost
struct simd_vec_cost
{
const int int_stmt_cost; /* Cost of any int vector operation,
excluding load, store, permute,
vector-to-scalar and
scalar-to-vector operation. */
const int fp_stmt_cost; /* Cost of any fp vector operation,
excluding load, store, permute,
vector-to-scalar and
scalar-to-vector operation. */
const int permute_cost; /* Cost of permute operation. */
const int vec_to_scalar_cost; /* Cost of vec-to-scalar operation. */
const int scalar_to_vec_cost; /* Cost of scalar-to-vector
operation. */
const int align_load_cost; /* Cost of aligned vector load. */
const int unalign_load_cost; /* Cost of unaligned vector load. */
const int unalign_store_cost; /* Cost of unaligned vector store. */
const int store_cost; /* Cost of vector store. */
/* Cost of any integer vector operation, excluding the ones handled
specially below. */
const int int_stmt_cost;
/* Cost of any fp vector operation, excluding the ones handled
specially below. */
const int fp_stmt_cost;
/* Cost of a permute operation. */
const int permute_cost;
/* Cost of reductions for various vector types: iN is for N-bit
integer elements and fN is for N-bit floating-point elements.
We need to single out the element type because it affects the
depth of the reduction. */
const int reduc_i8_cost;
const int reduc_i16_cost;
const int reduc_i32_cost;
const int reduc_i64_cost;
const int reduc_f16_cost;
const int reduc_f32_cost;
const int reduc_f64_cost;
/* Cost of a vector-to-scalar operation. */
const int vec_to_scalar_cost;
/* Cost of a scalar-to-vector operation. */
const int scalar_to_vec_cost;
/* Cost of an aligned vector load. */
const int align_load_cost;
/* Cost of an unaligned vector load. */
const int unalign_load_cost;
/* Cost of an unaligned vector store. */
const int unalign_store_cost;
/* Cost of a vector store. */
const int store_cost;
};
typedef struct simd_vec_cost advsimd_vec_cost;

View File

@ -48,4 +48,6 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS)
#undef AARCH64_EXTRA_TUNING_OPTION

View File

@ -591,6 +591,13 @@ static const advsimd_vec_cost generic_advsimd_vector_cost =
1, /* int_stmt_cost */
1, /* fp_stmt_cost */
2, /* permute_cost */
2, /* reduc_i8_cost */
2, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
2, /* reduc_f16_cost */
2, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* align_load_cost */
@ -605,6 +612,13 @@ static const sve_vec_cost generic_sve_vector_cost =
1, /* int_stmt_cost */
1, /* fp_stmt_cost */
2, /* permute_cost */
2, /* reduc_i8_cost */
2, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
2, /* reduc_f16_cost */
2, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* align_load_cost */
@ -631,6 +645,13 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost =
2, /* int_stmt_cost */
5, /* fp_stmt_cost */
3, /* permute_cost */
13, /* reduc_i8_cost */
13, /* reduc_i16_cost */
13, /* reduc_i32_cost */
13, /* reduc_i64_cost */
13, /* reduc_f16_cost */
13, /* reduc_f32_cost */
13, /* reduc_f64_cost */
13, /* vec_to_scalar_cost */
4, /* scalar_to_vec_cost */
6, /* align_load_cost */
@ -644,6 +665,13 @@ static const sve_vec_cost a64fx_sve_vector_cost =
2, /* int_stmt_cost */
5, /* fp_stmt_cost */
3, /* permute_cost */
13, /* reduc_i8_cost */
13, /* reduc_i16_cost */
13, /* reduc_i32_cost */
13, /* reduc_i64_cost */
13, /* reduc_f16_cost */
13, /* reduc_f32_cost */
13, /* reduc_f64_cost */
13, /* vec_to_scalar_cost */
4, /* scalar_to_vec_cost */
6, /* align_load_cost */
@ -669,6 +697,13 @@ static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
1, /* int_stmt_cost */
3, /* fp_stmt_cost */
2, /* permute_cost */
1, /* reduc_i8_cost */
1, /* reduc_i16_cost */
1, /* reduc_i32_cost */
1, /* reduc_i64_cost */
1, /* reduc_f16_cost */
1, /* reduc_f32_cost */
1, /* reduc_f64_cost */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* align_load_cost */
@ -696,6 +731,13 @@ static const advsimd_vec_cost thunderx_advsimd_vector_cost =
4, /* int_stmt_cost */
1, /* fp_stmt_cost */
4, /* permute_cost */
2, /* reduc_i8_cost */
2, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
2, /* reduc_f16_cost */
2, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* vec_to_scalar_cost */
2, /* scalar_to_vec_cost */
3, /* align_load_cost */
@ -722,6 +764,13 @@ static const advsimd_vec_cost tsv110_advsimd_vector_cost =
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
2, /* permute_cost */
3, /* reduc_i8_cost */
3, /* reduc_i16_cost */
3, /* reduc_i32_cost */
3, /* reduc_i64_cost */
3, /* reduc_f16_cost */
3, /* reduc_f32_cost */
3, /* reduc_f64_cost */
3, /* vec_to_scalar_cost */
2, /* scalar_to_vec_cost */
5, /* align_load_cost */
@ -747,6 +796,13 @@ static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
3, /* permute_cost */
8, /* reduc_i8_cost */
8, /* reduc_i16_cost */
8, /* reduc_i32_cost */
8, /* reduc_i64_cost */
8, /* reduc_f16_cost */
8, /* reduc_f32_cost */
8, /* reduc_f64_cost */
8, /* vec_to_scalar_cost */
8, /* scalar_to_vec_cost */
4, /* align_load_cost */
@ -773,6 +829,13 @@ static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
3, /* int_stmt_cost */
3, /* fp_stmt_cost */
3, /* permute_cost */
3, /* reduc_i8_cost */
3, /* reduc_i16_cost */
3, /* reduc_i32_cost */
3, /* reduc_i64_cost */
3, /* reduc_f16_cost */
3, /* reduc_f32_cost */
3, /* reduc_f64_cost */
3, /* vec_to_scalar_cost */
3, /* scalar_to_vec_cost */
5, /* align_load_cost */
@ -798,6 +861,13 @@ static const advsimd_vec_cost xgene1_advsimd_vector_cost =
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
2, /* permute_cost */
4, /* reduc_i8_cost */
4, /* reduc_i16_cost */
4, /* reduc_i32_cost */
4, /* reduc_i64_cost */
4, /* reduc_f16_cost */
4, /* reduc_f32_cost */
4, /* reduc_f64_cost */
4, /* vec_to_scalar_cost */
4, /* scalar_to_vec_cost */
10, /* align_load_cost */
@ -824,6 +894,13 @@ static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
4, /* int_stmt_cost */
5, /* fp_stmt_cost */
10, /* permute_cost */
6, /* reduc_i8_cost */
6, /* reduc_i16_cost */
6, /* reduc_i32_cost */
6, /* reduc_i64_cost */
6, /* reduc_f16_cost */
6, /* reduc_f32_cost */
6, /* reduc_f64_cost */
6, /* vec_to_scalar_cost */
5, /* scalar_to_vec_cost */
4, /* align_load_cost */
@ -850,6 +927,13 @@ static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
5, /* int_stmt_cost */
5, /* fp_stmt_cost */
10, /* permute_cost */
5, /* reduc_i8_cost */
5, /* reduc_i16_cost */
5, /* reduc_i32_cost */
5, /* reduc_i64_cost */
5, /* reduc_f16_cost */
5, /* reduc_f32_cost */
5, /* reduc_f64_cost */
5, /* vec_to_scalar_cost */
5, /* scalar_to_vec_cost */
4, /* align_load_cost */
@ -13874,6 +13958,28 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
/* Vectorizer cost model target hooks. */
/* Return true if the current CPU should use the new costs defined
in GCC 11. This should be removed for GCC 12 and above, with the
costs applying to all CPUs instead. */
static bool
aarch64_use_new_vector_costs_p ()
{
return (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
}
/* Return the appropriate SIMD costs for vectors of type VECTYPE. */
static const simd_vec_cost *
aarch64_simd_vec_costs (tree vectype)
{
const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
if (vectype != NULL
&& aarch64_sve_mode_p (TYPE_MODE (vectype))
&& costs->sve != NULL)
return costs->sve;
return costs->advsimd;
}
/* Implement targetm.vectorize.builtin_vectorization_cost. */
static int
aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
@ -13887,12 +13993,7 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
if (vectype != NULL)
fp = FLOAT_TYPE_P (vectype);
const simd_vec_cost *simd_costs;
if (vectype != NULL && aarch64_sve_mode_p (TYPE_MODE (vectype))
&& costs->sve != NULL)
simd_costs = costs->sve;
else
simd_costs = costs->advsimd;
const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
switch (type_of_cost)
{
@ -13951,6 +14052,14 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
}
}
/* Return true if STMT_INFO represents part of a reduction. */
static bool
aarch64_is_reduction (stmt_vec_info stmt_info)
{
return (STMT_VINFO_REDUC_DEF (stmt_info)
|| VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
}
/* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
vectors would produce a series of LDP or STP operations. KIND is the
kind of statement that STMT_INFO represents. */
@ -14014,6 +14123,57 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info)
&& TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
}
/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
for the vectorized form of STMT_INFO, which has cost kind KIND and which
when vectorized would operate on vector type VECTYPE. Try to subdivide
the target-independent categorization provided by KIND to get a more
accurate cost. WHERE specifies where the cost associated with KIND
occurs. */
static unsigned int
aarch64_detect_vector_stmt_subtype (vect_cost_for_stmt kind,
stmt_vec_info stmt_info, tree vectype,
enum vect_cost_model_location where,
unsigned int stmt_cost)
{
const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
/* Detect cases in which vec_to_scalar represents a single reduction
instruction like FADDP or MAXV. */
if (kind == vec_to_scalar
&& where == vect_epilogue
&& aarch64_is_reduction (stmt_info))
switch (GET_MODE_INNER (TYPE_MODE (vectype)))
{
case E_QImode:
return simd_costs->reduc_i8_cost;
case E_HImode:
return simd_costs->reduc_i16_cost;
case E_SImode:
return simd_costs->reduc_i32_cost;
case E_DImode:
return simd_costs->reduc_i64_cost;
case E_HFmode:
case E_BFmode:
return simd_costs->reduc_f16_cost;
case E_SFmode:
return simd_costs->reduc_f32_cost;
case E_DFmode:
return simd_costs->reduc_f64_cost;
default:
break;
}
/* Otherwise stick with the original categorization. */
return stmt_cost;
}
/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
for STMT_INFO, which has cost kind KIND and which when vectorized would
operate on vector type VECTYPE. Adjust the cost as necessary for SVE
@ -14097,6 +14257,14 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
int stmt_cost =
aarch64_builtin_vectorization_cost (kind, vectype, misalign);
/* Try to get a more accurate cost by looking at STMT_INFO instead
of just looking at KIND. */
if (stmt_info && vectype && aarch64_use_new_vector_costs_p ())
stmt_cost = aarch64_detect_vector_stmt_subtype (kind, stmt_info,
vectype, where,
stmt_cost);
/* Do any SVE-specific adjustments to the cost. */
if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
vectype, stmt_cost);