aarch64: Add costs for one element of a scatter store

Currently each element in a gather load is costed as a scalar_load
and each element in a scatter store is costed as a scalar_store.
The load side seems to work pretty well in practice, since many
CPU-specific costs give loads quite a high cost relative to
arithmetic operations.  However, stores usually have a cost
of just 1, which means that scatters tend to appear too cheap.

This patch adds a separate cost for one element in a scatter store.

Like with the previous patches, this one only becomes active if
a CPU selects use_new_vector_costs.  It should therefore have
a very low impact on other CPUs.

gcc/
	* config/aarch64/aarch64-protos.h
	(sve_vec_cost::scatter_store_elt_cost): New member variable.
	* config/aarch64/aarch64.c (generic_sve_vector_cost): Update
	accordingly, taking the cost from the cost of a scalar_store.
	(a64fx_sve_vector_cost): Likewise.
	(aarch64_detect_vector_stmt_subtype): Detect scatter stores.
This commit is contained in:
Richard Sandiford 2021-03-26 16:08:32 +00:00
parent d1ff0847b2
commit 7c679969ba
2 changed files with 18 additions and 4 deletions

View File

@ -256,12 +256,14 @@ struct sve_vec_cost : simd_vec_cost
unsigned int clast_cost,
unsigned int fadda_f16_cost,
unsigned int fadda_f32_cost,
unsigned int fadda_f64_cost)
unsigned int fadda_f64_cost,
unsigned int scatter_store_elt_cost)
: simd_vec_cost (base),
clast_cost (clast_cost),
fadda_f16_cost (fadda_f16_cost),
fadda_f32_cost (fadda_f32_cost),
fadda_f64_cost (fadda_f64_cost)
fadda_f64_cost (fadda_f64_cost),
scatter_store_elt_cost (scatter_store_elt_cost)
{}
/* The cost of a vector-to-scalar CLASTA or CLASTB instruction,
@ -274,6 +276,9 @@ struct sve_vec_cost : simd_vec_cost
const int fadda_f16_cost;
const int fadda_f32_cost;
const int fadda_f64_cost;
/* The per-element cost of a scatter store. */
const int scatter_store_elt_cost;
};
/* Cost for vector insn classes. */

View File

@ -638,7 +638,8 @@ static const sve_vec_cost generic_sve_vector_cost =
2, /* clast_cost */
2, /* fadda_f16_cost */
2, /* fadda_f32_cost */
2 /* fadda_f64_cost */
2, /* fadda_f64_cost */
1 /* scatter_store_elt_cost */
};
/* Generic costs for vector insn classes. */
@ -705,7 +706,8 @@ static const sve_vec_cost a64fx_sve_vector_cost =
13, /* clast_cost */
13, /* fadda_f16_cost */
13, /* fadda_f32_cost */
13 /* fadda_f64_cost */
13, /* fadda_f64_cost */
1 /* scatter_store_elt_cost */
};
static const struct cpu_vector_cost a64fx_vector_cost =
@ -14279,6 +14281,13 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
&& DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
return simd_costs->store_elt_extra_cost;
/* Detect cases in which a scalar_store is really storing one element
in a scatter operation. */
if (kind == scalar_store
&& sve_costs
&& STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
return sve_costs->scatter_store_elt_cost;
/* Detect cases in which vec_to_scalar represents an in-loop reduction. */
if (kind == vec_to_scalar
&& where == vect_body