aarch64: Add reduction costs to simd_vec_costs

This patch is part of a series that makes opt-in tweaks to the AArch64 vector cost model. At the moment, all reductions are costed as vec_to_scalar, which also includes things like extracting a single element from a vector. This is a bit too coarse in practice, since the cost of a reduction depends very much on the type of value that it's processing. This patch therefore adds separate costs for each case. To start with, all the new costs are copied from the associated vec_to_scalar ones. Due the extreme lateness of this patch in the GCC 11 cycle, I've added a new tuning flag (use_new_vector_costs) that selects the new behaviour. This should help to ensure that the risk of the new code is only borne by the CPUs that need it. Generic tuning is not affected. gcc/ * config/aarch64/aarch64-tuning-flags.def (use_new_vector_costs): New tuning flag. * config/aarch64/aarch64-protos.h (simd_vec_cost): Put comments above the fields rather than to the right. (simd_vec_cost::reduc_i8_cost): New member variable. (simd_vec_cost::reduc_i16_cost): Likewise. (simd_vec_cost::reduc_i32_cost): Likewise. (simd_vec_cost::reduc_i64_cost): Likewise. (simd_vec_cost::reduc_f16_cost): Likewise. (simd_vec_cost::reduc_f32_cost): Likewise. (simd_vec_cost::reduc_f64_cost): Likewise. * config/aarch64/aarch64.c (generic_advsimd_vector_cost): Update accordingly, using the vec_to_scalar_cost for the new fields. (generic_sve_vector_cost, a64fx_advsimd_vector_cost): Likewise. (a64fx_sve_vector_cost, qdf24xx_advsimd_vector_cost): Likewise. (thunderx_advsimd_vector_cost, tsv110_advsimd_vector_cost): Likewise. (cortexa57_advsimd_vector_cost, exynosm1_advsimd_vector_cost) (xgene1_advsimd_vector_cost, thunderx2t99_advsimd_vector_cost) (thunderx3t110_advsimd_vector_cost): Likewise. (aarch64_use_new_vector_costs_p): New function. (aarch64_simd_vec_costs): New function, split out from... (aarch64_builtin_vectorization_cost): ...here. (aarch64_is_reduction): New function. (aarch64_detect_vector_stmt_subtype): Likewise. (aarch64_add_stmt_cost): Call aarch64_detect_vector_stmt_subtype if using the new vector costs.
2021-03-26 16:08:29 +00:00 · 2021-03-26 16:08:29 +00:00 · e253bb8b79
commit e253bb8b79
parent fdfcb5353c
3 changed files with 216 additions and 22 deletions
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@ -194,22 +194,46 @@ struct cpu_regmove_cost

 struct simd_vec_cost
 {
-  const int int_stmt_cost;		/* Cost of any int vector operation,
-					   excluding load, store, permute,
-					   vector-to-scalar and
-					   scalar-to-vector operation.  */
-  const int fp_stmt_cost;		 /* Cost of any fp vector operation,
-					    excluding load, store, permute,
-					    vector-to-scalar and
-					    scalar-to-vector operation.  */
-  const int permute_cost;		 /* Cost of permute operation.  */
-  const int vec_to_scalar_cost;		 /* Cost of vec-to-scalar operation.  */
-  const int scalar_to_vec_cost;		 /* Cost of scalar-to-vector
-					    operation.  */
-  const int align_load_cost;	 /* Cost of aligned vector load.  */
-  const int unalign_load_cost;	 /* Cost of unaligned vector load.  */
-  const int unalign_store_cost;	 /* Cost of unaligned vector store.  */
-  const int store_cost;		 /* Cost of vector store.  */
+  /* Cost of any integer vector operation, excluding the ones handled
+     specially below.  */
+  const int int_stmt_cost;
+
+  /* Cost of any fp vector operation, excluding the ones handled
+     specially below.  */
+  const int fp_stmt_cost;
+
+  /* Cost of a permute operation.  */
+  const int permute_cost;
+
+  /* Cost of reductions for various vector types: iN is for N-bit
+     integer elements and fN is for N-bit floating-point elements.
+     We need to single out the element type because it affects the
+     depth of the reduction.  */
+  const int reduc_i8_cost;
+  const int reduc_i16_cost;
+  const int reduc_i32_cost;
+  const int reduc_i64_cost;
+  const int reduc_f16_cost;
+  const int reduc_f32_cost;
+  const int reduc_f64_cost;
+
+  /* Cost of a vector-to-scalar operation.  */
+  const int vec_to_scalar_cost;
+
+  /* Cost of a scalar-to-vector operation.  */
+  const int scalar_to_vec_cost;
+
+  /* Cost of an aligned vector load.  */
+  const int align_load_cost;
+
+  /* Cost of an unaligned vector load.  */
+  const int unalign_load_cost;
+
+  /* Cost of an unaligned vector store.  */
+  const int unalign_store_cost;
+
+  /* Cost of a vector store.  */
+  const int store_cost;
 };

 typedef struct simd_vec_cost advsimd_vec_cost;
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@ -48,4 +48,6 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)

 AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)

+AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@ -591,6 +591,13 @@ static const advsimd_vec_cost generic_advsimd_vector_cost =
  1, /* int_stmt_cost  */
  1, /* fp_stmt_cost  */
  2, /* permute_cost  */
+  2, /* reduc_i8_cost  */
+  2, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  2, /* reduc_f16_cost  */
+  2, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
  2, /* vec_to_scalar_cost  */
  1, /* scalar_to_vec_cost  */
  1, /* align_load_cost  */
@ -605,6 +612,13 @@ static const sve_vec_cost generic_sve_vector_cost =
  1, /* int_stmt_cost  */
  1, /* fp_stmt_cost  */
  2, /* permute_cost  */
+  2, /* reduc_i8_cost  */
+  2, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  2, /* reduc_f16_cost  */
+  2, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
  2, /* vec_to_scalar_cost  */
  1, /* scalar_to_vec_cost  */
  1, /* align_load_cost  */
@ -631,6 +645,13 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost =
  2, /* int_stmt_cost  */
  5, /* fp_stmt_cost  */
  3, /* permute_cost  */
+  13, /* reduc_i8_cost  */
+  13, /* reduc_i16_cost  */
+  13, /* reduc_i32_cost  */
+  13, /* reduc_i64_cost  */
+  13, /* reduc_f16_cost  */
+  13, /* reduc_f32_cost  */
+  13, /* reduc_f64_cost  */
  13, /* vec_to_scalar_cost  */
  4, /* scalar_to_vec_cost  */
  6, /* align_load_cost  */
@ -644,6 +665,13 @@ static const sve_vec_cost a64fx_sve_vector_cost =
  2, /* int_stmt_cost  */
  5, /* fp_stmt_cost  */
  3, /* permute_cost  */
+  13, /* reduc_i8_cost  */
+  13, /* reduc_i16_cost  */
+  13, /* reduc_i32_cost  */
+  13, /* reduc_i64_cost  */
+  13, /* reduc_f16_cost  */
+  13, /* reduc_f32_cost  */
+  13, /* reduc_f64_cost  */
  13, /* vec_to_scalar_cost  */
  4, /* scalar_to_vec_cost  */
  6, /* align_load_cost  */
@ -669,6 +697,13 @@ static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
  1, /* int_stmt_cost  */
  3, /* fp_stmt_cost  */
  2, /* permute_cost  */
+  1, /* reduc_i8_cost  */
+  1, /* reduc_i16_cost  */
+  1, /* reduc_i32_cost  */
+  1, /* reduc_i64_cost  */
+  1, /* reduc_f16_cost  */
+  1, /* reduc_f32_cost  */
+  1, /* reduc_f64_cost  */
  1, /* vec_to_scalar_cost  */
  1, /* scalar_to_vec_cost  */
  1, /* align_load_cost  */
@ -696,6 +731,13 @@ static const advsimd_vec_cost thunderx_advsimd_vector_cost =
  4, /* int_stmt_cost  */
  1, /* fp_stmt_cost  */
  4, /* permute_cost  */
+  2, /* reduc_i8_cost  */
+  2, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  2, /* reduc_f16_cost  */
+  2, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
  2, /* vec_to_scalar_cost  */
  2, /* scalar_to_vec_cost  */
  3, /* align_load_cost  */
@ -722,6 +764,13 @@ static const advsimd_vec_cost tsv110_advsimd_vector_cost =
  2, /* int_stmt_cost  */
  2, /* fp_stmt_cost  */
  2, /* permute_cost  */
+  3, /* reduc_i8_cost  */
+  3, /* reduc_i16_cost  */
+  3, /* reduc_i32_cost  */
+  3, /* reduc_i64_cost  */
+  3, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  3, /* reduc_f64_cost  */
  3, /* vec_to_scalar_cost  */
  2, /* scalar_to_vec_cost  */
  5, /* align_load_cost  */
@ -747,6 +796,13 @@ static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
  2, /* int_stmt_cost  */
  2, /* fp_stmt_cost  */
  3, /* permute_cost  */
+  8, /* reduc_i8_cost  */
+  8, /* reduc_i16_cost  */
+  8, /* reduc_i32_cost  */
+  8, /* reduc_i64_cost  */
+  8, /* reduc_f16_cost  */
+  8, /* reduc_f32_cost  */
+  8, /* reduc_f64_cost  */
  8, /* vec_to_scalar_cost  */
  8, /* scalar_to_vec_cost  */
  4, /* align_load_cost  */
@ -773,6 +829,13 @@ static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
  3, /* int_stmt_cost  */
  3, /* fp_stmt_cost  */
  3, /* permute_cost  */
+  3, /* reduc_i8_cost  */
+  3, /* reduc_i16_cost  */
+  3, /* reduc_i32_cost  */
+  3, /* reduc_i64_cost  */
+  3, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  3, /* reduc_f64_cost  */
  3, /* vec_to_scalar_cost  */
  3, /* scalar_to_vec_cost  */
  5, /* align_load_cost  */
@ -798,6 +861,13 @@ static const advsimd_vec_cost xgene1_advsimd_vector_cost =
  2, /* int_stmt_cost  */
  2, /* fp_stmt_cost  */
  2, /* permute_cost  */
+  4, /* reduc_i8_cost  */
+  4, /* reduc_i16_cost  */
+  4, /* reduc_i32_cost  */
+  4, /* reduc_i64_cost  */
+  4, /* reduc_f16_cost  */
+  4, /* reduc_f32_cost  */
+  4, /* reduc_f64_cost  */
  4, /* vec_to_scalar_cost  */
  4, /* scalar_to_vec_cost  */
  10, /* align_load_cost  */
@ -824,6 +894,13 @@ static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
  4, /* int_stmt_cost  */
  5, /* fp_stmt_cost  */
  10, /* permute_cost  */
+  6, /* reduc_i8_cost  */
+  6, /* reduc_i16_cost  */
+  6, /* reduc_i32_cost  */
+  6, /* reduc_i64_cost  */
+  6, /* reduc_f16_cost  */
+  6, /* reduc_f32_cost  */
+  6, /* reduc_f64_cost  */
  6, /* vec_to_scalar_cost  */
  5, /* scalar_to_vec_cost  */
  4, /* align_load_cost  */
@ -850,6 +927,13 @@ static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
  5, /* int_stmt_cost  */
  5, /* fp_stmt_cost  */
  10, /* permute_cost  */
+  5, /* reduc_i8_cost  */
+  5, /* reduc_i16_cost  */
+  5, /* reduc_i32_cost  */
+  5, /* reduc_i64_cost  */
+  5, /* reduc_f16_cost  */
+  5, /* reduc_f32_cost  */
+  5, /* reduc_f64_cost  */
  5, /* vec_to_scalar_cost  */
  5, /* scalar_to_vec_cost  */
  4, /* align_load_cost  */
@ -13874,6 +13958,28 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,

 /* Vectorizer cost model target hooks.  */

+/* Return true if the current CPU should use the new costs defined
+   in GCC 11.  This should be removed for GCC 12 and above, with the
+   costs applying to all CPUs instead.  */
+static bool
+aarch64_use_new_vector_costs_p ()
+{
+  return (aarch64_tune_params.extra_tuning_flags
+	  & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
+}
+
+/* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
+static const simd_vec_cost *
+aarch64_simd_vec_costs (tree vectype)
+{
+  const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
+  if (vectype != NULL
+      && aarch64_sve_mode_p (TYPE_MODE (vectype))
+      && costs->sve != NULL)
+    return costs->sve;
+  return costs->advsimd;
+}
+
 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
 static int
 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
@ -13887,12 +13993,7 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
  if (vectype != NULL)
    fp = FLOAT_TYPE_P (vectype);

-  const simd_vec_cost *simd_costs;
-  if (vectype != NULL && aarch64_sve_mode_p (TYPE_MODE (vectype))
-      && costs->sve != NULL)
-    simd_costs = costs->sve;
-  else
-    simd_costs = costs->advsimd;
+  const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);

  switch (type_of_cost)
    {
@ -13951,6 +14052,14 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
    }
 }

+/* Return true if STMT_INFO represents part of a reduction.  */
+static bool
+aarch64_is_reduction (stmt_vec_info stmt_info)
+{
+  return (STMT_VINFO_REDUC_DEF (stmt_info)
+	  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
+}
+
 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
   vectors would produce a series of LDP or STP operations.  KIND is the
   kind of statement that STMT_INFO represents.  */
@ -14014,6 +14123,57 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info)
 	  && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
 }

+/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
+   for the vectorized form of STMT_INFO, which has cost kind KIND and which
+   when vectorized would operate on vector type VECTYPE.  Try to subdivide
+   the target-independent categorization provided by KIND to get a more
+   accurate cost.  WHERE specifies where the cost associated with KIND
+   occurs.  */
+static unsigned int
+aarch64_detect_vector_stmt_subtype (vect_cost_for_stmt kind,
+				    stmt_vec_info stmt_info, tree vectype,
+				    enum vect_cost_model_location where,
+				    unsigned int stmt_cost)
+{
+  const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
+
+  /* Detect cases in which vec_to_scalar represents a single reduction
+     instruction like FADDP or MAXV.  */
+  if (kind == vec_to_scalar
+      && where == vect_epilogue
+      && aarch64_is_reduction (stmt_info))
+    switch (GET_MODE_INNER (TYPE_MODE (vectype)))
+      {
+      case E_QImode:
+	return simd_costs->reduc_i8_cost;
+
+      case E_HImode:
+	return simd_costs->reduc_i16_cost;
+
+      case E_SImode:
+	return simd_costs->reduc_i32_cost;
+
+      case E_DImode:
+	return simd_costs->reduc_i64_cost;
+
+      case E_HFmode:
+      case E_BFmode:
+	return simd_costs->reduc_f16_cost;
+
+      case E_SFmode:
+	return simd_costs->reduc_f32_cost;
+
+      case E_DFmode:
+	return simd_costs->reduc_f64_cost;
+
+      default:
+	break;
+      }
+
+  /* Otherwise stick with the original categorization.  */
+  return stmt_cost;
+}
+
 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
   for STMT_INFO, which has cost kind KIND and which when vectorized would
   operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
@ -14097,6 +14257,14 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
      int stmt_cost =
 	    aarch64_builtin_vectorization_cost (kind, vectype, misalign);

+      /* Try to get a more accurate cost by looking at STMT_INFO instead
+	 of just looking at KIND.  */
+      if (stmt_info && vectype && aarch64_use_new_vector_costs_p ())
+	stmt_cost = aarch64_detect_vector_stmt_subtype (kind, stmt_info,
+							vectype, where,
+							stmt_cost);
+
+      /* Do any SVE-specific adjustments to the cost.  */
      if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
 	stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
 						  vectype, stmt_cost);