diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ff87ced2a34..e4eeb2ce142 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -194,22 +194,46 @@ struct cpu_regmove_cost
 
 struct simd_vec_cost
 {
-  const int int_stmt_cost;		/* Cost of any int vector operation,
-					   excluding load, store, permute,
-					   vector-to-scalar and
-					   scalar-to-vector operation.  */
-  const int fp_stmt_cost;		 /* Cost of any fp vector operation,
-					    excluding load, store, permute,
-					    vector-to-scalar and
-					    scalar-to-vector operation.  */
-  const int permute_cost;		 /* Cost of permute operation.  */
-  const int vec_to_scalar_cost;		 /* Cost of vec-to-scalar operation.  */
-  const int scalar_to_vec_cost;		 /* Cost of scalar-to-vector
-					    operation.  */
-  const int align_load_cost;	 /* Cost of aligned vector load.  */
-  const int unalign_load_cost;	 /* Cost of unaligned vector load.  */
-  const int unalign_store_cost;	 /* Cost of unaligned vector store.  */
-  const int store_cost;		 /* Cost of vector store.  */
+  /* Cost of any integer vector operation, excluding the ones handled
+     specially below.  */
+  const int int_stmt_cost;
+
+  /* Cost of any fp vector operation, excluding the ones handled
+     specially below.  */
+  const int fp_stmt_cost;
+
+  /* Cost of a permute operation.  */
+  const int permute_cost;
+
+  /* Cost of reductions for various vector types: iN is for N-bit
+     integer elements and fN is for N-bit floating-point elements.
+     We need to single out the element type because it affects the
+     depth of the reduction.  */
+  const int reduc_i8_cost;
+  const int reduc_i16_cost;
+  const int reduc_i32_cost;
+  const int reduc_i64_cost;
+  const int reduc_f16_cost;
+  const int reduc_f32_cost;
+  const int reduc_f64_cost;
+
+  /* Cost of a vector-to-scalar operation.  */
+  const int vec_to_scalar_cost;
+
+  /* Cost of a scalar-to-vector operation.  */
+  const int scalar_to_vec_cost;
+
+  /* Cost of an aligned vector load.  */
+  const int align_load_cost;
+
+  /* Cost of an unaligned vector load.  */
+  const int unalign_load_cost;
+
+  /* Cost of an unaligned vector store.  */
+  const int unalign_store_cost;
+
+  /* Cost of a vector store.  */
+  const int store_cost;
 };
 
 typedef struct simd_vec_cost advsimd_vec_cost;
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 588edf4d923..a61fcf94916 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -48,4 +48,6 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
 
 AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
 
+AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index c8a87fe858a..b44dcdc6a6e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -591,6 +591,13 @@ static const advsimd_vec_cost generic_advsimd_vector_cost =
   1, /* int_stmt_cost  */
   1, /* fp_stmt_cost  */
   2, /* permute_cost  */
+  2, /* reduc_i8_cost  */
+  2, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  2, /* reduc_f16_cost  */
+  2, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
   2, /* vec_to_scalar_cost  */
   1, /* scalar_to_vec_cost  */
   1, /* align_load_cost  */
@@ -605,6 +612,13 @@ static const sve_vec_cost generic_sve_vector_cost =
   1, /* int_stmt_cost  */
   1, /* fp_stmt_cost  */
   2, /* permute_cost  */
+  2, /* reduc_i8_cost  */
+  2, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  2, /* reduc_f16_cost  */
+  2, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
   2, /* vec_to_scalar_cost  */
   1, /* scalar_to_vec_cost  */
   1, /* align_load_cost  */
@@ -631,6 +645,13 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost =
   2, /* int_stmt_cost  */
   5, /* fp_stmt_cost  */
   3, /* permute_cost  */
+  13, /* reduc_i8_cost  */
+  13, /* reduc_i16_cost  */
+  13, /* reduc_i32_cost  */
+  13, /* reduc_i64_cost  */
+  13, /* reduc_f16_cost  */
+  13, /* reduc_f32_cost  */
+  13, /* reduc_f64_cost  */
   13, /* vec_to_scalar_cost  */
   4, /* scalar_to_vec_cost  */
   6, /* align_load_cost  */
@@ -644,6 +665,13 @@ static const sve_vec_cost a64fx_sve_vector_cost =
   2, /* int_stmt_cost  */
   5, /* fp_stmt_cost  */
   3, /* permute_cost  */
+  13, /* reduc_i8_cost  */
+  13, /* reduc_i16_cost  */
+  13, /* reduc_i32_cost  */
+  13, /* reduc_i64_cost  */
+  13, /* reduc_f16_cost  */
+  13, /* reduc_f32_cost  */
+  13, /* reduc_f64_cost  */
   13, /* vec_to_scalar_cost  */
   4, /* scalar_to_vec_cost  */
   6, /* align_load_cost  */
@@ -669,6 +697,13 @@ static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
   1, /* int_stmt_cost  */
   3, /* fp_stmt_cost  */
   2, /* permute_cost  */
+  1, /* reduc_i8_cost  */
+  1, /* reduc_i16_cost  */
+  1, /* reduc_i32_cost  */
+  1, /* reduc_i64_cost  */
+  1, /* reduc_f16_cost  */
+  1, /* reduc_f32_cost  */
+  1, /* reduc_f64_cost  */
   1, /* vec_to_scalar_cost  */
   1, /* scalar_to_vec_cost  */
   1, /* align_load_cost  */
@@ -696,6 +731,13 @@ static const advsimd_vec_cost thunderx_advsimd_vector_cost =
   4, /* int_stmt_cost  */
   1, /* fp_stmt_cost  */
   4, /* permute_cost  */
+  2, /* reduc_i8_cost  */
+  2, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  2, /* reduc_f16_cost  */
+  2, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
   2, /* vec_to_scalar_cost  */
   2, /* scalar_to_vec_cost  */
   3, /* align_load_cost  */
@@ -722,6 +764,13 @@ static const advsimd_vec_cost tsv110_advsimd_vector_cost =
   2, /* int_stmt_cost  */
   2, /* fp_stmt_cost  */
   2, /* permute_cost  */
+  3, /* reduc_i8_cost  */
+  3, /* reduc_i16_cost  */
+  3, /* reduc_i32_cost  */
+  3, /* reduc_i64_cost  */
+  3, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  3, /* reduc_f64_cost  */
   3, /* vec_to_scalar_cost  */
   2, /* scalar_to_vec_cost  */
   5, /* align_load_cost  */
@@ -747,6 +796,13 @@ static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
   2, /* int_stmt_cost  */
   2, /* fp_stmt_cost  */
   3, /* permute_cost  */
+  8, /* reduc_i8_cost  */
+  8, /* reduc_i16_cost  */
+  8, /* reduc_i32_cost  */
+  8, /* reduc_i64_cost  */
+  8, /* reduc_f16_cost  */
+  8, /* reduc_f32_cost  */
+  8, /* reduc_f64_cost  */
   8, /* vec_to_scalar_cost  */
   8, /* scalar_to_vec_cost  */
   4, /* align_load_cost  */
@@ -773,6 +829,13 @@ static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
   3, /* int_stmt_cost  */
   3, /* fp_stmt_cost  */
   3, /* permute_cost  */
+  3, /* reduc_i8_cost  */
+  3, /* reduc_i16_cost  */
+  3, /* reduc_i32_cost  */
+  3, /* reduc_i64_cost  */
+  3, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  3, /* reduc_f64_cost  */
   3, /* vec_to_scalar_cost  */
   3, /* scalar_to_vec_cost  */
   5, /* align_load_cost  */
@@ -798,6 +861,13 @@ static const advsimd_vec_cost xgene1_advsimd_vector_cost =
   2, /* int_stmt_cost  */
   2, /* fp_stmt_cost  */
   2, /* permute_cost  */
+  4, /* reduc_i8_cost  */
+  4, /* reduc_i16_cost  */
+  4, /* reduc_i32_cost  */
+  4, /* reduc_i64_cost  */
+  4, /* reduc_f16_cost  */
+  4, /* reduc_f32_cost  */
+  4, /* reduc_f64_cost  */
   4, /* vec_to_scalar_cost  */
   4, /* scalar_to_vec_cost  */
   10, /* align_load_cost  */
@@ -824,6 +894,13 @@ static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
   4, /* int_stmt_cost  */
   5, /* fp_stmt_cost  */
   10, /* permute_cost  */
+  6, /* reduc_i8_cost  */
+  6, /* reduc_i16_cost  */
+  6, /* reduc_i32_cost  */
+  6, /* reduc_i64_cost  */
+  6, /* reduc_f16_cost  */
+  6, /* reduc_f32_cost  */
+  6, /* reduc_f64_cost  */
   6, /* vec_to_scalar_cost  */
   5, /* scalar_to_vec_cost  */
   4, /* align_load_cost  */
@@ -850,6 +927,13 @@ static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
   5, /* int_stmt_cost  */
   5, /* fp_stmt_cost  */
   10, /* permute_cost  */
+  5, /* reduc_i8_cost  */
+  5, /* reduc_i16_cost  */
+  5, /* reduc_i32_cost  */
+  5, /* reduc_i64_cost  */
+  5, /* reduc_f16_cost  */
+  5, /* reduc_f32_cost  */
+  5, /* reduc_f64_cost  */
   5, /* vec_to_scalar_cost  */
   5, /* scalar_to_vec_cost  */
   4, /* align_load_cost  */
@@ -13874,6 +13958,28 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
 
 /* Vectorizer cost model target hooks.  */
 
+/* Return true if the current CPU should use the new costs defined
+   in GCC 11.  This should be removed for GCC 12 and above, with the
+   costs applying to all CPUs instead.  */
+static bool
+aarch64_use_new_vector_costs_p ()
+{
+  return (aarch64_tune_params.extra_tuning_flags
+	  & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
+}
+
+/* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
+static const simd_vec_cost *
+aarch64_simd_vec_costs (tree vectype)
+{
+  const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
+  if (vectype != NULL
+      && aarch64_sve_mode_p (TYPE_MODE (vectype))
+      && costs->sve != NULL)
+    return costs->sve;
+  return costs->advsimd;
+}
+
 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
 static int
 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
@@ -13887,12 +13993,7 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
   if (vectype != NULL)
     fp = FLOAT_TYPE_P (vectype);
 
-  const simd_vec_cost *simd_costs;
-  if (vectype != NULL && aarch64_sve_mode_p (TYPE_MODE (vectype))
-      && costs->sve != NULL)
-    simd_costs = costs->sve;
-  else
-    simd_costs = costs->advsimd;
+  const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
 
   switch (type_of_cost)
     {
@@ -13951,6 +14052,14 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
     }
 }
 
+/* Return true if STMT_INFO represents part of a reduction.  */
+static bool
+aarch64_is_reduction (stmt_vec_info stmt_info)
+{
+  return (STMT_VINFO_REDUC_DEF (stmt_info)
+	  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
+}
+
 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
    vectors would produce a series of LDP or STP operations.  KIND is the
    kind of statement that STMT_INFO represents.  */
@@ -14014,6 +14123,57 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info)
 	  && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
 }
 
+/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
+   for the vectorized form of STMT_INFO, which has cost kind KIND and which
+   when vectorized would operate on vector type VECTYPE.  Try to subdivide
+   the target-independent categorization provided by KIND to get a more
+   accurate cost.  WHERE specifies where the cost associated with KIND
+   occurs.  */
+static unsigned int
+aarch64_detect_vector_stmt_subtype (vect_cost_for_stmt kind,
+				    stmt_vec_info stmt_info, tree vectype,
+				    enum vect_cost_model_location where,
+				    unsigned int stmt_cost)
+{
+  const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
+
+  /* Detect cases in which vec_to_scalar represents a single reduction
+     instruction like FADDP or MAXV.  */
+  if (kind == vec_to_scalar
+      && where == vect_epilogue
+      && aarch64_is_reduction (stmt_info))
+    switch (GET_MODE_INNER (TYPE_MODE (vectype)))
+      {
+      case E_QImode:
+	return simd_costs->reduc_i8_cost;
+
+      case E_HImode:
+	return simd_costs->reduc_i16_cost;
+
+      case E_SImode:
+	return simd_costs->reduc_i32_cost;
+
+      case E_DImode:
+	return simd_costs->reduc_i64_cost;
+
+      case E_HFmode:
+      case E_BFmode:
+	return simd_costs->reduc_f16_cost;
+
+      case E_SFmode:
+	return simd_costs->reduc_f32_cost;
+
+      case E_DFmode:
+	return simd_costs->reduc_f64_cost;
+
+      default:
+	break;
+      }
+
+  /* Otherwise stick with the original categorization.  */
+  return stmt_cost;
+}
+
 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
    for STMT_INFO, which has cost kind KIND and which when vectorized would
    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
@@ -14097,6 +14257,14 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
       int stmt_cost =
 	    aarch64_builtin_vectorization_cost (kind, vectype, misalign);
 
+      /* Try to get a more accurate cost by looking at STMT_INFO instead
+	 of just looking at KIND.  */
+      if (stmt_info && vectype && aarch64_use_new_vector_costs_p ())
+	stmt_cost = aarch64_detect_vector_stmt_subtype (kind, stmt_info,
+							vectype, where,
+							stmt_cost);
+
+      /* Do any SVE-specific adjustments to the cost.  */
       if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
 	stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
 						  vectype, stmt_cost);