diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index e4eeb2ce142..bfcab72b122 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -237,7 +237,33 @@ struct simd_vec_cost }; typedef struct simd_vec_cost advsimd_vec_cost; -typedef struct simd_vec_cost sve_vec_cost; + +/* SVE-specific extensions to the information provided by simd_vec_cost. */ +struct sve_vec_cost : simd_vec_cost +{ + constexpr sve_vec_cost (const simd_vec_cost &base, + unsigned int clast_cost, + unsigned int fadda_f16_cost, + unsigned int fadda_f32_cost, + unsigned int fadda_f64_cost) + : simd_vec_cost (base), + clast_cost (clast_cost), + fadda_f16_cost (fadda_f16_cost), + fadda_f32_cost (fadda_f32_cost), + fadda_f64_cost (fadda_f64_cost) + {} + + /* The cost of a vector-to-scalar CLASTA or CLASTB instruction, + with the scalar being stored in FP registers. This cost is + assumed to be a cycle latency. */ + const int clast_cost; + + /* The costs of FADDA for the three data types that it supports. + These costs are assumed to be cycle latencies. */ + const int fadda_f16_cost; + const int fadda_f32_cost; + const int fadda_f64_cost; +}; /* Cost for vector insn classes. */ struct cpu_vector_cost diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index b44dcdc6a6e..b62169a267a 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -609,22 +609,28 @@ static const advsimd_vec_cost generic_advsimd_vector_cost = /* Generic costs for SVE vector operations. */ static const sve_vec_cost generic_sve_vector_cost = { - 1, /* int_stmt_cost */ - 1, /* fp_stmt_cost */ - 2, /* permute_cost */ - 2, /* reduc_i8_cost */ - 2, /* reduc_i16_cost */ - 2, /* reduc_i32_cost */ - 2, /* reduc_i64_cost */ - 2, /* reduc_f16_cost */ - 2, /* reduc_f32_cost */ - 2, /* reduc_f64_cost */ - 2, /* vec_to_scalar_cost */ - 1, /* scalar_to_vec_cost */ - 1, /* align_load_cost */ - 1, /* unalign_load_cost */ - 1, /* unalign_store_cost */ - 1 /* store_cost */ + { + 1, /* int_stmt_cost */ + 1, /* fp_stmt_cost */ + 2, /* permute_cost */ + 2, /* reduc_i8_cost */ + 2, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 2, /* reduc_f16_cost */ + 2, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ + 2, /* vec_to_scalar_cost */ + 1, /* scalar_to_vec_cost */ + 1, /* align_load_cost */ + 1, /* unalign_load_cost */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ + }, + 2, /* clast_cost */ + 2, /* fadda_f16_cost */ + 2, /* fadda_f32_cost */ + 2 /* fadda_f64_cost */ }; /* Generic costs for vector insn classes. */ @@ -662,22 +668,28 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost = static const sve_vec_cost a64fx_sve_vector_cost = { - 2, /* int_stmt_cost */ - 5, /* fp_stmt_cost */ - 3, /* permute_cost */ - 13, /* reduc_i8_cost */ - 13, /* reduc_i16_cost */ - 13, /* reduc_i32_cost */ - 13, /* reduc_i64_cost */ - 13, /* reduc_f16_cost */ - 13, /* reduc_f32_cost */ - 13, /* reduc_f64_cost */ - 13, /* vec_to_scalar_cost */ - 4, /* scalar_to_vec_cost */ - 6, /* align_load_cost */ - 6, /* unalign_load_cost */ - 1, /* unalign_store_cost */ - 1 /* store_cost */ + { + 2, /* int_stmt_cost */ + 5, /* fp_stmt_cost */ + 3, /* permute_cost */ + 13, /* reduc_i8_cost */ + 13, /* reduc_i16_cost */ + 13, /* reduc_i32_cost */ + 13, /* reduc_i64_cost */ + 13, /* reduc_f16_cost */ + 13, /* reduc_f32_cost */ + 13, /* reduc_f64_cost */ + 13, /* vec_to_scalar_cost */ + 4, /* scalar_to_vec_cost */ + 6, /* align_load_cost */ + 6, /* unalign_load_cost */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ + }, + 13, /* clast_cost */ + 13, /* fadda_f16_cost */ + 13, /* fadda_f32_cost */ + 13 /* fadda_f64_cost */ }; static const struct cpu_vector_cost a64fx_vector_cost = @@ -14060,6 +14072,20 @@ aarch64_is_reduction (stmt_vec_info stmt_info) || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))); } +/* If STMT_INFO describes a reduction, return the type of reduction + it describes, otherwise return -1. */ +static int +aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info) +{ + if (loop_vec_info loop_vinfo = dyn_cast (vinfo)) + if (STMT_VINFO_REDUC_DEF (stmt_info)) + { + stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); + return int (STMT_VINFO_REDUC_TYPE (reduc_info)); + } + return -1; +} + /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD vectors would produce a series of LDP or STP operations. KIND is the kind of statement that STMT_INFO represents. */ @@ -14123,6 +14149,43 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info) && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type)); } +/* We are considering implementing STMT_INFO using SVE vector type VECTYPE. + If STMT_INFO is an in-loop reduction that SVE supports directly, return + its latency in cycles, otherwise return zero. SVE_COSTS specifies the + latencies of the relevant instructions. */ +static unsigned int +aarch64_sve_in_loop_reduction_latency (vec_info *vinfo, + stmt_vec_info stmt_info, + tree vectype, + const sve_vec_cost *sve_costs) +{ + switch (aarch64_reduc_type (vinfo, stmt_info)) + { + case EXTRACT_LAST_REDUCTION: + return sve_costs->clast_cost; + + case FOLD_LEFT_REDUCTION: + switch (GET_MODE_INNER (TYPE_MODE (vectype))) + { + case E_HFmode: + case E_BFmode: + return sve_costs->fadda_f16_cost; + + case E_SFmode: + return sve_costs->fadda_f32_cost; + + case E_DFmode: + return sve_costs->fadda_f64_cost; + + default: + break; + } + break; + } + + return 0; +} + /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost for the vectorized form of STMT_INFO, which has cost kind KIND and which when vectorized would operate on vector type VECTYPE. Try to subdivide @@ -14130,12 +14193,27 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info) accurate cost. WHERE specifies where the cost associated with KIND occurs. */ static unsigned int -aarch64_detect_vector_stmt_subtype (vect_cost_for_stmt kind, +aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, stmt_vec_info stmt_info, tree vectype, enum vect_cost_model_location where, unsigned int stmt_cost) { const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype); + const sve_vec_cost *sve_costs = nullptr; + if (aarch64_sve_mode_p (TYPE_MODE (vectype))) + sve_costs = aarch64_tune_params.vec_costs->sve; + + /* Detect cases in which vec_to_scalar represents an in-loop reduction. */ + if (kind == vec_to_scalar + && where == vect_body + && sve_costs) + { + unsigned int latency + = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype, + sve_costs); + if (latency) + return latency; + } /* Detect cases in which vec_to_scalar represents a single reduction instruction like FADDP or MAXV. */ @@ -14260,9 +14338,9 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, /* Try to get a more accurate cost by looking at STMT_INFO instead of just looking at KIND. */ if (stmt_info && vectype && aarch64_use_new_vector_costs_p ()) - stmt_cost = aarch64_detect_vector_stmt_subtype (kind, stmt_info, - vectype, where, - stmt_cost); + stmt_cost = aarch64_detect_vector_stmt_subtype (vinfo, kind, + stmt_info, vectype, + where, stmt_cost); /* Do any SVE-specific adjustments to the cost. */ if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))