diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index bfcab72b122..3d152754981 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -202,6 +202,13 @@ struct simd_vec_cost specially below. */ const int fp_stmt_cost; + /* Per-vector cost of permuting vectors after an LD2, LD3 or LD4, + as well as the per-vector cost of permuting vectors before + an ST2, ST3 or ST4. */ + const int ld2_st2_permute_cost; + const int ld3_st3_permute_cost; + const int ld4_st4_permute_cost; + /* Cost of a permute operation. */ const int permute_cost; diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index b62169a267a..8fb723dabd2 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -590,6 +590,9 @@ static const advsimd_vec_cost generic_advsimd_vector_cost = { 1, /* int_stmt_cost */ 1, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 2, /* reduc_i8_cost */ 2, /* reduc_i16_cost */ @@ -612,6 +615,9 @@ static const sve_vec_cost generic_sve_vector_cost = { 1, /* int_stmt_cost */ 1, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 2, /* reduc_i8_cost */ 2, /* reduc_i16_cost */ @@ -650,6 +656,9 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost = { 2, /* int_stmt_cost */ 5, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 3, /* permute_cost */ 13, /* reduc_i8_cost */ 13, /* reduc_i16_cost */ @@ -671,6 +680,9 @@ static const sve_vec_cost a64fx_sve_vector_cost = { 2, /* int_stmt_cost */ 5, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 3, /* permute_cost */ 13, /* reduc_i8_cost */ 13, /* reduc_i16_cost */ @@ -708,6 +720,9 @@ static const advsimd_vec_cost qdf24xx_advsimd_vector_cost = { 1, /* int_stmt_cost */ 3, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 1, /* reduc_i8_cost */ 1, /* reduc_i16_cost */ @@ -742,6 +757,9 @@ static const advsimd_vec_cost thunderx_advsimd_vector_cost = { 4, /* int_stmt_cost */ 1, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 4, /* permute_cost */ 2, /* reduc_i8_cost */ 2, /* reduc_i16_cost */ @@ -775,6 +793,9 @@ static const advsimd_vec_cost tsv110_advsimd_vector_cost = { 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 3, /* reduc_i8_cost */ 3, /* reduc_i16_cost */ @@ -807,6 +828,9 @@ static const advsimd_vec_cost cortexa57_advsimd_vector_cost = { 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 3, /* permute_cost */ 8, /* reduc_i8_cost */ 8, /* reduc_i16_cost */ @@ -840,6 +864,9 @@ static const advsimd_vec_cost exynosm1_advsimd_vector_cost = { 3, /* int_stmt_cost */ 3, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 3, /* permute_cost */ 3, /* reduc_i8_cost */ 3, /* reduc_i16_cost */ @@ -872,6 +899,9 @@ static const advsimd_vec_cost xgene1_advsimd_vector_cost = { 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 4, /* reduc_i8_cost */ 4, /* reduc_i16_cost */ @@ -905,6 +935,9 @@ static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost = { 4, /* int_stmt_cost */ 5, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 10, /* permute_cost */ 6, /* reduc_i8_cost */ 6, /* reduc_i16_cost */ @@ -938,6 +971,9 @@ static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost = { 5, /* int_stmt_cost */ 5, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 10, /* permute_cost */ 5, /* reduc_i8_cost */ 5, /* reduc_i16_cost */ @@ -14086,6 +14122,26 @@ aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info) return -1; } +/* Return true if an access of kind KIND for STMT_INFO represents one + vector of an LD[234] or ST[234] operation. Return the total number of + vectors (2, 3 or 4) if so, otherwise return a value outside that range. */ +static int +aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info) +{ + if ((kind == vector_load + || kind == unaligned_load + || kind == vector_store + || kind == unaligned_store) + && STMT_VINFO_DATA_REF (stmt_info)) + { + stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + if (stmt_info + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES) + return DR_GROUP_SIZE (stmt_info); + } + return 0; +} + /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD vectors would produce a series of LDP or STP operations. KIND is the kind of statement that STMT_INFO represents. */ @@ -14320,6 +14376,38 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind, return stmt_cost; } +/* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND + and which when vectorized would operate on vector type VECTYPE. Add the + cost of any embedded operations. */ +static unsigned int +aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, + tree vectype, unsigned int stmt_cost) +{ + if (vectype) + { + const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype); + + /* Detect cases in which a vector load or store represents an + LD[234] or ST[234] instruction. */ + switch (aarch64_ld234_st234_vectors (kind, stmt_info)) + { + case 2: + stmt_cost += simd_costs->ld2_st2_permute_cost; + break; + + case 3: + stmt_cost += simd_costs->ld3_st3_permute_cost; + break; + + case 4: + stmt_cost += simd_costs->ld4_st4_permute_cost; + break; + } + } + + return stmt_cost; +} + /* Implement targetm.vectorize.add_stmt_cost. */ static unsigned aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, @@ -14347,6 +14435,12 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info, vectype, stmt_cost); + if (stmt_info && aarch64_use_new_vector_costs_p ()) + /* Account for any extra "embedded" costs that apply additively + to the base cost calculated above. */ + stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype, + stmt_cost); + /* Statements in an inner loop relative to the loop being vectorized are weighted more heavily. The value here is arbitrary and could potentially be improved with analysis. */