From dbc38dd9e96a9995298da2478041bdbbf247c479 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Thu, 21 Nov 2024 12:49:35 +0000 Subject: [PATCH] middle-end: Pass along SLP node when costing vector loads/stores With the support to SLP only we now pass the VMAT through the SLP node, however the majority of the costing calls inside vectorizable_load and vectorizable_store do no pass the SLP node along. Due to this the backend costing never sees the VMAT for these cases anymore. Additionally the helper around record_stmt_cost when both SLP and stmt_vinfo are passed would only pass the SLP node along. However the SLP node doesn't contain all the info available in the stmt_vinfo and we'd have to go through the SLP_TREE_REPRESENTATIVE anyway. As such I changed the function to just Always pass both along. Unlike the VMAT changes, I don't believe there to be a correctness issue here but would minimize the number of churn in the backend costing until vectorizer costing as a whole is revisited in GCC 16. These changes re-enable the cost model on AArch64 and also correctly find the VMATs on loads and stores fixing testcases such as sve_iters_low_2.c. gcc/ChangeLog: * tree-vect-data-refs.cc (vect_get_data_access_cost): Pass NULL for SLP node. * tree-vect-stmts.cc (record_stmt_cost): Expose. (vect_get_store_cost, vect_get_load_cost): Extend with SLP node. (vectorizable_store, vectorizable_load): Pass SLP node to all costing. * tree-vectorizer.h (record_stmt_cost): Always pass both SLP node and stmt_vinfo to costing. (vect_get_load_cost, vect_get_store_cost): Extend with SLP node. --- gcc/tree-vect-data-refs.cc | 12 ++-- gcc/tree-vect-stmts.cc | 109 +++++++++++++++++++++---------------- gcc/tree-vectorizer.h | 16 +++--- 3 files changed, 76 insertions(+), 61 deletions(-) diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index a32343c0022..35c946ab2d4 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -1729,12 +1729,14 @@ vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info, ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info)); if (DR_IS_READ (dr_info->dr)) - vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme, - misalignment, true, inside_cost, - outside_cost, prologue_cost_vec, body_cost_vec, false); + vect_get_load_cost (vinfo, stmt_info, NULL, ncopies, + alignment_support_scheme, misalignment, true, + inside_cost, outside_cost, prologue_cost_vec, + body_cost_vec, false); else - vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme, - misalignment, inside_cost, body_cost_vec); + vect_get_store_cost (vinfo,stmt_info, NULL, ncopies, + alignment_support_scheme, misalignment, inside_cost, + body_cost_vec); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 75973c77236..e500902a8be 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -93,7 +93,7 @@ stmt_in_inner_loop_p (vec_info *vinfo, class _stmt_vec_info *stmt_info) target model or by saving it in a vector for later processing. Return a preliminary estimate of the statement's cost. */ -static unsigned +unsigned record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count, enum vect_cost_for_stmt kind, stmt_vec_info stmt_info, slp_tree node, @@ -1008,8 +1008,8 @@ cfun_returns (tree decl) /* Calculate cost of DR's memory access. */ void -vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, int ncopies, - dr_alignment_support alignment_support_scheme, +vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, slp_tree slp_node, + int ncopies, dr_alignment_support alignment_support_scheme, int misalignment, unsigned int *inside_cost, stmt_vector_for_cost *body_cost_vec) @@ -1019,7 +1019,7 @@ vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, int ncopies, case dr_aligned: { *inside_cost += record_stmt_cost (body_cost_vec, ncopies, - vector_store, stmt_info, 0, + vector_store, stmt_info, slp_node, 0, vect_body); if (dump_enabled_p ()) @@ -1032,7 +1032,7 @@ vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, int ncopies, { /* Here, we assign an additional cost for the unaligned store. */ *inside_cost += record_stmt_cost (body_cost_vec, ncopies, - unaligned_store, stmt_info, + unaligned_store, stmt_info, slp_node, misalignment, vect_body); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -1058,8 +1058,8 @@ vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, int ncopies, /* Calculate cost of DR's memory access. */ void -vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies, - dr_alignment_support alignment_support_scheme, +vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, slp_tree slp_node, + int ncopies, dr_alignment_support alignment_support_scheme, int misalignment, bool add_realign_cost, unsigned int *inside_cost, unsigned int *prologue_cost, @@ -1072,7 +1072,7 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies, case dr_aligned: { *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, vect_body); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -1084,7 +1084,7 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies, { /* Here, we assign an additional cost for the unaligned load. */ *inside_cost += record_stmt_cost (body_cost_vec, ncopies, - unaligned_load, stmt_info, + unaligned_load, stmt_info, slp_node, misalignment, vect_body); if (dump_enabled_p ()) @@ -1097,16 +1097,18 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies, case dr_explicit_realign: { *inside_cost += record_stmt_cost (body_cost_vec, ncopies * 2, - vector_load, stmt_info, 0, vect_body); + vector_load, stmt_info, slp_node, 0, + vect_body); *inside_cost += record_stmt_cost (body_cost_vec, ncopies, - vec_perm, stmt_info, 0, vect_body); + vec_perm, stmt_info, slp_node, 0, + vect_body); /* FIXME: If the misalignment remains fixed across the iterations of the containing loop, the following cost should be added to the prologue costs. */ if (targetm.vectorize.builtin_mask_for_load) *inside_cost += record_stmt_cost (body_cost_vec, 1, vector_stmt, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, vect_body); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -1132,17 +1134,17 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies, { *prologue_cost += record_stmt_cost (prologue_cost_vec, 2, vector_stmt, stmt_info, - 0, vect_prologue); + slp_node, 0, vect_prologue); if (targetm.vectorize.builtin_mask_for_load) *prologue_cost += record_stmt_cost (prologue_cost_vec, 1, vector_stmt, stmt_info, - 0, vect_prologue); + slp_node, 0, vect_prologue); } *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, vect_body); *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_perm, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, vect_body); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -8540,7 +8542,7 @@ vectorizable_store (vec_info *vinfo, if (vls_type == VLS_STORE_INVARIANT) prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info, 0, vect_prologue); - vect_get_store_cost (vinfo, stmt_info, ncopies, + vect_get_store_cost (vinfo, stmt_info, slp_node, ncopies, alignment_support_scheme, misalignment, &inside_cost, cost_vec); @@ -8612,7 +8614,7 @@ vectorizable_store (vec_info *vinfo, else if (vls_type != VLS_STORE_INVARIANT) return; *prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info, - 0, vect_prologue); + slp_node, 0, vect_prologue); }; if (memory_access_type == VMAT_ELEMENTWISE @@ -8879,7 +8881,7 @@ vectorizable_store (vec_info *vinfo, if (costing_p) { if (n_adjacent_stores > 0) - vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores, + vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores, alignment_support_scheme, misalignment, &inside_cost, cost_vec); if (dump_enabled_p ()) @@ -9191,7 +9193,7 @@ vectorizable_store (vec_info *vinfo, if (costing_p) { if (n_adjacent_stores > 0) - vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores, + vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores, alignment_support_scheme, misalignment, &inside_cost, cost_vec); if (dump_enabled_p ()) @@ -9216,7 +9218,8 @@ vectorizable_store (vec_info *vinfo, { if (costing_p && vls_type == VLS_STORE_INVARIANT) prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, - stmt_info, 0, vect_prologue); + stmt_info, slp_node, 0, + vect_prologue); else if (!costing_p) { /* Since the store is not grouped, DR_GROUP_SIZE is 1, and @@ -9293,7 +9296,8 @@ vectorizable_store (vec_info *vinfo, unsigned int cnunits = vect_nunits_for_cost (vectype); inside_cost += record_stmt_cost (cost_vec, cnunits, scalar_store, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, + vect_body); continue; } @@ -9360,7 +9364,7 @@ vectorizable_store (vec_info *vinfo, unsigned int cnunits = vect_nunits_for_cost (vectype); inside_cost += record_stmt_cost (cost_vec, cnunits, scalar_store, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, vect_body); continue; } poly_uint64 offset_nunits @@ -9467,14 +9471,14 @@ vectorizable_store (vec_info *vinfo, consumed by the load). */ inside_cost += record_stmt_cost (cost_vec, cnunits, vec_to_scalar, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, vect_body); /* N scalar stores plus extracting the elements. */ inside_cost += record_stmt_cost (cost_vec, cnunits, vec_to_scalar, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, vect_body); inside_cost += record_stmt_cost (cost_vec, cnunits, scalar_store, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, vect_body); continue; } @@ -9668,7 +9672,8 @@ vectorizable_store (vec_info *vinfo, int group_size = DR_GROUP_SIZE (first_stmt_info); int nstmts = ceil_log2 (group_size) * group_size; inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, + vect_body); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "vect_model_store_cost: " @@ -9697,7 +9702,8 @@ vectorizable_store (vec_info *vinfo, { if (costing_p) inside_cost += record_stmt_cost (cost_vec, 1, vec_perm, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, + vect_body); else { tree perm_mask = perm_mask_for_reverse (vectype); @@ -9890,7 +9896,7 @@ vectorizable_store (vec_info *vinfo, if (costing_p) { if (n_adjacent_stores > 0) - vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores, + vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores, alignment_support_scheme, misalignment, &inside_cost, cost_vec); @@ -9916,11 +9922,11 @@ vectorizable_store (vec_info *vinfo, /* Spill. */ prologue_cost += record_stmt_cost (cost_vec, ncopies, vector_store, - stmt_info, 0, vect_epilogue); + stmt_info, slp_node, 0, vect_epilogue); /* Loads. */ prologue_cost += record_stmt_cost (cost_vec, ncopies * nregs, scalar_load, - stmt_info, 0, vect_epilogue); + stmt_info, slp_node, 0, vect_epilogue); } } } @@ -10493,9 +10499,10 @@ vectorizable_load (vec_info *vinfo, enum vect_cost_model_location cost_loc = hoist_p ? vect_prologue : vect_body; unsigned int cost = record_stmt_cost (cost_vec, 1, scalar_load, - stmt_info, 0, cost_loc); - cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info, 0, - cost_loc); + stmt_info, slp_node, 0, + cost_loc); + cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info, + slp_node, 0, cost_loc); unsigned int prologue_cost = hoist_p ? cost : 0; unsigned int inside_cost = hoist_p ? 0 : cost; if (dump_enabled_p ()) @@ -10726,7 +10733,8 @@ vectorizable_load (vec_info *vinfo, n_adjacent_loads++; else inside_cost += record_stmt_cost (cost_vec, 1, scalar_load, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, + vect_body); continue; } tree this_off = build_int_cst (TREE_TYPE (alias_off), @@ -10764,7 +10772,8 @@ vectorizable_load (vec_info *vinfo, { if (costing_p) inside_cost += record_stmt_cost (cost_vec, 1, vec_construct, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, + vect_body); else { tree vec_inv = build_constructor (lvectype, v); @@ -10810,7 +10819,8 @@ vectorizable_load (vec_info *vinfo, vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf, true, &n_perms, &n_loads); inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm, - first_stmt_info, 0, vect_body); + first_stmt_info, slp_node, 0, + vect_body); } else vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf, @@ -10820,7 +10830,7 @@ vectorizable_load (vec_info *vinfo, if (costing_p) { if (n_adjacent_loads > 0) - vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads, + vect_get_load_cost (vinfo, stmt_info, slp_node, n_adjacent_loads, alignment_support_scheme, misalignment, false, &inside_cost, nullptr, cost_vec, cost_vec, true); @@ -11174,7 +11184,7 @@ vectorizable_load (vec_info *vinfo, "vect_model_load_cost: %d " "unused vectors.\n", gaps); - vect_get_load_cost (vinfo, stmt_info, gaps, + vect_get_load_cost (vinfo, stmt_info, slp_node, gaps, alignment_support_scheme, misalignment, false, &inside_cost, &prologue_cost, cost_vec, cost_vec, @@ -11303,7 +11313,7 @@ vectorizable_load (vec_info *vinfo, if (costing_p) { if (n_adjacent_loads > 0) - vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads, + vect_get_load_cost (vinfo, stmt_info, slp_node, n_adjacent_loads, alignment_support_scheme, misalignment, false, &inside_cost, &prologue_cost, cost_vec, cost_vec, true); @@ -11380,7 +11390,7 @@ vectorizable_load (vec_info *vinfo, unsigned int cnunits = vect_nunits_for_cost (vectype); inside_cost = record_stmt_cost (cost_vec, cnunits, scalar_load, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, vect_body); continue; } if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) @@ -11456,7 +11466,7 @@ vectorizable_load (vec_info *vinfo, unsigned int cnunits = vect_nunits_for_cost (vectype); inside_cost = record_stmt_cost (cost_vec, cnunits, scalar_load, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, vect_body); continue; } poly_uint64 offset_nunits @@ -11591,7 +11601,7 @@ vectorizable_load (vec_info *vinfo, vector. */ inside_cost = record_stmt_cost (cost_vec, const_nunits, scalar_load, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, vect_body); inside_cost = record_stmt_cost (cost_vec, 1, vec_construct, stmt_info, slp_node, 0, vect_body); @@ -12178,7 +12188,7 @@ vectorizable_load (vec_info *vinfo, /* Leave realign cases alone to keep them simple. */ if (alignment_support_scheme == dr_explicit_realign_optimized || alignment_support_scheme == dr_explicit_realign) - vect_get_load_cost (vinfo, stmt_info, 1, + vect_get_load_cost (vinfo, stmt_info, slp_node, 1, alignment_support_scheme, misalignment, add_realign_cost, &inside_cost, &prologue_cost, cost_vec, cost_vec, @@ -12251,7 +12261,8 @@ vectorizable_load (vec_info *vinfo, { if (costing_p) inside_cost = record_stmt_cost (cost_vec, 1, vec_perm, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, + vect_body); else { tree perm_mask = perm_mask_for_reverse (vectype); @@ -12320,7 +12331,8 @@ vectorizable_load (vec_info *vinfo, vect_transform_slp_perm_load (vinfo, slp_node, vNULL, nullptr, vf, true, &n_perms, nullptr); inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, + vect_body); } else { @@ -12347,7 +12359,8 @@ vectorizable_load (vec_info *vinfo, int group_size = DR_GROUP_SIZE (first_stmt_info); int nstmts = ceil_log2 (group_size) * group_size; inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, + vect_body); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -12376,7 +12389,7 @@ vectorizable_load (vec_info *vinfo, || memory_access_type == VMAT_CONTIGUOUS_REVERSE || memory_access_type == VMAT_CONTIGUOUS_PERMUTE); if (n_adjacent_loads > 0) - vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads, + vect_get_load_cost (vinfo, stmt_info, slp_node, n_adjacent_loads, alignment_support_scheme, misalignment, false, &inside_cost, &prologue_cost, cost_vec, cost_vec, true); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index dcad41dcf18..7f69a3f57b4 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2354,6 +2354,10 @@ extern unsigned record_stmt_cost (stmt_vector_for_cost *, int, extern unsigned record_stmt_cost (stmt_vector_for_cost *, int, enum vect_cost_for_stmt, enum vect_cost_model_location); +extern unsigned record_stmt_cost (stmt_vector_for_cost *, int, + enum vect_cost_for_stmt, stmt_vec_info, + slp_tree, tree, int, + enum vect_cost_model_location); /* Overload of record_stmt_cost with VECTYPE derived from STMT_INFO. */ @@ -2375,12 +2379,8 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count, slp_tree node, int misalign, enum vect_cost_model_location where) { - if (node) - return record_stmt_cost (body_cost_vec, count, kind, node, - STMT_VINFO_VECTYPE (stmt_info), misalign, where); - else - return record_stmt_cost (body_cost_vec, count, kind, stmt_info, - STMT_VINFO_VECTYPE (stmt_info), misalign, where); + return record_stmt_cost (body_cost_vec, count, kind, stmt_info, node, + STMT_VINFO_VECTYPE (stmt_info), misalign, where); } extern void vect_finish_replace_stmt (vec_info *, stmt_vec_info, gimple *); @@ -2411,12 +2411,12 @@ extern bool vect_nop_conversion_p (stmt_vec_info); extern opt_result vect_analyze_stmt (vec_info *, stmt_vec_info, bool *, slp_tree, slp_instance, stmt_vector_for_cost *); -extern void vect_get_load_cost (vec_info *, stmt_vec_info, int, +extern void vect_get_load_cost (vec_info *, stmt_vec_info, slp_tree, int, dr_alignment_support, int, bool, unsigned int *, unsigned int *, stmt_vector_for_cost *, stmt_vector_for_cost *, bool); -extern void vect_get_store_cost (vec_info *, stmt_vec_info, int, +extern void vect_get_store_cost (vec_info *, stmt_vec_info, slp_tree, int, dr_alignment_support, int, unsigned int *, stmt_vector_for_cost *); extern bool vect_supportable_shift (vec_info *, enum tree_code, tree);