tree-optimization/98235 - limit SLP discovery

With following backedges and the SLP discovery cache not being
permute aware we have to put some discovery limits in place again.
That's also the opportunity to ditch the separate limit on the
number of permutes we try, so the patch limits the overall work
done (as in vect_build_slp_tree cache misses) to what we compute
as max_tree_size which is based on the number of scalar stmts in
the vectorized region.

Note the limit is global and there's no attempt to divide the
allowed work evenly amongst opportunities, so one degenerate
can eat it all up.  That's probably only relevant for BB
vectorization where the limit is based on up to the size of the
whole function.

2020-12-11  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/98235
	* tree-vect-slp.c (vect_build_slp_tree): Exchange npermutes
	for limit.  Decrement that for each cache miss and fail
	discovery when it reaches zero.
	(vect_build_slp_tree_2): Remove npermutes handling and
	simply pass down limit.
	(vect_build_slp_instance): Use pass down limit.
	(vect_analyze_slp_instance): Likewise.
	(vect_analyze_slp): Base the SLP discovery limit on
	max_tree_size and pass it down.

	* gcc.dg/torture/pr98235.c: New testcase.
This commit is contained in:
Richard Biener 2020-12-11 10:52:58 +01:00
parent 3e60ddeb82
commit fc7b424817
2 changed files with 77 additions and 31 deletions

View File

@ -0,0 +1,34 @@
/* { dg-do compile } */
/* { dg-additional-options "-fallow-store-data-races" } */
char tcube[3][9];
int cur_move;
void perm_cube(void) {
int i, j, k, tmp;
for (; i < cur_move; i++)
while (k-- >= 0)
switch (j) {
case 0:
tmp = tcube[0][6];
tcube[2][8] = tcube[0][8];
tcube[0][8] = tmp;
tmp = tcube[0][5];
tcube[0][5] = tcube[1][8];
tcube[1][8] = tcube[2][5];
tcube[2][5] = tcube[1][2];
tcube[1][2] = tcube[2][1];
tcube[2][1] = tcube[1][0];
tcube[0][6] = tmp;
tmp = tcube[0][3];
tcube[0][3] = tcube[1][0];
tcube[1][0] = tcube[2][3];
tcube[2][3] = tcube[1][6];
tcube[1][6] = tmp;
break;
case 5:
tmp = tcube[2][0];
tcube[2][0] = tcube[2][2];
tcube[2][2] = tcube[2][8];
tcube[2][3] = tmp;
}
}

View File

@ -1375,14 +1375,14 @@ static slp_tree
vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
vec<stmt_vec_info> stmts, unsigned int group_size,
poly_uint64 *max_nunits,
bool *matches, unsigned *npermutes, unsigned *tree_size,
bool *matches, unsigned *limit, unsigned *tree_size,
scalar_stmts_to_slp_tree_map_t *bst_map);
static slp_tree
vect_build_slp_tree (vec_info *vinfo,
vec<stmt_vec_info> stmts, unsigned int group_size,
poly_uint64 *max_nunits,
bool *matches, unsigned *npermutes, unsigned *tree_size,
bool *matches, unsigned *limit, unsigned *tree_size,
scalar_stmts_to_slp_tree_map_t *bst_map)
{
if (slp_tree *leader = bst_map->get (stmts))
@ -1405,10 +1405,26 @@ vect_build_slp_tree (vec_info *vinfo,
SLP_TREE_SCALAR_STMTS (res) = stmts;
bst_map->put (stmts.copy (), res);
if (*limit == 0)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"SLP discovery limit exceeded\n");
bool existed_p = bst_map->put (stmts, NULL);
gcc_assert (existed_p);
/* Mark the node invalid so we can detect those when still in use
as backedge destinations. */
SLP_TREE_SCALAR_STMTS (res) = vNULL;
SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
vect_free_slp_tree (res);
return NULL;
}
--*limit;
poly_uint64 this_max_nunits = 1;
slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
&this_max_nunits,
matches, npermutes, tree_size, bst_map);
matches, limit, tree_size, bst_map);
if (!res_)
{
bool existed_p = bst_map->put (stmts, NULL);
@ -1441,7 +1457,7 @@ static slp_tree
vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
vec<stmt_vec_info> stmts, unsigned int group_size,
poly_uint64 *max_nunits,
bool *matches, unsigned *npermutes, unsigned *tree_size,
bool *matches, unsigned *limit, unsigned *tree_size,
scalar_stmts_to_slp_tree_map_t *bst_map)
{
unsigned nops, i, this_tree_size = 0;
@ -1687,7 +1703,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
group_size, &this_max_nunits,
matches, npermutes,
matches, limit,
&this_tree_size, bst_map)) != NULL)
{
oprnd_info->def_stmts = vNULL;
@ -1708,12 +1724,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
&& is_gimple_assign (stmt_info->stmt)
/* Swapping operands for reductions breaks assumptions later on. */
&& STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
&& STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
/* Do so only if the number of not successful permutes was nor more
than a cut-ff as re-trying the recursive match on
possibly each level of the tree would expose exponential
behavior. */
&& *npermutes < 4)
&& STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
{
/* See whether we can swap the matching or the non-matching
stmt operands. */
@ -1759,17 +1770,13 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
bool *tem = XALLOCAVEC (bool, group_size);
if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
group_size, &this_max_nunits,
tem, npermutes,
tem, limit,
&this_tree_size, bst_map)) != NULL)
{
oprnd_info->def_stmts = vNULL;
children.safe_push (child);
continue;
}
/* We do not undo the swapping here since it might still be
the better order for the second operand in case we build
the first one from scalars below. */
++*npermutes;
}
fail:
@ -2213,7 +2220,7 @@ static bool
vect_analyze_slp_instance (vec_info *vinfo,
scalar_stmts_to_slp_tree_map_t *bst_map,
stmt_vec_info stmt_info, slp_instance_kind kind,
unsigned max_tree_size);
unsigned max_tree_size, unsigned *limit);
/* Analyze an SLP instance starting from SCALAR_STMTS which are a group
of KIND. Return true if successful. */
@ -2223,7 +2230,7 @@ vect_build_slp_instance (vec_info *vinfo,
slp_instance_kind kind,
vec<stmt_vec_info> &scalar_stmts,
stmt_vec_info root_stmt_info,
unsigned max_tree_size,
unsigned max_tree_size, unsigned *limit,
scalar_stmts_to_slp_tree_map_t *bst_map,
/* ??? We need stmt_info for group splitting. */
stmt_vec_info stmt_info_)
@ -2240,12 +2247,11 @@ vect_build_slp_instance (vec_info *vinfo,
/* Build the tree for the SLP instance. */
unsigned int group_size = scalar_stmts.length ();
bool *matches = XALLOCAVEC (bool, group_size);
unsigned npermutes = 0;
poly_uint64 max_nunits = 1;
unsigned tree_size = 0;
unsigned i;
slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
&max_nunits, matches, &npermutes,
&max_nunits, matches, limit,
&tree_size, bst_map);
if (node != NULL)
{
@ -2413,7 +2419,8 @@ vect_build_slp_instance (vec_info *vinfo,
stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
group1_size);
bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
kind, max_tree_size);
kind, max_tree_size,
limit);
/* Split the rest at the failure point and possibly
re-analyze the remaining matching part if it has
at least two lanes. */
@ -2425,13 +2432,15 @@ vect_build_slp_instance (vec_info *vinfo,
rest = vect_split_slp_store_group (rest, i - group1_size);
if (i - group1_size > 1)
res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
kind, max_tree_size);
kind, max_tree_size,
limit);
}
/* Re-analyze the non-matching tail if it has at least
two lanes. */
if (i + 1 < group_size)
res |= vect_analyze_slp_instance (vinfo, bst_map,
rest, kind, max_tree_size);
rest, kind, max_tree_size,
limit);
return res;
}
}
@ -2456,10 +2465,10 @@ vect_build_slp_instance (vec_info *vinfo,
DR_GROUP_GAP (stmt_info) = 0;
bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
kind, max_tree_size);
kind, max_tree_size, limit);
if (i + 1 < group_size)
res |= vect_analyze_slp_instance (vinfo, bst_map,
rest, kind, max_tree_size);
rest, kind, max_tree_size, limit);
return res;
}
@ -2484,7 +2493,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
scalar_stmts_to_slp_tree_map_t *bst_map,
stmt_vec_info stmt_info,
slp_instance_kind kind,
unsigned max_tree_size)
unsigned max_tree_size, unsigned *limit)
{
unsigned int i;
vec<stmt_vec_info> scalar_stmts;
@ -2556,7 +2565,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
kind == slp_inst_kind_ctor
? stmt_info : NULL,
max_tree_size, bst_map,
max_tree_size, limit, bst_map,
kind == slp_inst_kind_store
? stmt_info : NULL);
@ -2577,6 +2586,8 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
DUMP_VECT_SCOPE ("vect_analyze_slp");
unsigned limit = max_tree_size;
scalar_stmts_to_slp_tree_map_t *bst_map
= new scalar_stmts_to_slp_tree_map_t ();
@ -2585,7 +2596,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
vect_analyze_slp_instance (vinfo, bst_map, first_element,
STMT_VINFO_GROUPED_ACCESS (first_element)
? slp_inst_kind_store : slp_inst_kind_ctor,
max_tree_size);
max_tree_size, &limit);
if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
{
@ -2595,7 +2606,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
bb_vinfo->roots[i].stmts,
bb_vinfo->roots[i].root,
max_tree_size, bst_map, NULL))
max_tree_size, &limit, bst_map, NULL))
bb_vinfo->roots[i].stmts = vNULL;
}
}
@ -2609,7 +2620,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
;
else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
slp_inst_kind_reduc_chain,
max_tree_size))
max_tree_size, &limit))
{
/* Dissolve reduction chain group. */
stmt_vec_info vinfo = first_element;
@ -2630,7 +2641,8 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
/* Find SLP sequences starting from groups of reductions. */
if (loop_vinfo->reductions.length () > 1)
vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
slp_inst_kind_reduc_group, max_tree_size);
slp_inst_kind_reduc_group, max_tree_size,
&limit);
}
/* The map keeps a reference on SLP nodes built, release that. */