cfgloopmanip.c (duplicate_subloops): Export.

2009-10-22  Razya Ladelsky  <razya@il.ibm.com>

        * cfgloopmanip.c  (duplicate_subloops): Export.
        * tree-parloops.c (loop_parallel_p): Dump if loop is innermost.
        (transform_to_exit_first_loop): Duplicate bbs starting from 
        header up to loop->latch instead of exit->src.
        Initialize control variable to the correct number of iterations.
        (gather_scalar_reductions): Do not register double reductions.
        (parallelize_loops): Dump which loop is tested. 
        Indicate whether the parallelized loop is inner or not. 
        Remove the innermost-loop requirement.
        * cfgloop.h (duplicate_subloops): Export. 
        * tree-cfg.c (add_phi_args_after_redirect): New function.
        (gimple_duplicate_sese_tail): Remove the no-subloops constraint.
        Call duplicate_subloops.
        Update number of iterations at the exit condition.
        Don't redirect nexits always to the loop exit.
        Redirect copied edges from latch to the loop exit.
        * testsuite/libgomp.graphite/force-parallel-2.c: Adjust scan.
        * testsuite/gcc.dg/autopar/outer-1.c: New testcase.
        * testsuite/gcc.dg/autopar/outer-2.c: New testcase.
        * testsuite/gcc.dg/autopar/outer-3.c: New testcase.
        * testsuite/gcc.dg/autopar/outer-4.c: New testcase.
        * testsuite/gcc.dg/autopar/outer-5.c: New testcase.
        * testsuite/gcc.dg/autopar/outer-6.c: New testcase.

From-SVN: r153457
This commit is contained in:
Razya Ladelsky 2009-10-22 14:43:40 +00:00 committed by Razya Ladelsky
parent 0d4958d022
commit 487102294c
12 changed files with 449 additions and 34 deletions

View File

@ -1,3 +1,29 @@
2009-10-22 Razya Ladelsky <razya@il.ibm.com>
* cfgloopmanip.c (duplicate_subloops): Export.
* tree-parloops.c (loop_parallel_p): Dump if loop is innermost.
(transform_to_exit_first_loop): Duplicate bbs starting from
header up to loop->latch instead of exit->src.
Initialize control variable to the correct number of iterations.
(gather_scalar_reductions): Do not register double reductions.
(parallelize_loops): Dump which loop is tested.
Indicate whether the parallelized loop is inner or not.
Remove the innermost-loop requirement.
* cfgloop.h (duplicate_subloops): Export.
* tree-cfg.c (add_phi_args_after_redirect): New function.
(gimple_duplicate_sese_tail): Remove the no-subloops constraint.
Call duplicate_subloops.
Update number of iterations at the exit condition.
Don't redirect nexits always to the loop exit.
Redirect copied edges from latch to the loop exit.
* testsuite/libgomp.graphite/force-parallel-2.c: Adjust scan.
* testsuite/gcc.dg/autopar/outer-1.c: New testcase.
* testsuite/gcc.dg/autopar/outer-2.c: New testcase.
* testsuite/gcc.dg/autopar/outer-3.c: New testcase.
* testsuite/gcc.dg/autopar/outer-4.c: New testcase.
* testsuite/gcc.dg/autopar/outer-5.c: New testcase.
* testsuite/gcc.dg/autopar/outer-6.c: New testcase.
2009-10-22 Jan Hubicka <jh@suse.cz>
* ipa-cp.c (ipcp_read_summary): Remove now invalid FIXME and

View File

@ -288,6 +288,7 @@ extern edge create_empty_if_region_on_edge (edge, tree);
extern struct loop *create_empty_loop_on_edge (edge, tree, tree, tree, tree,
tree *, tree *, struct loop *);
extern struct loop * duplicate_loop (struct loop *, struct loop *);
extern void duplicate_subloops (struct loop *, struct loop *);
extern bool duplicate_loop_to_header_edge (struct loop *, edge,
unsigned, sbitmap, edge,
VEC (edge, heap) **, int);

View File

@ -32,7 +32,6 @@ along with GCC; see the file COPYING3. If not see
#include "output.h"
#include "tree-flow.h"
static void duplicate_subloops (struct loop *, struct loop *);
static void copy_loops_to (struct loop **, int,
struct loop *);
static void loop_redirect_edge (edge, basic_block);
@ -886,7 +885,7 @@ duplicate_loop (struct loop *loop, struct loop *target)
/* Copies structure of subloops of LOOP into TARGET loop, placing
newly created loops into loop tree. */
static void
void
duplicate_subloops (struct loop *loop, struct loop *target)
{
struct loop *aloop, *cloop;

View File

@ -0,0 +1,33 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */
void abort (void);
void parloop (int N)
{
int i, j;
int x[10000][10000];
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
x[i][j] = i + j + 3;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
if (x[i][j] != i + j + 3)
abort ();
}
int main(void)
{
parloop(10000);
return 0;
}
/* Check that outer loop is parallelized. */
/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" } } */
/* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
/* { dg-final { cleanup-tree-dump "parloops" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

View File

@ -0,0 +1,33 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */
void abort (void);
void parloop (int N)
{
int i, j,ii;
int x[400][10][400];
for (ii = 0; ii < N; ii++)
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
x[i][j][ii] = ii+i + j + 3;
for (ii = 0; ii < N; ii++)
for (i = 0; i < N;i++)
for (j = 0; j < N; j++)
if (x[i][j][ii] != ii+i + j + 3)
abort ();
}
int main(void)
{
parloop(400);
return 0;
}
/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" } } */
/* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
/* { dg-final { cleanup-tree-dump "parloops" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

View File

@ -0,0 +1,33 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */
void abort (void);
void parloop (int N)
{
int i, j;
int x[500][500];
for (i = 0; i < N; i++)
for (j = 0; j < i; j++)
x[i][j] = i + j + 3;
for (i = 0; i < N; i++)
for (j = 0; j < i; j++)
if (x[i][j] != i + j + 3)
abort ();
}
int main(void)
{
parloop(500);
return 0;
}
/* Check that outer loop is parallelized. */
/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" } } */
/* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
/* { dg-final { cleanup-tree-dump "parloops" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

View File

@ -0,0 +1,38 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */
void abort (void);
int g_sum=0;
int x[500][500];
__attribute__((noinline))
void parloop (int N)
{
int i, j;
int sum;
/* Double reduction is currently not supported, outer loop is not
parallelized. Inner reduction is detected, inner loop is
parallelized. */
sum = 0;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
sum += x[i][j];
g_sum = sum;
}
int main(void)
{
parloop(500);
return 0;
}
/* Check that outer loop is parallelized. */
/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 0 "parloops" } } */
/* { dg-final { scan-tree-dump-times "parallelizing inner loop" 1 "parloops" } } */
/* { dg-final { cleanup-tree-dump "parloops" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

View File

@ -0,0 +1,52 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */
void abort (void);
int x[500][500];
int y[500];
int g_sum=0;
__attribute__((noinline))
void init (int i, int j)
{
x[i][j]=1;
}
__attribute__((noinline))
void parloop (int N)
{
int i, j;
int sum;
/* Inner cycle is currently not supported, outer loop is not
parallelized. Inner reduction is detected, inner loop is
parallelized. */
for (i = 0; i < N; i++)
{
sum = 0;
for (j = 0; j < N; j++)
sum += x[i][j];
y[i]=sum;
}
g_sum = sum;
}
int main(void)
{
int i,j;
for (i = 0; i < 500; i++)
for (j = 0; j < 500; j++)
init(i, j);
parloop(500);
return 0;
}
/* Check that outer loop is parallelized. */
/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 0 "parloops" } } */
/* { dg-final { scan-tree-dump-times "parallelizing inner loop" 1 "parloops" } } */
/* { dg-final { cleanup-tree-dump "parloops" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

View File

@ -0,0 +1,50 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */
void abort (void);
int x[500][500];
int y[500];
int g_sum=0;
__attribute__((noinline))
void init (int i, int j)
{
x[i][j]=1;
}
__attribute__((noinline))
void parloop (int N)
{
int i, j;
int sum;
/* Outer loop reduction, outerloop is parallelized. */
sum=0;
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
y[i]=x[i][j];
sum += y[i];
}
g_sum = sum;
}
int main(void)
{
int i,j;
for (i = 0; i < 500; i++)
for (j = 0; j < 500; j++)
init(i, j);
parloop(500);
return 0;
}
/* Check that outer loop is parallelized. */
/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" } } */
/* { dg-final { scan-tree-dump-times "parallelizing inner loop" 0 "parloops" } } */
/* { dg-final { cleanup-tree-dump "parloops" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

View File

@ -4850,6 +4850,31 @@ gimple_duplicate_bb (basic_block bb)
return new_bb;
}
/* Add phi arguments to the phi nodes in E_COPY->dest according to
the phi arguments coming from the equivalent edge at
the phi nodes of DEST. */
static void
add_phi_args_after_redirect (edge e_copy, edge orig_e)
{
gimple_stmt_iterator psi, psi_copy;
gimple phi, phi_copy;
tree def;
for (psi = gsi_start_phis (orig_e->dest),
psi_copy = gsi_start_phis (e_copy->dest);
!gsi_end_p (psi);
gsi_next (&psi), gsi_next (&psi_copy))
{
phi = gsi_stmt (psi);
phi_copy = gsi_stmt (psi_copy);
def = PHI_ARG_DEF_FROM_EDGE (phi, orig_e);
add_phi_arg (phi_copy, def, e_copy,
gimple_phi_arg_location_from_edge (phi, orig_e));
}
}
/* Adds phi node arguments for edge E_COPY after basic block duplication. */
static void
@ -5131,9 +5156,14 @@ gimple_duplicate_sese_tail (edge entry ATTRIBUTE_UNUSED, edge exit ATTRIBUTE_UNU
int total_freq = 0, exit_freq = 0;
gcov_type total_count = 0, exit_count = 0;
edge exits[2], nexits[2], e;
gimple_stmt_iterator gsi;
gimple_stmt_iterator gsi,gsi1;
gimple cond_stmt;
edge sorig, snew;
edge sorig, snew, orig_e;
basic_block exit_bb;
edge_iterator ei;
VEC (edge, heap) *redirect_edges;
basic_block iters_bb, orig_src;
tree new_rhs;
gcc_assert (EDGE_COUNT (exit->src->succs) == 2);
exits[0] = exit;
@ -5149,17 +5179,13 @@ gimple_duplicate_sese_tail (edge entry ATTRIBUTE_UNUSED, edge exit ATTRIBUTE_UNU
it will work, but the resulting code will not be correct. */
for (i = 0; i < n_region; i++)
{
/* We do not handle subloops, i.e. all the blocks must belong to the
same loop. */
if (region[i]->loop_father != orig_loop)
return false;
if (region[i] == orig_loop->latch)
return false;
}
initialize_original_copy_tables ();
set_loop_copy (orig_loop, loop);
duplicate_subloops (orig_loop, loop);
if (!region_copy)
{
@ -5225,8 +5251,36 @@ gimple_duplicate_sese_tail (edge entry ATTRIBUTE_UNUSED, edge exit ATTRIBUTE_UNU
cond_stmt = last_stmt (exit->src);
gcc_assert (gimple_code (cond_stmt) == GIMPLE_COND);
cond_stmt = gimple_copy (cond_stmt);
/* If the block consisting of the exit condition has the latch as
successor, then the body of the loop is executed before
the exit consition is tested. In such case, moving the
condition to the entry, causes that the loop will iterate
one less iteration (which is the wanted outcome, since we
peel out the last iteration). If the body is executed after
the condition, moving the condition to the entry requires
decrementing one iteration. */
if (exits[1]->dest == orig_loop->latch)
new_rhs = gimple_cond_rhs (cond_stmt);
else
{
new_rhs = fold_build2 (MINUS_EXPR, TREE_TYPE (gimple_cond_rhs (cond_stmt)),
gimple_cond_rhs (cond_stmt),
build_int_cst (TREE_TYPE (gimple_cond_rhs (cond_stmt)), 1));
if (TREE_CODE (gimple_cond_rhs (cond_stmt)) == SSA_NAME)
{
iters_bb = gimple_bb (SSA_NAME_DEF_STMT (gimple_cond_rhs (cond_stmt)));
for (gsi1 = gsi_start_bb (iters_bb); !gsi_end_p (gsi1); gsi_next (&gsi1))
if (gsi_stmt (gsi1)==SSA_NAME_DEF_STMT (gimple_cond_rhs (cond_stmt)))
break;
new_rhs = force_gimple_operand_gsi (&gsi1, new_rhs, true,
NULL_TREE,false,GSI_CONTINUE_LINKING);
}
}
gimple_cond_set_rhs (cond_stmt, unshare_expr (new_rhs));
gimple_cond_set_lhs (cond_stmt, unshare_expr (gimple_cond_lhs (cond_stmt)));
gimple_cond_set_rhs (cond_stmt, unshare_expr (gimple_cond_rhs (cond_stmt)));
gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
sorig = single_succ_edge (switch_bb);
@ -5238,25 +5292,87 @@ gimple_duplicate_sese_tail (edge entry ATTRIBUTE_UNUSED, edge exit ATTRIBUTE_UNU
/* Add the PHI node arguments. */
add_phi_args_after_copy (region_copy, n_region, snew);
/* Get rid of now superfluous conditions and associated edges (and phi node
arguments). */
exit_bb = exit->dest;
e = redirect_edge_and_branch (exits[0], exits[1]->dest);
PENDING_STMT (e) = NULL;
e = redirect_edge_and_branch (nexits[1], nexits[0]->dest);
PENDING_STMT (e) = NULL;
/* If the block consisting of the exit condition has the latch as
successor, then the body of the loop is executed before
the exit consition is tested.
{ body }
{ cond } (exit[0]) -> { latch }
|
V (exit[1])
{ exit_bb }
In such case, the equivalent copied edge nexits[1]
(for the peeled iteration) needs to be redirected to exit_bb.
Otherwise,
{ cond } (exit[0]) -> { body }
|
V (exit[1])
{ exit_bb }
exit[0] is pointing to the body of the loop,
and the equivalent nexits[0] needs to be redirected to
the copied body (of the peeled iteration). */
if (exits[1]->dest == orig_loop->latch)
e = redirect_edge_and_branch (nexits[1], nexits[0]->dest);
else
e = redirect_edge_and_branch (nexits[0], nexits[1]->dest);
PENDING_STMT (e) = NULL;
redirect_edges = VEC_alloc (edge, heap, 10);
for (i = 0; i < n_region; i++)
region_copy[i]->flags |= BB_DUPLICATED;
/* Iterate all incoming edges to latch. All those coming from
copied bbs will be redicrecred to exit_bb. */
FOR_EACH_EDGE (e, ei, orig_loop->latch->preds)
{
if (e->src->flags & BB_DUPLICATED)
VEC_safe_push (edge, heap, redirect_edges, e);
}
for (i = 0; i < n_region; i++)
region_copy[i]->flags &= ~BB_DUPLICATED;
for (i = 0; VEC_iterate (edge, redirect_edges, i, e); ++i)
{
e = redirect_edge_and_branch (e, exit_bb);
PENDING_STMT (e) = NULL;
orig_src = get_bb_original (e->src);
orig_e = find_edge (orig_src, orig_loop->latch);
add_phi_args_after_redirect (e, orig_e);
}
VEC_free (edge, heap, redirect_edges);
/* Anything that is outside of the region, but was dominated by something
inside needs to update dominance info. */
iterate_fix_dominators (CDI_DOMINATORS, doms, false);
VEC_free (basic_block, heap, doms);
/* Update the SSA web. */
update_ssa (TODO_update_ssa);
if (free_region_copy)
free (region_copy);
free_original_copy_tables ();
return true;
}

View File

@ -255,7 +255,13 @@ loop_parallel_p (struct loop *loop)
bool ret = false;
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "\nConsidering loop %d\n", loop->num);
{
fprintf (dump_file, "Considering loop %d\n", loop->num);
if (!loop->inner)
fprintf (dump_file, "loop is innermost\n");
else
fprintf (dump_file, "loop NOT innermost\n");
}
/* Check for problems with dependences. If the loop can be reversed,
the iterations are independent. */
@ -1289,8 +1295,9 @@ transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit
bool ok;
edge exit = single_dom_exit (loop), hpred;
tree control, control_name, res, t;
gimple phi, nphi, cond_stmt, stmt;
gimple phi, nphi, cond_stmt, stmt, cond_nit;
gimple_stmt_iterator gsi;
tree nit_1;
split_block_after_labels (loop->header);
orig_header = single_succ (loop->header);
@ -1308,7 +1315,6 @@ transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit
res = PHI_RESULT (phi);
t = make_ssa_name (SSA_NAME_VAR (res), phi);
SET_PHI_RESULT (phi, t);
nphi = create_phi_node (res, orig_header);
SSA_NAME_DEF_STMT (res) = nphi;
add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION);
@ -1320,10 +1326,11 @@ transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit
control = t;
}
}
bbs = get_loop_body_in_dom_order (loop);
for (n = 0; bbs[n] != exit->src; n++)
for (n = 0; bbs[n] != loop->latch; n++)
continue;
n--;
nbbs = XNEWVEC (basic_block, n);
ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit,
bbs + 1, n, nbbs);
@ -1358,7 +1365,6 @@ transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit
struct reduction_info *red;
tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
if (red)
{
@ -1374,12 +1380,15 @@ transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit
}
gcc_assert (control_name != NULL_TREE);
/* Initialize the control variable to NIT. */
/* Initialize the control variable to number of iterations
according to the rhs of the exit condition. */
gsi = gsi_after_labels (ex_bb);
nit = force_gimple_operand_gsi (&gsi,
fold_convert (TREE_TYPE (control_name), nit),
cond_nit = last_stmt (exit->src);
nit_1 = gimple_cond_rhs (cond_nit);
nit_1 = force_gimple_operand_gsi (&gsi,
fold_convert (TREE_TYPE (control_name), nit_1),
false, NULL_TREE, false, GSI_SAME_STMT);
stmt = gimple_build_assign (control_name, nit);
stmt = gimple_build_assign (control_name, nit_1);
gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
SSA_NAME_DEF_STMT (control_name) = stmt;
}
@ -1740,7 +1749,7 @@ gather_scalar_reductions (loop_p loop, htab_t reduction_list)
&& simple_loop_info)
{
gimple reduc_stmt = vect_is_simple_reduction (simple_loop_info, phi, true, &double_reduc);
if (reduc_stmt)
if (reduc_stmt && !double_reduc)
build_new_reduction (reduction_list, reduc_stmt, phi);
}
}
@ -1890,15 +1899,32 @@ parallelize_loops (void)
FOR_EACH_LOOP (li, loop, 0)
{
htab_empty (reduction_list);
/* If we use autopar in graphite pass, we use it's marked dependency
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
if (loop->inner)
fprintf (dump_file, "loop %d is not innermost\n",loop->num);
else
fprintf (dump_file, "loop %d is innermost\n",loop->num);
}
/* If we use autopar in graphite pass, we use its marked dependency
checking results. */
if (flag_loop_parallelize_all && !loop->can_be_parallel)
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "loop is not parallel according to graphite\n");
continue;
}
/* FIXME: Only consider innermost loops with just one exit. */
if (loop->inner || !single_dom_exit (loop))
if (!single_dom_exit (loop))
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "loop is !single_dom_exit\n");
continue;
}
if (/* And of course, the loop must be parallelizable. */
!can_duplicate_loop_p (loop)
@ -1915,7 +1941,7 @@ parallelize_loops (void)
/* Do not bother with loops in cold areas. */
|| optimize_loop_nest_for_size_p (loop)))
continue;
if (!try_get_loop_niter (loop, &niter_desc))
continue;
@ -1926,6 +1952,14 @@ parallelize_loops (void)
continue;
changed = true;
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "parallelizing ");
if (loop->inner)
fprintf (dump_file, "outer loop\n");
else
fprintf (dump_file, "inner loop\n");
}
gen_parallel_loop (loop, reduction_list,
n_threads, &niter_desc);
verify_flow_info ();

View File

@ -23,7 +23,7 @@ int main(void)
}
/* Check that parallel code generation part make the right answer. */
/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 1 "graphite" } } */
/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 2 "graphite" } } */
/* { dg-final { cleanup-tree-dump "graphite" } } */
/* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
/* { dg-final { cleanup-tree-dump "parloops" } } */