cfgloopmanip.c (duplicate_subloops): Export.

2009-10-22 Razya Ladelsky <razya@il.ibm.com> * cfgloopmanip.c (duplicate_subloops): Export. * tree-parloops.c (loop_parallel_p): Dump if loop is innermost. (transform_to_exit_first_loop): Duplicate bbs starting from header up to loop->latch instead of exit->src. Initialize control variable to the correct number of iterations. (gather_scalar_reductions): Do not register double reductions. (parallelize_loops): Dump which loop is tested. Indicate whether the parallelized loop is inner or not. Remove the innermost-loop requirement. * cfgloop.h (duplicate_subloops): Export. * tree-cfg.c (add_phi_args_after_redirect): New function. (gimple_duplicate_sese_tail): Remove the no-subloops constraint. Call duplicate_subloops. Update number of iterations at the exit condition. Don't redirect nexits always to the loop exit. Redirect copied edges from latch to the loop exit. * testsuite/libgomp.graphite/force-parallel-2.c: Adjust scan. * testsuite/gcc.dg/autopar/outer-1.c: New testcase. * testsuite/gcc.dg/autopar/outer-2.c: New testcase. * testsuite/gcc.dg/autopar/outer-3.c: New testcase. * testsuite/gcc.dg/autopar/outer-4.c: New testcase. * testsuite/gcc.dg/autopar/outer-5.c: New testcase. * testsuite/gcc.dg/autopar/outer-6.c: New testcase. From-SVN: r153457
2024-11-25 11:54:01 +08:00 · 2009-10-22 14:43:40 +00:00 · 2009-10-22 14:43:40 +00:00 · 487102294c
commit 487102294c
parent 0d4958d022
12 changed files with 449 additions and 34 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,29 @@
+2009-10-22  Razya Ladelsky  <razya@il.ibm.com>
+	
+	* cfgloopmanip.c  (duplicate_subloops): Export. 
+	* tree-parloops.c (loop_parallel_p): Dump if loop is innermost.
+	(transform_to_exit_first_loop): Duplicate bbs starting from 
+	header up to loop->latch instead of exit->src.
+	Initialize control variable to the correct number of iterations.
+	(gather_scalar_reductions): Do not register double reductions.
+	(parallelize_loops): Dump which loop is tested. 
+	Indicate whether the parallelized loop is inner or not. 
+	Remove the innermost-loop requirement.
+	* cfgloop.h (duplicate_subloops): Export. 
+	* tree-cfg.c (add_phi_args_after_redirect): New function.
+	(gimple_duplicate_sese_tail): Remove the no-subloops constraint.
+	Call duplicate_subloops.
+	Update number of iterations at the exit condition.
+	Don't redirect nexits always to the loop exit.
+	Redirect copied edges from latch to the loop exit.
+	* testsuite/libgomp.graphite/force-parallel-2.c: Adjust scan.
+	* testsuite/gcc.dg/autopar/outer-1.c: New testcase.
+	* testsuite/gcc.dg/autopar/outer-2.c: New testcase.
+	* testsuite/gcc.dg/autopar/outer-3.c: New testcase.
+	* testsuite/gcc.dg/autopar/outer-4.c: New testcase.
+	* testsuite/gcc.dg/autopar/outer-5.c: New testcase.
+	* testsuite/gcc.dg/autopar/outer-6.c: New testcase.
+
 2009-10-22  Jan Hubicka  <jh@suse.cz>

 	* ipa-cp.c (ipcp_read_summary): Remove now invalid FIXME and
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@ -288,6 +288,7 @@ extern edge create_empty_if_region_on_edge (edge, tree);
 extern struct loop *create_empty_loop_on_edge (edge, tree, tree, tree, tree,
 					       tree *, tree *, struct loop *);
 extern struct loop * duplicate_loop (struct loop *, struct loop *);
+extern void duplicate_subloops (struct loop *, struct loop *);
 extern bool duplicate_loop_to_header_edge (struct loop *, edge, 
 					   unsigned, sbitmap, edge,
 					   VEC (edge, heap) **, int);
--- a/gcc/cfgloopmanip.c
+++ b/gcc/cfgloopmanip.c
@ -32,7 +32,6 @@ along with GCC; see the file COPYING3.  If not see
 #include "output.h"
 #include "tree-flow.h"

-static void duplicate_subloops (struct loop *, struct loop *);
 static void copy_loops_to (struct loop **, int,
 			   struct loop *);
 static void loop_redirect_edge (edge, basic_block);
@ -886,7 +885,7 @@ duplicate_loop (struct loop *loop, struct loop *target)

 /* Copies structure of subloops of LOOP into TARGET loop, placing
   newly created loops into loop tree.  */
-static void
+void
 duplicate_subloops (struct loop *loop, struct loop *target)
 {
  struct loop *aloop, *cloop;
--- a/gcc/testsuite/gcc.dg/autopar/outer-1.c
+++ b/gcc/testsuite/gcc.dg/autopar/outer-1.c
@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */
+
+void abort (void);
+
+void parloop (int N)
+{
+  int i, j;
+  int x[10000][10000];
+
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      x[i][j] = i + j + 3;
+
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      if (x[i][j] != i + j + 3)
+	abort ();
+}
+
+int main(void)
+{
+  parloop(10000);
+
+  return 0;
+}
+
+
+/* Check that outer loop is parallelized.  */
+/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" } } */
+/* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "parloops" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
--- a/gcc/testsuite/gcc.dg/autopar/outer-2.c
+++ b/gcc/testsuite/gcc.dg/autopar/outer-2.c
@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */
+
+void abort (void);
+
+void parloop (int N)
+{
+  int i, j,ii;
+  int x[400][10][400];
+
+for (ii = 0; ii < N; ii++)
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      x[i][j][ii] = ii+i + j + 3;
+
+for (ii = 0; ii < N; ii++)
+  for (i = 0; i < N;i++)
+    for (j = 0; j < N; j++)
+      if (x[i][j][ii] != ii+i + j + 3)
+	abort ();
+}
+
+int main(void)
+{
+  parloop(400);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" } } */
+/* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "parloops" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
--- a/gcc/testsuite/gcc.dg/autopar/outer-3.c
+++ b/gcc/testsuite/gcc.dg/autopar/outer-3.c
@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */
+
+void abort (void);
+
+void parloop (int N)
+{
+  int i, j;
+  int x[500][500];
+
+  for (i = 0; i < N; i++)
+    for (j = 0; j < i; j++)
+      x[i][j] = i + j + 3;
+
+  for (i = 0; i < N; i++)
+    for (j = 0; j < i; j++)
+      if (x[i][j] != i + j + 3)
+	abort ();
+}
+
+int main(void)
+{
+  parloop(500);
+
+  return 0;
+}
+
+
+/* Check that outer loop is parallelized.  */
+/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" } } */
+/* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "parloops" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
--- a/gcc/testsuite/gcc.dg/autopar/outer-4.c
+++ b/gcc/testsuite/gcc.dg/autopar/outer-4.c
@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */
+
+void abort (void);
+
+int g_sum=0;
+int x[500][500];
+
+__attribute__((noinline))
+void parloop (int N)
+{
+  int i, j;
+  int sum;
+
+  /* Double reduction is currently not supported, outer loop is not 
+     parallelized.  Inner reduction is detected, inner loop is 
+     parallelized.  */
+  sum = 0;
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      sum += x[i][j];
+
+  g_sum = sum;
+}
+
+int main(void)
+{
+  parloop(500);
+
+  return 0;
+}
+
+
+/* Check that outer loop is parallelized.  */
+/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 0 "parloops" } } */
+/* { dg-final { scan-tree-dump-times "parallelizing inner loop" 1 "parloops" } } */
+/* { dg-final { cleanup-tree-dump "parloops" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
--- a/gcc/testsuite/gcc.dg/autopar/outer-5.c
+++ b/gcc/testsuite/gcc.dg/autopar/outer-5.c
@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */
+
+void abort (void);
+
+int x[500][500];
+int y[500];
+int g_sum=0;
+
+__attribute__((noinline))
+void init (int i, int j)
+{
+  x[i][j]=1;
+}
+
+__attribute__((noinline))
+void parloop (int N)
+{
+  int i, j;
+  int sum;
+
+  /* Inner cycle is currently not supported, outer loop is not 
+     parallelized.  Inner reduction is detected, inner loop is 
+     parallelized.  */
+  for (i = 0; i < N; i++)
+  {
+    sum = 0;
+    for (j = 0; j < N; j++)
+      sum += x[i][j];
+    y[i]=sum;
+  }
+  g_sum = sum;
+}
+
+int main(void)
+{
+  int i,j;
+  for (i = 0; i < 500; i++) 
+    for (j = 0; j < 500; j++)
+      init(i, j);
+  
+  parloop(500);
+
+  return 0;
+}
+
+
+/* Check that outer loop is parallelized.  */
+/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 0 "parloops" } } */
+/* { dg-final { scan-tree-dump-times "parallelizing inner loop" 1 "parloops" } } */
+/* { dg-final { cleanup-tree-dump "parloops" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
--- a/gcc/testsuite/gcc.dg/autopar/outer-6.c
+++ b/gcc/testsuite/gcc.dg/autopar/outer-6.c
@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-optimized" } */
+
+void abort (void);
+
+int x[500][500];
+int y[500];
+int g_sum=0;
+
+__attribute__((noinline))
+void init (int i, int j)
+{
+  x[i][j]=1;
+}
+
+__attribute__((noinline))
+void parloop (int N)
+{
+  int i, j;
+  int sum;
+
+  /* Outer loop reduction, outerloop is parallelized.  */ 
+  sum=0;
+  for (i = 0; i < N; i++)
+  {
+    for (j = 0; j < N; j++)
+      y[i]=x[i][j];
+    sum += y[i];
+  }
+  g_sum = sum;
+}
+
+int main(void)
+{
+  int i,j;
+  for (i = 0; i < 500; i++) 
+    for (j = 0; j < 500; j++)
+      init(i, j);
+  
+  parloop(500);
+
+  return 0;
+}
+
+
+/* Check that outer loop is parallelized.  */
+/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" } } */
+/* { dg-final { scan-tree-dump-times "parallelizing inner loop" 0 "parloops" } } */
+/* { dg-final { cleanup-tree-dump "parloops" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@ -4850,6 +4850,31 @@ gimple_duplicate_bb (basic_block bb)
  return new_bb;
 }

+/* Add phi arguments to the phi nodes in E_COPY->dest according to 
+   the phi arguments coming from the equivalent edge at
+   the phi nodes of DEST.  */
+
+static void
+add_phi_args_after_redirect (edge e_copy, edge orig_e)
+{
+   gimple_stmt_iterator psi, psi_copy;
+   gimple phi, phi_copy;
+   tree def; 
+   
+    for (psi = gsi_start_phis (orig_e->dest),
+       psi_copy = gsi_start_phis (e_copy->dest);
+       !gsi_end_p (psi);
+       gsi_next (&psi), gsi_next (&psi_copy))
+    {
+
+      phi = gsi_stmt (psi);
+      phi_copy = gsi_stmt (psi_copy);
+      def = PHI_ARG_DEF_FROM_EDGE (phi, orig_e);
+      add_phi_arg (phi_copy, def, e_copy,
+                   gimple_phi_arg_location_from_edge (phi, orig_e));
+    }
+}
+
 /* Adds phi node arguments for edge E_COPY after basic block duplication.  */

 static void
@ -5131,9 +5156,14 @@ gimple_duplicate_sese_tail (edge entry ATTRIBUTE_UNUSED, edge exit ATTRIBUTE_UNU
  int total_freq = 0, exit_freq = 0;
  gcov_type total_count = 0, exit_count = 0;
  edge exits[2], nexits[2], e;
-  gimple_stmt_iterator gsi;
+  gimple_stmt_iterator gsi,gsi1;
  gimple cond_stmt;
-  edge sorig, snew;
+  edge sorig, snew, orig_e;
+  basic_block exit_bb;
+  edge_iterator ei;
+  VEC (edge, heap) *redirect_edges;
+  basic_block iters_bb, orig_src;
+  tree new_rhs;

  gcc_assert (EDGE_COUNT (exit->src->succs) == 2);
  exits[0] = exit;
@ -5149,17 +5179,13 @@ gimple_duplicate_sese_tail (edge entry ATTRIBUTE_UNUSED, edge exit ATTRIBUTE_UNU
     it will work, but the resulting code will not be correct.  */
  for (i = 0; i < n_region; i++)
    {
-      /* We do not handle subloops, i.e. all the blocks must belong to the
-	 same loop.  */
-      if (region[i]->loop_father != orig_loop)
-	return false;
-
      if (region[i] == orig_loop->latch)
 	return false;
    }

  initialize_original_copy_tables ();
  set_loop_copy (orig_loop, loop);
+  duplicate_subloops (orig_loop, loop);

  if (!region_copy)
    {
@ -5225,8 +5251,36 @@ gimple_duplicate_sese_tail (edge entry ATTRIBUTE_UNUSED, edge exit ATTRIBUTE_UNU
  cond_stmt = last_stmt (exit->src);
  gcc_assert (gimple_code (cond_stmt) == GIMPLE_COND);
  cond_stmt = gimple_copy (cond_stmt);
+ 
+ /* If the block consisting of the exit condition has the latch as 
+    successor, then the body of the loop is executed before 
+    the exit consition is tested.  In such case, moving the 
+    condition to the entry, causes that the loop will iterate  
+    one less iteration (which is the wanted outcome, since we 
+    peel out the last iteration).  If the body is executed after 
+    the condition, moving the condition to the entry requires 
+    decrementing one iteration.  */
+  if (exits[1]->dest == orig_loop->latch)
+    new_rhs = gimple_cond_rhs (cond_stmt);
+  else
+  {
+    new_rhs = fold_build2 (MINUS_EXPR, TREE_TYPE (gimple_cond_rhs (cond_stmt)),
+			   gimple_cond_rhs (cond_stmt), 
+			   build_int_cst (TREE_TYPE (gimple_cond_rhs (cond_stmt)), 1));
+
+    if (TREE_CODE (gimple_cond_rhs (cond_stmt)) == SSA_NAME)
+      {
+	iters_bb = gimple_bb (SSA_NAME_DEF_STMT (gimple_cond_rhs (cond_stmt)));
+	for (gsi1 = gsi_start_bb (iters_bb); !gsi_end_p (gsi1); gsi_next (&gsi1))
+	  if (gsi_stmt (gsi1)==SSA_NAME_DEF_STMT (gimple_cond_rhs (cond_stmt)))
+	    break;
+		 
+	new_rhs = force_gimple_operand_gsi (&gsi1, new_rhs, true,
+					    NULL_TREE,false,GSI_CONTINUE_LINKING);
+      }
+  }   
+  gimple_cond_set_rhs (cond_stmt, unshare_expr (new_rhs)); 
  gimple_cond_set_lhs (cond_stmt, unshare_expr (gimple_cond_lhs (cond_stmt)));
-  gimple_cond_set_rhs (cond_stmt, unshare_expr (gimple_cond_rhs (cond_stmt)));
  gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);

  sorig = single_succ_edge (switch_bb);
@ -5238,25 +5292,87 @@ gimple_duplicate_sese_tail (edge entry ATTRIBUTE_UNUSED, edge exit ATTRIBUTE_UNU

  /* Add the PHI node arguments.  */
  add_phi_args_after_copy (region_copy, n_region, snew);
-
+  
  /* Get rid of now superfluous conditions and associated edges (and phi node
     arguments).  */
+  exit_bb = exit->dest;
+ 
  e = redirect_edge_and_branch (exits[0], exits[1]->dest);
  PENDING_STMT (e) = NULL;
-  e = redirect_edge_and_branch (nexits[1], nexits[0]->dest);
-  PENDING_STMT (e) = NULL;
-
+ 
+  /* If the block consisting of the exit condition has the latch as 
+     successor, then the body of the loop is executed before 
+     the exit consition is tested.  
+     
+     { body  }
+     { cond  } (exit[0])  -> { latch }
+        |      
+	V (exit[1])
+	 
+     { exit_bb }
+     
+     
+     In such case, the equivalent copied edge nexits[1]
+     (for the peeled iteration) needs to be redirected to exit_bb.
+     
+     Otherwise, 
+     
+     { cond  } (exit[0])  -> { body }
+        |
+	V (exit[1])
+     
+     { exit_bb }
+    
+     
+     exit[0] is pointing to the body of the loop,
+     and the equivalent nexits[0] needs to be redirected to 
+     the copied body (of the peeled iteration).  */ 
+    
+  if (exits[1]->dest == orig_loop->latch)
+    e = redirect_edge_and_branch (nexits[1], nexits[0]->dest);
+  else
+    e = redirect_edge_and_branch (nexits[0], nexits[1]->dest);
+  PENDING_STMT (e) = NULL; 
+  
+  redirect_edges = VEC_alloc (edge, heap, 10);
+  
+  for (i = 0; i < n_region; i++)
+    region_copy[i]->flags |= BB_DUPLICATED;
+  
+  /* Iterate all incoming edges to latch.  All those coming from 
+     copied bbs will be redicrecred to exit_bb.  */
+  FOR_EACH_EDGE (e, ei, orig_loop->latch->preds)
+    {
+      if (e->src->flags & BB_DUPLICATED)
+        VEC_safe_push (edge, heap, redirect_edges, e);
+    }
+  
+  for (i = 0; i < n_region; i++)
+    region_copy[i]->flags &= ~BB_DUPLICATED;
+  
+  for (i = 0; VEC_iterate (edge, redirect_edges, i, e); ++i)
+    {
+      e = redirect_edge_and_branch (e, exit_bb);
+      PENDING_STMT (e) = NULL;
+      orig_src = get_bb_original (e->src);
+      orig_e = find_edge (orig_src, orig_loop->latch);
+      add_phi_args_after_redirect (e, orig_e);
+    }
+  
+  VEC_free (edge, heap, redirect_edges);
+  
+  
  /* Anything that is outside of the region, but was dominated by something
     inside needs to update dominance info.  */
  iterate_fix_dominators (CDI_DOMINATORS, doms, false);
  VEC_free (basic_block, heap, doms);
-
+  
  /* Update the SSA web.  */
  update_ssa (TODO_update_ssa);
-
+  
  if (free_region_copy)
    free (region_copy);
-
+  
  free_original_copy_tables ();
  return true;
 }
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@ -255,7 +255,13 @@ loop_parallel_p (struct loop *loop)
  bool ret = false;

  if (dump_file && (dump_flags & TDF_DETAILS))
-    fprintf (dump_file, "\nConsidering loop %d\n", loop->num);
+  {
+    fprintf (dump_file, "Considering loop %d\n", loop->num);
+    if (!loop->inner)
+      fprintf (dump_file, "loop is innermost\n");
+    else 
+      fprintf (dump_file, "loop NOT innermost\n");
+   }

  /* Check for problems with dependences.  If the loop can be reversed,
     the iterations are independent.  */
@ -1289,8 +1295,9 @@ transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit
  bool ok;
  edge exit = single_dom_exit (loop), hpred;
  tree control, control_name, res, t;
-  gimple phi, nphi, cond_stmt, stmt;
+  gimple phi, nphi, cond_stmt, stmt, cond_nit;
  gimple_stmt_iterator gsi;
+  tree nit_1;

  split_block_after_labels (loop->header);
  orig_header = single_succ (loop->header);
@ -1308,7 +1315,6 @@ transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit
      res = PHI_RESULT (phi);
      t = make_ssa_name (SSA_NAME_VAR (res), phi);
      SET_PHI_RESULT (phi, t);
-
      nphi = create_phi_node (res, orig_header);
      SSA_NAME_DEF_STMT (res) = nphi;
      add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION);
@ -1320,10 +1326,11 @@ transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit
 	  control = t;
 	}
    }
-
  bbs = get_loop_body_in_dom_order (loop);
-  for (n = 0; bbs[n] != exit->src; n++)
+
+  for (n = 0; bbs[n] != loop->latch; n++)
    continue;
+  n--;
  nbbs = XNEWVEC (basic_block, n);
  ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit,
 				   bbs + 1, n, nbbs);
@ -1358,7 +1365,6 @@ transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit
 	  struct reduction_info *red;

 	  tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
-
 	  red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
 	  if (red)
 	    {
@ -1374,12 +1380,15 @@ transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit
    }
  gcc_assert (control_name != NULL_TREE);

-  /* Initialize the control variable to NIT.  */
+  /* Initialize the control variable to number of iterations 
+     according to the rhs of the exit condition.  */
  gsi = gsi_after_labels (ex_bb);
-  nit = force_gimple_operand_gsi (&gsi,
-				  fold_convert (TREE_TYPE (control_name), nit),
+  cond_nit = last_stmt (exit->src); 
+  nit_1 =  gimple_cond_rhs (cond_nit);
+  nit_1 = force_gimple_operand_gsi (&gsi,
+				  fold_convert (TREE_TYPE (control_name), nit_1),
 				  false, NULL_TREE, false, GSI_SAME_STMT);
-  stmt = gimple_build_assign (control_name, nit);
+  stmt = gimple_build_assign (control_name, nit_1);
  gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
  SSA_NAME_DEF_STMT (control_name) = stmt;
 }
@ -1740,7 +1749,7 @@ gather_scalar_reductions (loop_p loop, htab_t reduction_list)
 	&& simple_loop_info)
 	{
           gimple reduc_stmt = vect_is_simple_reduction (simple_loop_info, phi, true, &double_reduc);
-	   if (reduc_stmt)
+	   if (reduc_stmt && !double_reduc)
              build_new_reduction (reduction_list, reduc_stmt, phi);
        }
    }
@ -1890,15 +1899,32 @@ parallelize_loops (void)
  FOR_EACH_LOOP (li, loop, 0)
    {
      htab_empty (reduction_list);
-
-      /* If we use autopar in graphite pass, we use it's marked dependency
+      if (dump_file && (dump_flags & TDF_DETAILS))
+      {
+        fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
+	if (loop->inner)
+	  fprintf (dump_file, "loop %d is not innermost\n",loop->num);
+	else
+	  fprintf (dump_file, "loop %d is innermost\n",loop->num);
+      }
+      
+      /* If we use autopar in graphite pass, we use its marked dependency
      checking results.  */
      if (flag_loop_parallelize_all && !loop->can_be_parallel)
+      {
+        if (dump_file && (dump_flags & TDF_DETAILS))
+	   fprintf (dump_file, "loop is not parallel according to graphite\n");
 	continue;
+      }

-      /* FIXME: Only consider innermost loops with just one exit.  */
-      if (loop->inner || !single_dom_exit (loop))
+      if (!single_dom_exit (loop))
+      {
+       
+        if (dump_file && (dump_flags & TDF_DETAILS))
+	  fprintf (dump_file, "loop is !single_dom_exit\n");
+		
 	continue;
+      }

      if (/* And of course, the loop must be parallelizable.  */
 	  !can_duplicate_loop_p (loop)
@ -1915,7 +1941,7 @@ parallelize_loops (void)
 	      /* Do not bother with loops in cold areas.  */
 	      || optimize_loop_nest_for_size_p (loop)))
 	continue;
-
+ 
      if (!try_get_loop_niter (loop, &niter_desc))
 	continue;

@ -1926,6 +1952,14 @@ parallelize_loops (void)
 	continue;

      changed = true;
+      if (dump_file && (dump_flags & TDF_DETAILS))
+      {
+        fprintf (dump_file, "parallelizing ");
+	if (loop->inner)
+	  fprintf (dump_file, "outer loop\n");
+	else
+	  fprintf (dump_file, "inner loop\n");
+      } 
      gen_parallel_loop (loop, reduction_list, 
 			 n_threads, &niter_desc);
      verify_flow_info ();
--- a/libgomp/testsuite/libgomp.graphite/force-parallel-2.c
+++ b/libgomp/testsuite/libgomp.graphite/force-parallel-2.c
@ -23,7 +23,7 @@ int main(void)
 }

 /* Check that parallel code generation part make the right answer.  */
-/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 1 "graphite" } } */
+/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 2 "graphite" } } */
 /* { dg-final { cleanup-tree-dump "graphite" } } */
 /* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
 /* { dg-final { cleanup-tree-dump "parloops" } } */