1/* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "tree.h"
27#include "gimple.h"
28#include "cfghooks.h"
29#include "tree-pass.h"
30#include "ssa.h"
31#include "fold-const.h"
32#include "cfganal.h"
33#include "gimplify.h"
34#include "gimple-iterator.h"
35#include "gimplify-me.h"
36#include "tree-cfg.h"
37#include "tree-ssa-loop-manip.h"
38#include "tree-into-ssa.h"
39#include "tree-ssa.h"
40#include "cfgloop.h"
41#include "tree-scalar-evolution.h"
42#include "tree-vectorizer.h"
43#include "tree-ssa-loop-ivopts.h"
44
45/*************************************************************************
46 Simple Loop Peeling Utilities
47
48 Utilities to support loop peeling for vectorization purposes.
49 *************************************************************************/
50
51
52/* Renames the use *OP_P. */
53
54static void
55rename_use_op (use_operand_p op_p)
56{
57 tree new_name;
58
59 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
60 return;
61
62 new_name = get_current_def (USE_FROM_PTR (op_p));
63
64 /* Something defined outside of the loop. */
65 if (!new_name)
66 return;
67
68 /* An ordinary ssa name defined in the loop. */
69
70 SET_USE (op_p, new_name);
71}
72
73
74/* Renames the variables in basic block BB. Allow renaming of PHI arguments
75 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
76 true. */
77
78static void
79rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
80{
81 gimple *stmt;
82 use_operand_p use_p;
83 ssa_op_iter iter;
84 edge e;
85 edge_iterator ei;
86 struct loop *loop = bb->loop_father;
87 struct loop *outer_loop = NULL;
88
89 if (rename_from_outer_loop)
90 {
91 gcc_assert (loop);
92 outer_loop = loop_outer (loop);
93 }
94
95 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
96 gsi_next (&gsi))
97 {
98 stmt = gsi_stmt (gsi);
99 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
100 rename_use_op (use_p);
101 }
102
103 FOR_EACH_EDGE (e, ei, bb->preds)
104 {
105 if (!flow_bb_inside_loop_p (loop, e->src))
106 {
107 if (!rename_from_outer_loop)
108 continue;
109 if (e->src != outer_loop->header)
110 {
111 if (outer_loop->inner->next)
112 {
113 /* If outer_loop has 2 inner loops, allow there to
114 be an extra basic block which decides which of the
115 two loops to use using LOOP_VECTORIZED. */
116 if (!single_pred_p (e->src)
117 || single_pred (e->src) != outer_loop->header)
118 continue;
119 }
120 }
121 }
122 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
123 gsi_next (&gsi))
124 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
125 }
126}
127
128
129struct adjust_info
130{
131 tree from, to;
132 basic_block bb;
133};
134
135/* A stack of values to be adjusted in debug stmts. We have to
136 process them LIFO, so that the closest substitution applies. If we
137 processed them FIFO, without the stack, we might substitute uses
138 with a PHI DEF that would soon become non-dominant, and when we got
139 to the suitable one, it wouldn't have anything to substitute any
140 more. */
141static vec<adjust_info, va_heap> adjust_vec;
142
143/* Adjust any debug stmts that referenced AI->from values to use the
144 loop-closed AI->to, if the references are dominated by AI->bb and
145 not by the definition of AI->from. */
146
147static void
148adjust_debug_stmts_now (adjust_info *ai)
149{
150 basic_block bbphi = ai->bb;
151 tree orig_def = ai->from;
152 tree new_def = ai->to;
153 imm_use_iterator imm_iter;
154 gimple *stmt;
155 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
156
157 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
158
159 /* Adjust any debug stmts that held onto non-loop-closed
160 references. */
161 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
162 {
163 use_operand_p use_p;
164 basic_block bbuse;
165
166 if (!is_gimple_debug (stmt))
167 continue;
168
169 gcc_assert (gimple_debug_bind_p (stmt));
170
171 bbuse = gimple_bb (stmt);
172
173 if ((bbuse == bbphi
174 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
175 && !(bbuse == bbdef
176 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
177 {
178 if (new_def)
179 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
180 SET_USE (use_p, new_def);
181 else
182 {
183 gimple_debug_bind_reset_value (stmt);
184 update_stmt (stmt);
185 }
186 }
187 }
188}
189
190/* Adjust debug stmts as scheduled before. */
191
192static void
193adjust_vec_debug_stmts (void)
194{
195 if (!MAY_HAVE_DEBUG_BIND_STMTS)
196 return;
197
198 gcc_assert (adjust_vec.exists ());
199
200 while (!adjust_vec.is_empty ())
201 {
202 adjust_debug_stmts_now (&adjust_vec.last ());
203 adjust_vec.pop ();
204 }
205}
206
207/* Adjust any debug stmts that referenced FROM values to use the
208 loop-closed TO, if the references are dominated by BB and not by
209 the definition of FROM. If adjust_vec is non-NULL, adjustments
210 will be postponed until adjust_vec_debug_stmts is called. */
211
212static void
213adjust_debug_stmts (tree from, tree to, basic_block bb)
214{
215 adjust_info ai;
216
217 if (MAY_HAVE_DEBUG_BIND_STMTS
218 && TREE_CODE (from) == SSA_NAME
219 && ! SSA_NAME_IS_DEFAULT_DEF (from)
220 && ! virtual_operand_p (from))
221 {
222 ai.from = from;
223 ai.to = to;
224 ai.bb = bb;
225
226 if (adjust_vec.exists ())
227 adjust_vec.safe_push (ai);
228 else
229 adjust_debug_stmts_now (&ai);
230 }
231}
232
233/* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
234 to adjust any debug stmts that referenced the old phi arg,
235 presumably non-loop-closed references left over from other
236 transformations. */
237
238static void
239adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
240{
241 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
242
243 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
244
245 if (MAY_HAVE_DEBUG_BIND_STMTS)
246 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
247 gimple_bb (update_phi));
248}
249
250/* Make the LOOP iterate NITERS times. This is done by adding a new IV
251 that starts at zero, increases by one and its limit is NITERS.
252
253 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
254
255void
256slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
257{
258 tree indx_before_incr, indx_after_incr;
259 gcond *cond_stmt;
260 gcond *orig_cond;
261 edge exit_edge = single_exit (loop);
262 gimple_stmt_iterator loop_cond_gsi;
263 gimple_stmt_iterator incr_gsi;
264 bool insert_after;
265 tree init = build_int_cst (TREE_TYPE (niters), 0);
266 tree step = build_int_cst (TREE_TYPE (niters), 1);
267 source_location loop_loc;
268 enum tree_code code;
269
270 orig_cond = get_loop_exit_condition (loop);
271 gcc_assert (orig_cond);
272 loop_cond_gsi = gsi_for_stmt (orig_cond);
273
274 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
275 create_iv (init, step, NULL_TREE, loop,
276 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
277
278 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
279 true, NULL_TREE, true,
280 GSI_SAME_STMT);
281 niters = force_gimple_operand_gsi (&loop_cond_gsi, niters, true, NULL_TREE,
282 true, GSI_SAME_STMT);
283
284 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
285 cond_stmt = gimple_build_cond (code, indx_after_incr, niters, NULL_TREE,
286 NULL_TREE);
287
288 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
289
290 /* Remove old loop exit test: */
291 gsi_remove (&loop_cond_gsi, true);
292 free_stmt_vec_info (orig_cond);
293
294 loop_loc = find_loop_location (loop);
295 if (dump_enabled_p ())
296 {
297 if (LOCATION_LOCUS (loop_loc) != UNKNOWN_LOCATION)
298 dump_printf (MSG_NOTE, "\nloop at %s:%d: ", LOCATION_FILE (loop_loc),
299 LOCATION_LINE (loop_loc));
300 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
301 }
302
303 /* Record the number of latch iterations. */
304 loop->nb_iterations = fold_build2 (MINUS_EXPR, TREE_TYPE (niters), niters,
305 build_int_cst (TREE_TYPE (niters), 1));
306}
307
308/* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
309 For all PHI arguments in FROM->dest and TO->dest from those
310 edges ensure that TO->dest PHI arguments have current_def
311 to that in from. */
312
313static void
314slpeel_duplicate_current_defs_from_edges (edge from, edge to)
315{
316 gimple_stmt_iterator gsi_from, gsi_to;
317
318 for (gsi_from = gsi_start_phis (from->dest),
319 gsi_to = gsi_start_phis (to->dest);
320 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
321 {
322 gimple *from_phi = gsi_stmt (gsi_from);
323 gimple *to_phi = gsi_stmt (gsi_to);
324 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
325 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
326 if (virtual_operand_p (from_arg))
327 {
328 gsi_next (&gsi_from);
329 continue;
330 }
331 if (virtual_operand_p (to_arg))
332 {
333 gsi_next (&gsi_to);
334 continue;
335 }
336 if (TREE_CODE (from_arg) != SSA_NAME)
337 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
338 else
339 {
340 if (get_current_def (to_arg) == NULL_TREE)
341 set_current_def (to_arg, get_current_def (from_arg));
342 }
343 gsi_next (&gsi_from);
344 gsi_next (&gsi_to);
345 }
346
347 gphi *from_phi = get_virtual_phi (from->dest);
348 gphi *to_phi = get_virtual_phi (to->dest);
349 if (from_phi)
350 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
351 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
352}
353
354
355/* Given LOOP this function generates a new copy of it and puts it
356 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
357 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
358 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
359 entry or exit of LOOP. */
360
361struct loop *
362slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
363 struct loop *scalar_loop, edge e)
364{
365 struct loop *new_loop;
366 basic_block *new_bbs, *bbs, *pbbs;
367 bool at_exit;
368 bool was_imm_dom;
369 basic_block exit_dest;
370 edge exit, new_exit;
371 bool duplicate_outer_loop = false;
372
373 exit = single_exit (loop);
374 at_exit = (e == exit);
375 if (!at_exit && e != loop_preheader_edge (loop))
376 return NULL;
377
378 if (scalar_loop == NULL)
379 scalar_loop = loop;
380
381 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
382 pbbs = bbs + 1;
383 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
384 /* Allow duplication of outer loops. */
385 if (scalar_loop->inner)
386 duplicate_outer_loop = true;
387 /* Check whether duplication is possible. */
388 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
389 {
390 free (bbs);
391 return NULL;
392 }
393
394 /* Generate new loop structure. */
395 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
396 duplicate_subloops (scalar_loop, new_loop);
397
398 exit_dest = exit->dest;
399 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
400 exit_dest) == loop->header ?
401 true : false);
402
403 /* Also copy the pre-header, this avoids jumping through hoops to
404 duplicate the loop entry PHI arguments. Create an empty
405 pre-header unconditionally for this. */
406 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
407 edge entry_e = single_pred_edge (preheader);
408 bbs[0] = preheader;
409 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
410
411 exit = single_exit (scalar_loop);
412 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
413 &exit, 1, &new_exit, NULL,
414 at_exit ? loop->latch : e->src, true);
415 exit = single_exit (loop);
416 basic_block new_preheader = new_bbs[0];
417
418 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
419
420 if (scalar_loop != loop)
421 {
422 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
423 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
424 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
425 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
426 header) to have current_def set, so copy them over. */
427 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
428 exit);
429 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
430 0),
431 EDGE_SUCC (loop->latch, 0));
432 }
433
434 if (at_exit) /* Add the loop copy at exit. */
435 {
436 if (scalar_loop != loop)
437 {
438 gphi_iterator gsi;
439 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
440
441 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
442 gsi_next (&gsi))
443 {
444 gphi *phi = gsi.phi ();
445 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
446 location_t orig_locus
447 = gimple_phi_arg_location_from_edge (phi, e);
448
449 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
450 }
451 }
452 redirect_edge_and_branch_force (e, new_preheader);
453 flush_pending_stmts (e);
454 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
455 if (was_imm_dom || duplicate_outer_loop)
456 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
457
458 /* And remove the non-necessary forwarder again. Keep the other
459 one so we have a proper pre-header for the loop at the exit edge. */
460 redirect_edge_pred (single_succ_edge (preheader),
461 single_pred (preheader));
462 delete_basic_block (preheader);
463 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
464 loop_preheader_edge (scalar_loop)->src);
465 }
466 else /* Add the copy at entry. */
467 {
468 if (scalar_loop != loop)
469 {
470 /* Remove the non-necessary forwarder of scalar_loop again. */
471 redirect_edge_pred (single_succ_edge (preheader),
472 single_pred (preheader));
473 delete_basic_block (preheader);
474 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
475 loop_preheader_edge (scalar_loop)->src);
476 preheader = split_edge (loop_preheader_edge (loop));
477 entry_e = single_pred_edge (preheader);
478 }
479
480 redirect_edge_and_branch_force (entry_e, new_preheader);
481 flush_pending_stmts (entry_e);
482 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
483
484 redirect_edge_and_branch_force (new_exit, preheader);
485 flush_pending_stmts (new_exit);
486 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
487
488 /* And remove the non-necessary forwarder again. Keep the other
489 one so we have a proper pre-header for the loop at the exit edge. */
490 redirect_edge_pred (single_succ_edge (new_preheader),
491 single_pred (new_preheader));
492 delete_basic_block (new_preheader);
493 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
494 loop_preheader_edge (new_loop)->src);
495 }
496
497 /* Skip new preheader since it's deleted if copy loop is added at entry. */
498 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
499 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
500
501 if (scalar_loop != loop)
502 {
503 /* Update new_loop->header PHIs, so that on the preheader
504 edge they are the ones from loop rather than scalar_loop. */
505 gphi_iterator gsi_orig, gsi_new;
506 edge orig_e = loop_preheader_edge (loop);
507 edge new_e = loop_preheader_edge (new_loop);
508
509 for (gsi_orig = gsi_start_phis (loop->header),
510 gsi_new = gsi_start_phis (new_loop->header);
511 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
512 gsi_next (&gsi_orig), gsi_next (&gsi_new))
513 {
514 gphi *orig_phi = gsi_orig.phi ();
515 gphi *new_phi = gsi_new.phi ();
516 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
517 location_t orig_locus
518 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
519
520 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
521 }
522 }
523
524 free (new_bbs);
525 free (bbs);
526
527 checking_verify_dominators (CDI_DOMINATORS);
528
529 return new_loop;
530}
531
532
533/* Given the condition expression COND, put it as the last statement of
534 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
535 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
536 skip the loop. PROBABILITY is the skip edge's probability. Mark the
537 new edge as irreducible if IRREDUCIBLE_P is true. */
538
539static edge
540slpeel_add_loop_guard (basic_block guard_bb, tree cond,
541 basic_block guard_to, basic_block dom_bb,
542 profile_probability probability, bool irreducible_p)
543{
544 gimple_stmt_iterator gsi;
545 edge new_e, enter_e;
546 gcond *cond_stmt;
547 gimple_seq gimplify_stmt_list = NULL;
548
549 enter_e = EDGE_SUCC (guard_bb, 0);
550 enter_e->flags &= ~EDGE_FALLTHRU;
551 enter_e->flags |= EDGE_FALSE_VALUE;
552 gsi = gsi_last_bb (guard_bb);
553
554 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
555 NULL_TREE);
556 if (gimplify_stmt_list)
557 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
558
559 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
560 gsi = gsi_last_bb (guard_bb);
561 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
562
563 /* Add new edge to connect guard block to the merge/loop-exit block. */
564 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
565
566 new_e->probability = probability;
567 if (irreducible_p)
568 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
569
570 enter_e->probability = probability.invert ();
571 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
572
573 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
574 if (enter_e->dest->loop_father->header == enter_e->dest)
575 split_edge (enter_e);
576
577 return new_e;
578}
579
580
581/* This function verifies that the following restrictions apply to LOOP:
582 (1) it consists of exactly 2 basic blocks - header, and an empty latch
583 for innermost loop and 5 basic blocks for outer-loop.
584 (2) it is single entry, single exit
585 (3) its exit condition is the last stmt in the header
586 (4) E is the entry/exit edge of LOOP.
587 */
588
589bool
590slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
591{
592 edge exit_e = single_exit (loop);
593 edge entry_e = loop_preheader_edge (loop);
594 gcond *orig_cond = get_loop_exit_condition (loop);
595 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
596 unsigned int num_bb = loop->inner? 5 : 2;
597
598 /* All loops have an outer scope; the only case loop->outer is NULL is for
599 the function itself. */
600 if (!loop_outer (loop)
601 || loop->num_nodes != num_bb
602 || !empty_block_p (loop->latch)
603 || !single_exit (loop)
604 /* Verify that new loop exit condition can be trivially modified. */
605 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
606 || (e != exit_e && e != entry_e))
607 return false;
608
609 return true;
610}
611
612/* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
613 in the exit bb and rename all the uses after the loop. This simplifies
614 the *guard[12] routines, which assume loop closed SSA form for all PHIs
615 (but normally loop closed SSA form doesn't require virtual PHIs to be
616 in the same form). Doing this early simplifies the checking what
617 uses should be renamed. */
618
619static void
620create_lcssa_for_virtual_phi (struct loop *loop)
621{
622 gphi_iterator gsi;
623 edge exit_e = single_exit (loop);
624
625 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
626 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
627 {
628 gphi *phi = gsi.phi ();
629 for (gsi = gsi_start_phis (exit_e->dest);
630 !gsi_end_p (gsi); gsi_next (&gsi))
631 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
632 break;
633 if (gsi_end_p (gsi))
634 {
635 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
636 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
637 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
638 imm_use_iterator imm_iter;
639 gimple *stmt;
640 use_operand_p use_p;
641
642 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
643 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
644 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
645 gimple_phi_set_result (new_phi, new_vop);
646 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
647 if (stmt != new_phi
648 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
649 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
650 SET_USE (use_p, new_vop);
651 }
652 break;
653 }
654
655}
656
657/* Function vect_get_loop_location.
658
659 Extract the location of the loop in the source code.
660 If the loop is not well formed for vectorization, an estimated
661 location is calculated.
662 Return the loop location if succeed and NULL if not. */
663
664source_location
665find_loop_location (struct loop *loop)
666{
667 gimple *stmt = NULL;
668 basic_block bb;
669 gimple_stmt_iterator si;
670
671 if (!loop)
672 return UNKNOWN_LOCATION;
673
674 stmt = get_loop_exit_condition (loop);
675
676 if (stmt
677 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
678 return gimple_location (stmt);
679
680 /* If we got here the loop is probably not "well formed",
681 try to estimate the loop location */
682
683 if (!loop->header)
684 return UNKNOWN_LOCATION;
685
686 bb = loop->header;
687
688 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
689 {
690 stmt = gsi_stmt (si);
691 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
692 return gimple_location (stmt);
693 }
694
695 return UNKNOWN_LOCATION;
696}
697
698/* Return true if PHI defines an IV of the loop to be vectorized. */
699
700static bool
701iv_phi_p (gphi *phi)
702{
703 if (virtual_operand_p (PHI_RESULT (phi)))
704 return false;
705
706 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
707 gcc_assert (stmt_info != NULL);
708 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
709 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
710 return false;
711
712 return true;
713}
714
715/* Function vect_can_advance_ivs_p
716
717 In case the number of iterations that LOOP iterates is unknown at compile
718 time, an epilog loop will be generated, and the loop induction variables
719 (IVs) will be "advanced" to the value they are supposed to take just before
720 the epilog loop. Here we check that the access function of the loop IVs
721 and the expression that represents the loop bound are simple enough.
722 These restrictions will be relaxed in the future. */
723
724bool
725vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
726{
727 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
728 basic_block bb = loop->header;
729 gphi_iterator gsi;
730
731 /* Analyze phi functions of the loop header. */
732
733 if (dump_enabled_p ())
734 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
735 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
736 {
737 tree evolution_part;
738
739 gphi *phi = gsi.phi ();
740 if (dump_enabled_p ())
741 {
742 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
743 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
744 }
745
746 /* Skip virtual phi's. The data dependences that are associated with
747 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
748
749 Skip reduction phis. */
750 if (!iv_phi_p (phi))
751 {
752 if (dump_enabled_p ())
753 dump_printf_loc (MSG_NOTE, vect_location,
754 "reduc or virtual phi. skip.\n");
755 continue;
756 }
757
758 /* Analyze the evolution function. */
759
760 evolution_part
761 = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
762 if (evolution_part == NULL_TREE)
763 {
764 if (dump_enabled_p ())
765 dump_printf (MSG_MISSED_OPTIMIZATION,
766 "No access function or evolution.\n");
767 return false;
768 }
769
770 /* FORNOW: We do not transform initial conditions of IVs
771 which evolution functions are not invariants in the loop. */
772
773 if (!expr_invariant_in_loop_p (loop, evolution_part))
774 {
775 if (dump_enabled_p ())
776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
777 "evolution not invariant in loop.\n");
778 return false;
779 }
780
781 /* FORNOW: We do not transform initial conditions of IVs
782 which evolution functions are a polynomial of degree >= 2. */
783
784 if (tree_is_chrec (evolution_part))
785 {
786 if (dump_enabled_p ())
787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
788 "evolution is chrec.\n");
789 return false;
790 }
791 }
792
793 return true;
794}
795
796
797/* Function vect_update_ivs_after_vectorizer.
798
799 "Advance" the induction variables of LOOP to the value they should take
800 after the execution of LOOP. This is currently necessary because the
801 vectorizer does not handle induction variables that are used after the
802 loop. Such a situation occurs when the last iterations of LOOP are
803 peeled, because:
804 1. We introduced new uses after LOOP for IVs that were not originally used
805 after LOOP: the IVs of LOOP are now used by an epilog loop.
806 2. LOOP is going to be vectorized; this means that it will iterate N/VF
807 times, whereas the loop IVs should be bumped N times.
808
809 Input:
810 - LOOP - a loop that is going to be vectorized. The last few iterations
811 of LOOP were peeled.
812 - NITERS - the number of iterations that LOOP executes (before it is
813 vectorized). i.e, the number of times the ivs should be bumped.
814 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
815 coming out from LOOP on which there are uses of the LOOP ivs
816 (this is the path from LOOP->exit to epilog_loop->preheader).
817
818 The new definitions of the ivs are placed in LOOP->exit.
819 The phi args associated with the edge UPDATE_E in the bb
820 UPDATE_E->dest are updated accordingly.
821
822 Assumption 1: Like the rest of the vectorizer, this function assumes
823 a single loop exit that has a single predecessor.
824
825 Assumption 2: The phi nodes in the LOOP header and in update_bb are
826 organized in the same order.
827
828 Assumption 3: The access function of the ivs is simple enough (see
829 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
830
831 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
832 coming out of LOOP on which the ivs of LOOP are used (this is the path
833 that leads to the epilog loop; other paths skip the epilog loop). This
834 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
835 needs to have its phis updated.
836 */
837
838static void
839vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
840 tree niters, edge update_e)
841{
842 gphi_iterator gsi, gsi1;
843 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
844 basic_block update_bb = update_e->dest;
845 basic_block exit_bb = single_exit (loop)->dest;
846
847 /* Make sure there exists a single-predecessor exit bb: */
848 gcc_assert (single_pred_p (exit_bb));
849 gcc_assert (single_succ_edge (exit_bb) == update_e);
850
851 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
852 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
853 gsi_next (&gsi), gsi_next (&gsi1))
854 {
855 tree init_expr;
856 tree step_expr, off;
857 tree type;
858 tree var, ni, ni_name;
859 gimple_stmt_iterator last_gsi;
860
861 gphi *phi = gsi.phi ();
862 gphi *phi1 = gsi1.phi ();
863 if (dump_enabled_p ())
864 {
865 dump_printf_loc (MSG_NOTE, vect_location,
866 "vect_update_ivs_after_vectorizer: phi: ");
867 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
868 }
869
870 /* Skip reduction and virtual phis. */
871 if (!iv_phi_p (phi))
872 {
873 if (dump_enabled_p ())
874 dump_printf_loc (MSG_NOTE, vect_location,
875 "reduc or virtual phi. skip.\n");
876 continue;
877 }
878
879 type = TREE_TYPE (gimple_phi_result (phi));
880 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
881 step_expr = unshare_expr (step_expr);
882
883 /* FORNOW: We do not support IVs whose evolution function is a polynomial
884 of degree >= 2 or exponential. */
885 gcc_assert (!tree_is_chrec (step_expr));
886
887 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
888
889 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
890 fold_convert (TREE_TYPE (step_expr), niters),
891 step_expr);
892 if (POINTER_TYPE_P (type))
893 ni = fold_build_pointer_plus (init_expr, off);
894 else
895 ni = fold_build2 (PLUS_EXPR, type,
896 init_expr, fold_convert (type, off));
897
898 var = create_tmp_var (type, "tmp");
899
900 last_gsi = gsi_last_bb (exit_bb);
901 gimple_seq new_stmts = NULL;
902 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
903 /* Exit_bb shouldn't be empty. */
904 if (!gsi_end_p (last_gsi))
905 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
906 else
907 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
908
909 /* Fix phi expressions in the successor bb. */
910 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
911 }
912}
913
914/* Function vect_gen_prolog_loop_niters
915
916 Generate the number of iterations which should be peeled as prolog for the
917 loop represented by LOOP_VINFO. It is calculated as the misalignment of
918 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
919 As a result, after the execution of this loop, the data reference DR will
920 refer to an aligned location. The following computation is generated:
921
922 If the misalignment of DR is known at compile time:
923 addr_mis = int mis = DR_MISALIGNMENT (dr);
924 Else, compute address misalignment in bytes:
925 addr_mis = addr & (vectype_align - 1)
926
927 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
928
929 (elem_size = element type size; an element is the scalar element whose type
930 is the inner type of the vectype)
931
932 The computations will be emitted at the end of BB. We also compute and
933 store upper bound (included) of the result in BOUND.
934
935 When the step of the data-ref in the loop is not 1 (as in interleaved data
936 and SLP), the number of iterations of the prolog must be divided by the step
937 (which is equal to the size of interleaved group).
938
939 The above formulas assume that VF == number of elements in the vector. This
940 may not hold when there are multiple-types in the loop.
941 In this case, for some data-references in the loop the VF does not represent
942 the number of elements that fit in the vector. Therefore, instead of VF we
943 use TYPE_VECTOR_SUBPARTS. */
944
945static tree
946vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
947 basic_block bb, int *bound)
948{
949 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
950 tree var;
951 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
952 gimple_seq stmts = NULL, new_stmts = NULL;
953 tree iters, iters_name;
954 gimple *dr_stmt = DR_STMT (dr);
955 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
956 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
957 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
958
959 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
960 {
961 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
962
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_NOTE, vect_location,
965 "known peeling = %d.\n", npeel);
966
967 iters = build_int_cst (niters_type, npeel);
968 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
969 }
970 else
971 {
972 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
973 tree offset = negative
974 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
975 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
976 &stmts, offset);
977 tree type = unsigned_type_for (TREE_TYPE (start_addr));
978 tree target_align_minus_1 = build_int_cst (type, target_align - 1);
979 HOST_WIDE_INT elem_size
980 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
981 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
982 HOST_WIDE_INT align_in_elems = target_align / elem_size;
983 tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
984 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
985 tree misalign_in_bytes;
986 tree misalign_in_elems;
987
988 /* Create: misalign_in_bytes = addr & (target_align - 1). */
989 misalign_in_bytes
990 = fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr),
991 target_align_minus_1);
992
993 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
994 misalign_in_elems
995 = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, elem_size_log);
996
997 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
998 & (align_in_elems - 1)). */
999 if (negative)
1000 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1001 align_in_elems_tree);
1002 else
1003 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1004 misalign_in_elems);
1005 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1006 iters = fold_convert (niters_type, iters);
1007 *bound = align_in_elems - 1;
1008 }
1009
1010 if (dump_enabled_p ())
1011 {
1012 dump_printf_loc (MSG_NOTE, vect_location,
1013 "niters for prolog loop: ");
1014 dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
1015 dump_printf (MSG_NOTE, "\n");
1016 }
1017
1018 var = create_tmp_var (niters_type, "prolog_loop_niters");
1019 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1020
1021 if (new_stmts)
1022 gimple_seq_add_seq (&stmts, new_stmts);
1023 if (stmts)
1024 {
1025 gcc_assert (single_succ_p (bb));
1026 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1027 if (gsi_end_p (gsi))
1028 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1029 else
1030 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1031 }
1032 return iters_name;
1033}
1034
1035
1036/* Function vect_update_init_of_dr
1037
1038 NITERS iterations were peeled from LOOP. DR represents a data reference
1039 in LOOP. This function updates the information recorded in DR to
1040 account for the fact that the first NITERS iterations had already been
1041 executed. Specifically, it updates the OFFSET field of DR. */
1042
1043static void
1044vect_update_init_of_dr (struct data_reference *dr, tree niters)
1045{
1046 tree offset = DR_OFFSET (dr);
1047
1048 niters = fold_build2 (MULT_EXPR, sizetype,
1049 fold_convert (sizetype, niters),
1050 fold_convert (sizetype, DR_STEP (dr)));
1051 offset = fold_build2 (PLUS_EXPR, sizetype,
1052 fold_convert (sizetype, offset), niters);
1053 DR_OFFSET (dr) = offset;
1054}
1055
1056
1057/* Function vect_update_inits_of_drs
1058
1059 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
1060 This function updates the information recorded for the data references in
1061 the loop to account for the fact that the first NITERS iterations had
1062 already been executed. Specifically, it updates the initial_condition of
1063 the access_function of all the data_references in the loop. */
1064
1065static void
1066vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
1067{
1068 unsigned int i;
1069 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1070 struct data_reference *dr;
1071
1072 if (dump_enabled_p ())
1073 dump_printf_loc (MSG_NOTE, vect_location,
1074 "=== vect_update_inits_of_dr ===\n");
1075
1076 /* Adjust niters to sizetype and insert stmts on loop preheader edge. */
1077 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1078 {
1079 gimple_seq seq;
1080 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1081 tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1082
1083 niters = fold_convert (sizetype, niters);
1084 niters = force_gimple_operand (niters, &seq, false, var);
1085 if (seq)
1086 {
1087 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1088 gcc_assert (!new_bb);
1089 }
1090 }
1091
1092 FOR_EACH_VEC_ELT (datarefs, i, dr)
1093 vect_update_init_of_dr (dr, niters);
1094}
1095
1096
1097/* This function builds ni_name = number of iterations. Statements
1098 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1099 it to TRUE if new ssa_var is generated. */
1100
1101tree
1102vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1103{
1104 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1105 if (TREE_CODE (ni) == INTEGER_CST)
1106 return ni;
1107 else
1108 {
1109 tree ni_name, var;
1110 gimple_seq stmts = NULL;
1111 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1112
1113 var = create_tmp_var (TREE_TYPE (ni), "niters");
1114 ni_name = force_gimple_operand (ni, &stmts, false, var);
1115 if (stmts)
1116 {
1117 gsi_insert_seq_on_edge_immediate (pe, stmts);
1118 if (new_var_p != NULL)
1119 *new_var_p = true;
1120 }
1121
1122 return ni_name;
1123 }
1124}
1125
1126/* Calculate the number of iterations above which vectorized loop will be
1127 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1128 of prolog loop. If it's integer const, the integer number is also passed
1129 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (included) of
1130 number of iterations of prolog loop. VFM1 is vector factor minus one.
1131 If CHECK_PROFITABILITY is true, TH is the threshold below which scalar
1132 (rather than vectorized) loop will be executed. This function stores
1133 upper bound (included) of the result in BOUND_SCALAR. */
1134
1135static tree
1136vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1137 int bound_prolog, int vfm1, int th,
1138 int *bound_scalar, bool check_profitability)
1139{
1140 tree type = TREE_TYPE (niters_prolog);
1141 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1142 build_int_cst (type, vfm1));
1143
1144 *bound_scalar = vfm1 + bound_prolog;
1145 if (check_profitability)
1146 {
1147 /* TH indicates the minimum niters of vectorized loop, while we
1148 compute the maximum niters of scalar loop. */
1149 th--;
1150 /* Peeling for constant times. */
1151 if (int_niters_prolog >= 0)
1152 {
1153 *bound_scalar = (int_niters_prolog + vfm1 < th
1154 ? th
1155 : vfm1 + int_niters_prolog);
1156 return build_int_cst (type, *bound_scalar);
1157 }
1158 /* Peeling for unknown times. Note BOUND_PROLOG is the upper
1159 bound (inlcuded) of niters of prolog loop. */
1160 if (th >= vfm1 + bound_prolog)
1161 {
1162 *bound_scalar = th;
1163 return build_int_cst (type, th);
1164 }
1165 /* Need to do runtime comparison, but BOUND_SCALAR remains the same. */
1166 else if (th > vfm1)
1167 return fold_build2 (MAX_EXPR, type, build_int_cst (type, th), niters);
1168 }
1169 return niters;
1170}
1171
1172/* This function generates the following statements:
1173
1174 niters = number of iterations loop executes (after peeling)
1175 niters_vector = niters / vf
1176
1177 and places them on the loop preheader edge. NITERS_NO_OVERFLOW is
1178 true if NITERS doesn't overflow. */
1179
1180void
1181vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1182 tree *niters_vector_ptr, bool niters_no_overflow)
1183{
1184 tree ni_minus_gap, var;
1185 tree niters_vector, type = TREE_TYPE (niters);
1186 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1187 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1188 tree log_vf = build_int_cst (type, exact_log2 (vf));
1189
1190 /* If epilogue loop is required because of data accesses with gaps, we
1191 subtract one iteration from the total number of iterations here for
1192 correct calculation of RATIO. */
1193 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1194 {
1195 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1196 build_one_cst (type));
1197 if (!is_gimple_val (ni_minus_gap))
1198 {
1199 var = create_tmp_var (type, "ni_gap");
1200 gimple *stmts = NULL;
1201 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1202 true, var);
1203 gsi_insert_seq_on_edge_immediate (pe, stmts);
1204 }
1205 }
1206 else
1207 ni_minus_gap = niters;
1208
1209 /* Create: niters >> log2(vf) */
1210 /* If it's known that niters == number of latch executions + 1 doesn't
1211 overflow, we can generate niters >> log2(vf); otherwise we generate
1212 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1213 will be at least one. */
1214 if (niters_no_overflow)
1215 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1216 else
1217 niters_vector
1218 = fold_build2 (PLUS_EXPR, type,
1219 fold_build2 (RSHIFT_EXPR, type,
1220 fold_build2 (MINUS_EXPR, type, ni_minus_gap,
1221 build_int_cst (type, vf)),
1222 log_vf),
1223 build_int_cst (type, 1));
1224
1225 if (!is_gimple_val (niters_vector))
1226 {
1227 var = create_tmp_var (type, "bnd");
1228 gimple_seq stmts = NULL;
1229 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1230 gsi_insert_seq_on_edge_immediate (pe, stmts);
1231 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1232 we set range information to make niters analyzer's life easier. */
1233 if (stmts != NULL)
1234 set_range_info (niters_vector, VR_RANGE,
1235 wi::to_wide (build_int_cst (type, 1)),
1236 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1237 TYPE_MAX_VALUE (type),
1238 log_vf)));
1239 }
1240 *niters_vector_ptr = niters_vector;
1241
1242 return;
1243}
1244
1245/* Given NITERS_VECTOR which is the number of iterations for vectorized
1246 loop specified by LOOP_VINFO after vectorization, compute the number
1247 of iterations before vectorization (niters_vector * vf) and store it
1248 to NITERS_VECTOR_MULT_VF_PTR. */
1249
1250static void
1251vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1252 tree niters_vector,
1253 tree *niters_vector_mult_vf_ptr)
1254{
1255 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1256 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1257 tree type = TREE_TYPE (niters_vector);
1258 tree log_vf = build_int_cst (type, exact_log2 (vf));
1259 basic_block exit_bb = single_exit (loop)->dest;
1260
1261 gcc_assert (niters_vector_mult_vf_ptr != NULL);
1262 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
1263 niters_vector, log_vf);
1264 if (!is_gimple_val (niters_vector_mult_vf))
1265 {
1266 tree var = create_tmp_var (type, "niters_vector_mult_vf");
1267 gimple_seq stmts = NULL;
1268 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
1269 &stmts, true, var);
1270 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
1271 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1272 }
1273 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
1274}
1275
1276/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
1277 from SECOND/FIRST and puts it at the original loop's preheader/exit
1278 edge, the two loops are arranged as below:
1279
1280 preheader_a:
1281 first_loop:
1282 header_a:
1283 i_1 = PHI<i_0, i_2>;
1284 ...
1285 i_2 = i_1 + 1;
1286 if (cond_a)
1287 goto latch_a;
1288 else
1289 goto between_bb;
1290 latch_a:
1291 goto header_a;
1292
1293 between_bb:
1294 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
1295
1296 second_loop:
1297 header_b:
1298 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
1299 or with i_2 if no LCSSA phi is created
1300 under condition of CREATE_LCSSA_FOR_IV_PHIS.
1301 ...
1302 i_4 = i_3 + 1;
1303 if (cond_b)
1304 goto latch_b;
1305 else
1306 goto exit_bb;
1307 latch_b:
1308 goto header_b;
1309
1310 exit_bb:
1311
1312 This function creates loop closed SSA for the first loop; update the
1313 second loop's PHI nodes by replacing argument on incoming edge with the
1314 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
1315 is false, Loop closed ssa phis will only be created for non-iv phis for
1316 the first loop.
1317
1318 This function assumes exit bb of the first loop is preheader bb of the
1319 second loop, i.e, between_bb in the example code. With PHIs updated,
1320 the second loop will execute rest iterations of the first. */
1321
1322static void
1323slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
1324 struct loop *first, struct loop *second,
1325 bool create_lcssa_for_iv_phis)
1326{
1327 gphi_iterator gsi_update, gsi_orig;
1328 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1329
1330 edge first_latch_e = EDGE_SUCC (first->latch, 0);
1331 edge second_preheader_e = loop_preheader_edge (second);
1332 basic_block between_bb = single_exit (first)->dest;
1333
1334 gcc_assert (between_bb == second_preheader_e->src);
1335 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
1336 /* Either the first loop or the second is the loop to be vectorized. */
1337 gcc_assert (loop == first || loop == second);
1338
1339 for (gsi_orig = gsi_start_phis (first->header),
1340 gsi_update = gsi_start_phis (second->header);
1341 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
1342 gsi_next (&gsi_orig), gsi_next (&gsi_update))
1343 {
1344 gphi *orig_phi = gsi_orig.phi ();
1345 gphi *update_phi = gsi_update.phi ();
1346
1347 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
1348 /* Generate lcssa PHI node for the first loop. */
1349 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
1350 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi))
1351 {
1352 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
1353 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
1354 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
1355 arg = new_res;
1356 }
1357
1358 /* Update PHI node in the second loop by replacing arg on the loop's
1359 incoming edge. */
1360 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
1361 }
1362}
1363
1364/* Function slpeel_add_loop_guard adds guard skipping from the beginning
1365 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
1366 are two pred edges of the merge point before UPDATE_LOOP. The two loops
1367 appear like below:
1368
1369 guard_bb:
1370 if (cond)
1371 goto merge_bb;
1372 else
1373 goto skip_loop;
1374
1375 skip_loop:
1376 header_a:
1377 i_1 = PHI<i_0, i_2>;
1378 ...
1379 i_2 = i_1 + 1;
1380 if (cond_a)
1381 goto latch_a;
1382 else
1383 goto exit_a;
1384 latch_a:
1385 goto header_a;
1386
1387 exit_a:
1388 i_5 = PHI<i_2>;
1389
1390 merge_bb:
1391 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
1392
1393 update_loop:
1394 header_b:
1395 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
1396 ...
1397 i_4 = i_3 + 1;
1398 if (cond_b)
1399 goto latch_b;
1400 else
1401 goto exit_bb;
1402 latch_b:
1403 goto header_b;
1404
1405 exit_bb:
1406
1407 This function creates PHI nodes at merge_bb and replaces the use of i_5
1408 in the update_loop's PHI node with the result of new PHI result. */
1409
1410static void
1411slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
1412 struct loop *update_loop,
1413 edge guard_edge, edge merge_edge)
1414{
1415 source_location merge_loc, guard_loc;
1416 edge orig_e = loop_preheader_edge (skip_loop);
1417 edge update_e = loop_preheader_edge (update_loop);
1418 gphi_iterator gsi_orig, gsi_update;
1419
1420 for ((gsi_orig = gsi_start_phis (skip_loop->header),
1421 gsi_update = gsi_start_phis (update_loop->header));
1422 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
1423 gsi_next (&gsi_orig), gsi_next (&gsi_update))
1424 {
1425 gphi *orig_phi = gsi_orig.phi ();
1426 gphi *update_phi = gsi_update.phi ();
1427
1428 /* Generate new phi node at merge bb of the guard. */
1429 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
1430 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
1431
1432 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
1433 args in NEW_PHI for these edges. */
1434 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
1435 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1436 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
1437 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1438 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
1439 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
1440
1441 /* Update phi in UPDATE_PHI. */
1442 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
1443 }
1444}
1445
1446/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
1447 this function searches for the corresponding lcssa phi node in exit
1448 bb of LOOP. If it is found, return the phi result; otherwise return
1449 NULL. */
1450
1451static tree
1452find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
1453 gphi *lcssa_phi)
1454{
1455 gphi_iterator gsi;
1456 edge e = single_exit (loop);
1457
1458 gcc_assert (single_pred_p (e->dest));
1459 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
1460 {
1461 gphi *phi = gsi.phi ();
1462 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
1463 PHI_ARG_DEF (lcssa_phi, 0), 0))
1464 return PHI_RESULT (phi);
1465 }
1466 return NULL_TREE;
1467}
1468
1469/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
1470 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
1471 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
1472 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
1473 The CFG looks like:
1474
1475 loop:
1476 header_a:
1477 i_1 = PHI<i_0, i_2>;
1478 ...
1479 i_2 = i_1 + 1;
1480 if (cond_a)
1481 goto latch_a;
1482 else
1483 goto exit_a;
1484 latch_a:
1485 goto header_a;
1486
1487 exit_a:
1488
1489 guard_bb:
1490 if (cond)
1491 goto merge_bb;
1492 else
1493 goto epilog_loop;
1494
1495 ;; fall_through_bb
1496
1497 epilog_loop:
1498 header_b:
1499 i_3 = PHI<i_2, i_4>;
1500 ...
1501 i_4 = i_3 + 1;
1502 if (cond_b)
1503 goto latch_b;
1504 else
1505 goto merge_bb;
1506 latch_b:
1507 goto header_b;
1508
1509 merge_bb:
1510 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
1511
1512 exit_bb:
1513 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
1514
1515 For each name used out side EPILOG (i.e - for each name that has a lcssa
1516 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
1517 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
1518 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
1519 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
1520 in exit_bb will also be updated. */
1521
1522static void
1523slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
1524 edge guard_edge, edge merge_edge)
1525{
1526 gphi_iterator gsi;
1527 basic_block merge_bb = guard_edge->dest;
1528
1529 gcc_assert (single_succ_p (merge_bb));
1530 edge e = single_succ_edge (merge_bb);
1531 basic_block exit_bb = e->dest;
1532 gcc_assert (single_pred_p (exit_bb));
1533 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
1534
1535 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
1536 {
1537 gphi *update_phi = gsi.phi ();
1538 tree old_arg = PHI_ARG_DEF (update_phi, 0);
1539 /* This loop-closed-phi actually doesn't represent a use out of the
1540 loop - the phi arg is a constant. */
1541 if (TREE_CODE (old_arg) != SSA_NAME)
1542 continue;
1543
1544 tree merge_arg = get_current_def (old_arg);
1545 if (!merge_arg)
1546 merge_arg = old_arg;
1547
1548 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
1549 /* If the var is live after loop but not a reduction, we simply
1550 use the old arg. */
1551 if (!guard_arg)
1552 guard_arg = old_arg;
1553
1554 /* Create new phi node in MERGE_BB: */
1555 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
1556 gphi *merge_phi = create_phi_node (new_res, merge_bb);
1557
1558 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
1559 the two PHI args in merge_phi for these edges. */
1560 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
1561 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
1562
1563 /* Update the original phi in exit_bb. */
1564 adjust_phi_and_debug_stmts (update_phi, e, new_res);
1565 }
1566}
1567
1568/* EPILOG loop is duplicated from the original loop for vectorizing,
1569 the arg of its loop closed ssa PHI needs to be updated. */
1570
1571static void
1572slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
1573{
1574 gphi_iterator gsi;
1575 basic_block exit_bb = single_exit (epilog)->dest;
1576
1577 gcc_assert (single_pred_p (exit_bb));
1578 edge e = EDGE_PRED (exit_bb, 0);
1579 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
1580 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
1581}
1582
1583/* Function vect_do_peeling.
1584
1585 Input:
1586 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
1587
1588 preheader:
1589 LOOP:
1590 header_bb:
1591 loop_body
1592 if (exit_loop_cond) goto exit_bb
1593 else goto header_bb
1594 exit_bb:
1595
1596 - NITERS: The number of iterations of the loop.
1597 - NITERSM1: The number of iterations of the loop's latch.
1598 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
1599 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
1600 CHECK_PROFITABILITY is true.
1601 Output:
1602 - NITERS_VECTOR: The number of iterations of loop after vectorization.
1603
1604 This function peels prolog and epilog from the loop, adds guards skipping
1605 PROLOG and EPILOG for various conditions. As a result, the changed CFG
1606 would look like:
1607
1608 guard_bb_1:
1609 if (prefer_scalar_loop) goto merge_bb_1
1610 else goto guard_bb_2
1611
1612 guard_bb_2:
1613 if (skip_prolog) goto merge_bb_2
1614 else goto prolog_preheader
1615
1616 prolog_preheader:
1617 PROLOG:
1618 prolog_header_bb:
1619 prolog_body
1620 if (exit_prolog_cond) goto prolog_exit_bb
1621 else goto prolog_header_bb
1622 prolog_exit_bb:
1623
1624 merge_bb_2:
1625
1626 vector_preheader:
1627 VECTOR LOOP:
1628 vector_header_bb:
1629 vector_body
1630 if (exit_vector_cond) goto vector_exit_bb
1631 else goto vector_header_bb
1632 vector_exit_bb:
1633
1634 guard_bb_3:
1635 if (skip_epilog) goto merge_bb_3
1636 else goto epilog_preheader
1637
1638 merge_bb_1:
1639
1640 epilog_preheader:
1641 EPILOG:
1642 epilog_header_bb:
1643 epilog_body
1644 if (exit_epilog_cond) goto merge_bb_3
1645 else goto epilog_header_bb
1646
1647 merge_bb_3:
1648
1649 Note this function peels prolog and epilog only if it's necessary,
1650 as well as guards.
1651 Returns created epilogue or NULL.
1652
1653 TODO: Guard for prefer_scalar_loop should be emitted along with
1654 versioning conditions if loop versioning is needed. */
1655
1656
1657struct loop *
1658vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
1659 tree *niters_vector, int th, bool check_profitability,
1660 bool niters_no_overflow)
1661{
1662 edge e, guard_e;
1663 tree type = TREE_TYPE (niters), guard_cond;
1664 basic_block guard_bb, guard_to;
1665 profile_probability prob_prolog, prob_vector, prob_epilog;
1666 int bound_prolog = 0, bound_scalar = 0, bound = 0;
1667 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1668 int prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1669 bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1670 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
1671
1672 if (!prolog_peeling && !epilog_peeling)
1673 return NULL;
1674
1675 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
1676 if ((vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo)) == 2)
1677 vf = 3;
1678 prob_prolog = prob_epilog = profile_probability::guessed_always ()
1679 .apply_scale (vf - 1, vf);
1680 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1681
1682 struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
1683 struct loop *first_loop = loop;
1684 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
1685 create_lcssa_for_virtual_phi (loop);
1686 update_ssa (TODO_update_ssa_only_virtuals);
1687
1688 if (MAY_HAVE_DEBUG_BIND_STMTS)
1689 {
1690 gcc_assert (!adjust_vec.exists ());
1691 adjust_vec.create (32);
1692 }
1693 initialize_original_copy_tables ();
1694
1695 /* Prolog loop may be skipped. */
1696 bool skip_prolog = (prolog_peeling != 0);
1697 /* Skip to epilog if scalar loop may be preferred. It's only needed
1698 when we peel for epilog loop and when it hasn't been checked with
1699 loop versioning. */
1700 bool skip_vector = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1701 && !LOOP_REQUIRES_VERSIONING (loop_vinfo));
1702 /* Epilog loop must be executed if the number of iterations for epilog
1703 loop is known at compile time, otherwise we need to add a check at
1704 the end of vector loop and skip to the end of epilog loop. */
1705 bool skip_epilog = (prolog_peeling < 0
1706 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo));
1707 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
1708 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1709 skip_epilog = false;
1710
1711 /* Record the anchor bb at which guard should be placed if scalar loop
1712 may be preferred. */
1713 basic_block anchor = loop_preheader_edge (loop)->src;
1714 if (skip_vector)
1715 {
1716 split_edge (loop_preheader_edge (loop));
1717
1718 /* Due to the order in which we peel prolog and epilog, we first
1719 propagate probability to the whole loop. The purpose is to
1720 avoid adjusting probabilities of both prolog and vector loops
1721 separately. Note in this case, the probability of epilog loop
1722 needs to be scaled back later. */
1723 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
1724 if (prob_vector.initialized_p ())
1725 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
1726 scale_loop_profile (loop, prob_vector, bound);
1727 }
1728
1729 tree niters_prolog = build_int_cst (type, 0);
1730 source_location loop_loc = find_loop_location (loop);
1731 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
1732 if (prolog_peeling)
1733 {
1734 e = loop_preheader_edge (loop);
1735 if (!slpeel_can_duplicate_loop_p (loop, e))
1736 {
1737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
1738 "loop can't be duplicated to preheader edge.\n");
1739 gcc_unreachable ();
1740 }
1741 /* Peel prolog and put it on preheader edge of loop. */
1742 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
1743 if (!prolog)
1744 {
1745 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
1746 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
1747 gcc_unreachable ();
1748 }
1749 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
1750 first_loop = prolog;
1751 reset_original_copy_tables ();
1752
1753 /* Generate and update the number of iterations for prolog loop. */
1754 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
1755 &bound_prolog);
1756 slpeel_make_loop_iterate_ntimes (prolog, niters_prolog);
1757
1758 /* Skip the prolog loop. */
1759 if (skip_prolog)
1760 {
1761 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
1762 niters_prolog, build_int_cst (type, 0));
1763 guard_bb = loop_preheader_edge (prolog)->src;
1764 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
1765 guard_to = split_edge (loop_preheader_edge (loop));
1766 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
1767 guard_to, guard_bb,
1768 prob_prolog.invert (),
1769 irred_flag);
1770 e = EDGE_PRED (guard_to, 0);
1771 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
1772 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
1773
1774 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
1775 scale_loop_profile (prolog, prob_prolog, bound_prolog);
1776 }
1777 /* Update init address of DRs. */
1778 vect_update_inits_of_drs (loop_vinfo, niters_prolog);
1779 /* Update niters for vector loop. */
1780 LOOP_VINFO_NITERS (loop_vinfo)
1781 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
1782 LOOP_VINFO_NITERSM1 (loop_vinfo)
1783 = fold_build2 (MINUS_EXPR, type,
1784 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
1785 bool new_var_p = false;
1786 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
1787 /* It's guaranteed that vector loop bound before vectorization is at
1788 least VF, so set range information for newly generated var. */
1789 if (new_var_p)
1790 set_range_info (niters, VR_RANGE,
1791 wi::to_wide (build_int_cst (type, vf)),
1792 wi::to_wide (TYPE_MAX_VALUE (type)));
1793
1794 /* Prolog iterates at most bound_prolog times, latch iterates at
1795 most bound_prolog - 1 times. */
1796 record_niter_bound (prolog, bound_prolog - 1, false, true);
1797 delete_update_ssa ();
1798 adjust_vec_debug_stmts ();
1799 scev_reset ();
1800 }
1801
1802 if (epilog_peeling)
1803 {
1804 e = single_exit (loop);
1805 if (!slpeel_can_duplicate_loop_p (loop, e))
1806 {
1807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
1808 "loop can't be duplicated to exit edge.\n");
1809 gcc_unreachable ();
1810 }
1811 /* Peel epilog and put it on exit edge of loop. */
1812 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
1813 if (!epilog)
1814 {
1815 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
1816 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
1817 gcc_unreachable ();
1818 }
1819 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
1820
1821 /* Scalar version loop may be preferred. In this case, add guard
1822 and skip to epilog. Note this only happens when the number of
1823 iterations of loop is unknown at compile time, otherwise this
1824 won't be vectorized. */
1825 if (skip_vector)
1826 {
1827 /* Additional epilogue iteration is peeled if gap exists. */
1828 bool peel_for_gaps = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
1829 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
1830 bound_prolog,
1831 peel_for_gaps ? vf : vf - 1,
1832 th, &bound_scalar,
1833 check_profitability);
1834 /* Build guard against NITERSM1 since NITERS may overflow. */
1835 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
1836 guard_bb = anchor;
1837 guard_to = split_edge (loop_preheader_edge (epilog));
1838 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
1839 guard_to, guard_bb,
1840 prob_vector.invert (),
1841 irred_flag);
1842 e = EDGE_PRED (guard_to, 0);
1843 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
1844 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
1845
1846 /* Simply propagate profile info from guard_bb to guard_to which is
1847 a merge point of control flow. */
1848 guard_to->count = guard_bb->count;
1849
1850 /* Scale probability of epilog loop back.
1851 FIXME: We should avoid scaling down and back up. Profile may
1852 get lost if we scale down to 0. */
1853 basic_block *bbs = get_loop_body (epilog);
1854 for (unsigned int i = 0; i < epilog->num_nodes; i++)
1855 bbs[i]->count = bbs[i]->count.apply_scale
1856 (bbs[i]->count,
1857 bbs[i]->count.apply_probability
1858 (prob_vector));
1859 free (bbs);
1860 }
1861
1862 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
1863 tree niters_vector_mult_vf;
1864 /* If loop is peeled for non-zero constant times, now niters refers to
1865 orig_niters - prolog_peeling, it won't overflow even the orig_niters
1866 overflows. */
1867 niters_no_overflow |= (prolog_peeling > 0);
1868 vect_gen_vector_loop_niters (loop_vinfo, niters,
1869 niters_vector, niters_no_overflow);
1870 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
1871 &niters_vector_mult_vf);
1872 /* Update IVs of original loop as if they were advanced by
1873 niters_vector_mult_vf steps. */
1874 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
1875 edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
1876 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
1877 update_e);
1878
1879 if (skip_epilog)
1880 {
1881 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
1882 niters, niters_vector_mult_vf);
1883 guard_bb = single_exit (loop)->dest;
1884 guard_to = split_edge (single_exit (epilog));
1885 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
1886 skip_vector ? anchor : guard_bb,
1887 prob_epilog.invert (),
1888 irred_flag);
1889 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
1890 single_exit (epilog));
1891 /* Only need to handle basic block before epilog loop if it's not
1892 the guard_bb, which is the case when skip_vector is true. */
1893 if (guard_bb != bb_before_epilog)
1894 {
1895 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
1896
1897 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
1898 }
1899 scale_loop_profile (epilog, prob_epilog, bound);
1900 }
1901 else
1902 slpeel_update_phi_nodes_for_lcssa (epilog);
1903
1904 bound = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? vf - 1 : vf - 2;
1905 /* We share epilog loop with scalar version loop. */
1906 bound = MAX (bound, bound_scalar - 1);
1907 record_niter_bound (epilog, bound, false, true);
1908
1909 delete_update_ssa ();
1910 adjust_vec_debug_stmts ();
1911 scev_reset ();
1912 }
1913 adjust_vec.release ();
1914 free_original_copy_tables ();
1915
1916 return epilog;
1917}
1918
1919/* Function vect_create_cond_for_niters_checks.
1920
1921 Create a conditional expression that represents the run-time checks for
1922 loop's niter. The loop is guaranteed to to terminate if the run-time
1923 checks hold.
1924
1925 Input:
1926 COND_EXPR - input conditional expression. New conditions will be chained
1927 with logical AND operation. If it is NULL, then the function
1928 is used to return the number of alias checks.
1929 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
1930 to be checked.
1931
1932 Output:
1933 COND_EXPR - conditional expression.
1934
1935 The returned COND_EXPR is the conditional expression to be used in the
1936 if statement that controls which version of the loop gets executed at
1937 runtime. */
1938
1939static void
1940vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
1941{
1942 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
1943
1944 if (*cond_expr)
1945 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1946 *cond_expr, part_cond_expr);
1947 else
1948 *cond_expr = part_cond_expr;
1949}
1950
1951/* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
1952 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
1953
1954static void
1955chain_cond_expr (tree *cond_expr, tree part_cond_expr)
1956{
1957 if (*cond_expr)
1958 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1959 *cond_expr, part_cond_expr);
1960 else
1961 *cond_expr = part_cond_expr;
1962}
1963
1964/* Function vect_create_cond_for_align_checks.
1965
1966 Create a conditional expression that represents the alignment checks for
1967 all of data references (array element references) whose alignment must be
1968 checked at runtime.
1969
1970 Input:
1971 COND_EXPR - input conditional expression. New conditions will be chained
1972 with logical AND operation.
1973 LOOP_VINFO - two fields of the loop information are used.
1974 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
1975 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
1976
1977 Output:
1978 COND_EXPR_STMT_LIST - statements needed to construct the conditional
1979 expression.
1980 The returned value is the conditional expression to be used in the if
1981 statement that controls which version of the loop gets executed at runtime.
1982
1983 The algorithm makes two assumptions:
1984 1) The number of bytes "n" in a vector is a power of 2.
1985 2) An address "a" is aligned if a%n is zero and that this
1986 test can be done as a&(n-1) == 0. For example, for 16
1987 byte vectors the test is a&0xf == 0. */
1988
1989static void
1990vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
1991 tree *cond_expr,
1992 gimple_seq *cond_expr_stmt_list)
1993{
1994 vec<gimple *> may_misalign_stmts
1995 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1996 gimple *ref_stmt;
1997 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
1998 tree mask_cst;
1999 unsigned int i;
2000 tree int_ptrsize_type;
2001 char tmp_name[20];
2002 tree or_tmp_name = NULL_TREE;
2003 tree and_tmp_name;
2004 gimple *and_stmt;
2005 tree ptrsize_zero;
2006 tree part_cond_expr;
2007
2008 /* Check that mask is one less than a power of 2, i.e., mask is
2009 all zeros followed by all ones. */
2010 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2011
2012 int_ptrsize_type = signed_type_for (ptr_type_node);
2013
2014 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2015 of the first vector of the i'th data reference. */
2016
2017 FOR_EACH_VEC_ELT (may_misalign_stmts, i, ref_stmt)
2018 {
2019 gimple_seq new_stmt_list = NULL;
2020 tree addr_base;
2021 tree addr_tmp_name;
2022 tree new_or_tmp_name;
2023 gimple *addr_stmt, *or_stmt;
2024 stmt_vec_info stmt_vinfo = vinfo_for_stmt (ref_stmt);
2025 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2026 bool negative = tree_int_cst_compare
2027 (DR_STEP (STMT_VINFO_DATA_REF (stmt_vinfo)), size_zero_node) < 0;
2028 tree offset = negative
2029 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2030
2031 /* create: addr_tmp = (int)(address_of_first_vector) */
2032 addr_base =
2033 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
2034 offset);
2035 if (new_stmt_list != NULL)
2036 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2037
2038 sprintf (tmp_name, "addr2int%d", i);
2039 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2040 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2041 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2042
2043 /* The addresses are OR together. */
2044
2045 if (or_tmp_name != NULL_TREE)
2046 {
2047 /* create: or_tmp = or_tmp | addr_tmp */
2048 sprintf (tmp_name, "orptrs%d", i);
2049 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2050 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2051 or_tmp_name, addr_tmp_name);
2052 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2053 or_tmp_name = new_or_tmp_name;
2054 }
2055 else
2056 or_tmp_name = addr_tmp_name;
2057
2058 } /* end for i */
2059
2060 mask_cst = build_int_cst (int_ptrsize_type, mask);
2061
2062 /* create: and_tmp = or_tmp & mask */
2063 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2064
2065 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2066 or_tmp_name, mask_cst);
2067 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2068
2069 /* Make and_tmp the left operand of the conditional test against zero.
2070 if and_tmp has a nonzero bit then some address is unaligned. */
2071 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2072 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2073 and_tmp_name, ptrsize_zero);
2074 chain_cond_expr (cond_expr, part_cond_expr);
2075}
2076
2077/* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2078 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2079 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2080 and this new condition are true. Treat a null *COND_EXPR as "true". */
2081
2082static void
2083vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2084{
2085 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2086 unsigned int i;
2087 vec_object_pair *pair;
2088 FOR_EACH_VEC_ELT (pairs, i, pair)
2089 {
2090 tree addr1 = build_fold_addr_expr (pair->first);
2091 tree addr2 = build_fold_addr_expr (pair->second);
2092 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2093 addr1, addr2);
2094 chain_cond_expr (cond_expr, part_cond_expr);
2095 }
2096}
2097
2098/* Function vect_create_cond_for_alias_checks.
2099
2100 Create a conditional expression that represents the run-time checks for
2101 overlapping of address ranges represented by a list of data references
2102 relations passed as input.
2103
2104 Input:
2105 COND_EXPR - input conditional expression. New conditions will be chained
2106 with logical AND operation. If it is NULL, then the function
2107 is used to return the number of alias checks.
2108 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2109 to be checked.
2110
2111 Output:
2112 COND_EXPR - conditional expression.
2113
2114 The returned COND_EXPR is the conditional expression to be used in the if
2115 statement that controls which version of the loop gets executed at runtime.
2116*/
2117
2118void
2119vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2120{
2121 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2122 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2123
2124 if (comp_alias_ddrs.is_empty ())
2125 return;
2126
2127 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2128 &comp_alias_ddrs, cond_expr);
2129 if (dump_enabled_p ())
2130 dump_printf_loc (MSG_NOTE, vect_location,
2131 "created %u versioning for alias checks.\n",
2132 comp_alias_ddrs.length ());
2133}
2134
2135
2136/* Function vect_loop_versioning.
2137
2138 If the loop has data references that may or may not be aligned or/and
2139 has data reference relations whose independence was not proven then
2140 two versions of the loop need to be generated, one which is vectorized
2141 and one which isn't. A test is then generated to control which of the
2142 loops is executed. The test checks for the alignment of all of the
2143 data references that may or may not be aligned. An additional
2144 sequence of runtime tests is generated for each pairs of DDRs whose
2145 independence was not proven. The vectorized version of loop is
2146 executed only if both alias and alignment tests are passed.
2147
2148 The test generated to check which version of loop is executed
2149 is modified to also check for profitability as indicated by the
2150 cost model threshold TH.
2151
2152 The versioning precondition(s) are placed in *COND_EXPR and
2153 *COND_EXPR_STMT_LIST. */
2154
2155void
2156vect_loop_versioning (loop_vec_info loop_vinfo,
2157 unsigned int th, bool check_profitability)
2158{
2159 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2160 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2161 basic_block condition_bb;
2162 gphi_iterator gsi;
2163 gimple_stmt_iterator cond_exp_gsi;
2164 basic_block merge_bb;
2165 basic_block new_exit_bb;
2166 edge new_exit_e, e;
2167 gphi *orig_phi, *new_phi;
2168 tree cond_expr = NULL_TREE;
2169 gimple_seq cond_expr_stmt_list = NULL;
2170 tree arg;
2171 profile_probability prob = profile_probability::likely ();
2172 gimple_seq gimplify_stmt_list = NULL;
2173 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2174 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2175 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2176 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2177
2178 if (check_profitability)
2179 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2180 build_int_cst (TREE_TYPE (scalar_loop_iters),
2181 th - 1));
2182
2183 if (version_niter)
2184 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
2185
2186 if (cond_expr)
2187 cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
2188 is_gimple_condexpr, NULL_TREE);
2189
2190 if (version_align)
2191 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
2192 &cond_expr_stmt_list);
2193
2194 if (version_alias)
2195 {
2196 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
2197 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
2198 }
2199
2200 cond_expr = force_gimple_operand_1 (cond_expr, &gimplify_stmt_list,
2201 is_gimple_condexpr, NULL_TREE);
2202 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
2203
2204 initialize_original_copy_tables ();
2205 if (scalar_loop)
2206 {
2207 edge scalar_e;
2208 basic_block preheader, scalar_preheader;
2209
2210 /* We don't want to scale SCALAR_LOOP's frequencies, we need to
2211 scale LOOP's frequencies instead. */
2212 nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
2213 prob, prob.invert (), prob, prob.invert (), true);
2214 scale_loop_frequencies (loop, prob);
2215 /* CONDITION_BB was created above SCALAR_LOOP's preheader,
2216 while we need to move it above LOOP's preheader. */
2217 e = loop_preheader_edge (loop);
2218 scalar_e = loop_preheader_edge (scalar_loop);
2219 gcc_assert (empty_block_p (e->src)
2220 && single_pred_p (e->src));
2221 gcc_assert (empty_block_p (scalar_e->src)
2222 && single_pred_p (scalar_e->src));
2223 gcc_assert (single_pred_p (condition_bb));
2224 preheader = e->src;
2225 scalar_preheader = scalar_e->src;
2226 scalar_e = find_edge (condition_bb, scalar_preheader);
2227 e = single_pred_edge (preheader);
2228 redirect_edge_and_branch_force (single_pred_edge (condition_bb),
2229 scalar_preheader);
2230 redirect_edge_and_branch_force (scalar_e, preheader);
2231 redirect_edge_and_branch_force (e, condition_bb);
2232 set_immediate_dominator (CDI_DOMINATORS, condition_bb,
2233 single_pred (condition_bb));
2234 set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
2235 single_pred (scalar_preheader));
2236 set_immediate_dominator (CDI_DOMINATORS, preheader,
2237 condition_bb);
2238 }
2239 else
2240 nloop = loop_version (loop, cond_expr, &condition_bb,
2241 prob, prob.invert (), prob, prob.invert (), true);
2242
2243 if (version_niter)
2244 {
2245 /* The versioned loop could be infinite, we need to clear existing
2246 niter information which is copied from the original loop. */
2247 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
2248 vect_free_loop_info_assumptions (nloop);
2249 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
2250 loop_constraint_set (loop, LOOP_C_INFINITE);
2251 }
2252
2253 if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
2254 && dump_enabled_p ())
2255 {
2256 if (version_alias)
2257 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
2258 "loop versioned for vectorization because of "
2259 "possible aliasing\n");
2260 if (version_align)
2261 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
2262 "loop versioned for vectorization to enhance "
2263 "alignment\n");
2264
2265 }
2266 free_original_copy_tables ();
2267
2268 /* Loop versioning violates an assumption we try to maintain during
2269 vectorization - that the loop exit block has a single predecessor.
2270 After versioning, the exit block of both loop versions is the same
2271 basic block (i.e. it has two predecessors). Just in order to simplify
2272 following transformations in the vectorizer, we fix this situation
2273 here by adding a new (empty) block on the exit-edge of the loop,
2274 with the proper loop-exit phis to maintain loop-closed-form.
2275 If loop versioning wasn't done from loop, but scalar_loop instead,
2276 merge_bb will have already just a single successor. */
2277
2278 merge_bb = single_exit (loop)->dest;
2279 if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
2280 {
2281 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
2282 new_exit_bb = split_edge (single_exit (loop));
2283 new_exit_e = single_exit (loop);
2284 e = EDGE_SUCC (new_exit_bb, 0);
2285
2286 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2287 {
2288 tree new_res;
2289 orig_phi = gsi.phi ();
2290 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2291 new_phi = create_phi_node (new_res, new_exit_bb);
2292 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
2293 add_phi_arg (new_phi, arg, new_exit_e,
2294 gimple_phi_arg_location_from_edge (orig_phi, e));
2295 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
2296 }
2297 }
2298
2299 /* End loop-exit-fixes after versioning. */
2300
2301 if (cond_expr_stmt_list)
2302 {
2303 cond_exp_gsi = gsi_last_bb (condition_bb);
2304 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
2305 GSI_SAME_STMT);
2306 }
2307 update_ssa (TODO_update_ssa);
2308}
2309