1 | /* Loop unroll-and-jam. |
2 | Copyright (C) 2017-2024 Free Software Foundation, Inc. |
3 | |
4 | This file is part of GCC. |
5 | |
6 | GCC is free software; you can redistribute it and/or modify it |
7 | under the terms of the GNU General Public License as published by the |
8 | Free Software Foundation; either version 3, or (at your option) any |
9 | later version. |
10 | |
11 | GCC is distributed in the hope that it will be useful, but WITHOUT |
12 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
13 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
14 | for more details. |
15 | |
16 | You should have received a copy of the GNU General Public License |
17 | along with GCC; see the file COPYING3. If not see |
18 | <http://www.gnu.org/licenses/>. */ |
19 | |
20 | #include "config.h" |
21 | #include "system.h" |
22 | #include "coretypes.h" |
23 | #include "tree-pass.h" |
24 | #include "backend.h" |
25 | #include "tree.h" |
26 | #include "gimple.h" |
27 | #include "ssa.h" |
28 | #include "fold-const.h" |
29 | #include "tree-cfg.h" |
30 | #include "tree-ssa.h" |
31 | #include "tree-ssa-loop-niter.h" |
32 | #include "tree-ssa-loop.h" |
33 | #include "tree-ssa-loop-manip.h" |
34 | #include "cfgloop.h" |
35 | #include "tree-scalar-evolution.h" |
36 | #include "gimple-iterator.h" |
37 | #include "cfghooks.h" |
38 | #include "tree-data-ref.h" |
39 | #include "tree-ssa-loop-ivopts.h" |
40 | #include "tree-vectorizer.h" |
41 | #include "tree-ssa-sccvn.h" |
42 | #include "tree-cfgcleanup.h" |
43 | |
44 | /* Unroll and Jam transformation |
45 | |
46 | This is a combination of two transformations, where the second |
47 | is not always valid. It's applicable if a loop nest has redundancies |
48 | over the iterations of an outer loop while not having that with |
49 | an inner loop. |
50 | |
51 | Given this nest: |
52 | for (i) { |
53 | for (j) { |
54 | B(i,j) |
55 | } |
56 | } |
57 | |
58 | first unroll: |
59 | for (i by 2) { |
60 | for (j) { |
61 | B(i,j) |
62 | } |
63 | for (j) { |
64 | B(i+1,j) |
65 | } |
66 | } |
67 | |
68 | then fuse the two adjacent inner loops resulting from that: |
69 | for (i by 2) { |
70 | for (j) { |
71 | B(i,j) |
72 | B(i+1,j) |
73 | } |
74 | } |
75 | |
76 | As the order of evaluations of the body B changes this is valid |
77 | only in certain situations: all distance vectors need to be forward. |
78 | Additionally if there are multiple induction variables than just |
79 | a counting control IV (j above) we can also deal with some situations. |
80 | |
81 | The validity is checked by unroll_jam_possible_p, and the data-dep |
82 | testing below. |
83 | |
84 | A trivial example where the fusion is wrong would be when |
85 | B(i,j) == x[j-1] = x[j]; |
86 | for (i by 2) { |
87 | for (j) { |
88 | x[j-1] = x[j]; |
89 | } |
90 | for (j) { |
91 | x[j-1] = x[j]; |
92 | } |
93 | } effect: move content to front by two elements |
94 | --> |
95 | for (i by 2) { |
96 | for (j) { |
97 | x[j-1] = x[j]; |
98 | x[j-1] = x[j]; |
99 | } |
100 | } effect: move content to front by one element |
101 | */ |
102 | |
103 | /* Modify the loop tree for the fact that all code once belonging |
104 | to the OLD loop or the outer loop of OLD now is inside LOOP. */ |
105 | |
106 | static void |
107 | merge_loop_tree (class loop *loop, class loop *old) |
108 | { |
109 | basic_block *bbs; |
110 | int i, n; |
111 | class loop *subloop; |
112 | edge e; |
113 | edge_iterator ei; |
114 | |
115 | /* Find its nodes. */ |
116 | bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); |
117 | n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun)); |
118 | |
119 | for (i = 0; i < n; i++) |
120 | { |
121 | /* If the block was direct child of OLD loop it's now part |
122 | of LOOP. If it was outside OLD, then it moved into LOOP |
123 | as well. This avoids changing the loop father for BBs |
124 | in inner loops of OLD. */ |
125 | if (bbs[i]->loop_father == old |
126 | || loop_depth (loop: bbs[i]->loop_father) < loop_depth (loop: old)) |
127 | { |
128 | remove_bb_from_loops (bbs[i]); |
129 | add_bb_to_loop (bbs[i], loop); |
130 | continue; |
131 | } |
132 | |
133 | /* If we find a direct subloop of OLD, move it to LOOP. */ |
134 | subloop = bbs[i]->loop_father; |
135 | if (loop_outer (loop: subloop) == old && subloop->header == bbs[i]) |
136 | { |
137 | flow_loop_tree_node_remove (subloop); |
138 | flow_loop_tree_node_add (loop, subloop); |
139 | } |
140 | } |
141 | |
142 | /* Update the information about loop exit edges. */ |
143 | for (i = 0; i < n; i++) |
144 | { |
145 | FOR_EACH_EDGE (e, ei, bbs[i]->succs) |
146 | { |
147 | rescan_loop_exit (e, false, false); |
148 | } |
149 | } |
150 | |
151 | loop->num_nodes = n; |
152 | |
153 | free (ptr: bbs); |
154 | } |
155 | |
156 | /* BB is part of the outer loop of an unroll-and-jam situation. |
157 | Check if any statements therein would prevent the transformation. */ |
158 | |
159 | static bool |
160 | bb_prevents_fusion_p (basic_block bb) |
161 | { |
162 | gimple_stmt_iterator gsi; |
163 | /* BB is duplicated by outer unrolling and then all N-1 first copies |
164 | move into the body of the fused inner loop. If BB exits the outer loop |
165 | the last copy still does so, and the first N-1 copies are cancelled |
166 | by loop unrolling, so also after fusion it's the exit block. |
167 | But there might be other reasons that prevent fusion: |
168 | * stores or unknown side-effects prevent fusion |
169 | * loads don't |
170 | * computations into SSA names: these aren't problematic. Their |
171 | result will be unused on the exit edges of the first N-1 copies |
172 | (those aren't taken after unrolling). If they are used on the |
173 | other edge (the one leading to the outer latch block) they are |
174 | loop-carried (on the outer loop) and the Nth copy of BB will |
175 | compute them again (i.e. the first N-1 copies will be dead). */ |
176 | for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
177 | { |
178 | gimple *g = gsi_stmt (i: gsi); |
179 | if (gimple_vdef (g) || gimple_has_side_effects (g)) |
180 | return true; |
181 | } |
182 | return false; |
183 | } |
184 | |
185 | /* Given an inner loop LOOP (of some OUTER loop) determine if |
186 | we can safely fuse copies of it (generated by outer unrolling). |
187 | If so return true, otherwise return false. */ |
188 | |
189 | static bool |
190 | unroll_jam_possible_p (class loop *outer, class loop *loop) |
191 | { |
192 | basic_block *bbs; |
193 | int i, n; |
194 | class tree_niter_desc niter; |
195 | |
196 | /* When fusing the loops we skip the latch block |
197 | of the first one, so it mustn't have any effects to |
198 | preserve. */ |
199 | if (!empty_block_p (loop->latch)) |
200 | return false; |
201 | |
202 | edge exit; |
203 | if (!(exit = single_exit (loop))) |
204 | return false; |
205 | |
206 | /* We need a perfect nest. Quick check for adjacent inner loops. */ |
207 | if (outer->inner != loop || loop->next) |
208 | return false; |
209 | |
210 | /* Prevent head-controlled inner loops, that we usually have. |
211 | The guard block would need to be accepted |
212 | (invariant condition either entering or skipping the loop), |
213 | without also accepting arbitrary control flow. When unswitching |
214 | ran before us (as with -O3) this won't be a problem because its |
215 | outer loop unswitching will have moved out the invariant condition. |
216 | |
217 | If we do that we need to extend fuse_loops() to cope with this |
218 | by threading through the (still invariant) copied condition |
219 | between the two loop copies. */ |
220 | if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header)) |
221 | return false; |
222 | |
223 | /* The number of iterations of the inner loop must be loop invariant |
224 | with respect to the outer loop. */ |
225 | if (!number_of_iterations_exit (loop, single_exit (loop), niter: &niter, |
226 | false, every_iteration: true) |
227 | || niter.cmp == ERROR_MARK |
228 | || !integer_zerop (niter.may_be_zero) |
229 | || !expr_invariant_in_loop_p (outer, niter.niter)) |
230 | return false; |
231 | |
232 | /* If the inner loop produces any values that are used inside the |
233 | outer loop (except the virtual op) then it can flow |
234 | back (perhaps indirectly) into the inner loop. This prevents |
235 | fusion: without fusion the value at the last iteration is used, |
236 | with fusion the value after the initial iteration is used. |
237 | |
238 | If all uses are outside the outer loop this doesn't prevent fusion; |
239 | the value of the last iteration is still used (and the values from |
240 | all intermediate iterations are dead). */ |
241 | gphi_iterator psi; |
242 | for (psi = gsi_start_phis (single_exit (loop)->dest); |
243 | !gsi_end_p (i: psi); gsi_next (i: &psi)) |
244 | { |
245 | imm_use_iterator imm_iter; |
246 | use_operand_p use_p; |
247 | tree op = gimple_phi_result (gs: psi.phi ()); |
248 | if (virtual_operand_p (op)) |
249 | continue; |
250 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op) |
251 | { |
252 | gimple *use_stmt = USE_STMT (use_p); |
253 | if (!is_gimple_debug (gs: use_stmt) |
254 | && flow_bb_inside_loop_p (outer, gimple_bb (g: use_stmt))) |
255 | return false; |
256 | } |
257 | } |
258 | |
259 | /* And check blocks belonging to just outer loop. */ |
260 | bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); |
261 | n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun)); |
262 | |
263 | for (i = 0; i < n; i++) |
264 | if (bbs[i]->loop_father == outer |
265 | && (bb_prevents_fusion_p (bb: bbs[i]) |
266 | /* Outer loop exits must come after the inner loop, otherwise |
267 | we'll put the outer loop exit into the fused inner loop. */ |
268 | || (loop_exits_from_bb_p (outer, bbs[i]) |
269 | && !dominated_by_p (CDI_DOMINATORS, bbs[i], exit->src)))) |
270 | break; |
271 | free (ptr: bbs); |
272 | if (i != n) |
273 | return false; |
274 | |
275 | /* For now we can safely fuse copies of LOOP only if all |
276 | loop carried variables are inductions (or the virtual op). |
277 | |
278 | We could handle reductions as well (the initial value in the second |
279 | body would be the after-iter value of the first body) if it's over |
280 | an associative and commutative operation. We wouldn't |
281 | be able to handle unknown cycles. */ |
282 | for (psi = gsi_start_phis (loop->header); !gsi_end_p (i: psi); gsi_next (i: &psi)) |
283 | { |
284 | affine_iv iv; |
285 | tree op = gimple_phi_result (gs: psi.phi ()); |
286 | |
287 | if (virtual_operand_p (op)) |
288 | continue; |
289 | if (!simple_iv (loop, loop, op, &iv, true)) |
290 | return false; |
291 | /* The inductions must be regular, loop invariant step and initial |
292 | value. */ |
293 | if (!expr_invariant_in_loop_p (outer, iv.step) |
294 | || !expr_invariant_in_loop_p (outer, iv.base)) |
295 | return false; |
296 | /* XXX With more effort we could also be able to deal with inductions |
297 | where the initial value is loop variant but a simple IV in the |
298 | outer loop. The initial value for the second body would be |
299 | the original initial value plus iv.base.step. The next value |
300 | for the fused loop would be the original next value of the first |
301 | copy, _not_ the next value of the second body. */ |
302 | } |
303 | |
304 | return true; |
305 | } |
306 | |
307 | /* Fuse LOOP with all further neighbors. The loops are expected to |
308 | be in appropriate form. */ |
309 | |
310 | static void |
311 | fuse_loops (class loop *loop) |
312 | { |
313 | class loop *next = loop->next; |
314 | |
315 | while (next) |
316 | { |
317 | edge e; |
318 | |
319 | remove_branch (single_pred_edge (bb: loop->latch)); |
320 | /* Make delete_basic_block not fiddle with the loop structure. */ |
321 | basic_block oldlatch = loop->latch; |
322 | loop->latch = NULL; |
323 | delete_basic_block (oldlatch); |
324 | e = redirect_edge_and_branch (loop_latch_edge (next), |
325 | loop->header); |
326 | loop->latch = e->src; |
327 | flush_pending_stmts (e); |
328 | |
329 | gcc_assert (EDGE_COUNT (next->header->preds) == 1); |
330 | |
331 | /* The PHI nodes of the second body (single-argument now) |
332 | need adjustments to use the right values: either directly |
333 | the value of the corresponding PHI in the first copy or |
334 | the one leaving the first body which unrolling did for us. |
335 | |
336 | See also unroll_jam_possible_p() for further possibilities. */ |
337 | gphi_iterator psi_first, psi_second; |
338 | e = single_pred_edge (bb: next->header); |
339 | for (psi_first = gsi_start_phis (loop->header), |
340 | psi_second = gsi_start_phis (next->header); |
341 | !gsi_end_p (i: psi_first); |
342 | gsi_next (i: &psi_first), gsi_next (i: &psi_second)) |
343 | { |
344 | gphi *phi_first = psi_first.phi (); |
345 | gphi *phi_second = psi_second.phi (); |
346 | tree firstop = gimple_phi_result (gs: phi_first); |
347 | /* The virtual operand is correct already as it's |
348 | always live at exit, hence has a LCSSA node and outer |
349 | loop unrolling updated SSA form. */ |
350 | if (virtual_operand_p (op: firstop)) |
351 | continue; |
352 | |
353 | /* Due to unroll_jam_possible_p() we know that this is |
354 | an induction. The second body goes over the same |
355 | iteration space. */ |
356 | add_phi_arg (phi_second, firstop, e, |
357 | gimple_location (g: phi_first)); |
358 | } |
359 | gcc_assert (gsi_end_p (psi_second)); |
360 | |
361 | merge_loop_tree (loop, old: next); |
362 | gcc_assert (!next->num_nodes); |
363 | class loop *ln = next->next; |
364 | delete_loop (next); |
365 | next = ln; |
366 | } |
367 | } |
368 | |
369 | /* Return true if any of the access functions for dataref A |
370 | isn't invariant with respect to loop LOOP_NEST. */ |
371 | static bool |
372 | any_access_function_variant_p (const struct data_reference *a, |
373 | const class loop *loop_nest) |
374 | { |
375 | vec<tree> fns = DR_ACCESS_FNS (a); |
376 | |
377 | for (tree t : fns) |
378 | if (!evolution_function_is_invariant_p (t, loop_nest->num)) |
379 | return true; |
380 | |
381 | return false; |
382 | } |
383 | |
384 | /* Returns true if the distance in DDR can be determined and adjusts |
385 | the unroll factor in *UNROLL to make unrolling valid for that distance. |
386 | Otherwise return false. DDR is with respect to the outer loop of INNER. |
387 | |
388 | If this data dep can lead to a removed memory reference, increment |
389 | *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor |
390 | for this to happen. */ |
391 | |
392 | static bool |
393 | adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr, |
394 | unsigned *unroll, unsigned *profit_unroll, |
395 | unsigned *removed) |
396 | { |
397 | bool ret = false; |
398 | if (DDR_ARE_DEPENDENT (ddr) != chrec_known) |
399 | { |
400 | if (DDR_NUM_DIST_VECTS (ddr) == 0) |
401 | return false; |
402 | unsigned i; |
403 | lambda_vector dist_v; |
404 | FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) |
405 | { |
406 | /* A distance (a,b) is at worst transformed into (a/N,b) by the |
407 | unrolling (factor N), so the transformation is valid if |
408 | a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll |
409 | factor needs to be limited so that the first condition holds. |
410 | That may limit the factor down to zero in the worst case. */ |
411 | lambda_int dist = dist_v[0]; |
412 | if (dist < 0) |
413 | gcc_unreachable (); |
414 | else if (dist >= (lambda_int)*unroll) |
415 | ; |
416 | else if (lambda_vector_zerop (vec1: dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) |
417 | { |
418 | /* We have (a,0) with a < N, so this will be transformed into |
419 | (0,0) after unrolling by N. This might potentially be a |
420 | problem, if it's not a read-read dependency. */ |
421 | if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))) |
422 | ; |
423 | else |
424 | { |
425 | /* So, at least one is a write, and we might reduce the |
426 | distance vector to (0,0). This is still no problem |
427 | if both data-refs are affine with respect to the inner |
428 | loops. But if one of them is invariant with respect |
429 | to an inner loop our reordering implicit in loop fusion |
430 | corrupts the program, as our data dependences don't |
431 | capture this. E.g. for: |
432 | for (0 <= i < n) |
433 | for (0 <= j < m) |
434 | a[i][0] = a[i+1][0] + 2; // (1) |
435 | b[i][j] = b[i+1][j] + 2; // (2) |
436 | the distance vector for both statements is (-1,0), |
437 | but exchanging the order for (2) is okay, while |
438 | for (1) it is not. To see this, write out the original |
439 | accesses (assume m is 2): |
440 | a i j original |
441 | 0 0 0 r a[1][0] b[1][0] |
442 | 1 0 0 w a[0][0] b[0][0] |
443 | 2 0 1 r a[1][0] b[1][1] |
444 | 3 0 1 w a[0][0] b[0][1] |
445 | 4 1 0 r a[2][0] b[2][0] |
446 | 5 1 0 w a[1][0] b[1][0] |
447 | after unroll-by-2 and fusion the accesses are done in |
448 | this order (from column a): 0,1, 4,5, 2,3, i.e. this: |
449 | a i j transformed |
450 | 0 0 0 r a[1][0] b[1][0] |
451 | 1 0 0 w a[0][0] b[0][0] |
452 | 4 1 0 r a[2][0] b[2][0] |
453 | 5 1 0 w a[1][0] b[1][0] |
454 | 2 0 1 r a[1][0] b[1][1] |
455 | 3 0 1 w a[0][0] b[0][1] |
456 | Note how access 2 accesses the same element as access 5 |
457 | for array 'a' but not for array 'b'. */ |
458 | if (any_access_function_variant_p (DDR_A (ddr), loop_nest: inner) |
459 | && any_access_function_variant_p (DDR_B (ddr), loop_nest: inner)) |
460 | ; |
461 | else |
462 | /* And if any dataref of this pair is invariant with |
463 | respect to the inner loop, we have no chance than |
464 | to reduce the unroll factor. */ |
465 | *unroll = dist; |
466 | } |
467 | } |
468 | else if (lambda_vector_lexico_pos (v: dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) |
469 | ; |
470 | else |
471 | *unroll = dist; |
472 | |
473 | /* With a distance (a,0) it's always profitable to unroll-and-jam |
474 | (by a+1), because one memory reference will go away. With |
475 | (a,b) and b != 0 that's less clear. We will increase the |
476 | number of streams without lowering the number of mem refs. |
477 | So for now only handle the first situation. */ |
478 | if (lambda_vector_zerop (vec1: dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) |
479 | { |
480 | *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1); |
481 | (*removed)++; |
482 | } |
483 | |
484 | ret = true; |
485 | } |
486 | } |
487 | return ret; |
488 | } |
489 | |
490 | /* Main entry point for the unroll-and-jam transformation |
491 | described above. */ |
492 | |
493 | static unsigned int |
494 | tree_loop_unroll_and_jam (void) |
495 | { |
496 | unsigned int todo = 0; |
497 | |
498 | gcc_assert (scev_initialized_p ()); |
499 | |
500 | /* Go through all innermost loops. */ |
501 | for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST)) |
502 | { |
503 | class loop *outer = loop_outer (loop); |
504 | |
505 | if (loop_depth (loop) < 2 |
506 | || optimize_loop_nest_for_size_p (outer)) |
507 | continue; |
508 | |
509 | if (!unroll_jam_possible_p (outer, loop)) |
510 | continue; |
511 | |
512 | vec<data_reference_p> datarefs = vNULL; |
513 | vec<ddr_p> dependences = vNULL; |
514 | unsigned unroll_factor, profit_unroll, removed; |
515 | class tree_niter_desc desc; |
516 | bool unroll = false; |
517 | |
518 | auto_vec<loop_p, 3> loop_nest; |
519 | if (!compute_data_dependences_for_loop (outer, true, &loop_nest, |
520 | &datarefs, &dependences)) |
521 | { |
522 | if (dump_file && (dump_flags & TDF_DETAILS)) |
523 | fprintf (stream: dump_file, format: "Cannot analyze data dependencies\n" ); |
524 | free_data_refs (datarefs); |
525 | free_dependence_relations (dependences); |
526 | continue; |
527 | } |
528 | if (!datarefs.length ()) |
529 | continue; |
530 | |
531 | if (dump_file && (dump_flags & TDF_DETAILS)) |
532 | dump_data_dependence_relations (dump_file, dependences); |
533 | |
534 | unroll_factor = (unsigned)-1; |
535 | profit_unroll = 1; |
536 | removed = 0; |
537 | |
538 | /* Check all dependencies. */ |
539 | unsigned i; |
540 | struct data_dependence_relation *ddr; |
541 | FOR_EACH_VEC_ELT (dependences, i, ddr) |
542 | { |
543 | struct data_reference *dra, *drb; |
544 | |
545 | /* If the refs are independend there's nothing to do. */ |
546 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) |
547 | continue; |
548 | |
549 | dra = DDR_A (ddr); |
550 | drb = DDR_B (ddr); |
551 | |
552 | /* Nothing interesting for the self dependencies, except for WAW if |
553 | the access function is not affine or constant because we may end |
554 | up reordering writes to the same location. */ |
555 | if (dra == drb) |
556 | { |
557 | if (DR_IS_WRITE (dra) |
558 | && !DR_ACCESS_FNS (dra).is_empty () |
559 | && DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) |
560 | { |
561 | unroll_factor = 0; |
562 | break; |
563 | } |
564 | else |
565 | continue; |
566 | } |
567 | |
568 | /* Now check the distance vector, for determining a sensible |
569 | outer unroll factor, and for validity of merging the inner |
570 | loop copies. */ |
571 | if (!adjust_unroll_factor (inner: loop, ddr, unroll: &unroll_factor, profit_unroll: &profit_unroll, |
572 | removed: &removed)) |
573 | { |
574 | /* Couldn't get the distance vector. For two reads that's |
575 | harmless (we assume we should unroll). For at least |
576 | one write this means we can't check the dependence direction |
577 | and hence can't determine safety. */ |
578 | |
579 | if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb)) |
580 | { |
581 | unroll_factor = 0; |
582 | break; |
583 | } |
584 | } |
585 | } |
586 | |
587 | /* We regard a user-specified minimum percentage of zero as a request |
588 | to ignore all profitability concerns and apply the transformation |
589 | always. */ |
590 | if (!param_unroll_jam_min_percent) |
591 | profit_unroll = MAX(2, profit_unroll); |
592 | else if (removed * 100 / datarefs.length () |
593 | < (unsigned)param_unroll_jam_min_percent) |
594 | profit_unroll = 1; |
595 | if (unroll_factor > profit_unroll) |
596 | unroll_factor = profit_unroll; |
597 | if (unroll_factor > (unsigned)param_unroll_jam_max_unroll) |
598 | unroll_factor = param_unroll_jam_max_unroll; |
599 | unroll = (unroll_factor > 1 |
600 | && can_unroll_loop_p (loop: outer, factor: unroll_factor, niter: &desc)); |
601 | |
602 | if (unroll) |
603 | { |
604 | if (dump_enabled_p ()) |
605 | dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS, |
606 | find_loop_location (outer), |
607 | "applying unroll and jam with factor %d\n" , |
608 | unroll_factor); |
609 | initialize_original_copy_tables (); |
610 | tree_unroll_loop (outer, unroll_factor, &desc); |
611 | free_original_copy_tables (); |
612 | fuse_loops (loop: outer->inner); |
613 | todo |= TODO_cleanup_cfg; |
614 | |
615 | auto_bitmap exit_bbs; |
616 | bitmap_set_bit (exit_bbs, single_exit (outer)->dest->index); |
617 | todo |= do_rpo_vn (cfun, loop_preheader_edge (outer), exit_bbs); |
618 | } |
619 | |
620 | loop_nest.release (); |
621 | free_dependence_relations (dependences); |
622 | free_data_refs (datarefs); |
623 | } |
624 | |
625 | if (todo) |
626 | { |
627 | free_dominance_info (CDI_DOMINATORS); |
628 | /* We need to cleanup the CFG first since otherwise SSA form can |
629 | be not up-to-date from the VN run. */ |
630 | if (todo & TODO_cleanup_cfg) |
631 | { |
632 | cleanup_tree_cfg (); |
633 | todo &= ~TODO_cleanup_cfg; |
634 | } |
635 | rewrite_into_loop_closed_ssa (NULL, 0); |
636 | scev_reset (); |
637 | } |
638 | return todo; |
639 | } |
640 | |
641 | /* Pass boilerplate */ |
642 | |
643 | namespace { |
644 | |
645 | const pass_data pass_data_loop_jam = |
646 | { |
647 | .type: GIMPLE_PASS, /* type */ |
648 | .name: "unrolljam" , /* name */ |
649 | .optinfo_flags: OPTGROUP_LOOP, /* optinfo_flags */ |
650 | .tv_id: TV_LOOP_JAM, /* tv_id */ |
651 | PROP_cfg, /* properties_required */ |
652 | .properties_provided: 0, /* properties_provided */ |
653 | .properties_destroyed: 0, /* properties_destroyed */ |
654 | .todo_flags_start: 0, /* todo_flags_start */ |
655 | .todo_flags_finish: 0, /* todo_flags_finish */ |
656 | }; |
657 | |
658 | class pass_loop_jam : public gimple_opt_pass |
659 | { |
660 | public: |
661 | pass_loop_jam (gcc::context *ctxt) |
662 | : gimple_opt_pass (pass_data_loop_jam, ctxt) |
663 | {} |
664 | |
665 | /* opt_pass methods: */ |
666 | bool gate (function *) final override { return flag_unroll_jam != 0; } |
667 | unsigned int execute (function *) final override; |
668 | |
669 | }; |
670 | |
671 | unsigned int |
672 | pass_loop_jam::execute (function *fun) |
673 | { |
674 | if (number_of_loops (fn: fun) <= 1) |
675 | return 0; |
676 | |
677 | return tree_loop_unroll_and_jam (); |
678 | } |
679 | |
680 | } |
681 | |
682 | gimple_opt_pass * |
683 | make_pass_loop_jam (gcc::context *ctxt) |
684 | { |
685 | return new pass_loop_jam (ctxt); |
686 | } |
687 | |