1 | /* Loop Vectorization |
2 | Copyright (C) 2003-2024 Free Software Foundation, Inc. |
3 | Contributed by Dorit Naishlos <dorit@il.ibm.com> and |
4 | Ira Rosen <irar@il.ibm.com> |
5 | |
6 | This file is part of GCC. |
7 | |
8 | GCC is free software; you can redistribute it and/or modify it under |
9 | the terms of the GNU General Public License as published by the Free |
10 | Software Foundation; either version 3, or (at your option) any later |
11 | version. |
12 | |
13 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or |
15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
16 | for more details. |
17 | |
18 | You should have received a copy of the GNU General Public License |
19 | along with GCC; see the file COPYING3. If not see |
20 | <http://www.gnu.org/licenses/>. */ |
21 | |
22 | #define INCLUDE_ALGORITHM |
23 | #include "config.h" |
24 | #include "system.h" |
25 | #include "coretypes.h" |
26 | #include "backend.h" |
27 | #include "target.h" |
28 | #include "rtl.h" |
29 | #include "tree.h" |
30 | #include "gimple.h" |
31 | #include "cfghooks.h" |
32 | #include "tree-pass.h" |
33 | #include "ssa.h" |
34 | #include "optabs-tree.h" |
35 | #include "memmodel.h" |
36 | #include "optabs.h" |
37 | #include "diagnostic-core.h" |
38 | #include "fold-const.h" |
39 | #include "stor-layout.h" |
40 | #include "cfganal.h" |
41 | #include "gimplify.h" |
42 | #include "gimple-iterator.h" |
43 | #include "gimplify-me.h" |
44 | #include "tree-ssa-loop-ivopts.h" |
45 | #include "tree-ssa-loop-manip.h" |
46 | #include "tree-ssa-loop-niter.h" |
47 | #include "tree-ssa-loop.h" |
48 | #include "cfgloop.h" |
49 | #include "tree-scalar-evolution.h" |
50 | #include "tree-vectorizer.h" |
51 | #include "gimple-fold.h" |
52 | #include "cgraph.h" |
53 | #include "tree-cfg.h" |
54 | #include "tree-if-conv.h" |
55 | #include "internal-fn.h" |
56 | #include "tree-vector-builder.h" |
57 | #include "vec-perm-indices.h" |
58 | #include "tree-eh.h" |
59 | #include "case-cfn-macros.h" |
60 | #include "langhooks.h" |
61 | |
62 | /* Loop Vectorization Pass. |
63 | |
64 | This pass tries to vectorize loops. |
65 | |
66 | For example, the vectorizer transforms the following simple loop: |
67 | |
68 | short a[N]; short b[N]; short c[N]; int i; |
69 | |
70 | for (i=0; i<N; i++){ |
71 | a[i] = b[i] + c[i]; |
72 | } |
73 | |
74 | as if it was manually vectorized by rewriting the source code into: |
75 | |
76 | typedef int __attribute__((mode(V8HI))) v8hi; |
77 | short a[N]; short b[N]; short c[N]; int i; |
78 | v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c; |
79 | v8hi va, vb, vc; |
80 | |
81 | for (i=0; i<N/8; i++){ |
82 | vb = pb[i]; |
83 | vc = pc[i]; |
84 | va = vb + vc; |
85 | pa[i] = va; |
86 | } |
87 | |
88 | The main entry to this pass is vectorize_loops(), in which |
89 | the vectorizer applies a set of analyses on a given set of loops, |
90 | followed by the actual vectorization transformation for the loops that |
91 | had successfully passed the analysis phase. |
92 | Throughout this pass we make a distinction between two types of |
93 | data: scalars (which are represented by SSA_NAMES), and memory references |
94 | ("data-refs"). These two types of data require different handling both |
95 | during analysis and transformation. The types of data-refs that the |
96 | vectorizer currently supports are ARRAY_REFS which base is an array DECL |
97 | (not a pointer), and INDIRECT_REFS through pointers; both array and pointer |
98 | accesses are required to have a simple (consecutive) access pattern. |
99 | |
100 | Analysis phase: |
101 | =============== |
102 | The driver for the analysis phase is vect_analyze_loop(). |
103 | It applies a set of analyses, some of which rely on the scalar evolution |
104 | analyzer (scev) developed by Sebastian Pop. |
105 | |
106 | During the analysis phase the vectorizer records some information |
107 | per stmt in a "stmt_vec_info" struct which is attached to each stmt in the |
108 | loop, as well as general information about the loop as a whole, which is |
109 | recorded in a "loop_vec_info" struct attached to each loop. |
110 | |
111 | Transformation phase: |
112 | ===================== |
113 | The loop transformation phase scans all the stmts in the loop, and |
114 | creates a vector stmt (or a sequence of stmts) for each scalar stmt S in |
115 | the loop that needs to be vectorized. It inserts the vector code sequence |
116 | just before the scalar stmt S, and records a pointer to the vector code |
117 | in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct |
118 | attached to S). This pointer will be used for the vectorization of following |
119 | stmts which use the def of stmt S. Stmt S is removed if it writes to memory; |
120 | otherwise, we rely on dead code elimination for removing it. |
121 | |
122 | For example, say stmt S1 was vectorized into stmt VS1: |
123 | |
124 | VS1: vb = px[i]; |
125 | S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 |
126 | S2: a = b; |
127 | |
128 | To vectorize stmt S2, the vectorizer first finds the stmt that defines |
129 | the operand 'b' (S1), and gets the relevant vector def 'vb' from the |
130 | vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The |
131 | resulting sequence would be: |
132 | |
133 | VS1: vb = px[i]; |
134 | S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 |
135 | VS2: va = vb; |
136 | S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2 |
137 | |
138 | Operands that are not SSA_NAMEs, are data-refs that appear in |
139 | load/store operations (like 'x[i]' in S1), and are handled differently. |
140 | |
141 | Target modeling: |
142 | ================= |
143 | Currently the only target specific information that is used is the |
144 | size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". |
145 | Targets that can support different sizes of vectors, for now will need |
146 | to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More |
147 | flexibility will be added in the future. |
148 | |
149 | Since we only vectorize operations which vector form can be |
150 | expressed using existing tree codes, to verify that an operation is |
151 | supported, the vectorizer checks the relevant optab at the relevant |
152 | machine_mode (e.g, optab_handler (add_optab, V8HImode)). If |
153 | the value found is CODE_FOR_nothing, then there's no target support, and |
154 | we can't vectorize the stmt. |
155 | |
156 | For additional information on this project see: |
157 | http://gcc.gnu.org/projects/tree-ssa/vectorization.html |
158 | */ |
159 | |
160 | static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *, |
161 | unsigned *); |
162 | static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info, |
163 | bool *, bool *, bool); |
164 | |
165 | /* Subroutine of vect_determine_vf_for_stmt that handles only one |
166 | statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE |
167 | may already be set for general statements (not just data refs). */ |
168 | |
169 | static opt_result |
170 | vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info, |
171 | bool vectype_maybe_set_p, |
172 | poly_uint64 *vf) |
173 | { |
174 | gimple *stmt = stmt_info->stmt; |
175 | |
176 | if ((!STMT_VINFO_RELEVANT_P (stmt_info) |
177 | && !STMT_VINFO_LIVE_P (stmt_info)) |
178 | || gimple_clobber_p (s: stmt)) |
179 | { |
180 | if (dump_enabled_p ()) |
181 | dump_printf_loc (MSG_NOTE, vect_location, "skip.\n" ); |
182 | return opt_result::success (); |
183 | } |
184 | |
185 | tree stmt_vectype, nunits_vectype; |
186 | opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info, |
187 | &stmt_vectype, |
188 | &nunits_vectype); |
189 | if (!res) |
190 | return res; |
191 | |
192 | if (stmt_vectype) |
193 | { |
194 | if (STMT_VINFO_VECTYPE (stmt_info)) |
195 | /* The only case when a vectype had been already set is for stmts |
196 | that contain a data ref, or for "pattern-stmts" (stmts generated |
197 | by the vectorizer to represent/replace a certain idiom). */ |
198 | gcc_assert ((STMT_VINFO_DATA_REF (stmt_info) |
199 | || vectype_maybe_set_p) |
200 | && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype); |
201 | else |
202 | STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype; |
203 | } |
204 | |
205 | if (nunits_vectype) |
206 | vect_update_max_nunits (max_nunits: vf, vectype: nunits_vectype); |
207 | |
208 | return opt_result::success (); |
209 | } |
210 | |
211 | /* Subroutine of vect_determine_vectorization_factor. Set the vector |
212 | types of STMT_INFO and all attached pattern statements and update |
213 | the vectorization factor VF accordingly. Return true on success |
214 | or false if something prevented vectorization. */ |
215 | |
216 | static opt_result |
217 | vect_determine_vf_for_stmt (vec_info *vinfo, |
218 | stmt_vec_info stmt_info, poly_uint64 *vf) |
219 | { |
220 | if (dump_enabled_p ()) |
221 | dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G" , |
222 | stmt_info->stmt); |
223 | opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, vectype_maybe_set_p: false, vf); |
224 | if (!res) |
225 | return res; |
226 | |
227 | if (STMT_VINFO_IN_PATTERN_P (stmt_info) |
228 | && STMT_VINFO_RELATED_STMT (stmt_info)) |
229 | { |
230 | gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); |
231 | stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); |
232 | |
233 | /* If a pattern statement has def stmts, analyze them too. */ |
234 | for (gimple_stmt_iterator si = gsi_start (seq&: pattern_def_seq); |
235 | !gsi_end_p (i: si); gsi_next (i: &si)) |
236 | { |
237 | stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (i: si)); |
238 | if (dump_enabled_p ()) |
239 | dump_printf_loc (MSG_NOTE, vect_location, |
240 | "==> examining pattern def stmt: %G" , |
241 | def_stmt_info->stmt); |
242 | res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info: def_stmt_info, vectype_maybe_set_p: true, vf); |
243 | if (!res) |
244 | return res; |
245 | } |
246 | |
247 | if (dump_enabled_p ()) |
248 | dump_printf_loc (MSG_NOTE, vect_location, |
249 | "==> examining pattern statement: %G" , |
250 | stmt_info->stmt); |
251 | res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, vectype_maybe_set_p: true, vf); |
252 | if (!res) |
253 | return res; |
254 | } |
255 | |
256 | return opt_result::success (); |
257 | } |
258 | |
259 | /* Function vect_determine_vectorization_factor |
260 | |
261 | Determine the vectorization factor (VF). VF is the number of data elements |
262 | that are operated upon in parallel in a single iteration of the vectorized |
263 | loop. For example, when vectorizing a loop that operates on 4byte elements, |
264 | on a target with vector size (VS) 16byte, the VF is set to 4, since 4 |
265 | elements can fit in a single vector register. |
266 | |
267 | We currently support vectorization of loops in which all types operated upon |
268 | are of the same size. Therefore this function currently sets VF according to |
269 | the size of the types operated upon, and fails if there are multiple sizes |
270 | in the loop. |
271 | |
272 | VF is also the factor by which the loop iterations are strip-mined, e.g.: |
273 | original loop: |
274 | for (i=0; i<N; i++){ |
275 | a[i] = b[i] + c[i]; |
276 | } |
277 | |
278 | vectorized loop: |
279 | for (i=0; i<N; i+=VF){ |
280 | a[i:VF] = b[i:VF] + c[i:VF]; |
281 | } |
282 | */ |
283 | |
284 | static opt_result |
285 | vect_determine_vectorization_factor (loop_vec_info loop_vinfo) |
286 | { |
287 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
288 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
289 | unsigned nbbs = loop->num_nodes; |
290 | poly_uint64 vectorization_factor = 1; |
291 | tree scalar_type = NULL_TREE; |
292 | gphi *phi; |
293 | tree vectype; |
294 | stmt_vec_info stmt_info; |
295 | unsigned i; |
296 | |
297 | DUMP_VECT_SCOPE ("vect_determine_vectorization_factor" ); |
298 | |
299 | for (i = 0; i < nbbs; i++) |
300 | { |
301 | basic_block bb = bbs[i]; |
302 | |
303 | for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si); |
304 | gsi_next (i: &si)) |
305 | { |
306 | phi = si.phi (); |
307 | stmt_info = loop_vinfo->lookup_stmt (phi); |
308 | if (dump_enabled_p ()) |
309 | dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G" , |
310 | (gimple *) phi); |
311 | |
312 | gcc_assert (stmt_info); |
313 | |
314 | if (STMT_VINFO_RELEVANT_P (stmt_info) |
315 | || STMT_VINFO_LIVE_P (stmt_info)) |
316 | { |
317 | gcc_assert (!STMT_VINFO_VECTYPE (stmt_info)); |
318 | scalar_type = TREE_TYPE (PHI_RESULT (phi)); |
319 | |
320 | if (dump_enabled_p ()) |
321 | dump_printf_loc (MSG_NOTE, vect_location, |
322 | "get vectype for scalar type: %T\n" , |
323 | scalar_type); |
324 | |
325 | vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); |
326 | if (!vectype) |
327 | return opt_result::failure_at (loc: phi, |
328 | fmt: "not vectorized: unsupported " |
329 | "data-type %T\n" , |
330 | scalar_type); |
331 | STMT_VINFO_VECTYPE (stmt_info) = vectype; |
332 | |
333 | if (dump_enabled_p ()) |
334 | dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n" , |
335 | vectype); |
336 | |
337 | if (dump_enabled_p ()) |
338 | { |
339 | dump_printf_loc (MSG_NOTE, vect_location, "nunits = " ); |
340 | dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (node: vectype)); |
341 | dump_printf (MSG_NOTE, "\n" ); |
342 | } |
343 | |
344 | vect_update_max_nunits (max_nunits: &vectorization_factor, vectype); |
345 | } |
346 | } |
347 | |
348 | for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (i: si); |
349 | gsi_next (i: &si)) |
350 | { |
351 | if (is_gimple_debug (gs: gsi_stmt (i: si))) |
352 | continue; |
353 | stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (i: si)); |
354 | opt_result res |
355 | = vect_determine_vf_for_stmt (vinfo: loop_vinfo, |
356 | stmt_info, vf: &vectorization_factor); |
357 | if (!res) |
358 | return res; |
359 | } |
360 | } |
361 | |
362 | /* TODO: Analyze cost. Decide if worth while to vectorize. */ |
363 | if (dump_enabled_p ()) |
364 | { |
365 | dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = " ); |
366 | dump_dec (MSG_NOTE, vectorization_factor); |
367 | dump_printf (MSG_NOTE, "\n" ); |
368 | } |
369 | |
370 | if (known_le (vectorization_factor, 1U)) |
371 | return opt_result::failure_at (loc: vect_location, |
372 | fmt: "not vectorized: unsupported data-type\n" ); |
373 | LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; |
374 | return opt_result::success (); |
375 | } |
376 | |
377 | |
378 | /* Function vect_is_simple_iv_evolution. |
379 | |
380 | FORNOW: A simple evolution of an induction variables in the loop is |
381 | considered a polynomial evolution. */ |
382 | |
383 | static bool |
384 | vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, |
385 | tree * step) |
386 | { |
387 | tree init_expr; |
388 | tree step_expr; |
389 | tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb); |
390 | basic_block bb; |
391 | |
392 | /* When there is no evolution in this loop, the evolution function |
393 | is not "simple". */ |
394 | if (evolution_part == NULL_TREE) |
395 | return false; |
396 | |
397 | /* When the evolution is a polynomial of degree >= 2 |
398 | the evolution function is not "simple". */ |
399 | if (tree_is_chrec (expr: evolution_part)) |
400 | return false; |
401 | |
402 | step_expr = evolution_part; |
403 | init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb)); |
404 | |
405 | if (dump_enabled_p ()) |
406 | dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n" , |
407 | step_expr, init_expr); |
408 | |
409 | *init = init_expr; |
410 | *step = step_expr; |
411 | |
412 | if (TREE_CODE (step_expr) != INTEGER_CST |
413 | && (TREE_CODE (step_expr) != SSA_NAME |
414 | || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr))) |
415 | && flow_bb_inside_loop_p (get_loop (cfun, num: loop_nb), bb)) |
416 | || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr)) |
417 | && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)) |
418 | || !flag_associative_math))) |
419 | && (TREE_CODE (step_expr) != REAL_CST |
420 | || !flag_associative_math)) |
421 | { |
422 | if (dump_enabled_p ()) |
423 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
424 | "step unknown.\n" ); |
425 | return false; |
426 | } |
427 | |
428 | return true; |
429 | } |
430 | |
431 | /* Function vect_is_nonlinear_iv_evolution |
432 | |
433 | Only support nonlinear induction for integer type |
434 | 1. neg |
435 | 2. mul by constant |
436 | 3. lshift/rshift by constant. |
437 | |
438 | For neg induction, return a fake step as integer -1. */ |
439 | static bool |
440 | vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info, |
441 | gphi* loop_phi_node, tree *init, tree *step) |
442 | { |
443 | tree init_expr, ev_expr, result, op1, op2; |
444 | gimple* def; |
445 | |
446 | if (gimple_phi_num_args (gs: loop_phi_node) != 2) |
447 | return false; |
448 | |
449 | init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop)); |
450 | ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop)); |
451 | |
452 | /* Support nonlinear induction only for integer type. */ |
453 | if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr))) |
454 | return false; |
455 | |
456 | *init = init_expr; |
457 | result = PHI_RESULT (loop_phi_node); |
458 | |
459 | if (TREE_CODE (ev_expr) != SSA_NAME |
460 | || ((def = SSA_NAME_DEF_STMT (ev_expr)), false) |
461 | || !is_gimple_assign (gs: def)) |
462 | return false; |
463 | |
464 | enum tree_code t_code = gimple_assign_rhs_code (gs: def); |
465 | switch (t_code) |
466 | { |
467 | case NEGATE_EXPR: |
468 | if (gimple_assign_rhs1 (gs: def) != result) |
469 | return false; |
470 | *step = build_int_cst (TREE_TYPE (init_expr), -1); |
471 | STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg; |
472 | break; |
473 | |
474 | case RSHIFT_EXPR: |
475 | case LSHIFT_EXPR: |
476 | case MULT_EXPR: |
477 | op1 = gimple_assign_rhs1 (gs: def); |
478 | op2 = gimple_assign_rhs2 (gs: def); |
479 | if (TREE_CODE (op2) != INTEGER_CST |
480 | || op1 != result) |
481 | return false; |
482 | *step = op2; |
483 | if (t_code == LSHIFT_EXPR) |
484 | STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl; |
485 | else if (t_code == RSHIFT_EXPR) |
486 | STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr; |
487 | /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */ |
488 | else |
489 | STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul; |
490 | break; |
491 | |
492 | default: |
493 | return false; |
494 | } |
495 | |
496 | STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init; |
497 | STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step; |
498 | |
499 | return true; |
500 | } |
501 | |
502 | /* Return true if PHI, described by STMT_INFO, is the inner PHI in |
503 | what we are assuming is a double reduction. For example, given |
504 | a structure like this: |
505 | |
506 | outer1: |
507 | x_1 = PHI <x_4(outer2), ...>; |
508 | ... |
509 | |
510 | inner: |
511 | x_2 = PHI <x_1(outer1), ...>; |
512 | ... |
513 | x_3 = ...; |
514 | ... |
515 | |
516 | outer2: |
517 | x_4 = PHI <x_3(inner)>; |
518 | ... |
519 | |
520 | outer loop analysis would treat x_1 as a double reduction phi and |
521 | this function would then return true for x_2. */ |
522 | |
523 | static bool |
524 | vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi) |
525 | { |
526 | use_operand_p use_p; |
527 | ssa_op_iter op_iter; |
528 | FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE) |
529 | if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p))) |
530 | if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def) |
531 | return true; |
532 | return false; |
533 | } |
534 | |
535 | /* Returns true if Phi is a first-order recurrence. A first-order |
536 | recurrence is a non-reduction recurrence relation in which the value of |
537 | the recurrence in the current loop iteration equals a value defined in |
538 | the previous iteration. */ |
539 | |
540 | static bool |
541 | vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop, |
542 | gphi *phi) |
543 | { |
544 | /* A nested cycle isn't vectorizable as first order recurrence. */ |
545 | if (LOOP_VINFO_LOOP (loop_vinfo) != loop) |
546 | return false; |
547 | |
548 | /* Ensure the loop latch definition is from within the loop. */ |
549 | edge latch = loop_latch_edge (loop); |
550 | tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch); |
551 | if (TREE_CODE (ldef) != SSA_NAME |
552 | || SSA_NAME_IS_DEFAULT_DEF (ldef) |
553 | || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef)) |
554 | || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef)))) |
555 | return false; |
556 | |
557 | tree def = gimple_phi_result (gs: phi); |
558 | |
559 | /* Ensure every use_stmt of the phi node is dominated by the latch |
560 | definition. */ |
561 | imm_use_iterator imm_iter; |
562 | use_operand_p use_p; |
563 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def) |
564 | if (!is_gimple_debug (USE_STMT (use_p)) |
565 | && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p) |
566 | || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef), |
567 | USE_STMT (use_p)))) |
568 | return false; |
569 | |
570 | /* First-order recurrence autovectorization needs shuffle vector. */ |
571 | tree scalar_type = TREE_TYPE (def); |
572 | tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); |
573 | if (!vectype) |
574 | return false; |
575 | |
576 | return true; |
577 | } |
578 | |
579 | /* Function vect_analyze_scalar_cycles_1. |
580 | |
581 | Examine the cross iteration def-use cycles of scalar variables |
582 | in LOOP. LOOP_VINFO represents the loop that is now being |
583 | considered for vectorization (can be LOOP, or an outer-loop |
584 | enclosing LOOP). SLP indicates there will be some subsequent |
585 | slp analyses or not. */ |
586 | |
587 | static void |
588 | vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop, |
589 | bool slp) |
590 | { |
591 | basic_block bb = loop->header; |
592 | tree init, step; |
593 | auto_vec<stmt_vec_info, 64> worklist; |
594 | gphi_iterator gsi; |
595 | bool double_reduc, reduc_chain; |
596 | |
597 | DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles" ); |
598 | |
599 | /* First - identify all inductions. Reduction detection assumes that all the |
600 | inductions have been identified, therefore, this order must not be |
601 | changed. */ |
602 | for (gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
603 | { |
604 | gphi *phi = gsi.phi (); |
605 | tree access_fn = NULL; |
606 | tree def = PHI_RESULT (phi); |
607 | stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi); |
608 | |
609 | if (dump_enabled_p ()) |
610 | dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G" , |
611 | (gimple *) phi); |
612 | |
613 | /* Skip virtual phi's. The data dependences that are associated with |
614 | virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */ |
615 | if (virtual_operand_p (op: def)) |
616 | continue; |
617 | |
618 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type; |
619 | |
620 | /* Analyze the evolution function. */ |
621 | access_fn = analyze_scalar_evolution (loop, def); |
622 | if (access_fn) |
623 | { |
624 | STRIP_NOPS (access_fn); |
625 | if (dump_enabled_p ()) |
626 | dump_printf_loc (MSG_NOTE, vect_location, |
627 | "Access function of PHI: %T\n" , access_fn); |
628 | STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) |
629 | = initial_condition_in_loop_num (access_fn, loop->num); |
630 | STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) |
631 | = evolution_part_in_loop_num (access_fn, loop->num); |
632 | } |
633 | |
634 | if ((!access_fn |
635 | || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi) |
636 | || !vect_is_simple_iv_evolution (loop_nb: loop->num, access_fn, |
637 | init: &init, step: &step) |
638 | || (LOOP_VINFO_LOOP (loop_vinfo) != loop |
639 | && TREE_CODE (step) != INTEGER_CST)) |
640 | /* Only handle nonlinear iv for same loop. */ |
641 | && (LOOP_VINFO_LOOP (loop_vinfo) != loop |
642 | || !vect_is_nonlinear_iv_evolution (loop, stmt_info: stmt_vinfo, |
643 | loop_phi_node: phi, init: &init, step: &step))) |
644 | { |
645 | worklist.safe_push (obj: stmt_vinfo); |
646 | continue; |
647 | } |
648 | |
649 | gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) |
650 | != NULL_TREE); |
651 | gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE); |
652 | |
653 | if (dump_enabled_p ()) |
654 | dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n" ); |
655 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def; |
656 | } |
657 | |
658 | |
659 | /* Second - identify all reductions and nested cycles. */ |
660 | while (worklist.length () > 0) |
661 | { |
662 | stmt_vec_info stmt_vinfo = worklist.pop (); |
663 | gphi *phi = as_a <gphi *> (p: stmt_vinfo->stmt); |
664 | tree def = PHI_RESULT (phi); |
665 | |
666 | if (dump_enabled_p ()) |
667 | dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G" , |
668 | (gimple *) phi); |
669 | |
670 | gcc_assert (!virtual_operand_p (def) |
671 | && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); |
672 | |
673 | stmt_vec_info reduc_stmt_info |
674 | = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc, |
675 | &reduc_chain, slp); |
676 | if (reduc_stmt_info) |
677 | { |
678 | STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info; |
679 | STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo; |
680 | if (double_reduc) |
681 | { |
682 | if (dump_enabled_p ()) |
683 | dump_printf_loc (MSG_NOTE, vect_location, |
684 | "Detected double reduction.\n" ); |
685 | |
686 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; |
687 | STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def; |
688 | } |
689 | else |
690 | { |
691 | if (loop != LOOP_VINFO_LOOP (loop_vinfo)) |
692 | { |
693 | if (dump_enabled_p ()) |
694 | dump_printf_loc (MSG_NOTE, vect_location, |
695 | "Detected vectorizable nested cycle.\n" ); |
696 | |
697 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; |
698 | } |
699 | else |
700 | { |
701 | if (dump_enabled_p ()) |
702 | dump_printf_loc (MSG_NOTE, vect_location, |
703 | "Detected reduction.\n" ); |
704 | |
705 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; |
706 | STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def; |
707 | /* Store the reduction cycles for possible vectorization in |
708 | loop-aware SLP if it was not detected as reduction |
709 | chain. */ |
710 | if (! reduc_chain) |
711 | LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push |
712 | (obj: reduc_stmt_info); |
713 | } |
714 | } |
715 | } |
716 | else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi)) |
717 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence; |
718 | else |
719 | if (dump_enabled_p ()) |
720 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
721 | "Unknown def-use cycle pattern.\n" ); |
722 | } |
723 | } |
724 | |
725 | |
726 | /* Function vect_analyze_scalar_cycles. |
727 | |
728 | Examine the cross iteration def-use cycles of scalar variables, by |
729 | analyzing the loop-header PHIs of scalar variables. Classify each |
730 | cycle as one of the following: invariant, induction, reduction, unknown. |
731 | We do that for the loop represented by LOOP_VINFO, and also to its |
732 | inner-loop, if exists. |
733 | Examples for scalar cycles: |
734 | |
735 | Example1: reduction: |
736 | |
737 | loop1: |
738 | for (i=0; i<N; i++) |
739 | sum += a[i]; |
740 | |
741 | Example2: induction: |
742 | |
743 | loop2: |
744 | for (i=0; i<N; i++) |
745 | a[i] = i; */ |
746 | |
747 | static void |
748 | vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp) |
749 | { |
750 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
751 | |
752 | vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp); |
753 | |
754 | /* When vectorizing an outer-loop, the inner-loop is executed sequentially. |
755 | Reductions in such inner-loop therefore have different properties than |
756 | the reductions in the nest that gets vectorized: |
757 | 1. When vectorized, they are executed in the same order as in the original |
758 | scalar loop, so we can't change the order of computation when |
759 | vectorizing them. |
760 | 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the |
761 | current checks are too strict. */ |
762 | |
763 | if (loop->inner) |
764 | vect_analyze_scalar_cycles_1 (loop_vinfo, loop: loop->inner, slp); |
765 | } |
766 | |
767 | /* Transfer group and reduction information from STMT_INFO to its |
768 | pattern stmt. */ |
769 | |
770 | static void |
771 | vect_fixup_reduc_chain (stmt_vec_info stmt_info) |
772 | { |
773 | stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info); |
774 | stmt_vec_info stmtp; |
775 | gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp) |
776 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info)); |
777 | REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info); |
778 | do |
779 | { |
780 | stmtp = STMT_VINFO_RELATED_STMT (stmt_info); |
781 | gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp) |
782 | == STMT_VINFO_DEF_TYPE (stmt_info)); |
783 | REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp; |
784 | stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info); |
785 | if (stmt_info) |
786 | REDUC_GROUP_NEXT_ELEMENT (stmtp) |
787 | = STMT_VINFO_RELATED_STMT (stmt_info); |
788 | } |
789 | while (stmt_info); |
790 | } |
791 | |
792 | /* Fixup scalar cycles that now have their stmts detected as patterns. */ |
793 | |
794 | static void |
795 | vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo) |
796 | { |
797 | stmt_vec_info first; |
798 | unsigned i; |
799 | |
800 | FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first) |
801 | { |
802 | stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first); |
803 | while (next) |
804 | { |
805 | if ((STMT_VINFO_IN_PATTERN_P (next) |
806 | != STMT_VINFO_IN_PATTERN_P (first)) |
807 | || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1) |
808 | break; |
809 | next = REDUC_GROUP_NEXT_ELEMENT (next); |
810 | } |
811 | /* If all reduction chain members are well-formed patterns adjust |
812 | the group to group the pattern stmts instead. */ |
813 | if (! next |
814 | && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1) |
815 | { |
816 | if (STMT_VINFO_IN_PATTERN_P (first)) |
817 | { |
818 | vect_fixup_reduc_chain (stmt_info: first); |
819 | LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] |
820 | = STMT_VINFO_RELATED_STMT (first); |
821 | } |
822 | } |
823 | /* If not all stmt in the chain are patterns or if we failed |
824 | to update STMT_VINFO_REDUC_IDX dissolve the chain and handle |
825 | it as regular reduction instead. */ |
826 | else |
827 | { |
828 | stmt_vec_info vinfo = first; |
829 | stmt_vec_info last = NULL; |
830 | while (vinfo) |
831 | { |
832 | next = REDUC_GROUP_NEXT_ELEMENT (vinfo); |
833 | REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL; |
834 | REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL; |
835 | last = vinfo; |
836 | vinfo = next; |
837 | } |
838 | STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first)) |
839 | = vect_internal_def; |
840 | loop_vinfo->reductions.safe_push (obj: vect_stmt_to_vectorize (stmt_info: last)); |
841 | LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (ix: i); |
842 | --i; |
843 | } |
844 | } |
845 | } |
846 | |
847 | /* Function vect_get_loop_niters. |
848 | |
849 | Determine how many iterations the loop is executed and place it |
850 | in NUMBER_OF_ITERATIONS. Place the number of latch iterations |
851 | in NUMBER_OF_ITERATIONSM1. Place the condition under which the |
852 | niter information holds in ASSUMPTIONS. |
853 | |
854 | Return the loop exit conditions. */ |
855 | |
856 | |
857 | static vec<gcond *> |
858 | vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions, |
859 | tree *number_of_iterations, tree *number_of_iterationsm1) |
860 | { |
861 | auto_vec<edge> exits = get_loop_exit_edges (loop); |
862 | vec<gcond *> conds; |
863 | conds.create (nelems: exits.length ()); |
864 | class tree_niter_desc niter_desc; |
865 | tree niter_assumptions, niter, may_be_zero; |
866 | |
867 | *assumptions = boolean_true_node; |
868 | *number_of_iterationsm1 = chrec_dont_know; |
869 | *number_of_iterations = chrec_dont_know; |
870 | |
871 | DUMP_VECT_SCOPE ("get_loop_niters" ); |
872 | |
873 | if (exits.is_empty ()) |
874 | return conds; |
875 | |
876 | if (dump_enabled_p ()) |
877 | dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n" , |
878 | exits.length ()); |
879 | |
880 | edge exit; |
881 | unsigned int i; |
882 | FOR_EACH_VEC_ELT (exits, i, exit) |
883 | { |
884 | gcond *cond = get_loop_exit_condition (exit); |
885 | if (cond) |
886 | conds.safe_push (obj: cond); |
887 | |
888 | if (dump_enabled_p ()) |
889 | dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n" , i); |
890 | |
891 | if (exit != main_exit) |
892 | continue; |
893 | |
894 | may_be_zero = NULL_TREE; |
895 | if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) |
896 | || chrec_contains_undetermined (niter_desc.niter)) |
897 | continue; |
898 | |
899 | niter_assumptions = niter_desc.assumptions; |
900 | may_be_zero = niter_desc.may_be_zero; |
901 | niter = niter_desc.niter; |
902 | |
903 | if (may_be_zero && integer_zerop (may_be_zero)) |
904 | may_be_zero = NULL_TREE; |
905 | |
906 | if (may_be_zero) |
907 | { |
908 | if (COMPARISON_CLASS_P (may_be_zero)) |
909 | { |
910 | /* Try to combine may_be_zero with assumptions, this can simplify |
911 | computation of niter expression. */ |
912 | if (niter_assumptions && !integer_nonzerop (niter_assumptions)) |
913 | niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, |
914 | niter_assumptions, |
915 | fold_build1 (TRUTH_NOT_EXPR, |
916 | boolean_type_node, |
917 | may_be_zero)); |
918 | else |
919 | niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero, |
920 | build_int_cst (TREE_TYPE (niter), 0), |
921 | rewrite_to_non_trapping_overflow (niter)); |
922 | |
923 | may_be_zero = NULL_TREE; |
924 | } |
925 | else if (integer_nonzerop (may_be_zero)) |
926 | { |
927 | *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0); |
928 | *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1); |
929 | continue; |
930 | } |
931 | else |
932 | continue; |
933 | } |
934 | |
935 | /* Loop assumptions are based off the normal exit. */ |
936 | *assumptions = niter_assumptions; |
937 | *number_of_iterationsm1 = niter; |
938 | |
939 | /* We want the number of loop header executions which is the number |
940 | of latch executions plus one. |
941 | ??? For UINT_MAX latch executions this number overflows to zero |
942 | for loops like do { n++; } while (n != 0); */ |
943 | if (niter && !chrec_contains_undetermined (niter)) |
944 | { |
945 | niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), |
946 | unshare_expr (niter), |
947 | build_int_cst (TREE_TYPE (niter), 1)); |
948 | if (TREE_CODE (niter) == INTEGER_CST |
949 | && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST) |
950 | { |
951 | /* If we manage to fold niter + 1 into INTEGER_CST even when |
952 | niter is some complex expression, ensure back |
953 | *number_of_iterationsm1 is an INTEGER_CST as well. See |
954 | PR113210. */ |
955 | *number_of_iterationsm1 |
956 | = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter, |
957 | build_minus_one_cst (TREE_TYPE (niter))); |
958 | } |
959 | } |
960 | *number_of_iterations = niter; |
961 | } |
962 | |
963 | if (dump_enabled_p ()) |
964 | dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n" ); |
965 | |
966 | return conds; |
967 | } |
968 | |
969 | /* Determine the main loop exit for the vectorizer. */ |
970 | |
971 | edge |
972 | vec_init_loop_exit_info (class loop *loop) |
973 | { |
974 | /* Before we begin we must first determine which exit is the main one and |
975 | which are auxilary exits. */ |
976 | auto_vec<edge> exits = get_loop_exit_edges (loop); |
977 | if (exits.length () == 1) |
978 | return exits[0]; |
979 | |
980 | /* If we have multiple exits we only support counting IV at the moment. |
981 | Analyze all exits and return the last one we can analyze. */ |
982 | class tree_niter_desc niter_desc; |
983 | edge candidate = NULL; |
984 | for (edge exit : exits) |
985 | { |
986 | if (!get_loop_exit_condition (exit)) |
987 | continue; |
988 | |
989 | if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) |
990 | && !chrec_contains_undetermined (niter_desc.niter)) |
991 | { |
992 | tree may_be_zero = niter_desc.may_be_zero; |
993 | if ((integer_zerop (may_be_zero) |
994 | /* As we are handling may_be_zero that's not false by |
995 | rewriting niter to may_be_zero ? 0 : niter we require |
996 | an empty latch. */ |
997 | || (single_pred_p (bb: loop->latch) |
998 | && exit->src == single_pred (bb: loop->latch) |
999 | && (integer_nonzerop (may_be_zero) |
1000 | || COMPARISON_CLASS_P (may_be_zero)))) |
1001 | && (!candidate |
1002 | || dominated_by_p (CDI_DOMINATORS, exit->src, |
1003 | candidate->src))) |
1004 | candidate = exit; |
1005 | } |
1006 | } |
1007 | |
1008 | return candidate; |
1009 | } |
1010 | |
1011 | /* Function bb_in_loop_p |
1012 | |
1013 | Used as predicate for dfs order traversal of the loop bbs. */ |
1014 | |
1015 | static bool |
1016 | bb_in_loop_p (const_basic_block bb, const void *data) |
1017 | { |
1018 | const class loop *const loop = (const class loop *)data; |
1019 | if (flow_bb_inside_loop_p (loop, bb)) |
1020 | return true; |
1021 | return false; |
1022 | } |
1023 | |
1024 | |
1025 | /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as |
1026 | stmt_vec_info structs for all the stmts in LOOP_IN. */ |
1027 | |
1028 | _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) |
1029 | : vec_info (vec_info::loop, shared), |
1030 | loop (loop_in), |
1031 | bbs (XCNEWVEC (basic_block, loop->num_nodes)), |
1032 | num_itersm1 (NULL_TREE), |
1033 | num_iters (NULL_TREE), |
1034 | num_iters_unchanged (NULL_TREE), |
1035 | num_iters_assumptions (NULL_TREE), |
1036 | vector_costs (nullptr), |
1037 | scalar_costs (nullptr), |
1038 | th (0), |
1039 | versioning_threshold (0), |
1040 | vectorization_factor (0), |
1041 | main_loop_edge (nullptr), |
1042 | skip_main_loop_edge (nullptr), |
1043 | skip_this_loop_edge (nullptr), |
1044 | reusable_accumulators (), |
1045 | suggested_unroll_factor (1), |
1046 | max_vectorization_factor (0), |
1047 | mask_skip_niters (NULL_TREE), |
1048 | rgroup_compare_type (NULL_TREE), |
1049 | simd_if_cond (NULL_TREE), |
1050 | partial_vector_style (vect_partial_vectors_none), |
1051 | unaligned_dr (NULL), |
1052 | peeling_for_alignment (0), |
1053 | ptr_mask (0), |
1054 | ivexpr_map (NULL), |
1055 | scan_map (NULL), |
1056 | slp_unrolling_factor (1), |
1057 | inner_loop_cost_factor (param_vect_inner_loop_cost_factor), |
1058 | vectorizable (false), |
1059 | can_use_partial_vectors_p (param_vect_partial_vector_usage != 0), |
1060 | using_partial_vectors_p (false), |
1061 | using_decrementing_iv_p (false), |
1062 | using_select_vl_p (false), |
1063 | epil_using_partial_vectors_p (false), |
1064 | partial_load_store_bias (0), |
1065 | peeling_for_gaps (false), |
1066 | peeling_for_niter (false), |
1067 | early_breaks (false), |
1068 | no_data_dependencies (false), |
1069 | has_mask_store (false), |
1070 | scalar_loop_scaling (profile_probability::uninitialized ()), |
1071 | scalar_loop (NULL), |
1072 | orig_loop_info (NULL), |
1073 | vec_loop_iv_exit (NULL), |
1074 | vec_epilogue_loop_iv_exit (NULL), |
1075 | scalar_loop_iv_exit (NULL) |
1076 | { |
1077 | /* CHECKME: We want to visit all BBs before their successors (except for |
1078 | latch blocks, for which this assertion wouldn't hold). In the simple |
1079 | case of the loop forms we allow, a dfs order of the BBs would the same |
1080 | as reversed postorder traversal, so we are safe. */ |
1081 | |
1082 | unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, |
1083 | bbs, loop->num_nodes, loop); |
1084 | gcc_assert (nbbs == loop->num_nodes); |
1085 | |
1086 | for (unsigned int i = 0; i < nbbs; i++) |
1087 | { |
1088 | basic_block bb = bbs[i]; |
1089 | gimple_stmt_iterator si; |
1090 | |
1091 | for (si = gsi_start_phis (bb); !gsi_end_p (i: si); gsi_next (i: &si)) |
1092 | { |
1093 | gimple *phi = gsi_stmt (i: si); |
1094 | gimple_set_uid (g: phi, uid: 0); |
1095 | add_stmt (phi); |
1096 | } |
1097 | |
1098 | for (si = gsi_start_bb (bb); !gsi_end_p (i: si); gsi_next (i: &si)) |
1099 | { |
1100 | gimple *stmt = gsi_stmt (i: si); |
1101 | gimple_set_uid (g: stmt, uid: 0); |
1102 | if (is_gimple_debug (gs: stmt)) |
1103 | continue; |
1104 | add_stmt (stmt); |
1105 | /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the |
1106 | third argument is the #pragma omp simd if (x) condition, when 0, |
1107 | loop shouldn't be vectorized, when non-zero constant, it should |
1108 | be vectorized normally, otherwise versioned with vectorized loop |
1109 | done if the condition is non-zero at runtime. */ |
1110 | if (loop_in->simduid |
1111 | && is_gimple_call (gs: stmt) |
1112 | && gimple_call_internal_p (gs: stmt) |
1113 | && gimple_call_internal_fn (gs: stmt) == IFN_GOMP_SIMD_LANE |
1114 | && gimple_call_num_args (gs: stmt) >= 3 |
1115 | && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME |
1116 | && (loop_in->simduid |
1117 | == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))) |
1118 | { |
1119 | tree arg = gimple_call_arg (gs: stmt, index: 2); |
1120 | if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME) |
1121 | simd_if_cond = arg; |
1122 | else |
1123 | gcc_assert (integer_nonzerop (arg)); |
1124 | } |
1125 | } |
1126 | } |
1127 | |
1128 | epilogue_vinfos.create (nelems: 6); |
1129 | } |
1130 | |
1131 | /* Free all levels of rgroup CONTROLS. */ |
1132 | |
1133 | void |
1134 | release_vec_loop_controls (vec<rgroup_controls> *controls) |
1135 | { |
1136 | rgroup_controls *rgc; |
1137 | unsigned int i; |
1138 | FOR_EACH_VEC_ELT (*controls, i, rgc) |
1139 | rgc->controls.release (); |
1140 | controls->release (); |
1141 | } |
1142 | |
1143 | /* Free all memory used by the _loop_vec_info, as well as all the |
1144 | stmt_vec_info structs of all the stmts in the loop. */ |
1145 | |
1146 | _loop_vec_info::~_loop_vec_info () |
1147 | { |
1148 | free (ptr: bbs); |
1149 | |
1150 | release_vec_loop_controls (controls: &masks.rgc_vec); |
1151 | release_vec_loop_controls (controls: &lens); |
1152 | delete ivexpr_map; |
1153 | delete scan_map; |
1154 | epilogue_vinfos.release (); |
1155 | delete scalar_costs; |
1156 | delete vector_costs; |
1157 | |
1158 | /* When we release an epiloge vinfo that we do not intend to use |
1159 | avoid clearing AUX of the main loop which should continue to |
1160 | point to the main loop vinfo since otherwise we'll leak that. */ |
1161 | if (loop->aux == this) |
1162 | loop->aux = NULL; |
1163 | } |
1164 | |
1165 | /* Return an invariant or register for EXPR and emit necessary |
1166 | computations in the LOOP_VINFO loop preheader. */ |
1167 | |
1168 | tree |
1169 | cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr) |
1170 | { |
1171 | if (is_gimple_reg (expr) |
1172 | || is_gimple_min_invariant (expr)) |
1173 | return expr; |
1174 | |
1175 | if (! loop_vinfo->ivexpr_map) |
1176 | loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>; |
1177 | tree &cached = loop_vinfo->ivexpr_map->get_or_insert (k: expr); |
1178 | if (! cached) |
1179 | { |
1180 | gimple_seq stmts = NULL; |
1181 | cached = force_gimple_operand (unshare_expr (expr), |
1182 | &stmts, true, NULL_TREE); |
1183 | if (stmts) |
1184 | { |
1185 | edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); |
1186 | gsi_insert_seq_on_edge_immediate (e, stmts); |
1187 | } |
1188 | } |
1189 | return cached; |
1190 | } |
1191 | |
1192 | /* Return true if we can use CMP_TYPE as the comparison type to produce |
1193 | all masks required to mask LOOP_VINFO. */ |
1194 | |
1195 | static bool |
1196 | can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type) |
1197 | { |
1198 | rgroup_controls *rgm; |
1199 | unsigned int i; |
1200 | FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm) |
1201 | if (rgm->type != NULL_TREE |
1202 | && !direct_internal_fn_supported_p (fn: IFN_WHILE_ULT, |
1203 | type0: cmp_type, type1: rgm->type, |
1204 | opt_type: OPTIMIZE_FOR_SPEED)) |
1205 | return false; |
1206 | return true; |
1207 | } |
1208 | |
1209 | /* Calculate the maximum number of scalars per iteration for every |
1210 | rgroup in LOOP_VINFO. */ |
1211 | |
1212 | static unsigned int |
1213 | vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo) |
1214 | { |
1215 | unsigned int res = 1; |
1216 | unsigned int i; |
1217 | rgroup_controls *rgm; |
1218 | FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm) |
1219 | res = MAX (res, rgm->max_nscalars_per_iter); |
1220 | return res; |
1221 | } |
1222 | |
1223 | /* Calculate the minimum precision necessary to represent: |
1224 | |
1225 | MAX_NITERS * FACTOR |
1226 | |
1227 | as an unsigned integer, where MAX_NITERS is the maximum number of |
1228 | loop header iterations for the original scalar form of LOOP_VINFO. */ |
1229 | |
1230 | static unsigned |
1231 | vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor) |
1232 | { |
1233 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1234 | |
1235 | /* Get the maximum number of iterations that is representable |
1236 | in the counter type. */ |
1237 | tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo)); |
1238 | widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1; |
1239 | |
1240 | /* Get a more refined estimate for the number of iterations. */ |
1241 | widest_int max_back_edges; |
1242 | if (max_loop_iterations (loop, &max_back_edges)) |
1243 | max_ni = wi::smin (x: max_ni, y: max_back_edges + 1); |
1244 | |
1245 | /* Work out how many bits we need to represent the limit. */ |
1246 | return wi::min_precision (x: max_ni * factor, sgn: UNSIGNED); |
1247 | } |
1248 | |
1249 | /* True if the loop needs peeling or partial vectors when vectorized. */ |
1250 | |
1251 | static bool |
1252 | vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo) |
1253 | { |
1254 | unsigned HOST_WIDE_INT const_vf; |
1255 | HOST_WIDE_INT max_niter |
1256 | = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); |
1257 | |
1258 | unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); |
1259 | if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) |
1260 | th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO |
1261 | (loop_vinfo)); |
1262 | |
1263 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
1264 | && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) |
1265 | { |
1266 | /* Work out the (constant) number of iterations that need to be |
1267 | peeled for reasons other than niters. */ |
1268 | unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
1269 | if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) |
1270 | peel_niter += 1; |
1271 | if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter, |
1272 | LOOP_VINFO_VECT_FACTOR (loop_vinfo))) |
1273 | return true; |
1274 | } |
1275 | else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) |
1276 | /* ??? When peeling for gaps but not alignment, we could |
1277 | try to check whether the (variable) niters is known to be |
1278 | VF * N + 1. That's something of a niche case though. */ |
1279 | || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) |
1280 | || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (const_value: &const_vf) |
1281 | || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) |
1282 | < (unsigned) exact_log2 (x: const_vf)) |
1283 | /* In case of versioning, check if the maximum number of |
1284 | iterations is greater than th. If they are identical, |
1285 | the epilogue is unnecessary. */ |
1286 | && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) |
1287 | || ((unsigned HOST_WIDE_INT) max_niter |
1288 | /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD |
1289 | but that's only computed later based on our result. |
1290 | The following is the most conservative approximation. */ |
1291 | > (std::max (a: (unsigned HOST_WIDE_INT) th, |
1292 | b: const_vf) / const_vf) * const_vf)))) |
1293 | return true; |
1294 | |
1295 | return false; |
1296 | } |
1297 | |
1298 | /* Each statement in LOOP_VINFO can be masked where necessary. Check |
1299 | whether we can actually generate the masks required. Return true if so, |
1300 | storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */ |
1301 | |
1302 | static bool |
1303 | vect_verify_full_masking (loop_vec_info loop_vinfo) |
1304 | { |
1305 | unsigned int min_ni_width; |
1306 | |
1307 | /* Use a normal loop if there are no statements that need masking. |
1308 | This only happens in rare degenerate cases: it means that the loop |
1309 | has no loads, no stores, and no live-out values. */ |
1310 | if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) |
1311 | return false; |
1312 | |
1313 | /* Produce the rgroup controls. */ |
1314 | for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set) |
1315 | { |
1316 | vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); |
1317 | tree vectype = mask.first; |
1318 | unsigned nvectors = mask.second; |
1319 | |
1320 | if (masks->rgc_vec.length () < nvectors) |
1321 | masks->rgc_vec.safe_grow_cleared (len: nvectors, exact: true); |
1322 | rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1]; |
1323 | /* The number of scalars per iteration and the number of vectors are |
1324 | both compile-time constants. */ |
1325 | unsigned int nscalars_per_iter |
1326 | = exact_div (a: nvectors * TYPE_VECTOR_SUBPARTS (node: vectype), |
1327 | LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); |
1328 | |
1329 | if (rgm->max_nscalars_per_iter < nscalars_per_iter) |
1330 | { |
1331 | rgm->max_nscalars_per_iter = nscalars_per_iter; |
1332 | rgm->type = truth_type_for (vectype); |
1333 | rgm->factor = 1; |
1334 | } |
1335 | } |
1336 | |
1337 | unsigned int max_nscalars_per_iter |
1338 | = vect_get_max_nscalars_per_iter (loop_vinfo); |
1339 | |
1340 | /* Work out how many bits we need to represent the limit. */ |
1341 | min_ni_width |
1342 | = vect_min_prec_for_max_niters (loop_vinfo, factor: max_nscalars_per_iter); |
1343 | |
1344 | /* Find a scalar mode for which WHILE_ULT is supported. */ |
1345 | opt_scalar_int_mode cmp_mode_iter; |
1346 | tree cmp_type = NULL_TREE; |
1347 | tree iv_type = NULL_TREE; |
1348 | widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo); |
1349 | unsigned int iv_precision = UINT_MAX; |
1350 | |
1351 | if (iv_limit != -1) |
1352 | iv_precision = wi::min_precision (x: iv_limit * max_nscalars_per_iter, |
1353 | sgn: UNSIGNED); |
1354 | |
1355 | FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) |
1356 | { |
1357 | unsigned int cmp_bits = GET_MODE_BITSIZE (mode: cmp_mode_iter.require ()); |
1358 | if (cmp_bits >= min_ni_width |
1359 | && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) |
1360 | { |
1361 | tree this_type = build_nonstandard_integer_type (cmp_bits, true); |
1362 | if (this_type |
1363 | && can_produce_all_loop_masks_p (loop_vinfo, cmp_type: this_type)) |
1364 | { |
1365 | /* Although we could stop as soon as we find a valid mode, |
1366 | there are at least two reasons why that's not always the |
1367 | best choice: |
1368 | |
1369 | - An IV that's Pmode or wider is more likely to be reusable |
1370 | in address calculations than an IV that's narrower than |
1371 | Pmode. |
1372 | |
1373 | - Doing the comparison in IV_PRECISION or wider allows |
1374 | a natural 0-based IV, whereas using a narrower comparison |
1375 | type requires mitigations against wrap-around. |
1376 | |
1377 | Conversely, if the IV limit is variable, doing the comparison |
1378 | in a wider type than the original type can introduce |
1379 | unnecessary extensions, so picking the widest valid mode |
1380 | is not always a good choice either. |
1381 | |
1382 | Here we prefer the first IV type that's Pmode or wider, |
1383 | and the first comparison type that's IV_PRECISION or wider. |
1384 | (The comparison type must be no wider than the IV type, |
1385 | to avoid extensions in the vector loop.) |
1386 | |
1387 | ??? We might want to try continuing beyond Pmode for ILP32 |
1388 | targets if CMP_BITS < IV_PRECISION. */ |
1389 | iv_type = this_type; |
1390 | if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type)) |
1391 | cmp_type = this_type; |
1392 | if (cmp_bits >= GET_MODE_BITSIZE (Pmode)) |
1393 | break; |
1394 | } |
1395 | } |
1396 | } |
1397 | |
1398 | if (!cmp_type) |
1399 | { |
1400 | LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release (); |
1401 | return false; |
1402 | } |
1403 | |
1404 | LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type; |
1405 | LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type; |
1406 | LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult; |
1407 | return true; |
1408 | } |
1409 | |
1410 | /* Each statement in LOOP_VINFO can be masked where necessary. Check |
1411 | whether we can actually generate AVX512 style masks. Return true if so, |
1412 | storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */ |
1413 | |
1414 | static bool |
1415 | vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo) |
1416 | { |
1417 | /* Produce differently organized rgc_vec and differently check |
1418 | we can produce masks. */ |
1419 | |
1420 | /* Use a normal loop if there are no statements that need masking. |
1421 | This only happens in rare degenerate cases: it means that the loop |
1422 | has no loads, no stores, and no live-out values. */ |
1423 | if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) |
1424 | return false; |
1425 | |
1426 | /* For the decrementing IV we need to represent all values in |
1427 | [0, niter + niter_skip] where niter_skip is the elements we |
1428 | skip in the first iteration for prologue peeling. */ |
1429 | tree iv_type = NULL_TREE; |
1430 | widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo); |
1431 | unsigned int iv_precision = UINT_MAX; |
1432 | if (iv_limit != -1) |
1433 | iv_precision = wi::min_precision (x: iv_limit, sgn: UNSIGNED); |
1434 | |
1435 | /* First compute the type for the IV we use to track the remaining |
1436 | scalar iterations. */ |
1437 | opt_scalar_int_mode cmp_mode_iter; |
1438 | FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) |
1439 | { |
1440 | unsigned int cmp_bits = GET_MODE_BITSIZE (mode: cmp_mode_iter.require ()); |
1441 | if (cmp_bits >= iv_precision |
1442 | && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) |
1443 | { |
1444 | iv_type = build_nonstandard_integer_type (cmp_bits, true); |
1445 | if (iv_type) |
1446 | break; |
1447 | } |
1448 | } |
1449 | if (!iv_type) |
1450 | return false; |
1451 | |
1452 | /* Produce the rgroup controls. */ |
1453 | for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set) |
1454 | { |
1455 | vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); |
1456 | tree vectype = mask.first; |
1457 | unsigned nvectors = mask.second; |
1458 | |
1459 | /* The number of scalars per iteration and the number of vectors are |
1460 | both compile-time constants. */ |
1461 | unsigned int nscalars_per_iter |
1462 | = exact_div (a: nvectors * TYPE_VECTOR_SUBPARTS (node: vectype), |
1463 | LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); |
1464 | |
1465 | /* We index the rgroup_controls vector with nscalars_per_iter |
1466 | which we keep constant and instead have a varying nvectors, |
1467 | remembering the vector mask with the fewest nV. */ |
1468 | if (masks->rgc_vec.length () < nscalars_per_iter) |
1469 | masks->rgc_vec.safe_grow_cleared (len: nscalars_per_iter, exact: true); |
1470 | rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1]; |
1471 | |
1472 | if (!rgm->type || rgm->factor > nvectors) |
1473 | { |
1474 | rgm->type = truth_type_for (vectype); |
1475 | rgm->compare_type = NULL_TREE; |
1476 | rgm->max_nscalars_per_iter = nscalars_per_iter; |
1477 | rgm->factor = nvectors; |
1478 | rgm->bias_adjusted_ctrl = NULL_TREE; |
1479 | } |
1480 | } |
1481 | |
1482 | /* There is no fixed compare type we are going to use but we have to |
1483 | be able to get at one for each mask group. */ |
1484 | unsigned int min_ni_width |
1485 | = wi::min_precision (x: vect_max_vf (loop_vinfo), sgn: UNSIGNED); |
1486 | |
1487 | bool ok = true; |
1488 | for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec) |
1489 | { |
1490 | tree mask_type = rgc.type; |
1491 | if (!mask_type) |
1492 | continue; |
1493 | |
1494 | /* For now vect_get_loop_mask only supports integer mode masks |
1495 | when we need to split it. */ |
1496 | if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT |
1497 | || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1) |
1498 | { |
1499 | ok = false; |
1500 | break; |
1501 | } |
1502 | |
1503 | /* If iv_type is usable as compare type use that - we can elide the |
1504 | saturation in that case. */ |
1505 | if (TYPE_PRECISION (iv_type) >= min_ni_width) |
1506 | { |
1507 | tree cmp_vectype |
1508 | = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (node: mask_type)); |
1509 | if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR)) |
1510 | rgc.compare_type = cmp_vectype; |
1511 | } |
1512 | if (!rgc.compare_type) |
1513 | FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) |
1514 | { |
1515 | unsigned int cmp_bits = GET_MODE_BITSIZE (mode: cmp_mode_iter.require ()); |
1516 | if (cmp_bits >= min_ni_width |
1517 | && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) |
1518 | { |
1519 | tree cmp_type = build_nonstandard_integer_type (cmp_bits, true); |
1520 | if (!cmp_type) |
1521 | continue; |
1522 | |
1523 | /* Check whether we can produce the mask with cmp_type. */ |
1524 | tree cmp_vectype |
1525 | = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (node: mask_type)); |
1526 | if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR)) |
1527 | { |
1528 | rgc.compare_type = cmp_vectype; |
1529 | break; |
1530 | } |
1531 | } |
1532 | } |
1533 | if (!rgc.compare_type) |
1534 | { |
1535 | ok = false; |
1536 | break; |
1537 | } |
1538 | } |
1539 | if (!ok) |
1540 | { |
1541 | release_vec_loop_controls (controls: &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec); |
1542 | return false; |
1543 | } |
1544 | |
1545 | LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node; |
1546 | LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type; |
1547 | LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512; |
1548 | return true; |
1549 | } |
1550 | |
1551 | /* Check whether we can use vector access with length based on precison |
1552 | comparison. So far, to keep it simple, we only allow the case that the |
1553 | precision of the target supported length is larger than the precision |
1554 | required by loop niters. */ |
1555 | |
1556 | static bool |
1557 | vect_verify_loop_lens (loop_vec_info loop_vinfo) |
1558 | { |
1559 | if (LOOP_VINFO_LENS (loop_vinfo).is_empty ()) |
1560 | return false; |
1561 | |
1562 | machine_mode len_load_mode, len_store_mode; |
1563 | if (!get_len_load_store_mode (loop_vinfo->vector_mode, true) |
1564 | .exists (mode: &len_load_mode)) |
1565 | return false; |
1566 | if (!get_len_load_store_mode (loop_vinfo->vector_mode, false) |
1567 | .exists (mode: &len_store_mode)) |
1568 | return false; |
1569 | |
1570 | signed char partial_load_bias = internal_len_load_store_bias |
1571 | (ifn: IFN_LEN_LOAD, len_load_mode); |
1572 | |
1573 | signed char partial_store_bias = internal_len_load_store_bias |
1574 | (ifn: IFN_LEN_STORE, len_store_mode); |
1575 | |
1576 | gcc_assert (partial_load_bias == partial_store_bias); |
1577 | |
1578 | if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED) |
1579 | return false; |
1580 | |
1581 | /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit |
1582 | len_loads with a length of zero. In order to avoid that we prohibit |
1583 | more than one loop length here. */ |
1584 | if (partial_load_bias == -1 |
1585 | && LOOP_VINFO_LENS (loop_vinfo).length () > 1) |
1586 | return false; |
1587 | |
1588 | LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias; |
1589 | |
1590 | unsigned int max_nitems_per_iter = 1; |
1591 | unsigned int i; |
1592 | rgroup_controls *rgl; |
1593 | /* Find the maximum number of items per iteration for every rgroup. */ |
1594 | FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl) |
1595 | { |
1596 | unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor; |
1597 | max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter); |
1598 | } |
1599 | |
1600 | /* Work out how many bits we need to represent the length limit. */ |
1601 | unsigned int min_ni_prec |
1602 | = vect_min_prec_for_max_niters (loop_vinfo, factor: max_nitems_per_iter); |
1603 | |
1604 | /* Now use the maximum of below precisions for one suitable IV type: |
1605 | - the IV's natural precision |
1606 | - the precision needed to hold: the maximum number of scalar |
1607 | iterations multiplied by the scale factor (min_ni_prec above) |
1608 | - the Pmode precision |
1609 | |
1610 | If min_ni_prec is less than the precision of the current niters, |
1611 | we perfer to still use the niters type. Prefer to use Pmode and |
1612 | wider IV to avoid narrow conversions. */ |
1613 | |
1614 | unsigned int ni_prec |
1615 | = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo))); |
1616 | min_ni_prec = MAX (min_ni_prec, ni_prec); |
1617 | min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode)); |
1618 | |
1619 | tree iv_type = NULL_TREE; |
1620 | opt_scalar_int_mode tmode_iter; |
1621 | FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT) |
1622 | { |
1623 | scalar_mode tmode = tmode_iter.require (); |
1624 | unsigned int tbits = GET_MODE_BITSIZE (mode: tmode); |
1625 | |
1626 | /* ??? Do we really want to construct one IV whose precision exceeds |
1627 | BITS_PER_WORD? */ |
1628 | if (tbits > BITS_PER_WORD) |
1629 | break; |
1630 | |
1631 | /* Find the first available standard integral type. */ |
1632 | if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode)) |
1633 | { |
1634 | iv_type = build_nonstandard_integer_type (tbits, true); |
1635 | break; |
1636 | } |
1637 | } |
1638 | |
1639 | if (!iv_type) |
1640 | { |
1641 | if (dump_enabled_p ()) |
1642 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1643 | "can't vectorize with length-based partial vectors" |
1644 | " because there is no suitable iv type.\n" ); |
1645 | return false; |
1646 | } |
1647 | |
1648 | LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type; |
1649 | LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type; |
1650 | LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len; |
1651 | |
1652 | return true; |
1653 | } |
1654 | |
1655 | /* Calculate the cost of one scalar iteration of the loop. */ |
1656 | static void |
1657 | vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) |
1658 | { |
1659 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1660 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
1661 | int nbbs = loop->num_nodes, factor; |
1662 | int innerloop_iters, i; |
1663 | |
1664 | DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost" ); |
1665 | |
1666 | /* Gather costs for statements in the scalar loop. */ |
1667 | |
1668 | /* FORNOW. */ |
1669 | innerloop_iters = 1; |
1670 | if (loop->inner) |
1671 | innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); |
1672 | |
1673 | for (i = 0; i < nbbs; i++) |
1674 | { |
1675 | gimple_stmt_iterator si; |
1676 | basic_block bb = bbs[i]; |
1677 | |
1678 | if (bb->loop_father == loop->inner) |
1679 | factor = innerloop_iters; |
1680 | else |
1681 | factor = 1; |
1682 | |
1683 | for (si = gsi_start_bb (bb); !gsi_end_p (i: si); gsi_next (i: &si)) |
1684 | { |
1685 | gimple *stmt = gsi_stmt (i: si); |
1686 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt); |
1687 | |
1688 | if (!is_gimple_assign (gs: stmt) && !is_gimple_call (gs: stmt)) |
1689 | continue; |
1690 | |
1691 | /* Skip stmts that are not vectorized inside the loop. */ |
1692 | stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info); |
1693 | if (!STMT_VINFO_RELEVANT_P (vstmt_info) |
1694 | && (!STMT_VINFO_LIVE_P (vstmt_info) |
1695 | || !VECTORIZABLE_CYCLE_DEF |
1696 | (STMT_VINFO_DEF_TYPE (vstmt_info)))) |
1697 | continue; |
1698 | |
1699 | vect_cost_for_stmt kind; |
1700 | if (STMT_VINFO_DATA_REF (stmt_info)) |
1701 | { |
1702 | if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) |
1703 | kind = scalar_load; |
1704 | else |
1705 | kind = scalar_store; |
1706 | } |
1707 | else if (vect_nop_conversion_p (stmt_info)) |
1708 | continue; |
1709 | else |
1710 | kind = scalar_stmt; |
1711 | |
1712 | /* We are using vect_prologue here to avoid scaling twice |
1713 | by the inner loop factor. */ |
1714 | record_stmt_cost (body_cost_vec: &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), |
1715 | count: factor, kind, stmt_info, misalign: 0, where: vect_prologue); |
1716 | } |
1717 | } |
1718 | |
1719 | /* Now accumulate cost. */ |
1720 | loop_vinfo->scalar_costs = init_cost (vinfo: loop_vinfo, costing_for_scalar: true); |
1721 | add_stmt_costs (costs: loop_vinfo->scalar_costs, |
1722 | cost_vec: &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo)); |
1723 | loop_vinfo->scalar_costs->finish_cost (scalar_costs: nullptr); |
1724 | } |
1725 | |
1726 | /* Function vect_analyze_loop_form. |
1727 | |
1728 | Verify that certain CFG restrictions hold, including: |
1729 | - the loop has a pre-header |
1730 | - the loop has a single entry |
1731 | - nested loops can have only a single exit. |
1732 | - the loop exit condition is simple enough |
1733 | - the number of iterations can be analyzed, i.e, a countable loop. The |
1734 | niter could be analyzed under some assumptions. */ |
1735 | |
1736 | opt_result |
1737 | vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info) |
1738 | { |
1739 | DUMP_VECT_SCOPE ("vect_analyze_loop_form" ); |
1740 | |
1741 | edge exit_e = vec_init_loop_exit_info (loop); |
1742 | if (!exit_e) |
1743 | return opt_result::failure_at (loc: vect_location, |
1744 | fmt: "not vectorized:" |
1745 | " could not determine main exit from" |
1746 | " loop with multiple exits.\n" ); |
1747 | info->loop_exit = exit_e; |
1748 | if (dump_enabled_p ()) |
1749 | dump_printf_loc (MSG_NOTE, vect_location, |
1750 | "using as main loop exit: %d -> %d [AUX: %p]\n" , |
1751 | exit_e->src->index, exit_e->dest->index, exit_e->aux); |
1752 | |
1753 | /* Check if we have any control flow that doesn't leave the loop. */ |
1754 | class loop *v_loop = loop->inner ? loop->inner : loop; |
1755 | basic_block *bbs = get_loop_body (v_loop); |
1756 | for (unsigned i = 0; i < v_loop->num_nodes; i++) |
1757 | if (EDGE_COUNT (bbs[i]->succs) != 1 |
1758 | && (EDGE_COUNT (bbs[i]->succs) != 2 |
1759 | || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i]))) |
1760 | { |
1761 | free (ptr: bbs); |
1762 | return opt_result::failure_at (loc: vect_location, |
1763 | fmt: "not vectorized:" |
1764 | " unsupported control flow in loop.\n" ); |
1765 | } |
1766 | free (ptr: bbs); |
1767 | |
1768 | /* Different restrictions apply when we are considering an inner-most loop, |
1769 | vs. an outer (nested) loop. |
1770 | (FORNOW. May want to relax some of these restrictions in the future). */ |
1771 | |
1772 | info->inner_loop_cond = NULL; |
1773 | if (!loop->inner) |
1774 | { |
1775 | /* Inner-most loop. */ |
1776 | |
1777 | if (empty_block_p (loop->header)) |
1778 | return opt_result::failure_at (loc: vect_location, |
1779 | fmt: "not vectorized: empty loop.\n" ); |
1780 | } |
1781 | else |
1782 | { |
1783 | class loop *innerloop = loop->inner; |
1784 | edge entryedge; |
1785 | |
1786 | /* Nested loop. We currently require that the loop is doubly-nested, |
1787 | contains a single inner loop with a single exit to the block |
1788 | with the single exit condition in the outer loop. |
1789 | Vectorizable outer-loops look like this: |
1790 | |
1791 | (pre-header) |
1792 | | |
1793 | header <---+ |
1794 | | | |
1795 | inner-loop | |
1796 | | | |
1797 | tail ------+ |
1798 | | |
1799 | (exit-bb) |
1800 | |
1801 | The inner-loop also has the properties expected of inner-most loops |
1802 | as described above. */ |
1803 | |
1804 | if ((loop->inner)->inner || (loop->inner)->next) |
1805 | return opt_result::failure_at (loc: vect_location, |
1806 | fmt: "not vectorized:" |
1807 | " multiple nested loops.\n" ); |
1808 | |
1809 | entryedge = loop_preheader_edge (innerloop); |
1810 | if (entryedge->src != loop->header |
1811 | || !single_exit (innerloop) |
1812 | || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src) |
1813 | return opt_result::failure_at (loc: vect_location, |
1814 | fmt: "not vectorized:" |
1815 | " unsupported outerloop form.\n" ); |
1816 | |
1817 | /* Analyze the inner-loop. */ |
1818 | vect_loop_form_info inner; |
1819 | opt_result res = vect_analyze_loop_form (loop: loop->inner, info: &inner); |
1820 | if (!res) |
1821 | { |
1822 | if (dump_enabled_p ()) |
1823 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1824 | "not vectorized: Bad inner loop.\n" ); |
1825 | return res; |
1826 | } |
1827 | |
1828 | /* Don't support analyzing niter under assumptions for inner |
1829 | loop. */ |
1830 | if (!integer_onep (inner.assumptions)) |
1831 | return opt_result::failure_at (loc: vect_location, |
1832 | fmt: "not vectorized: Bad inner loop.\n" ); |
1833 | |
1834 | if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations)) |
1835 | return opt_result::failure_at (loc: vect_location, |
1836 | fmt: "not vectorized: inner-loop count not" |
1837 | " invariant.\n" ); |
1838 | |
1839 | if (dump_enabled_p ()) |
1840 | dump_printf_loc (MSG_NOTE, vect_location, |
1841 | "Considering outer-loop vectorization.\n" ); |
1842 | info->inner_loop_cond = inner.conds[0]; |
1843 | } |
1844 | |
1845 | if (EDGE_COUNT (loop->header->preds) != 2) |
1846 | return opt_result::failure_at (loc: vect_location, |
1847 | fmt: "not vectorized:" |
1848 | " too many incoming edges.\n" ); |
1849 | |
1850 | /* We assume that the latch is empty. */ |
1851 | if (!empty_block_p (loop->latch) |
1852 | || !gimple_seq_empty_p (s: phi_nodes (bb: loop->latch))) |
1853 | return opt_result::failure_at (loc: vect_location, |
1854 | fmt: "not vectorized: latch block not empty.\n" ); |
1855 | |
1856 | /* Make sure there is no abnormal exit. */ |
1857 | auto_vec<edge> exits = get_loop_exit_edges (loop); |
1858 | for (edge e : exits) |
1859 | { |
1860 | if (e->flags & EDGE_ABNORMAL) |
1861 | return opt_result::failure_at (loc: vect_location, |
1862 | fmt: "not vectorized:" |
1863 | " abnormal loop exit edge.\n" ); |
1864 | } |
1865 | |
1866 | info->conds |
1867 | = vect_get_loop_niters (loop, main_exit: exit_e, assumptions: &info->assumptions, |
1868 | number_of_iterations: &info->number_of_iterations, |
1869 | number_of_iterationsm1: &info->number_of_iterationsm1); |
1870 | if (info->conds.is_empty ()) |
1871 | return opt_result::failure_at |
1872 | (loc: vect_location, |
1873 | fmt: "not vectorized: complicated exit condition.\n" ); |
1874 | |
1875 | /* Determine what the primary and alternate exit conds are. */ |
1876 | for (unsigned i = 0; i < info->conds.length (); i++) |
1877 | { |
1878 | gcond *cond = info->conds[i]; |
1879 | if (exit_e->src == gimple_bb (g: cond)) |
1880 | std::swap (a&: info->conds[0], b&: info->conds[i]); |
1881 | } |
1882 | |
1883 | if (integer_zerop (info->assumptions) |
1884 | || !info->number_of_iterations |
1885 | || chrec_contains_undetermined (info->number_of_iterations)) |
1886 | return opt_result::failure_at |
1887 | (loc: info->conds[0], |
1888 | fmt: "not vectorized: number of iterations cannot be computed.\n" ); |
1889 | |
1890 | if (integer_zerop (info->number_of_iterations)) |
1891 | return opt_result::failure_at |
1892 | (loc: info->conds[0], |
1893 | fmt: "not vectorized: number of iterations = 0.\n" ); |
1894 | |
1895 | if (!(tree_fits_shwi_p (info->number_of_iterations) |
1896 | && tree_to_shwi (info->number_of_iterations) > 0)) |
1897 | { |
1898 | if (dump_enabled_p ()) |
1899 | { |
1900 | dump_printf_loc (MSG_NOTE, vect_location, |
1901 | "Symbolic number of iterations is " ); |
1902 | dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations); |
1903 | dump_printf (MSG_NOTE, "\n" ); |
1904 | } |
1905 | } |
1906 | |
1907 | return opt_result::success (); |
1908 | } |
1909 | |
1910 | /* Create a loop_vec_info for LOOP with SHARED and the |
1911 | vect_analyze_loop_form result. */ |
1912 | |
1913 | loop_vec_info |
1914 | vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared, |
1915 | const vect_loop_form_info *info, |
1916 | loop_vec_info main_loop_info) |
1917 | { |
1918 | loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared); |
1919 | LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1; |
1920 | LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations; |
1921 | LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations; |
1922 | LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info; |
1923 | /* Also record the assumptions for versioning. */ |
1924 | if (!integer_onep (info->assumptions) && !main_loop_info) |
1925 | LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions; |
1926 | |
1927 | for (gcond *cond : info->conds) |
1928 | { |
1929 | stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond); |
1930 | STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type; |
1931 | /* Mark the statement as a condition. */ |
1932 | STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def; |
1933 | } |
1934 | |
1935 | for (unsigned i = 1; i < info->conds.length (); i ++) |
1936 | LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (obj: info->conds[i]); |
1937 | LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0]; |
1938 | |
1939 | LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit; |
1940 | |
1941 | /* Check to see if we're vectorizing multiple exits. */ |
1942 | LOOP_VINFO_EARLY_BREAKS (loop_vinfo) |
1943 | = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty (); |
1944 | |
1945 | if (info->inner_loop_cond) |
1946 | { |
1947 | stmt_vec_info inner_loop_cond_info |
1948 | = loop_vinfo->lookup_stmt (info->inner_loop_cond); |
1949 | STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type; |
1950 | /* If we have an estimate on the number of iterations of the inner |
1951 | loop use that to limit the scale for costing, otherwise use |
1952 | --param vect-inner-loop-cost-factor literally. */ |
1953 | widest_int nit; |
1954 | if (estimated_stmt_executions (loop->inner, &nit)) |
1955 | LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo) |
1956 | = wi::smin (x: nit, param_vect_inner_loop_cost_factor).to_uhwi (); |
1957 | } |
1958 | |
1959 | return loop_vinfo; |
1960 | } |
1961 | |
1962 | |
1963 | |
1964 | /* Scan the loop stmts and dependent on whether there are any (non-)SLP |
1965 | statements update the vectorization factor. */ |
1966 | |
1967 | static void |
1968 | vect_update_vf_for_slp (loop_vec_info loop_vinfo) |
1969 | { |
1970 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1971 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
1972 | int nbbs = loop->num_nodes; |
1973 | poly_uint64 vectorization_factor; |
1974 | int i; |
1975 | |
1976 | DUMP_VECT_SCOPE ("vect_update_vf_for_slp" ); |
1977 | |
1978 | vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
1979 | gcc_assert (known_ne (vectorization_factor, 0U)); |
1980 | |
1981 | /* If all the stmts in the loop can be SLPed, we perform only SLP, and |
1982 | vectorization factor of the loop is the unrolling factor required by |
1983 | the SLP instances. If that unrolling factor is 1, we say, that we |
1984 | perform pure SLP on loop - cross iteration parallelism is not |
1985 | exploited. */ |
1986 | bool only_slp_in_loop = true; |
1987 | for (i = 0; i < nbbs; i++) |
1988 | { |
1989 | basic_block bb = bbs[i]; |
1990 | for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si); |
1991 | gsi_next (i: &si)) |
1992 | { |
1993 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ()); |
1994 | if (!stmt_info) |
1995 | continue; |
1996 | if ((STMT_VINFO_RELEVANT_P (stmt_info) |
1997 | || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) |
1998 | && !PURE_SLP_STMT (stmt_info)) |
1999 | /* STMT needs both SLP and loop-based vectorization. */ |
2000 | only_slp_in_loop = false; |
2001 | } |
2002 | for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (i: si); |
2003 | gsi_next (i: &si)) |
2004 | { |
2005 | if (is_gimple_debug (gs: gsi_stmt (i: si))) |
2006 | continue; |
2007 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (i: si)); |
2008 | stmt_info = vect_stmt_to_vectorize (stmt_info); |
2009 | if ((STMT_VINFO_RELEVANT_P (stmt_info) |
2010 | || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) |
2011 | && !PURE_SLP_STMT (stmt_info)) |
2012 | /* STMT needs both SLP and loop-based vectorization. */ |
2013 | only_slp_in_loop = false; |
2014 | } |
2015 | } |
2016 | |
2017 | if (only_slp_in_loop) |
2018 | { |
2019 | if (dump_enabled_p ()) |
2020 | dump_printf_loc (MSG_NOTE, vect_location, |
2021 | "Loop contains only SLP stmts\n" ); |
2022 | vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); |
2023 | } |
2024 | else |
2025 | { |
2026 | if (dump_enabled_p ()) |
2027 | dump_printf_loc (MSG_NOTE, vect_location, |
2028 | "Loop contains SLP and non-SLP stmts\n" ); |
2029 | /* Both the vectorization factor and unroll factor have the form |
2030 | GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X, |
2031 | so they must have a common multiple. */ |
2032 | vectorization_factor |
2033 | = force_common_multiple (a: vectorization_factor, |
2034 | LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); |
2035 | } |
2036 | |
2037 | LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; |
2038 | if (dump_enabled_p ()) |
2039 | { |
2040 | dump_printf_loc (MSG_NOTE, vect_location, |
2041 | "Updating vectorization factor to " ); |
2042 | dump_dec (MSG_NOTE, vectorization_factor); |
2043 | dump_printf (MSG_NOTE, ".\n" ); |
2044 | } |
2045 | } |
2046 | |
2047 | /* Return true if STMT_INFO describes a double reduction phi and if |
2048 | the other phi in the reduction is also relevant for vectorization. |
2049 | This rejects cases such as: |
2050 | |
2051 | outer1: |
2052 | x_1 = PHI <x_3(outer2), ...>; |
2053 | ... |
2054 | |
2055 | inner: |
2056 | x_2 = ...; |
2057 | ... |
2058 | |
2059 | outer2: |
2060 | x_3 = PHI <x_2(inner)>; |
2061 | |
2062 | if nothing in x_2 or elsewhere makes x_1 relevant. */ |
2063 | |
2064 | static bool |
2065 | vect_active_double_reduction_p (stmt_vec_info stmt_info) |
2066 | { |
2067 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) |
2068 | return false; |
2069 | |
2070 | return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info)); |
2071 | } |
2072 | |
2073 | /* Function vect_analyze_loop_operations. |
2074 | |
2075 | Scan the loop stmts and make sure they are all vectorizable. */ |
2076 | |
2077 | static opt_result |
2078 | vect_analyze_loop_operations (loop_vec_info loop_vinfo) |
2079 | { |
2080 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
2081 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
2082 | int nbbs = loop->num_nodes; |
2083 | int i; |
2084 | stmt_vec_info stmt_info; |
2085 | bool need_to_vectorize = false; |
2086 | bool ok; |
2087 | |
2088 | DUMP_VECT_SCOPE ("vect_analyze_loop_operations" ); |
2089 | |
2090 | auto_vec<stmt_info_for_cost> cost_vec; |
2091 | |
2092 | for (i = 0; i < nbbs; i++) |
2093 | { |
2094 | basic_block bb = bbs[i]; |
2095 | |
2096 | for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si); |
2097 | gsi_next (i: &si)) |
2098 | { |
2099 | gphi *phi = si.phi (); |
2100 | ok = true; |
2101 | |
2102 | stmt_info = loop_vinfo->lookup_stmt (phi); |
2103 | if (dump_enabled_p ()) |
2104 | dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G" , |
2105 | (gimple *) phi); |
2106 | if (virtual_operand_p (op: gimple_phi_result (gs: phi))) |
2107 | continue; |
2108 | |
2109 | /* Inner-loop loop-closed exit phi in outer-loop vectorization |
2110 | (i.e., a phi in the tail of the outer-loop). */ |
2111 | if (! is_loop_header_bb_p (bb)) |
2112 | { |
2113 | /* FORNOW: we currently don't support the case that these phis |
2114 | are not used in the outerloop (unless it is double reduction, |
2115 | i.e., this phi is vect_reduction_def), cause this case |
2116 | requires to actually do something here. */ |
2117 | if (STMT_VINFO_LIVE_P (stmt_info) |
2118 | && !vect_active_double_reduction_p (stmt_info)) |
2119 | return opt_result::failure_at (loc: phi, |
2120 | fmt: "Unsupported loop-closed phi" |
2121 | " in outer-loop.\n" ); |
2122 | |
2123 | /* If PHI is used in the outer loop, we check that its operand |
2124 | is defined in the inner loop. */ |
2125 | if (STMT_VINFO_RELEVANT_P (stmt_info)) |
2126 | { |
2127 | tree phi_op; |
2128 | |
2129 | if (gimple_phi_num_args (gs: phi) != 1) |
2130 | return opt_result::failure_at (loc: phi, fmt: "unsupported phi" ); |
2131 | |
2132 | phi_op = PHI_ARG_DEF (phi, 0); |
2133 | stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op); |
2134 | if (!op_def_info) |
2135 | return opt_result::failure_at (loc: phi, fmt: "unsupported phi\n" ); |
2136 | |
2137 | if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer |
2138 | && (STMT_VINFO_RELEVANT (op_def_info) |
2139 | != vect_used_in_outer_by_reduction)) |
2140 | return opt_result::failure_at (loc: phi, fmt: "unsupported phi\n" ); |
2141 | |
2142 | if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def |
2143 | || (STMT_VINFO_DEF_TYPE (stmt_info) |
2144 | == vect_double_reduction_def)) |
2145 | && !vectorizable_lc_phi (loop_vinfo, |
2146 | stmt_info, NULL, NULL)) |
2147 | return opt_result::failure_at (loc: phi, fmt: "unsupported phi\n" ); |
2148 | } |
2149 | |
2150 | continue; |
2151 | } |
2152 | |
2153 | gcc_assert (stmt_info); |
2154 | |
2155 | if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope |
2156 | || STMT_VINFO_LIVE_P (stmt_info)) |
2157 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def |
2158 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence) |
2159 | /* A scalar-dependence cycle that we don't support. */ |
2160 | return opt_result::failure_at (loc: phi, |
2161 | fmt: "not vectorized:" |
2162 | " scalar dependence cycle.\n" ); |
2163 | |
2164 | if (STMT_VINFO_RELEVANT_P (stmt_info)) |
2165 | { |
2166 | need_to_vectorize = true; |
2167 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def |
2168 | && ! PURE_SLP_STMT (stmt_info)) |
2169 | ok = vectorizable_induction (loop_vinfo, |
2170 | stmt_info, NULL, NULL, |
2171 | &cost_vec); |
2172 | else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
2173 | || (STMT_VINFO_DEF_TYPE (stmt_info) |
2174 | == vect_double_reduction_def) |
2175 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) |
2176 | && ! PURE_SLP_STMT (stmt_info)) |
2177 | ok = vectorizable_reduction (loop_vinfo, |
2178 | stmt_info, NULL, NULL, &cost_vec); |
2179 | else if ((STMT_VINFO_DEF_TYPE (stmt_info) |
2180 | == vect_first_order_recurrence) |
2181 | && ! PURE_SLP_STMT (stmt_info)) |
2182 | ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL, |
2183 | &cost_vec); |
2184 | } |
2185 | |
2186 | /* SLP PHIs are tested by vect_slp_analyze_node_operations. */ |
2187 | if (ok |
2188 | && STMT_VINFO_LIVE_P (stmt_info) |
2189 | && !PURE_SLP_STMT (stmt_info)) |
2190 | ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL, |
2191 | -1, false, &cost_vec); |
2192 | |
2193 | if (!ok) |
2194 | return opt_result::failure_at (loc: phi, |
2195 | fmt: "not vectorized: relevant phi not " |
2196 | "supported: %G" , |
2197 | static_cast <gimple *> (phi)); |
2198 | } |
2199 | |
2200 | for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (i: si); |
2201 | gsi_next (i: &si)) |
2202 | { |
2203 | gimple *stmt = gsi_stmt (i: si); |
2204 | if (!gimple_clobber_p (s: stmt) |
2205 | && !is_gimple_debug (gs: stmt)) |
2206 | { |
2207 | opt_result res |
2208 | = vect_analyze_stmt (loop_vinfo, |
2209 | loop_vinfo->lookup_stmt (stmt), |
2210 | &need_to_vectorize, |
2211 | NULL, NULL, &cost_vec); |
2212 | if (!res) |
2213 | return res; |
2214 | } |
2215 | } |
2216 | } /* bbs */ |
2217 | |
2218 | add_stmt_costs (costs: loop_vinfo->vector_costs, cost_vec: &cost_vec); |
2219 | |
2220 | /* All operations in the loop are either irrelevant (deal with loop |
2221 | control, or dead), or only used outside the loop and can be moved |
2222 | out of the loop (e.g. invariants, inductions). The loop can be |
2223 | optimized away by scalar optimizations. We're better off not |
2224 | touching this loop. */ |
2225 | if (!need_to_vectorize) |
2226 | { |
2227 | if (dump_enabled_p ()) |
2228 | dump_printf_loc (MSG_NOTE, vect_location, |
2229 | "All the computation can be taken out of the loop.\n" ); |
2230 | return opt_result::failure_at |
2231 | (loc: vect_location, |
2232 | fmt: "not vectorized: redundant loop. no profit to vectorize.\n" ); |
2233 | } |
2234 | |
2235 | return opt_result::success (); |
2236 | } |
2237 | |
2238 | /* Return true if we know that the iteration count is smaller than the |
2239 | vectorization factor. Return false if it isn't, or if we can't be sure |
2240 | either way. */ |
2241 | |
2242 | static bool |
2243 | vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo) |
2244 | { |
2245 | unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); |
2246 | |
2247 | HOST_WIDE_INT max_niter; |
2248 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) |
2249 | max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo); |
2250 | else |
2251 | max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); |
2252 | |
2253 | if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf) |
2254 | return true; |
2255 | |
2256 | return false; |
2257 | } |
2258 | |
2259 | /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it |
2260 | is worthwhile to vectorize. Return 1 if definitely yes, 0 if |
2261 | definitely no, or -1 if it's worth retrying. */ |
2262 | |
2263 | static int |
2264 | vect_analyze_loop_costing (loop_vec_info loop_vinfo, |
2265 | unsigned *suggested_unroll_factor) |
2266 | { |
2267 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
2268 | unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); |
2269 | |
2270 | /* Only loops that can handle partially-populated vectors can have iteration |
2271 | counts less than the vectorization factor. */ |
2272 | if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) |
2273 | && vect_known_niters_smaller_than_vf (loop_vinfo)) |
2274 | { |
2275 | if (dump_enabled_p ()) |
2276 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2277 | "not vectorized: iteration count smaller than " |
2278 | "vectorization factor.\n" ); |
2279 | return 0; |
2280 | } |
2281 | |
2282 | /* If we know the number of iterations we can do better, for the |
2283 | epilogue we can also decide whether the main loop leaves us |
2284 | with enough iterations, prefering a smaller vector epilog then |
2285 | also possibly used for the case we skip the vector loop. */ |
2286 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) |
2287 | { |
2288 | widest_int scalar_niters |
2289 | = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1; |
2290 | if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) |
2291 | { |
2292 | loop_vec_info orig_loop_vinfo |
2293 | = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); |
2294 | unsigned lowest_vf |
2295 | = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)); |
2296 | int prolog_peeling = 0; |
2297 | if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
2298 | prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo); |
2299 | if (prolog_peeling >= 0 |
2300 | && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo), |
2301 | lowest_vf)) |
2302 | { |
2303 | unsigned gap |
2304 | = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0; |
2305 | scalar_niters = ((scalar_niters - gap - prolog_peeling) |
2306 | % lowest_vf + gap); |
2307 | } |
2308 | } |
2309 | /* Reject vectorizing for a single scalar iteration, even if |
2310 | we could in principle implement that using partial vectors. */ |
2311 | unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo); |
2312 | if (scalar_niters <= peeling_gap + 1) |
2313 | { |
2314 | if (dump_enabled_p ()) |
2315 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2316 | "not vectorized: loop only has a single " |
2317 | "scalar iteration.\n" ); |
2318 | return 0; |
2319 | } |
2320 | |
2321 | if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
2322 | { |
2323 | /* Check that the loop processes at least one full vector. */ |
2324 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
2325 | if (known_lt (scalar_niters, vf)) |
2326 | { |
2327 | if (dump_enabled_p ()) |
2328 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2329 | "loop does not have enough iterations " |
2330 | "to support vectorization.\n" ); |
2331 | return 0; |
2332 | } |
2333 | |
2334 | /* If we need to peel an extra epilogue iteration to handle data |
2335 | accesses with gaps, check that there are enough scalar iterations |
2336 | available. |
2337 | |
2338 | The check above is redundant with this one when peeling for gaps, |
2339 | but the distinction is useful for diagnostics. */ |
2340 | if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) |
2341 | && known_le (scalar_niters, vf)) |
2342 | { |
2343 | if (dump_enabled_p ()) |
2344 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2345 | "loop does not have enough iterations " |
2346 | "to support peeling for gaps.\n" ); |
2347 | return 0; |
2348 | } |
2349 | } |
2350 | } |
2351 | |
2352 | /* If using the "very cheap" model. reject cases in which we'd keep |
2353 | a copy of the scalar code (even if we might be able to vectorize it). */ |
2354 | if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP |
2355 | && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) |
2356 | || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) |
2357 | || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))) |
2358 | { |
2359 | if (dump_enabled_p ()) |
2360 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2361 | "some scalar iterations would need to be peeled\n" ); |
2362 | return 0; |
2363 | } |
2364 | |
2365 | int min_profitable_iters, min_profitable_estimate; |
2366 | vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters, |
2367 | &min_profitable_estimate, |
2368 | suggested_unroll_factor); |
2369 | |
2370 | if (min_profitable_iters < 0) |
2371 | { |
2372 | if (dump_enabled_p ()) |
2373 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2374 | "not vectorized: vectorization not profitable.\n" ); |
2375 | if (dump_enabled_p ()) |
2376 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2377 | "not vectorized: vector version will never be " |
2378 | "profitable.\n" ); |
2379 | return -1; |
2380 | } |
2381 | |
2382 | int min_scalar_loop_bound = (param_min_vect_loop_bound |
2383 | * assumed_vf); |
2384 | |
2385 | /* Use the cost model only if it is more conservative than user specified |
2386 | threshold. */ |
2387 | unsigned int th = (unsigned) MAX (min_scalar_loop_bound, |
2388 | min_profitable_iters); |
2389 | |
2390 | LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th; |
2391 | |
2392 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
2393 | && LOOP_VINFO_INT_NITERS (loop_vinfo) < th) |
2394 | { |
2395 | if (dump_enabled_p ()) |
2396 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2397 | "not vectorized: vectorization not profitable.\n" ); |
2398 | if (dump_enabled_p ()) |
2399 | dump_printf_loc (MSG_NOTE, vect_location, |
2400 | "not vectorized: iteration count smaller than user " |
2401 | "specified loop bound parameter or minimum profitable " |
2402 | "iterations (whichever is more conservative).\n" ); |
2403 | return 0; |
2404 | } |
2405 | |
2406 | /* The static profitablity threshold min_profitable_estimate includes |
2407 | the cost of having to check at runtime whether the scalar loop |
2408 | should be used instead. If it turns out that we don't need or want |
2409 | such a check, the threshold we should use for the static estimate |
2410 | is simply the point at which the vector loop becomes more profitable |
2411 | than the scalar loop. */ |
2412 | if (min_profitable_estimate > min_profitable_iters |
2413 | && !LOOP_REQUIRES_VERSIONING (loop_vinfo) |
2414 | && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) |
2415 | && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) |
2416 | && !vect_apply_runtime_profitability_check_p (loop_vinfo)) |
2417 | { |
2418 | if (dump_enabled_p ()) |
2419 | dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime" |
2420 | " choice between the scalar and vector loops\n" ); |
2421 | min_profitable_estimate = min_profitable_iters; |
2422 | } |
2423 | |
2424 | /* If the vector loop needs multiple iterations to be beneficial then |
2425 | things are probably too close to call, and the conservative thing |
2426 | would be to stick with the scalar code. */ |
2427 | if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP |
2428 | && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo)) |
2429 | { |
2430 | if (dump_enabled_p ()) |
2431 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2432 | "one iteration of the vector loop would be" |
2433 | " more expensive than the equivalent number of" |
2434 | " iterations of the scalar loop\n" ); |
2435 | return 0; |
2436 | } |
2437 | |
2438 | HOST_WIDE_INT estimated_niter; |
2439 | |
2440 | /* If we are vectorizing an epilogue then we know the maximum number of |
2441 | scalar iterations it will cover is at least one lower than the |
2442 | vectorization factor of the main loop. */ |
2443 | if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) |
2444 | estimated_niter |
2445 | = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1; |
2446 | else |
2447 | { |
2448 | estimated_niter = estimated_stmt_executions_int (loop); |
2449 | if (estimated_niter == -1) |
2450 | estimated_niter = likely_max_stmt_executions_int (loop); |
2451 | } |
2452 | if (estimated_niter != -1 |
2453 | && ((unsigned HOST_WIDE_INT) estimated_niter |
2454 | < MAX (th, (unsigned) min_profitable_estimate))) |
2455 | { |
2456 | if (dump_enabled_p ()) |
2457 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2458 | "not vectorized: estimated iteration count too " |
2459 | "small.\n" ); |
2460 | if (dump_enabled_p ()) |
2461 | dump_printf_loc (MSG_NOTE, vect_location, |
2462 | "not vectorized: estimated iteration count smaller " |
2463 | "than specified loop bound parameter or minimum " |
2464 | "profitable iterations (whichever is more " |
2465 | "conservative).\n" ); |
2466 | return -1; |
2467 | } |
2468 | |
2469 | return 1; |
2470 | } |
2471 | |
2472 | static opt_result |
2473 | vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs, |
2474 | vec<data_reference_p> *datarefs, |
2475 | unsigned int *n_stmts) |
2476 | { |
2477 | *n_stmts = 0; |
2478 | for (unsigned i = 0; i < loop->num_nodes; i++) |
2479 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs[i]); |
2480 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
2481 | { |
2482 | gimple *stmt = gsi_stmt (i: gsi); |
2483 | if (is_gimple_debug (gs: stmt)) |
2484 | continue; |
2485 | ++(*n_stmts); |
2486 | opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs, |
2487 | NULL, 0); |
2488 | if (!res) |
2489 | { |
2490 | if (is_gimple_call (gs: stmt) && loop->safelen) |
2491 | { |
2492 | tree fndecl = gimple_call_fndecl (gs: stmt), op; |
2493 | if (fndecl == NULL_TREE |
2494 | && gimple_call_internal_p (gs: stmt, fn: IFN_MASK_CALL)) |
2495 | { |
2496 | fndecl = gimple_call_arg (gs: stmt, index: 0); |
2497 | gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR); |
2498 | fndecl = TREE_OPERAND (fndecl, 0); |
2499 | gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL); |
2500 | } |
2501 | if (fndecl != NULL_TREE) |
2502 | { |
2503 | cgraph_node *node = cgraph_node::get (decl: fndecl); |
2504 | if (node != NULL && node->simd_clones != NULL) |
2505 | { |
2506 | unsigned int j, n = gimple_call_num_args (gs: stmt); |
2507 | for (j = 0; j < n; j++) |
2508 | { |
2509 | op = gimple_call_arg (gs: stmt, index: j); |
2510 | if (DECL_P (op) |
2511 | || (REFERENCE_CLASS_P (op) |
2512 | && get_base_address (t: op))) |
2513 | break; |
2514 | } |
2515 | op = gimple_call_lhs (gs: stmt); |
2516 | /* Ignore #pragma omp declare simd functions |
2517 | if they don't have data references in the |
2518 | call stmt itself. */ |
2519 | if (j == n |
2520 | && !(op |
2521 | && (DECL_P (op) |
2522 | || (REFERENCE_CLASS_P (op) |
2523 | && get_base_address (t: op))))) |
2524 | continue; |
2525 | } |
2526 | } |
2527 | } |
2528 | return res; |
2529 | } |
2530 | /* If dependence analysis will give up due to the limit on the |
2531 | number of datarefs stop here and fail fatally. */ |
2532 | if (datarefs->length () |
2533 | > (unsigned)param_loop_max_datarefs_for_datadeps) |
2534 | return opt_result::failure_at (loc: stmt, fmt: "exceeded param " |
2535 | "loop-max-datarefs-for-datadeps\n" ); |
2536 | } |
2537 | return opt_result::success (); |
2538 | } |
2539 | |
2540 | /* Look for SLP-only access groups and turn each individual access into its own |
2541 | group. */ |
2542 | static void |
2543 | vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo) |
2544 | { |
2545 | unsigned int i; |
2546 | struct data_reference *dr; |
2547 | |
2548 | DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups" ); |
2549 | |
2550 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
2551 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
2552 | { |
2553 | gcc_assert (DR_REF (dr)); |
2554 | stmt_vec_info stmt_info |
2555 | = vect_stmt_to_vectorize (stmt_info: loop_vinfo->lookup_stmt (DR_STMT (dr))); |
2556 | |
2557 | /* Check if the load is a part of an interleaving chain. */ |
2558 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
2559 | { |
2560 | stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info); |
2561 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element); |
2562 | unsigned int group_size = DR_GROUP_SIZE (first_element); |
2563 | |
2564 | /* Check if SLP-only groups. */ |
2565 | if (!STMT_SLP_TYPE (stmt_info) |
2566 | && STMT_VINFO_SLP_VECT_ONLY (first_element)) |
2567 | { |
2568 | /* Dissolve the group. */ |
2569 | STMT_VINFO_SLP_VECT_ONLY (first_element) = false; |
2570 | |
2571 | stmt_vec_info vinfo = first_element; |
2572 | while (vinfo) |
2573 | { |
2574 | stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo); |
2575 | DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo; |
2576 | DR_GROUP_NEXT_ELEMENT (vinfo) = NULL; |
2577 | DR_GROUP_SIZE (vinfo) = 1; |
2578 | if (STMT_VINFO_STRIDED_P (first_element) |
2579 | /* We cannot handle stores with gaps. */ |
2580 | || DR_IS_WRITE (dr_info->dr)) |
2581 | { |
2582 | STMT_VINFO_STRIDED_P (vinfo) = true; |
2583 | DR_GROUP_GAP (vinfo) = 0; |
2584 | } |
2585 | else |
2586 | DR_GROUP_GAP (vinfo) = group_size - 1; |
2587 | /* Duplicate and adjust alignment info, it needs to |
2588 | be present on each group leader, see dr_misalignment. */ |
2589 | if (vinfo != first_element) |
2590 | { |
2591 | dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo); |
2592 | dr_info2->target_alignment = dr_info->target_alignment; |
2593 | int misalignment = dr_info->misalignment; |
2594 | if (misalignment != DR_MISALIGNMENT_UNKNOWN) |
2595 | { |
2596 | HOST_WIDE_INT diff |
2597 | = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr)) |
2598 | - TREE_INT_CST_LOW (DR_INIT (dr_info->dr))); |
2599 | unsigned HOST_WIDE_INT align_c |
2600 | = dr_info->target_alignment.to_constant (); |
2601 | misalignment = (misalignment + diff) % align_c; |
2602 | } |
2603 | dr_info2->misalignment = misalignment; |
2604 | } |
2605 | vinfo = next; |
2606 | } |
2607 | } |
2608 | } |
2609 | } |
2610 | } |
2611 | |
2612 | /* Determine if operating on full vectors for LOOP_VINFO might leave |
2613 | some scalar iterations still to do. If so, decide how we should |
2614 | handle those scalar iterations. The possibilities are: |
2615 | |
2616 | (1) Make LOOP_VINFO operate on partial vectors instead of full vectors. |
2617 | In this case: |
2618 | |
2619 | LOOP_VINFO_USING_PARTIAL_VECTORS_P == true |
2620 | LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false |
2621 | LOOP_VINFO_PEELING_FOR_NITER == false |
2622 | |
2623 | (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop |
2624 | to handle the remaining scalar iterations. In this case: |
2625 | |
2626 | LOOP_VINFO_USING_PARTIAL_VECTORS_P == false |
2627 | LOOP_VINFO_PEELING_FOR_NITER == true |
2628 | |
2629 | There are two choices: |
2630 | |
2631 | (2a) Consider vectorizing the epilogue loop at the same VF as the |
2632 | main loop, but using partial vectors instead of full vectors. |
2633 | In this case: |
2634 | |
2635 | LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true |
2636 | |
2637 | (2b) Consider vectorizing the epilogue loop at lower VFs only. |
2638 | In this case: |
2639 | |
2640 | LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false |
2641 | */ |
2642 | |
2643 | opt_result |
2644 | vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo) |
2645 | { |
2646 | /* Determine whether there would be any scalar iterations left over. */ |
2647 | bool need_peeling_or_partial_vectors_p |
2648 | = vect_need_peeling_or_partial_vectors_p (loop_vinfo); |
2649 | |
2650 | /* Decide whether to vectorize the loop with partial vectors. */ |
2651 | LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; |
2652 | LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; |
2653 | if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) |
2654 | && need_peeling_or_partial_vectors_p) |
2655 | { |
2656 | /* For partial-vector-usage=1, try to push the handling of partial |
2657 | vectors to the epilogue, with the main loop continuing to operate |
2658 | on full vectors. |
2659 | |
2660 | If we are unrolling we also do not want to use partial vectors. This |
2661 | is to avoid the overhead of generating multiple masks and also to |
2662 | avoid having to execute entire iterations of FALSE masked instructions |
2663 | when dealing with one or less full iterations. |
2664 | |
2665 | ??? We could then end up failing to use partial vectors if we |
2666 | decide to peel iterations into a prologue, and if the main loop |
2667 | then ends up processing fewer than VF iterations. */ |
2668 | if ((param_vect_partial_vector_usage == 1 |
2669 | || loop_vinfo->suggested_unroll_factor > 1) |
2670 | && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) |
2671 | && !vect_known_niters_smaller_than_vf (loop_vinfo)) |
2672 | LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; |
2673 | else |
2674 | LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; |
2675 | } |
2676 | |
2677 | if (dump_enabled_p ()) |
2678 | dump_printf_loc (MSG_NOTE, vect_location, |
2679 | "operating on %s vectors%s.\n" , |
2680 | LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) |
2681 | ? "partial" : "full" , |
2682 | LOOP_VINFO_EPILOGUE_P (loop_vinfo) |
2683 | ? " for epilogue loop" : "" ); |
2684 | |
2685 | LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) |
2686 | = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) |
2687 | && need_peeling_or_partial_vectors_p); |
2688 | |
2689 | /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization |
2690 | analysis that we don't know whether the loop is vectorized by partial |
2691 | vectors (More details see tree-vect-loop-manip.cc). |
2692 | |
2693 | However, SELECT_VL vectorizaton style should only applied on partial |
2694 | vectorization since SELECT_VL is the GIMPLE IR that calculates the |
2695 | number of elements to be process for each iteration. |
2696 | |
2697 | After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P |
2698 | if it is not partial vectorized loop. */ |
2699 | if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
2700 | LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false; |
2701 | |
2702 | return opt_result::success (); |
2703 | } |
2704 | |
2705 | /* Function vect_analyze_loop_2. |
2706 | |
2707 | Apply a set of analyses on LOOP specified by LOOP_VINFO, the different |
2708 | analyses will record information in some members of LOOP_VINFO. FATAL |
2709 | indicates if some analysis meets fatal error. If one non-NULL pointer |
2710 | SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one |
2711 | worked out suggested unroll factor, while one NULL pointer shows it's |
2712 | going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF |
2713 | is to hold the slp decision when the suggested unroll factor is worked |
2714 | out. */ |
2715 | static opt_result |
2716 | vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, |
2717 | unsigned *suggested_unroll_factor, |
2718 | bool& slp_done_for_suggested_uf) |
2719 | { |
2720 | opt_result ok = opt_result::success (); |
2721 | int res; |
2722 | unsigned int max_vf = MAX_VECTORIZATION_FACTOR; |
2723 | poly_uint64 min_vf = 2; |
2724 | loop_vec_info orig_loop_vinfo = NULL; |
2725 | |
2726 | /* If we are dealing with an epilogue then orig_loop_vinfo points to the |
2727 | loop_vec_info of the first vectorized loop. */ |
2728 | if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) |
2729 | orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); |
2730 | else |
2731 | orig_loop_vinfo = loop_vinfo; |
2732 | gcc_assert (orig_loop_vinfo); |
2733 | |
2734 | /* The first group of checks is independent of the vector size. */ |
2735 | fatal = true; |
2736 | |
2737 | if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo) |
2738 | && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo))) |
2739 | return opt_result::failure_at (loc: vect_location, |
2740 | fmt: "not vectorized: simd if(0)\n" ); |
2741 | |
2742 | /* Find all data references in the loop (which correspond to vdefs/vuses) |
2743 | and analyze their evolution in the loop. */ |
2744 | |
2745 | loop_p loop = LOOP_VINFO_LOOP (loop_vinfo); |
2746 | |
2747 | /* Gather the data references and count stmts in the loop. */ |
2748 | if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ()) |
2749 | { |
2750 | opt_result res |
2751 | = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo), |
2752 | datarefs: &LOOP_VINFO_DATAREFS (loop_vinfo), |
2753 | n_stmts: &LOOP_VINFO_N_STMTS (loop_vinfo)); |
2754 | if (!res) |
2755 | { |
2756 | if (dump_enabled_p ()) |
2757 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2758 | "not vectorized: loop contains function " |
2759 | "calls or data references that cannot " |
2760 | "be analyzed\n" ); |
2761 | return res; |
2762 | } |
2763 | loop_vinfo->shared->save_datarefs (); |
2764 | } |
2765 | else |
2766 | loop_vinfo->shared->check_datarefs (); |
2767 | |
2768 | /* Analyze the data references and also adjust the minimal |
2769 | vectorization factor according to the loads and stores. */ |
2770 | |
2771 | ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal); |
2772 | if (!ok) |
2773 | { |
2774 | if (dump_enabled_p ()) |
2775 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2776 | "bad data references.\n" ); |
2777 | return ok; |
2778 | } |
2779 | |
2780 | /* Check if we are applying unroll factor now. */ |
2781 | bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1; |
2782 | gcc_assert (!applying_suggested_uf || !suggested_unroll_factor); |
2783 | |
2784 | /* If the slp decision is false when suggested unroll factor is worked |
2785 | out, and we are applying suggested unroll factor, we can simply skip |
2786 | all slp related analyses this time. */ |
2787 | bool slp = !applying_suggested_uf || slp_done_for_suggested_uf; |
2788 | |
2789 | /* Classify all cross-iteration scalar data-flow cycles. |
2790 | Cross-iteration cycles caused by virtual phis are analyzed separately. */ |
2791 | vect_analyze_scalar_cycles (loop_vinfo, slp); |
2792 | |
2793 | vect_pattern_recog (loop_vinfo); |
2794 | |
2795 | vect_fixup_scalar_cycles_with_patterns (loop_vinfo); |
2796 | |
2797 | /* Analyze the access patterns of the data-refs in the loop (consecutive, |
2798 | complex, etc.). FORNOW: Only handle consecutive access pattern. */ |
2799 | |
2800 | ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL); |
2801 | if (!ok) |
2802 | { |
2803 | if (dump_enabled_p ()) |
2804 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2805 | "bad data access.\n" ); |
2806 | return ok; |
2807 | } |
2808 | |
2809 | /* Data-flow analysis to detect stmts that do not need to be vectorized. */ |
2810 | |
2811 | ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal); |
2812 | if (!ok) |
2813 | { |
2814 | if (dump_enabled_p ()) |
2815 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2816 | "unexpected pattern.\n" ); |
2817 | return ok; |
2818 | } |
2819 | |
2820 | /* While the rest of the analysis below depends on it in some way. */ |
2821 | fatal = false; |
2822 | |
2823 | /* Analyze data dependences between the data-refs in the loop |
2824 | and adjust the maximum vectorization factor according to |
2825 | the dependences. |
2826 | FORNOW: fail at the first data dependence that we encounter. */ |
2827 | |
2828 | ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf); |
2829 | if (!ok) |
2830 | { |
2831 | if (dump_enabled_p ()) |
2832 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2833 | "bad data dependence.\n" ); |
2834 | return ok; |
2835 | } |
2836 | if (max_vf != MAX_VECTORIZATION_FACTOR |
2837 | && maybe_lt (a: max_vf, b: min_vf)) |
2838 | return opt_result::failure_at (loc: vect_location, fmt: "bad data dependence.\n" ); |
2839 | LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf; |
2840 | |
2841 | ok = vect_determine_vectorization_factor (loop_vinfo); |
2842 | if (!ok) |
2843 | { |
2844 | if (dump_enabled_p ()) |
2845 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2846 | "can't determine vectorization factor.\n" ); |
2847 | return ok; |
2848 | } |
2849 | |
2850 | /* Compute the scalar iteration cost. */ |
2851 | vect_compute_single_scalar_iteration_cost (loop_vinfo); |
2852 | |
2853 | poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
2854 | |
2855 | if (slp) |
2856 | { |
2857 | /* Check the SLP opportunities in the loop, analyze and build |
2858 | SLP trees. */ |
2859 | ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo)); |
2860 | if (!ok) |
2861 | return ok; |
2862 | |
2863 | /* If there are any SLP instances mark them as pure_slp. */ |
2864 | slp = vect_make_slp_decision (loop_vinfo); |
2865 | if (slp) |
2866 | { |
2867 | /* Find stmts that need to be both vectorized and SLPed. */ |
2868 | vect_detect_hybrid_slp (loop_vinfo); |
2869 | |
2870 | /* Update the vectorization factor based on the SLP decision. */ |
2871 | vect_update_vf_for_slp (loop_vinfo); |
2872 | |
2873 | /* Optimize the SLP graph with the vectorization factor fixed. */ |
2874 | vect_optimize_slp (loop_vinfo); |
2875 | |
2876 | /* Gather the loads reachable from the SLP graph entries. */ |
2877 | vect_gather_slp_loads (loop_vinfo); |
2878 | } |
2879 | } |
2880 | |
2881 | bool saved_can_use_partial_vectors_p |
2882 | = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo); |
2883 | |
2884 | /* We don't expect to have to roll back to anything other than an empty |
2885 | set of rgroups. */ |
2886 | gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); |
2887 | |
2888 | /* This is the point where we can re-start analysis with SLP forced off. */ |
2889 | start_over: |
2890 | |
2891 | /* Apply the suggested unrolling factor, this was determined by the backend |
2892 | during finish_cost the first time we ran the analyzis for this |
2893 | vector mode. */ |
2894 | if (applying_suggested_uf) |
2895 | LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor; |
2896 | |
2897 | /* Now the vectorization factor is final. */ |
2898 | poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
2899 | gcc_assert (known_ne (vectorization_factor, 0U)); |
2900 | |
2901 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ()) |
2902 | { |
2903 | dump_printf_loc (MSG_NOTE, vect_location, |
2904 | "vectorization_factor = " ); |
2905 | dump_dec (MSG_NOTE, vectorization_factor); |
2906 | dump_printf (MSG_NOTE, ", niters = %wd\n" , |
2907 | LOOP_VINFO_INT_NITERS (loop_vinfo)); |
2908 | } |
2909 | |
2910 | if (max_vf != MAX_VECTORIZATION_FACTOR |
2911 | && maybe_lt (a: max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo))) |
2912 | return opt_result::failure_at (loc: vect_location, fmt: "bad data dependence.\n" ); |
2913 | |
2914 | loop_vinfo->vector_costs = init_cost (vinfo: loop_vinfo, costing_for_scalar: false); |
2915 | |
2916 | /* Analyze the alignment of the data-refs in the loop. |
2917 | Fail if a data reference is found that cannot be vectorized. */ |
2918 | |
2919 | ok = vect_analyze_data_refs_alignment (loop_vinfo); |
2920 | if (!ok) |
2921 | { |
2922 | if (dump_enabled_p ()) |
2923 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2924 | "bad data alignment.\n" ); |
2925 | return ok; |
2926 | } |
2927 | |
2928 | /* Prune the list of ddrs to be tested at run-time by versioning for alias. |
2929 | It is important to call pruning after vect_analyze_data_ref_accesses, |
2930 | since we use grouping information gathered by interleaving analysis. */ |
2931 | ok = vect_prune_runtime_alias_test_list (loop_vinfo); |
2932 | if (!ok) |
2933 | return ok; |
2934 | |
2935 | /* Do not invoke vect_enhance_data_refs_alignment for epilogue |
2936 | vectorization, since we do not want to add extra peeling or |
2937 | add versioning for alignment. */ |
2938 | if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) |
2939 | /* This pass will decide on using loop versioning and/or loop peeling in |
2940 | order to enhance the alignment of data references in the loop. */ |
2941 | ok = vect_enhance_data_refs_alignment (loop_vinfo); |
2942 | if (!ok) |
2943 | return ok; |
2944 | |
2945 | if (slp) |
2946 | { |
2947 | /* Analyze operations in the SLP instances. Note this may |
2948 | remove unsupported SLP instances which makes the above |
2949 | SLP kind detection invalid. */ |
2950 | unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length (); |
2951 | vect_slp_analyze_operations (loop_vinfo); |
2952 | if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size) |
2953 | { |
2954 | ok = opt_result::failure_at (loc: vect_location, |
2955 | fmt: "unsupported SLP instances\n" ); |
2956 | goto again; |
2957 | } |
2958 | |
2959 | /* Check whether any load in ALL SLP instances is possibly permuted. */ |
2960 | slp_tree load_node, slp_root; |
2961 | unsigned i, x; |
2962 | slp_instance instance; |
2963 | bool can_use_lanes = true; |
2964 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance) |
2965 | { |
2966 | slp_root = SLP_INSTANCE_TREE (instance); |
2967 | int group_size = SLP_TREE_LANES (slp_root); |
2968 | tree vectype = SLP_TREE_VECTYPE (slp_root); |
2969 | bool loads_permuted = false; |
2970 | FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) |
2971 | { |
2972 | if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ()) |
2973 | continue; |
2974 | unsigned j; |
2975 | stmt_vec_info load_info; |
2976 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) |
2977 | if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j) |
2978 | { |
2979 | loads_permuted = true; |
2980 | break; |
2981 | } |
2982 | } |
2983 | |
2984 | /* If the loads and stores can be handled with load/store-lane |
2985 | instructions record it and move on to the next instance. */ |
2986 | if (loads_permuted |
2987 | && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store |
2988 | && vect_store_lanes_supported (vectype, group_size, false) |
2989 | != IFN_LAST) |
2990 | { |
2991 | FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) |
2992 | if (STMT_VINFO_GROUPED_ACCESS |
2993 | (SLP_TREE_REPRESENTATIVE (load_node))) |
2994 | { |
2995 | stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT |
2996 | (SLP_TREE_REPRESENTATIVE (load_node)); |
2997 | /* Use SLP for strided accesses (or if we can't |
2998 | load-lanes). */ |
2999 | if (STMT_VINFO_STRIDED_P (stmt_vinfo) |
3000 | || vect_load_lanes_supported |
3001 | (STMT_VINFO_VECTYPE (stmt_vinfo), |
3002 | DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST) |
3003 | break; |
3004 | } |
3005 | |
3006 | can_use_lanes |
3007 | = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length (); |
3008 | |
3009 | if (can_use_lanes && dump_enabled_p ()) |
3010 | dump_printf_loc (MSG_NOTE, vect_location, |
3011 | "SLP instance %p can use load/store-lanes\n" , |
3012 | (void *) instance); |
3013 | } |
3014 | else |
3015 | { |
3016 | can_use_lanes = false; |
3017 | break; |
3018 | } |
3019 | } |
3020 | |
3021 | /* If all SLP instances can use load/store-lanes abort SLP and try again |
3022 | with SLP disabled. */ |
3023 | if (can_use_lanes) |
3024 | { |
3025 | ok = opt_result::failure_at (loc: vect_location, |
3026 | fmt: "Built SLP cancelled: can use " |
3027 | "load/store-lanes\n" ); |
3028 | if (dump_enabled_p ()) |
3029 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3030 | "Built SLP cancelled: all SLP instances support " |
3031 | "load/store-lanes\n" ); |
3032 | goto again; |
3033 | } |
3034 | } |
3035 | |
3036 | /* Dissolve SLP-only groups. */ |
3037 | vect_dissolve_slp_only_groups (loop_vinfo); |
3038 | |
3039 | /* Scan all the remaining operations in the loop that are not subject |
3040 | to SLP and make sure they are vectorizable. */ |
3041 | ok = vect_analyze_loop_operations (loop_vinfo); |
3042 | if (!ok) |
3043 | { |
3044 | if (dump_enabled_p ()) |
3045 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3046 | "bad operation or unsupported loop bound.\n" ); |
3047 | return ok; |
3048 | } |
3049 | |
3050 | /* For now, we don't expect to mix both masking and length approaches for one |
3051 | loop, disable it if both are recorded. */ |
3052 | if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) |
3053 | && !LOOP_VINFO_MASKS (loop_vinfo).is_empty () |
3054 | && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()) |
3055 | { |
3056 | if (dump_enabled_p ()) |
3057 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3058 | "can't vectorize a loop with partial vectors" |
3059 | " because we don't expect to mix different" |
3060 | " approaches with partial vectors for the" |
3061 | " same loop.\n" ); |
3062 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
3063 | } |
3064 | |
3065 | /* If we still have the option of using partial vectors, |
3066 | check whether we can generate the necessary loop controls. */ |
3067 | if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) |
3068 | { |
3069 | if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) |
3070 | { |
3071 | if (!vect_verify_full_masking (loop_vinfo) |
3072 | && !vect_verify_full_masking_avx512 (loop_vinfo)) |
3073 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
3074 | } |
3075 | else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */ |
3076 | if (!vect_verify_loop_lens (loop_vinfo)) |
3077 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
3078 | } |
3079 | |
3080 | /* If we're vectorizing a loop that uses length "controls" and |
3081 | can iterate more than once, we apply decrementing IV approach |
3082 | in loop control. */ |
3083 | if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) |
3084 | && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len |
3085 | && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0 |
3086 | && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
3087 | && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo), |
3088 | LOOP_VINFO_VECT_FACTOR (loop_vinfo)))) |
3089 | LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true; |
3090 | |
3091 | /* If a loop uses length controls and has a decrementing loop control IV, |
3092 | we will normally pass that IV through a MIN_EXPR to calcaluate the |
3093 | basis for the length controls. E.g. in a loop that processes one |
3094 | element per scalar iteration, the number of elements would be |
3095 | MIN_EXPR <N, VF>, where N is the number of scalar iterations left. |
3096 | |
3097 | This MIN_EXPR approach allows us to use pointer IVs with an invariant |
3098 | step, since only the final iteration of the vector loop can have |
3099 | inactive lanes. |
3100 | |
3101 | However, some targets have a dedicated instruction for calculating the |
3102 | preferred length, given the total number of elements that still need to |
3103 | be processed. This is encapsulated in the SELECT_VL internal function. |
3104 | |
3105 | If the target supports SELECT_VL, we can use it instead of MIN_EXPR |
3106 | to determine the basis for the length controls. However, unlike the |
3107 | MIN_EXPR calculation, the SELECT_VL calculation can decide to make |
3108 | lanes inactive in any iteration of the vector loop, not just the last |
3109 | iteration. This SELECT_VL approach therefore requires us to use pointer |
3110 | IVs with variable steps. |
3111 | |
3112 | Once we've decided how many elements should be processed by one |
3113 | iteration of the vector loop, we need to populate the rgroup controls. |
3114 | If a loop has multiple rgroups, we need to make sure that those rgroups |
3115 | "line up" (that is, they must be consistent about which elements are |
3116 | active and which aren't). This is done by vect_adjust_loop_lens_control. |
3117 | |
3118 | In principle, it would be possible to use vect_adjust_loop_lens_control |
3119 | on either the result of a MIN_EXPR or the result of a SELECT_VL. |
3120 | However: |
3121 | |
3122 | (1) In practice, it only makes sense to use SELECT_VL when a vector |
3123 | operation will be controlled directly by the result. It is not |
3124 | worth using SELECT_VL if it would only be the input to other |
3125 | calculations. |
3126 | |
3127 | (2) If we use SELECT_VL for an rgroup that has N controls, each associated |
3128 | pointer IV will need N updates by a variable amount (N-1 updates |
3129 | within the iteration and 1 update to move to the next iteration). |
3130 | |
3131 | Because of this, we prefer to use the MIN_EXPR approach whenever there |
3132 | is more than one length control. |
3133 | |
3134 | In addition, SELECT_VL always operates to a granularity of 1 unit. |
3135 | If we wanted to use it to control an SLP operation on N consecutive |
3136 | elements, we would need to make the SELECT_VL inputs measure scalar |
3137 | iterations (rather than elements) and then multiply the SELECT_VL |
3138 | result by N. But using SELECT_VL this way is inefficient because |
3139 | of (1) above. |
3140 | |
3141 | 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are |
3142 | satisfied: |
3143 | |
3144 | (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true. |
3145 | (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true. |
3146 | |
3147 | Since SELECT_VL (variable step) will make SCEV analysis failed and then |
3148 | we will fail to gain benefits of following unroll optimizations. We prefer |
3149 | using the MIN_EXPR approach in this situation. */ |
3150 | if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) |
3151 | { |
3152 | tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); |
3153 | if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type, |
3154 | OPTIMIZE_FOR_SPEED) |
3155 | && LOOP_VINFO_LENS (loop_vinfo).length () == 1 |
3156 | && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp |
3157 | && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
3158 | || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())) |
3159 | LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true; |
3160 | } |
3161 | |
3162 | /* Decide whether this loop_vinfo should use partial vectors or peeling, |
3163 | assuming that the loop will be used as a main loop. We will redo |
3164 | this analysis later if we instead decide to use the loop as an |
3165 | epilogue loop. */ |
3166 | ok = vect_determine_partial_vectors_and_peeling (loop_vinfo); |
3167 | if (!ok) |
3168 | return ok; |
3169 | |
3170 | /* If we're vectorizing an epilogue loop, the vectorized loop either needs |
3171 | to be able to handle fewer than VF scalars, or needs to have a lower VF |
3172 | than the main loop. */ |
3173 | if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) |
3174 | && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
3175 | { |
3176 | poly_uint64 unscaled_vf |
3177 | = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo), |
3178 | b: orig_loop_vinfo->suggested_unroll_factor); |
3179 | if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf)) |
3180 | return opt_result::failure_at (loc: vect_location, |
3181 | fmt: "Vectorization factor too high for" |
3182 | " epilogue loop.\n" ); |
3183 | } |
3184 | |
3185 | /* Check the costings of the loop make vectorizing worthwhile. */ |
3186 | res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor); |
3187 | if (res < 0) |
3188 | { |
3189 | ok = opt_result::failure_at (loc: vect_location, |
3190 | fmt: "Loop costings may not be worthwhile.\n" ); |
3191 | goto again; |
3192 | } |
3193 | if (!res) |
3194 | return opt_result::failure_at (loc: vect_location, |
3195 | fmt: "Loop costings not worthwhile.\n" ); |
3196 | |
3197 | /* If an epilogue loop is required make sure we can create one. */ |
3198 | if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) |
3199 | || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) |
3200 | || LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) |
3201 | { |
3202 | if (dump_enabled_p ()) |
3203 | dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n" ); |
3204 | if (!vect_can_advance_ivs_p (loop_vinfo) |
3205 | || !slpeel_can_duplicate_loop_p (loop, |
3206 | LOOP_VINFO_IV_EXIT (loop_vinfo), |
3207 | LOOP_VINFO_IV_EXIT (loop_vinfo))) |
3208 | { |
3209 | ok = opt_result::failure_at (loc: vect_location, |
3210 | fmt: "not vectorized: can't create required " |
3211 | "epilog loop\n" ); |
3212 | goto again; |
3213 | } |
3214 | } |
3215 | |
3216 | /* During peeling, we need to check if number of loop iterations is |
3217 | enough for both peeled prolog loop and vector loop. This check |
3218 | can be merged along with threshold check of loop versioning, so |
3219 | increase threshold for this case if necessary. |
3220 | |
3221 | If we are analyzing an epilogue we still want to check what its |
3222 | versioning threshold would be. If we decide to vectorize the epilogues we |
3223 | will want to use the lowest versioning threshold of all epilogues and main |
3224 | loop. This will enable us to enter a vectorized epilogue even when |
3225 | versioning the loop. We can't simply check whether the epilogue requires |
3226 | versioning though since we may have skipped some versioning checks when |
3227 | analyzing the epilogue. For instance, checks for alias versioning will be |
3228 | skipped when dealing with epilogues as we assume we already checked them |
3229 | for the main loop. So instead we always check the 'orig_loop_vinfo'. */ |
3230 | if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo)) |
3231 | { |
3232 | poly_uint64 niters_th = 0; |
3233 | unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); |
3234 | |
3235 | if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
3236 | { |
3237 | /* Niters for peeled prolog loop. */ |
3238 | if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) |
3239 | { |
3240 | dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); |
3241 | tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); |
3242 | niters_th += TYPE_VECTOR_SUBPARTS (node: vectype) - 1; |
3243 | } |
3244 | else |
3245 | niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
3246 | } |
3247 | |
3248 | /* Niters for at least one iteration of vectorized loop. */ |
3249 | if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
3250 | niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
3251 | /* One additional iteration because of peeling for gap. */ |
3252 | if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) |
3253 | niters_th += 1; |
3254 | |
3255 | /* Use the same condition as vect_transform_loop to decide when to use |
3256 | the cost to determine a versioning threshold. */ |
3257 | if (vect_apply_runtime_profitability_check_p (loop_vinfo) |
3258 | && ordered_p (a: th, b: niters_th)) |
3259 | niters_th = ordered_max (a: poly_uint64 (th), b: niters_th); |
3260 | |
3261 | LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th; |
3262 | } |
3263 | |
3264 | gcc_assert (known_eq (vectorization_factor, |
3265 | LOOP_VINFO_VECT_FACTOR (loop_vinfo))); |
3266 | |
3267 | slp_done_for_suggested_uf = slp; |
3268 | |
3269 | /* Ok to vectorize! */ |
3270 | LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; |
3271 | return opt_result::success (); |
3272 | |
3273 | again: |
3274 | /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */ |
3275 | gcc_assert (!ok); |
3276 | |
3277 | /* Try again with SLP forced off but if we didn't do any SLP there is |
3278 | no point in re-trying. */ |
3279 | if (!slp) |
3280 | return ok; |
3281 | |
3282 | /* If the slp decision is true when suggested unroll factor is worked |
3283 | out, and we are applying suggested unroll factor, we don't need to |
3284 | re-try any more. */ |
3285 | if (applying_suggested_uf && slp_done_for_suggested_uf) |
3286 | return ok; |
3287 | |
3288 | /* If there are reduction chains re-trying will fail anyway. */ |
3289 | if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ()) |
3290 | return ok; |
3291 | |
3292 | /* Likewise if the grouped loads or stores in the SLP cannot be handled |
3293 | via interleaving or lane instructions. */ |
3294 | slp_instance instance; |
3295 | slp_tree node; |
3296 | unsigned i, j; |
3297 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) |
3298 | { |
3299 | stmt_vec_info vinfo; |
3300 | vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]; |
3301 | if (! STMT_VINFO_GROUPED_ACCESS (vinfo)) |
3302 | continue; |
3303 | vinfo = DR_GROUP_FIRST_ELEMENT (vinfo); |
3304 | unsigned int size = DR_GROUP_SIZE (vinfo); |
3305 | tree vectype = STMT_VINFO_VECTYPE (vinfo); |
3306 | if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST |
3307 | && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U) |
3308 | && ! vect_grouped_store_supported (vectype, size)) |
3309 | return opt_result::failure_at (loc: vinfo->stmt, |
3310 | fmt: "unsupported grouped store\n" ); |
3311 | FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node) |
3312 | { |
3313 | vinfo = SLP_TREE_REPRESENTATIVE (node); |
3314 | if (STMT_VINFO_GROUPED_ACCESS (vinfo)) |
3315 | { |
3316 | vinfo = DR_GROUP_FIRST_ELEMENT (vinfo); |
3317 | bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo); |
3318 | size = DR_GROUP_SIZE (vinfo); |
3319 | vectype = STMT_VINFO_VECTYPE (vinfo); |
3320 | if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST |
3321 | && ! vect_grouped_load_supported (vectype, single_element_p, |
3322 | size)) |
3323 | return opt_result::failure_at (loc: vinfo->stmt, |
3324 | fmt: "unsupported grouped load\n" ); |
3325 | } |
3326 | } |
3327 | } |
3328 | |
3329 | if (dump_enabled_p ()) |
3330 | dump_printf_loc (MSG_NOTE, vect_location, |
3331 | "re-trying with SLP disabled\n" ); |
3332 | |
3333 | /* Roll back state appropriately. No SLP this time. */ |
3334 | slp = false; |
3335 | /* Restore vectorization factor as it were without SLP. */ |
3336 | LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor; |
3337 | /* Free the SLP instances. */ |
3338 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance) |
3339 | vect_free_slp_instance (instance); |
3340 | LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); |
3341 | /* Reset SLP type to loop_vect on all stmts. */ |
3342 | for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i) |
3343 | { |
3344 | basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; |
3345 | for (gimple_stmt_iterator si = gsi_start_phis (bb); |
3346 | !gsi_end_p (i: si); gsi_next (i: &si)) |
3347 | { |
3348 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (i: si)); |
3349 | STMT_SLP_TYPE (stmt_info) = loop_vect; |
3350 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
3351 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) |
3352 | { |
3353 | /* vectorizable_reduction adjusts reduction stmt def-types, |
3354 | restore them to that of the PHI. */ |
3355 | STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info)) |
3356 | = STMT_VINFO_DEF_TYPE (stmt_info); |
3357 | STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize |
3358 | (STMT_VINFO_REDUC_DEF (stmt_info))) |
3359 | = STMT_VINFO_DEF_TYPE (stmt_info); |
3360 | } |
3361 | } |
3362 | for (gimple_stmt_iterator si = gsi_start_bb (bb); |
3363 | !gsi_end_p (i: si); gsi_next (i: &si)) |
3364 | { |
3365 | if (is_gimple_debug (gs: gsi_stmt (i: si))) |
3366 | continue; |
3367 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (i: si)); |
3368 | STMT_SLP_TYPE (stmt_info) = loop_vect; |
3369 | if (STMT_VINFO_IN_PATTERN_P (stmt_info)) |
3370 | { |
3371 | stmt_vec_info pattern_stmt_info |
3372 | = STMT_VINFO_RELATED_STMT (stmt_info); |
3373 | if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info)) |
3374 | STMT_VINFO_IN_PATTERN_P (stmt_info) = false; |
3375 | |
3376 | gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); |
3377 | STMT_SLP_TYPE (pattern_stmt_info) = loop_vect; |
3378 | for (gimple_stmt_iterator pi = gsi_start (seq&: pattern_def_seq); |
3379 | !gsi_end_p (i: pi); gsi_next (i: &pi)) |
3380 | STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi))) |
3381 | = loop_vect; |
3382 | } |
3383 | } |
3384 | } |
3385 | /* Free optimized alias test DDRS. */ |
3386 | LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (size: 0); |
3387 | LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release (); |
3388 | LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release (); |
3389 | /* Reset target cost data. */ |
3390 | delete loop_vinfo->vector_costs; |
3391 | loop_vinfo->vector_costs = nullptr; |
3392 | /* Reset accumulated rgroup information. */ |
3393 | LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty (); |
3394 | release_vec_loop_controls (controls: &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec); |
3395 | release_vec_loop_controls (controls: &LOOP_VINFO_LENS (loop_vinfo)); |
3396 | /* Reset assorted flags. */ |
3397 | LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; |
3398 | LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false; |
3399 | LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0; |
3400 | LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0; |
3401 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) |
3402 | = saved_can_use_partial_vectors_p; |
3403 | LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; |
3404 | |
3405 | goto start_over; |
3406 | } |
3407 | |
3408 | /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears |
3409 | to be better than vectorizing it using OLD_LOOP_VINFO. Assume that |
3410 | OLD_LOOP_VINFO is better unless something specifically indicates |
3411 | otherwise. |
3412 | |
3413 | Note that this deliberately isn't a partial order. */ |
3414 | |
3415 | static bool |
3416 | vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo, |
3417 | loop_vec_info old_loop_vinfo) |
3418 | { |
3419 | struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo); |
3420 | gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop); |
3421 | |
3422 | poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo); |
3423 | poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo); |
3424 | |
3425 | /* Always prefer a VF of loop->simdlen over any other VF. */ |
3426 | if (loop->simdlen) |
3427 | { |
3428 | bool new_simdlen_p = known_eq (new_vf, loop->simdlen); |
3429 | bool old_simdlen_p = known_eq (old_vf, loop->simdlen); |
3430 | if (new_simdlen_p != old_simdlen_p) |
3431 | return new_simdlen_p; |
3432 | } |
3433 | |
3434 | const auto *old_costs = old_loop_vinfo->vector_costs; |
3435 | const auto *new_costs = new_loop_vinfo->vector_costs; |
3436 | if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo)) |
3437 | return new_costs->better_epilogue_loop_than_p (other: old_costs, main_loop); |
3438 | |
3439 | return new_costs->better_main_loop_than_p (other: old_costs); |
3440 | } |
3441 | |
3442 | /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return |
3443 | true if we should. */ |
3444 | |
3445 | static bool |
3446 | vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo, |
3447 | loop_vec_info old_loop_vinfo) |
3448 | { |
3449 | if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo)) |
3450 | return false; |
3451 | |
3452 | if (dump_enabled_p ()) |
3453 | dump_printf_loc (MSG_NOTE, vect_location, |
3454 | "***** Preferring vector mode %s to vector mode %s\n" , |
3455 | GET_MODE_NAME (new_loop_vinfo->vector_mode), |
3456 | GET_MODE_NAME (old_loop_vinfo->vector_mode)); |
3457 | return true; |
3458 | } |
3459 | |
3460 | /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is |
3461 | not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance |
3462 | MODE_I to the next mode useful to analyze. |
3463 | Return the loop_vinfo on success and wrapped null on failure. */ |
3464 | |
3465 | static opt_loop_vec_info |
3466 | vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, |
3467 | const vect_loop_form_info *loop_form_info, |
3468 | loop_vec_info main_loop_vinfo, |
3469 | const vector_modes &vector_modes, unsigned &mode_i, |
3470 | machine_mode &autodetected_vector_mode, |
3471 | bool &fatal) |
3472 | { |
3473 | loop_vec_info loop_vinfo |
3474 | = vect_create_loop_vinfo (loop, shared, info: loop_form_info, main_loop_info: main_loop_vinfo); |
3475 | |
3476 | machine_mode vector_mode = vector_modes[mode_i]; |
3477 | loop_vinfo->vector_mode = vector_mode; |
3478 | unsigned int suggested_unroll_factor = 1; |
3479 | bool slp_done_for_suggested_uf = false; |
3480 | |
3481 | /* Run the main analysis. */ |
3482 | opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, |
3483 | suggested_unroll_factor: &suggested_unroll_factor, |
3484 | slp_done_for_suggested_uf); |
3485 | if (dump_enabled_p ()) |
3486 | dump_printf_loc (MSG_NOTE, vect_location, |
3487 | "***** Analysis %s with vector mode %s\n" , |
3488 | res ? "succeeded" : " failed" , |
3489 | GET_MODE_NAME (loop_vinfo->vector_mode)); |
3490 | |
3491 | if (res && !main_loop_vinfo && suggested_unroll_factor > 1) |
3492 | { |
3493 | if (dump_enabled_p ()) |
3494 | dump_printf_loc (MSG_NOTE, vect_location, |
3495 | "***** Re-trying analysis for unrolling" |
3496 | " with unroll factor %d and slp %s.\n" , |
3497 | suggested_unroll_factor, |
3498 | slp_done_for_suggested_uf ? "on" : "off" ); |
3499 | loop_vec_info unroll_vinfo |
3500 | = vect_create_loop_vinfo (loop, shared, info: loop_form_info, main_loop_info: main_loop_vinfo); |
3501 | unroll_vinfo->vector_mode = vector_mode; |
3502 | unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor; |
3503 | opt_result new_res = vect_analyze_loop_2 (loop_vinfo: unroll_vinfo, fatal, NULL, |
3504 | slp_done_for_suggested_uf); |
3505 | if (new_res) |
3506 | { |
3507 | delete loop_vinfo; |
3508 | loop_vinfo = unroll_vinfo; |
3509 | } |
3510 | else |
3511 | delete unroll_vinfo; |
3512 | } |
3513 | |
3514 | /* Remember the autodetected vector mode. */ |
3515 | if (vector_mode == VOIDmode) |
3516 | autodetected_vector_mode = loop_vinfo->vector_mode; |
3517 | |
3518 | /* Advance mode_i, first skipping modes that would result in the |
3519 | same analysis result. */ |
3520 | while (mode_i + 1 < vector_modes.length () |
3521 | && vect_chooses_same_modes_p (loop_vinfo, |
3522 | vector_modes[mode_i + 1])) |
3523 | { |
3524 | if (dump_enabled_p ()) |
3525 | dump_printf_loc (MSG_NOTE, vect_location, |
3526 | "***** The result for vector mode %s would" |
3527 | " be the same\n" , |
3528 | GET_MODE_NAME (vector_modes[mode_i + 1])); |
3529 | mode_i += 1; |
3530 | } |
3531 | if (mode_i + 1 < vector_modes.length () |
3532 | && VECTOR_MODE_P (autodetected_vector_mode) |
3533 | && (related_vector_mode (vector_modes[mode_i + 1], |
3534 | GET_MODE_INNER (autodetected_vector_mode)) |
3535 | == autodetected_vector_mode) |
3536 | && (related_vector_mode (autodetected_vector_mode, |
3537 | GET_MODE_INNER (vector_modes[mode_i + 1])) |
3538 | == vector_modes[mode_i + 1])) |
3539 | { |
3540 | if (dump_enabled_p ()) |
3541 | dump_printf_loc (MSG_NOTE, vect_location, |
3542 | "***** Skipping vector mode %s, which would" |
3543 | " repeat the analysis for %s\n" , |
3544 | GET_MODE_NAME (vector_modes[mode_i + 1]), |
3545 | GET_MODE_NAME (autodetected_vector_mode)); |
3546 | mode_i += 1; |
3547 | } |
3548 | mode_i++; |
3549 | |
3550 | if (!res) |
3551 | { |
3552 | delete loop_vinfo; |
3553 | if (fatal) |
3554 | gcc_checking_assert (main_loop_vinfo == NULL); |
3555 | return opt_loop_vec_info::propagate_failure (other: res); |
3556 | } |
3557 | |
3558 | return opt_loop_vec_info::success (ptr: loop_vinfo); |
3559 | } |
3560 | |
3561 | /* Function vect_analyze_loop. |
3562 | |
3563 | Apply a set of analyses on LOOP, and create a loop_vec_info struct |
3564 | for it. The different analyses will record information in the |
3565 | loop_vec_info struct. */ |
3566 | opt_loop_vec_info |
3567 | vect_analyze_loop (class loop *loop, vec_info_shared *shared) |
3568 | { |
3569 | DUMP_VECT_SCOPE ("analyze_loop_nest" ); |
3570 | |
3571 | if (loop_outer (loop) |
3572 | && loop_vec_info_for_loop (loop: loop_outer (loop)) |
3573 | && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop)))) |
3574 | return opt_loop_vec_info::failure_at (loc: vect_location, |
3575 | fmt: "outer-loop already vectorized.\n" ); |
3576 | |
3577 | if (!find_loop_nest (loop, &shared->loop_nest)) |
3578 | return opt_loop_vec_info::failure_at |
3579 | (loc: vect_location, |
3580 | fmt: "not vectorized: loop nest containing two or more consecutive inner" |
3581 | " loops cannot be vectorized\n" ); |
3582 | |
3583 | /* Analyze the loop form. */ |
3584 | vect_loop_form_info loop_form_info; |
3585 | opt_result res = vect_analyze_loop_form (loop, info: &loop_form_info); |
3586 | if (!res) |
3587 | { |
3588 | if (dump_enabled_p ()) |
3589 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3590 | "bad loop form.\n" ); |
3591 | return opt_loop_vec_info::propagate_failure (other: res); |
3592 | } |
3593 | if (!integer_onep (loop_form_info.assumptions)) |
3594 | { |
3595 | /* We consider to vectorize this loop by versioning it under |
3596 | some assumptions. In order to do this, we need to clear |
3597 | existing information computed by scev and niter analyzer. */ |
3598 | scev_reset_htab (); |
3599 | free_numbers_of_iterations_estimates (loop); |
3600 | /* Also set flag for this loop so that following scev and niter |
3601 | analysis are done under the assumptions. */ |
3602 | loop_constraint_set (loop, LOOP_C_FINITE); |
3603 | } |
3604 | else |
3605 | /* Clear the existing niter information to make sure the nonwrapping flag |
3606 | will be calculated and set propriately. */ |
3607 | free_numbers_of_iterations_estimates (loop); |
3608 | |
3609 | auto_vector_modes vector_modes; |
3610 | /* Autodetect first vector size we try. */ |
3611 | vector_modes.safe_push (VOIDmode); |
3612 | unsigned int autovec_flags |
3613 | = targetm.vectorize.autovectorize_vector_modes (&vector_modes, |
3614 | loop->simdlen != 0); |
3615 | bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS) |
3616 | && !unlimited_cost_model (loop)); |
3617 | machine_mode autodetected_vector_mode = VOIDmode; |
3618 | opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL); |
3619 | unsigned int mode_i = 0; |
3620 | unsigned HOST_WIDE_INT simdlen = loop->simdlen; |
3621 | |
3622 | /* Keep track of the VF for each mode. Initialize all to 0 which indicates |
3623 | a mode has not been analyzed. */ |
3624 | auto_vec<poly_uint64, 8> cached_vf_per_mode; |
3625 | for (unsigned i = 0; i < vector_modes.length (); ++i) |
3626 | cached_vf_per_mode.safe_push (obj: 0); |
3627 | |
3628 | /* First determine the main loop vectorization mode, either the first |
3629 | one that works, starting with auto-detecting the vector mode and then |
3630 | following the targets order of preference, or the one with the |
3631 | lowest cost if pick_lowest_cost_p. */ |
3632 | while (1) |
3633 | { |
3634 | bool fatal; |
3635 | unsigned int last_mode_i = mode_i; |
3636 | /* Set cached VF to -1 prior to analysis, which indicates a mode has |
3637 | failed. */ |
3638 | cached_vf_per_mode[last_mode_i] = -1; |
3639 | opt_loop_vec_info loop_vinfo |
3640 | = vect_analyze_loop_1 (loop, shared, loop_form_info: &loop_form_info, |
3641 | NULL, vector_modes, mode_i, |
3642 | autodetected_vector_mode, fatal); |
3643 | if (fatal) |
3644 | break; |
3645 | |
3646 | if (loop_vinfo) |
3647 | { |
3648 | /* Analyzis has been successful so update the VF value. The |
3649 | VF should always be a multiple of unroll_factor and we want to |
3650 | capture the original VF here. */ |
3651 | cached_vf_per_mode[last_mode_i] |
3652 | = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo), |
3653 | b: loop_vinfo->suggested_unroll_factor); |
3654 | /* Once we hit the desired simdlen for the first time, |
3655 | discard any previous attempts. */ |
3656 | if (simdlen |
3657 | && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen)) |
3658 | { |
3659 | delete first_loop_vinfo; |
3660 | first_loop_vinfo = opt_loop_vec_info::success (NULL); |
3661 | simdlen = 0; |
3662 | } |
3663 | else if (pick_lowest_cost_p |
3664 | && first_loop_vinfo |
3665 | && vect_joust_loop_vinfos (new_loop_vinfo: loop_vinfo, old_loop_vinfo: first_loop_vinfo)) |
3666 | { |
3667 | /* Pick loop_vinfo over first_loop_vinfo. */ |
3668 | delete first_loop_vinfo; |
3669 | first_loop_vinfo = opt_loop_vec_info::success (NULL); |
3670 | } |
3671 | if (first_loop_vinfo == NULL) |
3672 | first_loop_vinfo = loop_vinfo; |
3673 | else |
3674 | { |
3675 | delete loop_vinfo; |
3676 | loop_vinfo = opt_loop_vec_info::success (NULL); |
3677 | } |
3678 | |
3679 | /* Commit to first_loop_vinfo if we have no reason to try |
3680 | alternatives. */ |
3681 | if (!simdlen && !pick_lowest_cost_p) |
3682 | break; |
3683 | } |
3684 | if (mode_i == vector_modes.length () |
3685 | || autodetected_vector_mode == VOIDmode) |
3686 | break; |
3687 | |
3688 | /* Try the next biggest vector size. */ |
3689 | if (dump_enabled_p ()) |
3690 | dump_printf_loc (MSG_NOTE, vect_location, |
3691 | "***** Re-trying analysis with vector mode %s\n" , |
3692 | GET_MODE_NAME (vector_modes[mode_i])); |
3693 | } |
3694 | if (!first_loop_vinfo) |
3695 | return opt_loop_vec_info::propagate_failure (other: res); |
3696 | |
3697 | if (dump_enabled_p ()) |
3698 | dump_printf_loc (MSG_NOTE, vect_location, |
3699 | "***** Choosing vector mode %s\n" , |
3700 | GET_MODE_NAME (first_loop_vinfo->vector_mode)); |
3701 | |
3702 | /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is |
3703 | enabled, SIMDUID is not set, it is the innermost loop and we have |
3704 | either already found the loop's SIMDLEN or there was no SIMDLEN to |
3705 | begin with. |
3706 | TODO: Enable epilogue vectorization for loops with SIMDUID set. */ |
3707 | bool vect_epilogues = (!simdlen |
3708 | && loop->inner == NULL |
3709 | && param_vect_epilogues_nomask |
3710 | && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo) |
3711 | /* No code motion support for multiple epilogues so for now |
3712 | not supported when multiple exits. */ |
3713 | && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo) |
3714 | && !loop->simduid); |
3715 | if (!vect_epilogues) |
3716 | return first_loop_vinfo; |
3717 | |
3718 | /* Now analyze first_loop_vinfo for epilogue vectorization. */ |
3719 | poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo); |
3720 | |
3721 | /* For epilogues start the analysis from the first mode. The motivation |
3722 | behind starting from the beginning comes from cases where the VECTOR_MODES |
3723 | array may contain length-agnostic and length-specific modes. Their |
3724 | ordering is not guaranteed, so we could end up picking a mode for the main |
3725 | loop that is after the epilogue's optimal mode. */ |
3726 | vector_modes[0] = autodetected_vector_mode; |
3727 | mode_i = 0; |
3728 | |
3729 | bool supports_partial_vectors = |
3730 | partial_vectors_supported_p () && param_vect_partial_vector_usage != 0; |
3731 | poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo); |
3732 | |
3733 | while (1) |
3734 | { |
3735 | /* If the target does not support partial vectors we can shorten the |
3736 | number of modes to analyze for the epilogue as we know we can't pick a |
3737 | mode that would lead to a VF at least as big as the |
3738 | FIRST_VINFO_VF. */ |
3739 | if (!supports_partial_vectors |
3740 | && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf)) |
3741 | { |
3742 | mode_i++; |
3743 | if (mode_i == vector_modes.length ()) |
3744 | break; |
3745 | continue; |
3746 | } |
3747 | |
3748 | if (dump_enabled_p ()) |
3749 | dump_printf_loc (MSG_NOTE, vect_location, |
3750 | "***** Re-trying epilogue analysis with vector " |
3751 | "mode %s\n" , GET_MODE_NAME (vector_modes[mode_i])); |
3752 | |
3753 | bool fatal; |
3754 | opt_loop_vec_info loop_vinfo |
3755 | = vect_analyze_loop_1 (loop, shared, loop_form_info: &loop_form_info, |
3756 | main_loop_vinfo: first_loop_vinfo, |
3757 | vector_modes, mode_i, |
3758 | autodetected_vector_mode, fatal); |
3759 | if (fatal) |
3760 | break; |
3761 | |
3762 | if (loop_vinfo) |
3763 | { |
3764 | if (pick_lowest_cost_p) |
3765 | { |
3766 | /* Keep trying to roll back vectorization attempts while the |
3767 | loop_vec_infos they produced were worse than this one. */ |
3768 | vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos; |
3769 | while (!vinfos.is_empty () |
3770 | && vect_joust_loop_vinfos (new_loop_vinfo: loop_vinfo, old_loop_vinfo: vinfos.last ())) |
3771 | { |
3772 | gcc_assert (vect_epilogues); |
3773 | delete vinfos.pop (); |
3774 | } |
3775 | } |
3776 | /* For now only allow one epilogue loop. */ |
3777 | if (first_loop_vinfo->epilogue_vinfos.is_empty ()) |
3778 | { |
3779 | first_loop_vinfo->epilogue_vinfos.safe_push (obj: loop_vinfo); |
3780 | poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); |
3781 | gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo) |
3782 | || maybe_ne (lowest_th, 0U)); |
3783 | /* Keep track of the known smallest versioning |
3784 | threshold. */ |
3785 | if (ordered_p (a: lowest_th, b: th)) |
3786 | lowest_th = ordered_min (a: lowest_th, b: th); |
3787 | } |
3788 | else |
3789 | { |
3790 | delete loop_vinfo; |
3791 | loop_vinfo = opt_loop_vec_info::success (NULL); |
3792 | } |
3793 | |
3794 | /* For now only allow one epilogue loop, but allow |
3795 | pick_lowest_cost_p to replace it, so commit to the |
3796 | first epilogue if we have no reason to try alternatives. */ |
3797 | if (!pick_lowest_cost_p) |
3798 | break; |
3799 | } |
3800 | |
3801 | if (mode_i == vector_modes.length ()) |
3802 | break; |
3803 | |
3804 | } |
3805 | |
3806 | if (!first_loop_vinfo->epilogue_vinfos.is_empty ()) |
3807 | { |
3808 | LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th; |
3809 | if (dump_enabled_p ()) |
3810 | dump_printf_loc (MSG_NOTE, vect_location, |
3811 | "***** Choosing epilogue vector mode %s\n" , |
3812 | GET_MODE_NAME |
3813 | (first_loop_vinfo->epilogue_vinfos[0]->vector_mode)); |
3814 | } |
3815 | |
3816 | return first_loop_vinfo; |
3817 | } |
3818 | |
3819 | /* Return true if there is an in-order reduction function for CODE, storing |
3820 | it in *REDUC_FN if so. */ |
3821 | |
3822 | static bool |
3823 | fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn) |
3824 | { |
3825 | /* We support MINUS_EXPR by negating the operand. This also preserves an |
3826 | initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 + |
3827 | (-0.0) = -0.0. */ |
3828 | if (code == PLUS_EXPR || code == MINUS_EXPR) |
3829 | { |
3830 | *reduc_fn = IFN_FOLD_LEFT_PLUS; |
3831 | return true; |
3832 | } |
3833 | return false; |
3834 | } |
3835 | |
3836 | /* Function reduction_fn_for_scalar_code |
3837 | |
3838 | Input: |
3839 | CODE - tree_code of a reduction operations. |
3840 | |
3841 | Output: |
3842 | REDUC_FN - the corresponding internal function to be used to reduce the |
3843 | vector of partial results into a single scalar result, or IFN_LAST |
3844 | if the operation is a supported reduction operation, but does not have |
3845 | such an internal function. |
3846 | |
3847 | Return FALSE if CODE currently cannot be vectorized as reduction. */ |
3848 | |
3849 | bool |
3850 | reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn) |
3851 | { |
3852 | if (code.is_tree_code ()) |
3853 | switch (tree_code (code)) |
3854 | { |
3855 | case MAX_EXPR: |
3856 | *reduc_fn = IFN_REDUC_MAX; |
3857 | return true; |
3858 | |
3859 | case MIN_EXPR: |
3860 | *reduc_fn = IFN_REDUC_MIN; |
3861 | return true; |
3862 | |
3863 | case PLUS_EXPR: |
3864 | *reduc_fn = IFN_REDUC_PLUS; |
3865 | return true; |
3866 | |
3867 | case BIT_AND_EXPR: |
3868 | *reduc_fn = IFN_REDUC_AND; |
3869 | return true; |
3870 | |
3871 | case BIT_IOR_EXPR: |
3872 | *reduc_fn = IFN_REDUC_IOR; |
3873 | return true; |
3874 | |
3875 | case BIT_XOR_EXPR: |
3876 | *reduc_fn = IFN_REDUC_XOR; |
3877 | return true; |
3878 | |
3879 | case MULT_EXPR: |
3880 | case MINUS_EXPR: |
3881 | *reduc_fn = IFN_LAST; |
3882 | return true; |
3883 | |
3884 | default: |
3885 | return false; |
3886 | } |
3887 | else |
3888 | switch (combined_fn (code)) |
3889 | { |
3890 | CASE_CFN_FMAX: |
3891 | *reduc_fn = IFN_REDUC_FMAX; |
3892 | return true; |
3893 | |
3894 | CASE_CFN_FMIN: |
3895 | *reduc_fn = IFN_REDUC_FMIN; |
3896 | return true; |
3897 | |
3898 | default: |
3899 | return false; |
3900 | } |
3901 | } |
3902 | |
3903 | /* If there is a neutral value X such that a reduction would not be affected |
3904 | by the introduction of additional X elements, return that X, otherwise |
3905 | return null. CODE is the code of the reduction and SCALAR_TYPE is type |
3906 | of the scalar elements. If the reduction has just a single initial value |
3907 | then INITIAL_VALUE is that value, otherwise it is null. |
3908 | If AS_INITIAL is TRUE the value is supposed to be used as initial value. |
3909 | In that case no signed zero is returned. */ |
3910 | |
3911 | tree |
3912 | neutral_op_for_reduction (tree scalar_type, code_helper code, |
3913 | tree initial_value, bool as_initial) |
3914 | { |
3915 | if (code.is_tree_code ()) |
3916 | switch (tree_code (code)) |
3917 | { |
3918 | case DOT_PROD_EXPR: |
3919 | case SAD_EXPR: |
3920 | case MINUS_EXPR: |
3921 | case BIT_IOR_EXPR: |
3922 | case BIT_XOR_EXPR: |
3923 | return build_zero_cst (scalar_type); |
3924 | case WIDEN_SUM_EXPR: |
3925 | case PLUS_EXPR: |
3926 | if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type)) |
3927 | return build_real (scalar_type, dconstm0); |
3928 | else |
3929 | return build_zero_cst (scalar_type); |
3930 | |
3931 | case MULT_EXPR: |
3932 | return build_one_cst (scalar_type); |
3933 | |
3934 | case BIT_AND_EXPR: |
3935 | return build_all_ones_cst (scalar_type); |
3936 | |
3937 | case MAX_EXPR: |
3938 | case MIN_EXPR: |
3939 | return initial_value; |
3940 | |
3941 | default: |
3942 | return NULL_TREE; |
3943 | } |
3944 | else |
3945 | switch (combined_fn (code)) |
3946 | { |
3947 | CASE_CFN_FMIN: |
3948 | CASE_CFN_FMAX: |
3949 | return initial_value; |
3950 | |
3951 | default: |
3952 | return NULL_TREE; |
3953 | } |
3954 | } |
3955 | |
3956 | /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement |
3957 | STMT is printed with a message MSG. */ |
3958 | |
3959 | static void |
3960 | report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg) |
3961 | { |
3962 | dump_printf_loc (msg_type, vect_location, "%s%G" , msg, stmt); |
3963 | } |
3964 | |
3965 | /* Return true if we need an in-order reduction for operation CODE |
3966 | on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer |
3967 | overflow must wrap. */ |
3968 | |
3969 | bool |
3970 | needs_fold_left_reduction_p (tree type, code_helper code) |
3971 | { |
3972 | /* CHECKME: check for !flag_finite_math_only too? */ |
3973 | if (SCALAR_FLOAT_TYPE_P (type)) |
3974 | { |
3975 | if (code.is_tree_code ()) |
3976 | switch (tree_code (code)) |
3977 | { |
3978 | case MIN_EXPR: |
3979 | case MAX_EXPR: |
3980 | return false; |
3981 | |
3982 | default: |
3983 | return !flag_associative_math; |
3984 | } |
3985 | else |
3986 | switch (combined_fn (code)) |
3987 | { |
3988 | CASE_CFN_FMIN: |
3989 | CASE_CFN_FMAX: |
3990 | return false; |
3991 | |
3992 | default: |
3993 | return !flag_associative_math; |
3994 | } |
3995 | } |
3996 | |
3997 | if (INTEGRAL_TYPE_P (type)) |
3998 | return (!code.is_tree_code () |
3999 | || !operation_no_trapping_overflow (type, tree_code (code))); |
4000 | |
4001 | if (SAT_FIXED_POINT_TYPE_P (type)) |
4002 | return true; |
4003 | |
4004 | return false; |
4005 | } |
4006 | |
4007 | /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and |
4008 | has a handled computation expression. Store the main reduction |
4009 | operation in *CODE. */ |
4010 | |
4011 | static bool |
4012 | check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, |
4013 | tree loop_arg, code_helper *code, |
4014 | vec<std::pair<ssa_op_iter, use_operand_p> > &path) |
4015 | { |
4016 | auto_bitmap visited; |
4017 | tree lookfor = PHI_RESULT (phi); |
4018 | ssa_op_iter curri; |
4019 | use_operand_p curr = op_iter_init_phiuse (ptr: &curri, phi, SSA_OP_USE); |
4020 | while (USE_FROM_PTR (curr) != loop_arg) |
4021 | curr = op_iter_next_use (ptr: &curri); |
4022 | curri.i = curri.numops; |
4023 | do |
4024 | { |
4025 | path.safe_push (obj: std::make_pair (x&: curri, y&: curr)); |
4026 | tree use = USE_FROM_PTR (curr); |
4027 | if (use == lookfor) |
4028 | break; |
4029 | gimple *def = SSA_NAME_DEF_STMT (use); |
4030 | if (gimple_nop_p (g: def) |
4031 | || ! flow_bb_inside_loop_p (loop, gimple_bb (g: def))) |
4032 | { |
4033 | pop: |
4034 | do |
4035 | { |
4036 | std::pair<ssa_op_iter, use_operand_p> x = path.pop (); |
4037 | curri = x.first; |
4038 | curr = x.second; |
4039 | do |
4040 | curr = op_iter_next_use (ptr: &curri); |
4041 | /* Skip already visited or non-SSA operands (from iterating |
4042 | over PHI args). */ |
4043 | while (curr != NULL_USE_OPERAND_P |
4044 | && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME |
4045 | || ! bitmap_set_bit (visited, |
4046 | SSA_NAME_VERSION |
4047 | (USE_FROM_PTR (curr))))); |
4048 | } |
4049 | while (curr == NULL_USE_OPERAND_P && ! path.is_empty ()); |
4050 | if (curr == NULL_USE_OPERAND_P) |
4051 | break; |
4052 | } |
4053 | else |
4054 | { |
4055 | if (gimple_code (g: def) == GIMPLE_PHI) |
4056 | curr = op_iter_init_phiuse (ptr: &curri, phi: as_a <gphi *>(p: def), SSA_OP_USE); |
4057 | else |
4058 | curr = op_iter_init_use (ptr: &curri, stmt: def, SSA_OP_USE); |
4059 | while (curr != NULL_USE_OPERAND_P |
4060 | && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME |
4061 | || ! bitmap_set_bit (visited, |
4062 | SSA_NAME_VERSION |
4063 | (USE_FROM_PTR (curr))))) |
4064 | curr = op_iter_next_use (ptr: &curri); |
4065 | if (curr == NULL_USE_OPERAND_P) |
4066 | goto pop; |
4067 | } |
4068 | } |
4069 | while (1); |
4070 | if (dump_file && (dump_flags & TDF_DETAILS)) |
4071 | { |
4072 | dump_printf_loc (MSG_NOTE, loc, "reduction path: " ); |
4073 | unsigned i; |
4074 | std::pair<ssa_op_iter, use_operand_p> *x; |
4075 | FOR_EACH_VEC_ELT (path, i, x) |
4076 | dump_printf (MSG_NOTE, "%T " , USE_FROM_PTR (x->second)); |
4077 | dump_printf (MSG_NOTE, "\n" ); |
4078 | } |
4079 | |
4080 | /* Check whether the reduction path detected is valid. */ |
4081 | bool fail = path.length () == 0; |
4082 | bool neg = false; |
4083 | int sign = -1; |
4084 | *code = ERROR_MARK; |
4085 | for (unsigned i = 1; i < path.length (); ++i) |
4086 | { |
4087 | gimple *use_stmt = USE_STMT (path[i].second); |
4088 | gimple_match_op op; |
4089 | if (!gimple_extract_op (use_stmt, &op)) |
4090 | { |
4091 | fail = true; |
4092 | break; |
4093 | } |
4094 | unsigned int opi = op.num_ops; |
4095 | if (gassign *assign = dyn_cast<gassign *> (p: use_stmt)) |
4096 | { |
4097 | /* The following make sure we can compute the operand index |
4098 | easily plus it mostly disallows chaining via COND_EXPR condition |
4099 | operands. */ |
4100 | for (opi = 0; opi < op.num_ops; ++opi) |
4101 | if (gimple_assign_rhs1_ptr (gs: assign) + opi == path[i].second->use) |
4102 | break; |
4103 | } |
4104 | else if (gcall *call = dyn_cast<gcall *> (p: use_stmt)) |
4105 | { |
4106 | for (opi = 0; opi < op.num_ops; ++opi) |
4107 | if (gimple_call_arg_ptr (gs: call, index: opi) == path[i].second->use) |
4108 | break; |
4109 | } |
4110 | if (opi == op.num_ops) |
4111 | { |
4112 | fail = true; |
4113 | break; |
4114 | } |
4115 | op.code = canonicalize_code (op.code, op.type); |
4116 | if (op.code == MINUS_EXPR) |
4117 | { |
4118 | op.code = PLUS_EXPR; |
4119 | /* Track whether we negate the reduction value each iteration. */ |
4120 | if (op.ops[1] == op.ops[opi]) |
4121 | neg = ! neg; |
4122 | } |
4123 | else if (op.code == IFN_COND_SUB) |
4124 | { |
4125 | op.code = IFN_COND_ADD; |
4126 | /* Track whether we negate the reduction value each iteration. */ |
4127 | if (op.ops[2] == op.ops[opi]) |
4128 | neg = ! neg; |
4129 | } |
4130 | if (CONVERT_EXPR_CODE_P (op.code) |
4131 | && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))) |
4132 | ; |
4133 | else if (*code == ERROR_MARK) |
4134 | { |
4135 | *code = op.code; |
4136 | sign = TYPE_SIGN (op.type); |
4137 | } |
4138 | else if (op.code != *code) |
4139 | { |
4140 | fail = true; |
4141 | break; |
4142 | } |
4143 | else if ((op.code == MIN_EXPR |
4144 | || op.code == MAX_EXPR) |
4145 | && sign != TYPE_SIGN (op.type)) |
4146 | { |
4147 | fail = true; |
4148 | break; |
4149 | } |
4150 | /* Check there's only a single stmt the op is used on. For the |
4151 | not value-changing tail and the last stmt allow out-of-loop uses. |
4152 | ??? We could relax this and handle arbitrary live stmts by |
4153 | forcing a scalar epilogue for example. */ |
4154 | imm_use_iterator imm_iter; |
4155 | use_operand_p use_p; |
4156 | gimple *op_use_stmt; |
4157 | unsigned cnt = 0; |
4158 | bool cond_fn_p = op.code.is_internal_fn () |
4159 | && (conditional_internal_fn_code (internal_fn (op.code)) |
4160 | != ERROR_MARK); |
4161 | |
4162 | FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi]) |
4163 | { |
4164 | /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have |
4165 | op1 twice (once as definition, once as else) in the same operation. |
4166 | Allow this. */ |
4167 | if (cond_fn_p && op_use_stmt == use_stmt) |
4168 | { |
4169 | gcall *call = as_a<gcall *> (p: use_stmt); |
4170 | unsigned else_pos |
4171 | = internal_fn_else_index (internal_fn (op.code)); |
4172 | |
4173 | for (unsigned int j = 0; j < gimple_call_num_args (gs: call); ++j) |
4174 | { |
4175 | if (j == else_pos) |
4176 | continue; |
4177 | if (gimple_call_arg (gs: call, index: j) == op.ops[opi]) |
4178 | cnt++; |
4179 | } |
4180 | } |
4181 | else if (!is_gimple_debug (gs: op_use_stmt) |
4182 | && (*code != ERROR_MARK |
4183 | || flow_bb_inside_loop_p (loop, |
4184 | gimple_bb (g: op_use_stmt)))) |
4185 | FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) |
4186 | cnt++; |
4187 | } |
4188 | |
4189 | if (cnt != 1) |
4190 | { |
4191 | fail = true; |
4192 | break; |
4193 | } |
4194 | } |
4195 | return ! fail && ! neg && *code != ERROR_MARK; |
4196 | } |
4197 | |
4198 | bool |
4199 | check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, |
4200 | tree loop_arg, enum tree_code code) |
4201 | { |
4202 | auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; |
4203 | code_helper code_; |
4204 | return (check_reduction_path (loc, loop, phi, loop_arg, code: &code_, path) |
4205 | && code_ == code); |
4206 | } |
4207 | |
4208 | |
4209 | |
4210 | /* Function vect_is_simple_reduction |
4211 | |
4212 | (1) Detect a cross-iteration def-use cycle that represents a simple |
4213 | reduction computation. We look for the following pattern: |
4214 | |
4215 | loop_header: |
4216 | a1 = phi < a0, a2 > |
4217 | a3 = ... |
4218 | a2 = operation (a3, a1) |
4219 | |
4220 | or |
4221 | |
4222 | a3 = ... |
4223 | loop_header: |
4224 | a1 = phi < a0, a2 > |
4225 | a2 = operation (a3, a1) |
4226 | |
4227 | such that: |
4228 | 1. operation is commutative and associative and it is safe to |
4229 | change the order of the computation |
4230 | 2. no uses for a2 in the loop (a2 is used out of the loop) |
4231 | 3. no uses of a1 in the loop besides the reduction operation |
4232 | 4. no uses of a1 outside the loop. |
4233 | |
4234 | Conditions 1,4 are tested here. |
4235 | Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. |
4236 | |
4237 | (2) Detect a cross-iteration def-use cycle in nested loops, i.e., |
4238 | nested cycles. |
4239 | |
4240 | (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double |
4241 | reductions: |
4242 | |
4243 | a1 = phi < a0, a2 > |
4244 | inner loop (def of a3) |
4245 | a2 = phi < a3 > |
4246 | |
4247 | (4) Detect condition expressions, ie: |
4248 | for (int i = 0; i < N; i++) |
4249 | if (a[i] < val) |
4250 | ret_val = a[i]; |
4251 | |
4252 | */ |
4253 | |
4254 | static stmt_vec_info |
4255 | vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, |
4256 | bool *double_reduc, bool *reduc_chain_p, bool slp) |
4257 | { |
4258 | gphi *phi = as_a <gphi *> (p: phi_info->stmt); |
4259 | gimple *phi_use_stmt = NULL; |
4260 | imm_use_iterator imm_iter; |
4261 | use_operand_p use_p; |
4262 | |
4263 | *double_reduc = false; |
4264 | *reduc_chain_p = false; |
4265 | STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION; |
4266 | |
4267 | tree phi_name = PHI_RESULT (phi); |
4268 | /* ??? If there are no uses of the PHI result the inner loop reduction |
4269 | won't be detected as possibly double-reduction by vectorizable_reduction |
4270 | because that tries to walk the PHI arg from the preheader edge which |
4271 | can be constant. See PR60382. */ |
4272 | if (has_zero_uses (var: phi_name)) |
4273 | return NULL; |
4274 | class loop *loop = (gimple_bb (g: phi))->loop_father; |
4275 | unsigned nphi_def_loop_uses = 0; |
4276 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name) |
4277 | { |
4278 | gimple *use_stmt = USE_STMT (use_p); |
4279 | if (is_gimple_debug (gs: use_stmt)) |
4280 | continue; |
4281 | |
4282 | if (!flow_bb_inside_loop_p (loop, gimple_bb (g: use_stmt))) |
4283 | { |
4284 | if (dump_enabled_p ()) |
4285 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4286 | "intermediate value used outside loop.\n" ); |
4287 | |
4288 | return NULL; |
4289 | } |
4290 | |
4291 | /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have |
4292 | op1 twice (once as definition, once as else) in the same operation. |
4293 | Only count it as one. */ |
4294 | if (use_stmt != phi_use_stmt) |
4295 | { |
4296 | nphi_def_loop_uses++; |
4297 | phi_use_stmt = use_stmt; |
4298 | } |
4299 | } |
4300 | |
4301 | tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop)); |
4302 | if (TREE_CODE (latch_def) != SSA_NAME) |
4303 | { |
4304 | if (dump_enabled_p ()) |
4305 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4306 | "reduction: not ssa_name: %T\n" , latch_def); |
4307 | return NULL; |
4308 | } |
4309 | |
4310 | stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def); |
4311 | if (!def_stmt_info |
4312 | || !flow_bb_inside_loop_p (loop, gimple_bb (g: def_stmt_info->stmt))) |
4313 | return NULL; |
4314 | |
4315 | bool nested_in_vect_loop |
4316 | = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop); |
4317 | unsigned nlatch_def_loop_uses = 0; |
4318 | auto_vec<gphi *, 3> lcphis; |
4319 | bool inner_loop_of_double_reduc = false; |
4320 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def) |
4321 | { |
4322 | gimple *use_stmt = USE_STMT (use_p); |
4323 | if (is_gimple_debug (gs: use_stmt)) |
4324 | continue; |
4325 | if (flow_bb_inside_loop_p (loop, gimple_bb (g: use_stmt))) |
4326 | nlatch_def_loop_uses++; |
4327 | else |
4328 | { |
4329 | /* We can have more than one loop-closed PHI. */ |
4330 | lcphis.safe_push (obj: as_a <gphi *> (p: use_stmt)); |
4331 | if (nested_in_vect_loop |
4332 | && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt)) |
4333 | == vect_double_reduction_def)) |
4334 | inner_loop_of_double_reduc = true; |
4335 | } |
4336 | } |
4337 | |
4338 | /* If we are vectorizing an inner reduction we are executing that |
4339 | in the original order only in case we are not dealing with a |
4340 | double reduction. */ |
4341 | if (nested_in_vect_loop && !inner_loop_of_double_reduc) |
4342 | { |
4343 | if (dump_enabled_p ()) |
4344 | report_vect_op (msg_type: MSG_NOTE, stmt: def_stmt_info->stmt, |
4345 | msg: "detected nested cycle: " ); |
4346 | return def_stmt_info; |
4347 | } |
4348 | |
4349 | /* When the inner loop of a double reduction ends up with more than |
4350 | one loop-closed PHI we have failed to classify alternate such |
4351 | PHIs as double reduction, leading to wrong code. See PR103237. */ |
4352 | if (inner_loop_of_double_reduc && lcphis.length () != 1) |
4353 | { |
4354 | if (dump_enabled_p ()) |
4355 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4356 | "unhandle double reduction\n" ); |
4357 | return NULL; |
4358 | } |
4359 | |
4360 | /* If this isn't a nested cycle or if the nested cycle reduction value |
4361 | is used ouside of the inner loop we cannot handle uses of the reduction |
4362 | value. */ |
4363 | if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1) |
4364 | { |
4365 | if (dump_enabled_p ()) |
4366 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4367 | "reduction used in loop.\n" ); |
4368 | return NULL; |
4369 | } |
4370 | |
4371 | /* If DEF_STMT is a phi node itself, we expect it to have a single argument |
4372 | defined in the inner loop. */ |
4373 | if (gphi *def_stmt = dyn_cast <gphi *> (p: def_stmt_info->stmt)) |
4374 | { |
4375 | tree op1 = PHI_ARG_DEF (def_stmt, 0); |
4376 | if (gimple_phi_num_args (gs: def_stmt) != 1 |
4377 | || TREE_CODE (op1) != SSA_NAME) |
4378 | { |
4379 | if (dump_enabled_p ()) |
4380 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4381 | "unsupported phi node definition.\n" ); |
4382 | |
4383 | return NULL; |
4384 | } |
4385 | |
4386 | /* Verify there is an inner cycle composed of the PHI phi_use_stmt |
4387 | and the latch definition op1. */ |
4388 | gimple *def1 = SSA_NAME_DEF_STMT (op1); |
4389 | if (gimple_bb (g: def1) |
4390 | && flow_bb_inside_loop_p (loop, gimple_bb (g: def_stmt)) |
4391 | && loop->inner |
4392 | && flow_bb_inside_loop_p (loop->inner, gimple_bb (g: def1)) |
4393 | && (is_gimple_assign (gs: def1) || is_gimple_call (gs: def1)) |
4394 | && is_a <gphi *> (p: phi_use_stmt) |
4395 | && flow_bb_inside_loop_p (loop->inner, gimple_bb (g: phi_use_stmt)) |
4396 | && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt, |
4397 | loop_latch_edge (loop->inner)))) |
4398 | { |
4399 | if (dump_enabled_p ()) |
4400 | report_vect_op (msg_type: MSG_NOTE, stmt: def_stmt, |
4401 | msg: "detected double reduction: " ); |
4402 | |
4403 | *double_reduc = true; |
4404 | return def_stmt_info; |
4405 | } |
4406 | |
4407 | return NULL; |
4408 | } |
4409 | |
4410 | /* Look for the expression computing latch_def from then loop PHI result. */ |
4411 | auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; |
4412 | code_helper code; |
4413 | if (check_reduction_path (loc: vect_location, loop, phi, loop_arg: latch_def, code: &code, |
4414 | path)) |
4415 | { |
4416 | STMT_VINFO_REDUC_CODE (phi_info) = code; |
4417 | if (code == COND_EXPR && !nested_in_vect_loop) |
4418 | STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION; |
4419 | |
4420 | /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP |
4421 | reduction chain for which the additional restriction is that |
4422 | all operations in the chain are the same. */ |
4423 | auto_vec<stmt_vec_info, 8> reduc_chain; |
4424 | unsigned i; |
4425 | bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR; |
4426 | for (i = path.length () - 1; i >= 1; --i) |
4427 | { |
4428 | gimple *stmt = USE_STMT (path[i].second); |
4429 | stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt); |
4430 | gimple_match_op op; |
4431 | if (!gimple_extract_op (stmt, &op)) |
4432 | gcc_unreachable (); |
4433 | if (gassign *assign = dyn_cast<gassign *> (p: stmt)) |
4434 | STMT_VINFO_REDUC_IDX (stmt_info) |
4435 | = path[i].second->use - gimple_assign_rhs1_ptr (gs: assign); |
4436 | else |
4437 | { |
4438 | gcall *call = as_a<gcall *> (p: stmt); |
4439 | STMT_VINFO_REDUC_IDX (stmt_info) |
4440 | = path[i].second->use - gimple_call_arg_ptr (gs: call, index: 0); |
4441 | } |
4442 | bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code) |
4443 | && (i == 1 || i == path.length () - 1)); |
4444 | if ((op.code != code && !leading_conversion) |
4445 | /* We can only handle the final value in epilogue |
4446 | generation for reduction chains. */ |
4447 | || (i != 1 && !has_single_use (var: gimple_get_lhs (stmt)))) |
4448 | is_slp_reduc = false; |
4449 | /* For reduction chains we support a trailing/leading |
4450 | conversions. We do not store those in the actual chain. */ |
4451 | if (leading_conversion) |
4452 | continue; |
4453 | reduc_chain.safe_push (obj: stmt_info); |
4454 | } |
4455 | if (slp && is_slp_reduc && reduc_chain.length () > 1) |
4456 | { |
4457 | for (unsigned i = 0; i < reduc_chain.length () - 1; ++i) |
4458 | { |
4459 | REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]; |
4460 | REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]; |
4461 | } |
4462 | REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]; |
4463 | REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL; |
4464 | |
4465 | /* Save the chain for further analysis in SLP detection. */ |
4466 | LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (obj: reduc_chain[0]); |
4467 | REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length (); |
4468 | |
4469 | *reduc_chain_p = true; |
4470 | if (dump_enabled_p ()) |
4471 | dump_printf_loc (MSG_NOTE, vect_location, |
4472 | "reduction: detected reduction chain\n" ); |
4473 | } |
4474 | else if (dump_enabled_p ()) |
4475 | dump_printf_loc (MSG_NOTE, vect_location, |
4476 | "reduction: detected reduction\n" ); |
4477 | |
4478 | return def_stmt_info; |
4479 | } |
4480 | |
4481 | if (dump_enabled_p ()) |
4482 | dump_printf_loc (MSG_NOTE, vect_location, |
4483 | "reduction: unknown pattern\n" ); |
4484 | |
4485 | return NULL; |
4486 | } |
4487 | |
4488 | /* Estimate the number of peeled epilogue iterations for LOOP_VINFO. |
4489 | PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations, |
4490 | or -1 if not known. */ |
4491 | |
4492 | static int |
4493 | vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue) |
4494 | { |
4495 | int assumed_vf = vect_vf_for_cost (loop_vinfo); |
4496 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1) |
4497 | { |
4498 | if (dump_enabled_p ()) |
4499 | dump_printf_loc (MSG_NOTE, vect_location, |
4500 | "cost model: epilogue peel iters set to vf/2 " |
4501 | "because loop iterations are unknown .\n" ); |
4502 | return assumed_vf / 2; |
4503 | } |
4504 | else |
4505 | { |
4506 | int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); |
4507 | peel_iters_prologue = MIN (niters, peel_iters_prologue); |
4508 | int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf; |
4509 | /* If we need to peel for gaps, but no peeling is required, we have to |
4510 | peel VF iterations. */ |
4511 | if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue) |
4512 | peel_iters_epilogue = assumed_vf; |
4513 | return peel_iters_epilogue; |
4514 | } |
4515 | } |
4516 | |
4517 | /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ |
4518 | int |
4519 | vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, |
4520 | int *peel_iters_epilogue, |
4521 | stmt_vector_for_cost *scalar_cost_vec, |
4522 | stmt_vector_for_cost *prologue_cost_vec, |
4523 | stmt_vector_for_cost *epilogue_cost_vec) |
4524 | { |
4525 | int retval = 0; |
4526 | |
4527 | *peel_iters_epilogue |
4528 | = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue); |
4529 | |
4530 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) |
4531 | { |
4532 | /* If peeled iterations are known but number of scalar loop |
4533 | iterations are unknown, count a taken branch per peeled loop. */ |
4534 | if (peel_iters_prologue > 0) |
4535 | retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, |
4536 | vect_prologue); |
4537 | if (*peel_iters_epilogue > 0) |
4538 | retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken, |
4539 | vect_epilogue); |
4540 | } |
4541 | |
4542 | stmt_info_for_cost *si; |
4543 | int j; |
4544 | if (peel_iters_prologue) |
4545 | FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) |
4546 | retval += record_stmt_cost (body_cost_vec: prologue_cost_vec, |
4547 | count: si->count * peel_iters_prologue, |
4548 | kind: si->kind, stmt_info: si->stmt_info, misalign: si->misalign, |
4549 | where: vect_prologue); |
4550 | if (*peel_iters_epilogue) |
4551 | FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) |
4552 | retval += record_stmt_cost (body_cost_vec: epilogue_cost_vec, |
4553 | count: si->count * *peel_iters_epilogue, |
4554 | kind: si->kind, stmt_info: si->stmt_info, misalign: si->misalign, |
4555 | where: vect_epilogue); |
4556 | |
4557 | return retval; |
4558 | } |
4559 | |
4560 | /* Function vect_estimate_min_profitable_iters |
4561 | |
4562 | Return the number of iterations required for the vector version of the |
4563 | loop to be profitable relative to the cost of the scalar version of the |
4564 | loop. |
4565 | |
4566 | *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold |
4567 | of iterations for vectorization. -1 value means loop vectorization |
4568 | is not profitable. This returned value may be used for dynamic |
4569 | profitability check. |
4570 | |
4571 | *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used |
4572 | for static check against estimated number of iterations. */ |
4573 | |
4574 | static void |
4575 | vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, |
4576 | int *ret_min_profitable_niters, |
4577 | int *ret_min_profitable_estimate, |
4578 | unsigned *suggested_unroll_factor) |
4579 | { |
4580 | int min_profitable_iters; |
4581 | int min_profitable_estimate; |
4582 | int peel_iters_prologue; |
4583 | int peel_iters_epilogue; |
4584 | unsigned vec_inside_cost = 0; |
4585 | int vec_outside_cost = 0; |
4586 | unsigned vec_prologue_cost = 0; |
4587 | unsigned vec_epilogue_cost = 0; |
4588 | int scalar_single_iter_cost = 0; |
4589 | int scalar_outside_cost = 0; |
4590 | int assumed_vf = vect_vf_for_cost (loop_vinfo); |
4591 | int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
4592 | vector_costs *target_cost_data = loop_vinfo->vector_costs; |
4593 | |
4594 | /* Cost model disabled. */ |
4595 | if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) |
4596 | { |
4597 | if (dump_enabled_p ()) |
4598 | dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n" ); |
4599 | *ret_min_profitable_niters = 0; |
4600 | *ret_min_profitable_estimate = 0; |
4601 | return; |
4602 | } |
4603 | |
4604 | /* Requires loop versioning tests to handle misalignment. */ |
4605 | if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) |
4606 | { |
4607 | /* FIXME: Make cost depend on complexity of individual check. */ |
4608 | unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length (); |
4609 | (void) add_stmt_cost (costs: target_cost_data, count: len, kind: scalar_stmt, where: vect_prologue); |
4610 | if (dump_enabled_p ()) |
4611 | dump_printf (MSG_NOTE, |
4612 | "cost model: Adding cost of checks for loop " |
4613 | "versioning to treat misalignment.\n" ); |
4614 | } |
4615 | |
4616 | /* Requires loop versioning with alias checks. */ |
4617 | if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) |
4618 | { |
4619 | /* FIXME: Make cost depend on complexity of individual check. */ |
4620 | unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length (); |
4621 | (void) add_stmt_cost (costs: target_cost_data, count: len, kind: scalar_stmt, where: vect_prologue); |
4622 | len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length (); |
4623 | if (len) |
4624 | /* Count LEN - 1 ANDs and LEN comparisons. */ |
4625 | (void) add_stmt_cost (costs: target_cost_data, count: len * 2 - 1, |
4626 | kind: scalar_stmt, where: vect_prologue); |
4627 | len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length (); |
4628 | if (len) |
4629 | { |
4630 | /* Count LEN - 1 ANDs and LEN comparisons. */ |
4631 | unsigned int nstmts = len * 2 - 1; |
4632 | /* +1 for each bias that needs adding. */ |
4633 | for (unsigned int i = 0; i < len; ++i) |
4634 | if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p) |
4635 | nstmts += 1; |
4636 | (void) add_stmt_cost (costs: target_cost_data, count: nstmts, |
4637 | kind: scalar_stmt, where: vect_prologue); |
4638 | } |
4639 | if (dump_enabled_p ()) |
4640 | dump_printf (MSG_NOTE, |
4641 | "cost model: Adding cost of checks for loop " |
4642 | "versioning aliasing.\n" ); |
4643 | } |
4644 | |
4645 | /* Requires loop versioning with niter checks. */ |
4646 | if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo)) |
4647 | { |
4648 | /* FIXME: Make cost depend on complexity of individual check. */ |
4649 | (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: vector_stmt, |
4650 | NULL, NULL, NULL_TREE, misalign: 0, where: vect_prologue); |
4651 | if (dump_enabled_p ()) |
4652 | dump_printf (MSG_NOTE, |
4653 | "cost model: Adding cost of checks for loop " |
4654 | "versioning niters.\n" ); |
4655 | } |
4656 | |
4657 | if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) |
4658 | (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: cond_branch_taken, |
4659 | where: vect_prologue); |
4660 | |
4661 | /* Count statements in scalar loop. Using this as scalar cost for a single |
4662 | iteration for now. |
4663 | |
4664 | TODO: Add outer loop support. |
4665 | |
4666 | TODO: Consider assigning different costs to different scalar |
4667 | statements. */ |
4668 | |
4669 | scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost (); |
4670 | |
4671 | /* Add additional cost for the peeled instructions in prologue and epilogue |
4672 | loop. (For fully-masked loops there will be no peeling.) |
4673 | |
4674 | FORNOW: If we don't know the value of peel_iters for prologue or epilogue |
4675 | at compile-time - we assume it's vf/2 (the worst would be vf-1). |
4676 | |
4677 | TODO: Build an expression that represents peel_iters for prologue and |
4678 | epilogue to be used in a run-time test. */ |
4679 | |
4680 | bool prologue_need_br_taken_cost = false; |
4681 | bool prologue_need_br_not_taken_cost = false; |
4682 | |
4683 | /* Calculate peel_iters_prologue. */ |
4684 | if (vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
4685 | peel_iters_prologue = 0; |
4686 | else if (npeel < 0) |
4687 | { |
4688 | peel_iters_prologue = assumed_vf / 2; |
4689 | if (dump_enabled_p ()) |
4690 | dump_printf (MSG_NOTE, "cost model: " |
4691 | "prologue peel iters set to vf/2.\n" ); |
4692 | |
4693 | /* If peeled iterations are unknown, count a taken branch and a not taken |
4694 | branch per peeled loop. Even if scalar loop iterations are known, |
4695 | vector iterations are not known since peeled prologue iterations are |
4696 | not known. Hence guards remain the same. */ |
4697 | prologue_need_br_taken_cost = true; |
4698 | prologue_need_br_not_taken_cost = true; |
4699 | } |
4700 | else |
4701 | { |
4702 | peel_iters_prologue = npeel; |
4703 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0) |
4704 | /* If peeled iterations are known but number of scalar loop |
4705 | iterations are unknown, count a taken branch per peeled loop. */ |
4706 | prologue_need_br_taken_cost = true; |
4707 | } |
4708 | |
4709 | bool epilogue_need_br_taken_cost = false; |
4710 | bool epilogue_need_br_not_taken_cost = false; |
4711 | |
4712 | /* Calculate peel_iters_epilogue. */ |
4713 | if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
4714 | /* We need to peel exactly one iteration for gaps. */ |
4715 | peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0; |
4716 | else if (npeel < 0) |
4717 | { |
4718 | /* If peeling for alignment is unknown, loop bound of main loop |
4719 | becomes unknown. */ |
4720 | peel_iters_epilogue = assumed_vf / 2; |
4721 | if (dump_enabled_p ()) |
4722 | dump_printf (MSG_NOTE, "cost model: " |
4723 | "epilogue peel iters set to vf/2 because " |
4724 | "peeling for alignment is unknown.\n" ); |
4725 | |
4726 | /* See the same reason above in peel_iters_prologue calculation. */ |
4727 | epilogue_need_br_taken_cost = true; |
4728 | epilogue_need_br_not_taken_cost = true; |
4729 | } |
4730 | else |
4731 | { |
4732 | peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue: npeel); |
4733 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0) |
4734 | /* If peeled iterations are known but number of scalar loop |
4735 | iterations are unknown, count a taken branch per peeled loop. */ |
4736 | epilogue_need_br_taken_cost = true; |
4737 | } |
4738 | |
4739 | stmt_info_for_cost *si; |
4740 | int j; |
4741 | /* Add costs associated with peel_iters_prologue. */ |
4742 | if (peel_iters_prologue) |
4743 | FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si) |
4744 | { |
4745 | (void) add_stmt_cost (costs: target_cost_data, |
4746 | count: si->count * peel_iters_prologue, kind: si->kind, |
4747 | stmt_info: si->stmt_info, node: si->node, vectype: si->vectype, |
4748 | misalign: si->misalign, where: vect_prologue); |
4749 | } |
4750 | |
4751 | /* Add costs associated with peel_iters_epilogue. */ |
4752 | if (peel_iters_epilogue) |
4753 | FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si) |
4754 | { |
4755 | (void) add_stmt_cost (costs: target_cost_data, |
4756 | count: si->count * peel_iters_epilogue, kind: si->kind, |
4757 | stmt_info: si->stmt_info, node: si->node, vectype: si->vectype, |
4758 | misalign: si->misalign, where: vect_epilogue); |
4759 | } |
4760 | |
4761 | /* Add possible cond_branch_taken/cond_branch_not_taken cost. */ |
4762 | |
4763 | if (prologue_need_br_taken_cost) |
4764 | (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: cond_branch_taken, |
4765 | where: vect_prologue); |
4766 | |
4767 | if (prologue_need_br_not_taken_cost) |
4768 | (void) add_stmt_cost (costs: target_cost_data, count: 1, |
4769 | kind: cond_branch_not_taken, where: vect_prologue); |
4770 | |
4771 | if (epilogue_need_br_taken_cost) |
4772 | (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: cond_branch_taken, |
4773 | where: vect_epilogue); |
4774 | |
4775 | if (epilogue_need_br_not_taken_cost) |
4776 | (void) add_stmt_cost (costs: target_cost_data, count: 1, |
4777 | kind: cond_branch_not_taken, where: vect_epilogue); |
4778 | |
4779 | /* Take care of special costs for rgroup controls of partial vectors. */ |
4780 | if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) |
4781 | && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) |
4782 | == vect_partial_vectors_avx512)) |
4783 | { |
4784 | /* Calculate how many masks we need to generate. */ |
4785 | unsigned int num_masks = 0; |
4786 | bool need_saturation = false; |
4787 | for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec) |
4788 | if (rgm.type) |
4789 | { |
4790 | unsigned nvectors = rgm.factor; |
4791 | num_masks += nvectors; |
4792 | if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type)) |
4793 | < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo))) |
4794 | need_saturation = true; |
4795 | } |
4796 | |
4797 | /* ??? The target isn't able to identify the costs below as |
4798 | producing masks so it cannot penaltize cases where we'd run |
4799 | out of mask registers for example. */ |
4800 | |
4801 | /* ??? We are also failing to account for smaller vector masks |
4802 | we generate by splitting larger masks in vect_get_loop_mask. */ |
4803 | |
4804 | /* In the worst case, we need to generate each mask in the prologue |
4805 | and in the loop body. We need one splat per group and one |
4806 | compare per mask. |
4807 | |
4808 | Sometimes the prologue mask will fold to a constant, |
4809 | so the actual prologue cost might be smaller. However, it's |
4810 | simpler and safer to use the worst-case cost; if this ends up |
4811 | being the tie-breaker between vectorizing or not, then it's |
4812 | probably better not to vectorize. */ |
4813 | (void) add_stmt_cost (costs: target_cost_data, |
4814 | count: num_masks |
4815 | + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (), |
4816 | kind: vector_stmt, NULL, NULL, NULL_TREE, misalign: 0, |
4817 | where: vect_prologue); |
4818 | (void) add_stmt_cost (costs: target_cost_data, |
4819 | count: num_masks |
4820 | + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (), |
4821 | kind: vector_stmt, NULL, NULL, NULL_TREE, misalign: 0, where: vect_body); |
4822 | |
4823 | /* When we need saturation we need it both in the prologue and |
4824 | the epilogue. */ |
4825 | if (need_saturation) |
4826 | { |
4827 | (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: scalar_stmt, |
4828 | NULL, NULL, NULL_TREE, misalign: 0, where: vect_prologue); |
4829 | (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: scalar_stmt, |
4830 | NULL, NULL, NULL_TREE, misalign: 0, where: vect_body); |
4831 | } |
4832 | } |
4833 | else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) |
4834 | && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) |
4835 | == vect_partial_vectors_while_ult)) |
4836 | { |
4837 | /* Calculate how many masks we need to generate. */ |
4838 | unsigned int num_masks = 0; |
4839 | rgroup_controls *rgm; |
4840 | unsigned int num_vectors_m1; |
4841 | FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, |
4842 | num_vectors_m1, rgm) |
4843 | if (rgm->type) |
4844 | num_masks += num_vectors_m1 + 1; |
4845 | gcc_assert (num_masks > 0); |
4846 | |
4847 | /* In the worst case, we need to generate each mask in the prologue |
4848 | and in the loop body. One of the loop body mask instructions |
4849 | replaces the comparison in the scalar loop, and since we don't |
4850 | count the scalar comparison against the scalar body, we shouldn't |
4851 | count that vector instruction against the vector body either. |
4852 | |
4853 | Sometimes we can use unpacks instead of generating prologue |
4854 | masks and sometimes the prologue mask will fold to a constant, |
4855 | so the actual prologue cost might be smaller. However, it's |
4856 | simpler and safer to use the worst-case cost; if this ends up |
4857 | being the tie-breaker between vectorizing or not, then it's |
4858 | probably better not to vectorize. */ |
4859 | (void) add_stmt_cost (costs: target_cost_data, count: num_masks, |
4860 | kind: vector_stmt, NULL, NULL, NULL_TREE, misalign: 0, |
4861 | where: vect_prologue); |
4862 | (void) add_stmt_cost (costs: target_cost_data, count: num_masks - 1, |
4863 | kind: vector_stmt, NULL, NULL, NULL_TREE, misalign: 0, |
4864 | where: vect_body); |
4865 | } |
4866 | else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) |
4867 | { |
4868 | /* Referring to the functions vect_set_loop_condition_partial_vectors |
4869 | and vect_set_loop_controls_directly, we need to generate each |
4870 | length in the prologue and in the loop body if required. Although |
4871 | there are some possible optimizations, we consider the worst case |
4872 | here. */ |
4873 | |
4874 | bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo); |
4875 | signed char partial_load_store_bias |
4876 | = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); |
4877 | bool need_iterate_p |
4878 | = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) |
4879 | && !vect_known_niters_smaller_than_vf (loop_vinfo)); |
4880 | |
4881 | /* Calculate how many statements to be added. */ |
4882 | unsigned int prologue_stmts = 0; |
4883 | unsigned int body_stmts = 0; |
4884 | |
4885 | rgroup_controls *rgc; |
4886 | unsigned int num_vectors_m1; |
4887 | FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc) |
4888 | if (rgc->type) |
4889 | { |
4890 | /* May need one SHIFT for nitems_total computation. */ |
4891 | unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor; |
4892 | if (nitems != 1 && !niters_known_p) |
4893 | prologue_stmts += 1; |
4894 | |
4895 | /* May need one MAX and one MINUS for wrap around. */ |
4896 | if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc)) |
4897 | prologue_stmts += 2; |
4898 | |
4899 | /* Need one MAX and one MINUS for each batch limit excepting for |
4900 | the 1st one. */ |
4901 | prologue_stmts += num_vectors_m1 * 2; |
4902 | |
4903 | unsigned int num_vectors = num_vectors_m1 + 1; |
4904 | |
4905 | /* Need to set up lengths in prologue, only one MIN required |
4906 | for each since start index is zero. */ |
4907 | prologue_stmts += num_vectors; |
4908 | |
4909 | /* If we have a non-zero partial load bias, we need one PLUS |
4910 | to adjust the load length. */ |
4911 | if (partial_load_store_bias != 0) |
4912 | body_stmts += 1; |
4913 | |
4914 | unsigned int length_update_cost = 0; |
4915 | if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) |
4916 | /* For decrement IV style, Each only need a single SELECT_VL |
4917 | or MIN since beginning to calculate the number of elements |
4918 | need to be processed in current iteration. */ |
4919 | length_update_cost = 1; |
4920 | else |
4921 | /* For increment IV stype, Each may need two MINs and one MINUS to |
4922 | update lengths in body for next iteration. */ |
4923 | length_update_cost = 3; |
4924 | |
4925 | if (need_iterate_p) |
4926 | body_stmts += length_update_cost * num_vectors; |
4927 | } |
4928 | |
4929 | (void) add_stmt_cost (costs: target_cost_data, count: prologue_stmts, |
4930 | kind: scalar_stmt, where: vect_prologue); |
4931 | (void) add_stmt_cost (costs: target_cost_data, count: body_stmts, |
4932 | kind: scalar_stmt, where: vect_body); |
4933 | } |
4934 | |
4935 | /* FORNOW: The scalar outside cost is incremented in one of the |
4936 | following ways: |
4937 | |
4938 | 1. The vectorizer checks for alignment and aliasing and generates |
4939 | a condition that allows dynamic vectorization. A cost model |
4940 | check is ANDED with the versioning condition. Hence scalar code |
4941 | path now has the added cost of the versioning check. |
4942 | |
4943 | if (cost > th & versioning_check) |
4944 | jmp to vector code |
4945 | |
4946 | Hence run-time scalar is incremented by not-taken branch cost. |
4947 | |
4948 | 2. The vectorizer then checks if a prologue is required. If the |
4949 | cost model check was not done before during versioning, it has to |
4950 | be done before the prologue check. |
4951 | |
4952 | if (cost <= th) |
4953 | prologue = scalar_iters |
4954 | if (prologue == 0) |
4955 | jmp to vector code |
4956 | else |
4957 | execute prologue |
4958 | if (prologue == num_iters) |
4959 | go to exit |
4960 | |
4961 | Hence the run-time scalar cost is incremented by a taken branch, |
4962 | plus a not-taken branch, plus a taken branch cost. |
4963 | |
4964 | 3. The vectorizer then checks if an epilogue is required. If the |
4965 | cost model check was not done before during prologue check, it |
4966 | has to be done with the epilogue check. |
4967 | |
4968 | if (prologue == 0) |
4969 | jmp to vector code |
4970 | else |
4971 | execute prologue |
4972 | if (prologue == num_iters) |
4973 | go to exit |
4974 | vector code: |
4975 | if ((cost <= th) | (scalar_iters-prologue-epilogue == 0)) |
4976 | jmp to epilogue |
4977 | |
4978 | Hence the run-time scalar cost should be incremented by 2 taken |
4979 | branches. |
4980 | |
4981 | TODO: The back end may reorder the BBS's differently and reverse |
4982 | conditions/branch directions. Change the estimates below to |
4983 | something more reasonable. */ |
4984 | |
4985 | /* If the number of iterations is known and we do not do versioning, we can |
4986 | decide whether to vectorize at compile time. Hence the scalar version |
4987 | do not carry cost model guard costs. */ |
4988 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
4989 | || LOOP_REQUIRES_VERSIONING (loop_vinfo)) |
4990 | { |
4991 | /* Cost model check occurs at versioning. */ |
4992 | if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) |
4993 | scalar_outside_cost += vect_get_stmt_cost (type_of_cost: cond_branch_not_taken); |
4994 | else |
4995 | { |
4996 | /* Cost model check occurs at prologue generation. */ |
4997 | if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) |
4998 | scalar_outside_cost += 2 * vect_get_stmt_cost (type_of_cost: cond_branch_taken) |
4999 | + vect_get_stmt_cost (type_of_cost: cond_branch_not_taken); |
5000 | /* Cost model check occurs at epilogue generation. */ |
5001 | else |
5002 | scalar_outside_cost += 2 * vect_get_stmt_cost (type_of_cost: cond_branch_taken); |
5003 | } |
5004 | } |
5005 | |
5006 | /* Complete the target-specific cost calculations. */ |
5007 | finish_cost (costs: loop_vinfo->vector_costs, scalar_costs: loop_vinfo->scalar_costs, |
5008 | prologue_cost: &vec_prologue_cost, body_cost: &vec_inside_cost, epilogue_cost: &vec_epilogue_cost, |
5009 | suggested_unroll_factor); |
5010 | |
5011 | if (suggested_unroll_factor && *suggested_unroll_factor > 1 |
5012 | && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR |
5013 | && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) * |
5014 | *suggested_unroll_factor, |
5015 | LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo))) |
5016 | { |
5017 | if (dump_enabled_p ()) |
5018 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
5019 | "can't unroll as unrolled vectorization factor larger" |
5020 | " than maximum vectorization factor: " |
5021 | HOST_WIDE_INT_PRINT_UNSIGNED "\n" , |
5022 | LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)); |
5023 | *suggested_unroll_factor = 1; |
5024 | } |
5025 | |
5026 | vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); |
5027 | |
5028 | if (dump_enabled_p ()) |
5029 | { |
5030 | dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n" ); |
5031 | dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n" , |
5032 | vec_inside_cost); |
5033 | dump_printf (MSG_NOTE, " Vector prologue cost: %d\n" , |
5034 | vec_prologue_cost); |
5035 | dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n" , |
5036 | vec_epilogue_cost); |
5037 | dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n" , |
5038 | scalar_single_iter_cost); |
5039 | dump_printf (MSG_NOTE, " Scalar outside cost: %d\n" , |
5040 | scalar_outside_cost); |
5041 | dump_printf (MSG_NOTE, " Vector outside cost: %d\n" , |
5042 | vec_outside_cost); |
5043 | dump_printf (MSG_NOTE, " prologue iterations: %d\n" , |
5044 | peel_iters_prologue); |
5045 | dump_printf (MSG_NOTE, " epilogue iterations: %d\n" , |
5046 | peel_iters_epilogue); |
5047 | } |
5048 | |
5049 | /* Calculate number of iterations required to make the vector version |
5050 | profitable, relative to the loop bodies only. The following condition |
5051 | must hold true: |
5052 | SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC |
5053 | where |
5054 | SIC = scalar iteration cost, VIC = vector iteration cost, |
5055 | VOC = vector outside cost, VF = vectorization factor, |
5056 | NPEEL = prologue iterations + epilogue iterations, |
5057 | SOC = scalar outside cost for run time cost model check. */ |
5058 | |
5059 | int saving_per_viter = (scalar_single_iter_cost * assumed_vf |
5060 | - vec_inside_cost); |
5061 | if (saving_per_viter <= 0) |
5062 | { |
5063 | if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize) |
5064 | warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd, |
5065 | "vectorization did not happen for a simd loop" ); |
5066 | |
5067 | if (dump_enabled_p ()) |
5068 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
5069 | "cost model: the vector iteration cost = %d " |
5070 | "divided by the scalar iteration cost = %d " |
5071 | "is greater or equal to the vectorization factor = %d" |
5072 | ".\n" , |
5073 | vec_inside_cost, scalar_single_iter_cost, assumed_vf); |
5074 | *ret_min_profitable_niters = -1; |
5075 | *ret_min_profitable_estimate = -1; |
5076 | return; |
5077 | } |
5078 | |
5079 | /* ??? The "if" arm is written to handle all cases; see below for what |
5080 | we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */ |
5081 | if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
5082 | { |
5083 | /* Rewriting the condition above in terms of the number of |
5084 | vector iterations (vniters) rather than the number of |
5085 | scalar iterations (niters) gives: |
5086 | |
5087 | SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC |
5088 | |
5089 | <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC |
5090 | |
5091 | For integer N, X and Y when X > 0: |
5092 | |
5093 | N * X > Y <==> N >= (Y /[floor] X) + 1. */ |
5094 | int outside_overhead = (vec_outside_cost |
5095 | - scalar_single_iter_cost * peel_iters_prologue |
5096 | - scalar_single_iter_cost * peel_iters_epilogue |
5097 | - scalar_outside_cost); |
5098 | /* We're only interested in cases that require at least one |
5099 | vector iteration. */ |
5100 | int min_vec_niters = 1; |
5101 | if (outside_overhead > 0) |
5102 | min_vec_niters = outside_overhead / saving_per_viter + 1; |
5103 | |
5104 | if (dump_enabled_p ()) |
5105 | dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n" , |
5106 | min_vec_niters); |
5107 | |
5108 | if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
5109 | { |
5110 | /* Now that we know the minimum number of vector iterations, |
5111 | find the minimum niters for which the scalar cost is larger: |
5112 | |
5113 | SIC * niters > VIC * vniters + VOC - SOC |
5114 | |
5115 | We know that the minimum niters is no more than |
5116 | vniters * VF + NPEEL, but it might be (and often is) less |
5117 | than that if a partial vector iteration is cheaper than the |
5118 | equivalent scalar code. */ |
5119 | int threshold = (vec_inside_cost * min_vec_niters |
5120 | + vec_outside_cost |
5121 | - scalar_outside_cost); |
5122 | if (threshold <= 0) |
5123 | min_profitable_iters = 1; |
5124 | else |
5125 | min_profitable_iters = threshold / scalar_single_iter_cost + 1; |
5126 | } |
5127 | else |
5128 | /* Convert the number of vector iterations into a number of |
5129 | scalar iterations. */ |
5130 | min_profitable_iters = (min_vec_niters * assumed_vf |
5131 | + peel_iters_prologue |
5132 | + peel_iters_epilogue); |
5133 | } |
5134 | else |
5135 | { |
5136 | min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) |
5137 | * assumed_vf |
5138 | - vec_inside_cost * peel_iters_prologue |
5139 | - vec_inside_cost * peel_iters_epilogue); |
5140 | if (min_profitable_iters <= 0) |
5141 | min_profitable_iters = 0; |
5142 | else |
5143 | { |
5144 | min_profitable_iters /= saving_per_viter; |
5145 | |
5146 | if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters) |
5147 | <= (((int) vec_inside_cost * min_profitable_iters) |
5148 | + (((int) vec_outside_cost - scalar_outside_cost) |
5149 | * assumed_vf))) |
5150 | min_profitable_iters++; |
5151 | } |
5152 | } |
5153 | |
5154 | if (dump_enabled_p ()) |
5155 | dump_printf (MSG_NOTE, |
5156 | " Calculated minimum iters for profitability: %d\n" , |
5157 | min_profitable_iters); |
5158 | |
5159 | if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) |
5160 | && min_profitable_iters < (assumed_vf + peel_iters_prologue)) |
5161 | /* We want the vectorized loop to execute at least once. */ |
5162 | min_profitable_iters = assumed_vf + peel_iters_prologue; |
5163 | else if (min_profitable_iters < peel_iters_prologue) |
5164 | /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the |
5165 | vectorized loop executes at least once. */ |
5166 | min_profitable_iters = peel_iters_prologue; |
5167 | |
5168 | if (dump_enabled_p ()) |
5169 | dump_printf_loc (MSG_NOTE, vect_location, |
5170 | " Runtime profitability threshold = %d\n" , |
5171 | min_profitable_iters); |
5172 | |
5173 | *ret_min_profitable_niters = min_profitable_iters; |
5174 | |
5175 | /* Calculate number of iterations required to make the vector version |
5176 | profitable, relative to the loop bodies only. |
5177 | |
5178 | Non-vectorized variant is SIC * niters and it must win over vector |
5179 | variant on the expected loop trip count. The following condition must hold true: |
5180 | SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */ |
5181 | |
5182 | if (vec_outside_cost <= 0) |
5183 | min_profitable_estimate = 0; |
5184 | /* ??? This "else if" arm is written to handle all cases; see below for |
5185 | what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */ |
5186 | else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
5187 | { |
5188 | /* This is a repeat of the code above, but with + SOC rather |
5189 | than - SOC. */ |
5190 | int outside_overhead = (vec_outside_cost |
5191 | - scalar_single_iter_cost * peel_iters_prologue |
5192 | - scalar_single_iter_cost * peel_iters_epilogue |
5193 | + scalar_outside_cost); |
5194 | int min_vec_niters = 1; |
5195 | if (outside_overhead > 0) |
5196 | min_vec_niters = outside_overhead / saving_per_viter + 1; |
5197 | |
5198 | if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
5199 | { |
5200 | int threshold = (vec_inside_cost * min_vec_niters |
5201 | + vec_outside_cost |
5202 | + scalar_outside_cost); |
5203 | min_profitable_estimate = threshold / scalar_single_iter_cost + 1; |
5204 | } |
5205 | else |
5206 | min_profitable_estimate = (min_vec_niters * assumed_vf |
5207 | + peel_iters_prologue |
5208 | + peel_iters_epilogue); |
5209 | } |
5210 | else |
5211 | { |
5212 | min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) |
5213 | * assumed_vf |
5214 | - vec_inside_cost * peel_iters_prologue |
5215 | - vec_inside_cost * peel_iters_epilogue) |
5216 | / ((scalar_single_iter_cost * assumed_vf) |
5217 | - vec_inside_cost); |
5218 | } |
5219 | min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters); |
5220 | if (dump_enabled_p ()) |
5221 | dump_printf_loc (MSG_NOTE, vect_location, |
5222 | " Static estimate profitability threshold = %d\n" , |
5223 | min_profitable_estimate); |
5224 | |
5225 | *ret_min_profitable_estimate = min_profitable_estimate; |
5226 | } |
5227 | |
5228 | /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET |
5229 | vector elements (not bits) for a vector with NELT elements. */ |
5230 | static void |
5231 | calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt, |
5232 | vec_perm_builder *sel) |
5233 | { |
5234 | /* The encoding is a single stepped pattern. Any wrap-around is handled |
5235 | by vec_perm_indices. */ |
5236 | sel->new_vector (full_nelts: nelt, npatterns: 1, nelts_per_pattern: 3); |
5237 | for (unsigned int i = 0; i < 3; i++) |
5238 | sel->quick_push (obj: i + offset); |
5239 | } |
5240 | |
5241 | /* Checks whether the target supports whole-vector shifts for vectors of mode |
5242 | MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_ |
5243 | it supports vec_perm_const with masks for all necessary shift amounts. */ |
5244 | static bool |
5245 | have_whole_vector_shift (machine_mode mode) |
5246 | { |
5247 | if (optab_handler (op: vec_shr_optab, mode) != CODE_FOR_nothing) |
5248 | return true; |
5249 | |
5250 | /* Variable-length vectors should be handled via the optab. */ |
5251 | unsigned int nelt; |
5252 | if (!GET_MODE_NUNITS (mode).is_constant (const_value: &nelt)) |
5253 | return false; |
5254 | |
5255 | vec_perm_builder sel; |
5256 | vec_perm_indices indices; |
5257 | for (unsigned int i = nelt / 2; i >= 1; i /= 2) |
5258 | { |
5259 | calc_vec_perm_mask_for_shift (offset: i, nelt, sel: &sel); |
5260 | indices.new_vector (sel, 2, nelt); |
5261 | if (!can_vec_perm_const_p (mode, mode, indices, false)) |
5262 | return false; |
5263 | } |
5264 | return true; |
5265 | } |
5266 | |
5267 | /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose |
5268 | multiplication operands have differing signs and (b) we intend |
5269 | to emulate the operation using a series of signed DOT_PROD_EXPRs. |
5270 | See vect_emulate_mixed_dot_prod for the actual sequence used. */ |
5271 | |
5272 | static bool |
5273 | vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo, |
5274 | stmt_vec_info stmt_info) |
5275 | { |
5276 | gassign *assign = dyn_cast<gassign *> (p: stmt_info->stmt); |
5277 | if (!assign || gimple_assign_rhs_code (gs: assign) != DOT_PROD_EXPR) |
5278 | return false; |
5279 | |
5280 | tree rhs1 = gimple_assign_rhs1 (gs: assign); |
5281 | tree rhs2 = gimple_assign_rhs2 (gs: assign); |
5282 | if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2))) |
5283 | return false; |
5284 | |
5285 | stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); |
5286 | gcc_assert (reduc_info->is_reduc_info); |
5287 | return !directly_supported_p (DOT_PROD_EXPR, |
5288 | STMT_VINFO_REDUC_VECTYPE_IN (reduc_info), |
5289 | optab_vector_mixed_sign); |
5290 | } |
5291 | |
5292 | /* TODO: Close dependency between vect_model_*_cost and vectorizable_* |
5293 | functions. Design better to avoid maintenance issues. */ |
5294 | |
5295 | /* Function vect_model_reduction_cost. |
5296 | |
5297 | Models cost for a reduction operation, including the vector ops |
5298 | generated within the strip-mine loop in some cases, the initial |
5299 | definition before the loop, and the epilogue code that must be generated. */ |
5300 | |
5301 | static void |
5302 | vect_model_reduction_cost (loop_vec_info loop_vinfo, |
5303 | stmt_vec_info stmt_info, internal_fn reduc_fn, |
5304 | vect_reduction_type reduction_type, |
5305 | int ncopies, stmt_vector_for_cost *cost_vec) |
5306 | { |
5307 | int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0; |
5308 | tree vectype; |
5309 | machine_mode mode; |
5310 | class loop *loop = NULL; |
5311 | |
5312 | if (loop_vinfo) |
5313 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
5314 | |
5315 | /* Condition reductions generate two reductions in the loop. */ |
5316 | if (reduction_type == COND_REDUCTION) |
5317 | ncopies *= 2; |
5318 | |
5319 | vectype = STMT_VINFO_VECTYPE (stmt_info); |
5320 | mode = TYPE_MODE (vectype); |
5321 | stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); |
5322 | |
5323 | gimple_match_op op; |
5324 | if (!gimple_extract_op (orig_stmt_info->stmt, &op)) |
5325 | gcc_unreachable (); |
5326 | |
5327 | bool emulated_mixed_dot_prod |
5328 | = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info); |
5329 | if (reduction_type == EXTRACT_LAST_REDUCTION) |
5330 | /* No extra instructions are needed in the prologue. The loop body |
5331 | operations are costed in vectorizable_condition. */ |
5332 | inside_cost = 0; |
5333 | else if (reduction_type == FOLD_LEFT_REDUCTION) |
5334 | { |
5335 | /* No extra instructions needed in the prologue. */ |
5336 | prologue_cost = 0; |
5337 | |
5338 | if (reduc_fn != IFN_LAST) |
5339 | /* Count one reduction-like operation per vector. */ |
5340 | inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: ncopies, kind: vec_to_scalar, |
5341 | stmt_info, misalign: 0, where: vect_body); |
5342 | else |
5343 | { |
5344 | /* Use NELEMENTS extracts and NELEMENTS scalar ops. */ |
5345 | unsigned int nelements = ncopies * vect_nunits_for_cost (vec_type: vectype); |
5346 | inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: nelements, |
5347 | kind: vec_to_scalar, stmt_info, misalign: 0, |
5348 | where: vect_body); |
5349 | inside_cost += record_stmt_cost (body_cost_vec: cost_vec, count: nelements, |
5350 | kind: scalar_stmt, stmt_info, misalign: 0, |
5351 | where: vect_body); |
5352 | } |
5353 | } |
5354 | else |
5355 | { |
5356 | /* Add in the cost of the initial definitions. */ |
5357 | int prologue_stmts; |
5358 | if (reduction_type == COND_REDUCTION) |
5359 | /* For cond reductions we have four vectors: initial index, step, |
5360 | initial result of the data reduction, initial value of the index |
5361 | reduction. */ |
5362 | prologue_stmts = 4; |
5363 | else if (emulated_mixed_dot_prod) |
5364 | /* We need the initial reduction value and two invariants: |
5365 | one that contains the minimum signed value and one that |
5366 | contains half of its negative. */ |
5367 | prologue_stmts = 3; |
5368 | else |
5369 | prologue_stmts = 1; |
5370 | prologue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: prologue_stmts, |
5371 | kind: scalar_to_vec, stmt_info, misalign: 0, |
5372 | where: vect_prologue); |
5373 | } |
5374 | |
5375 | /* Determine cost of epilogue code. |
5376 | |
5377 | We have a reduction operator that will reduce the vector in one statement. |
5378 | Also requires scalar extract. */ |
5379 | |
5380 | if (!loop || !nested_in_vect_loop_p (loop, stmt_info: orig_stmt_info)) |
5381 | { |
5382 | if (reduc_fn != IFN_LAST) |
5383 | { |
5384 | if (reduction_type == COND_REDUCTION) |
5385 | { |
5386 | /* An EQ stmt and an COND_EXPR stmt. */ |
5387 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 2, |
5388 | kind: vector_stmt, stmt_info, misalign: 0, |
5389 | where: vect_epilogue); |
5390 | /* Reduction of the max index and a reduction of the found |
5391 | values. */ |
5392 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 2, |
5393 | kind: vec_to_scalar, stmt_info, misalign: 0, |
5394 | where: vect_epilogue); |
5395 | /* A broadcast of the max value. */ |
5396 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 1, |
5397 | kind: scalar_to_vec, stmt_info, misalign: 0, |
5398 | where: vect_epilogue); |
5399 | } |
5400 | else |
5401 | { |
5402 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 1, kind: vector_stmt, |
5403 | stmt_info, misalign: 0, where: vect_epilogue); |
5404 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 1, |
5405 | kind: vec_to_scalar, stmt_info, misalign: 0, |
5406 | where: vect_epilogue); |
5407 | } |
5408 | } |
5409 | else if (reduction_type == COND_REDUCTION) |
5410 | { |
5411 | unsigned estimated_nunits = vect_nunits_for_cost (vec_type: vectype); |
5412 | /* Extraction of scalar elements. */ |
5413 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, |
5414 | count: 2 * estimated_nunits, |
5415 | kind: vec_to_scalar, stmt_info, misalign: 0, |
5416 | where: vect_epilogue); |
5417 | /* Scalar max reductions via COND_EXPR / MAX_EXPR. */ |
5418 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, |
5419 | count: 2 * estimated_nunits - 3, |
5420 | kind: scalar_stmt, stmt_info, misalign: 0, |
5421 | where: vect_epilogue); |
5422 | } |
5423 | else if (reduction_type == EXTRACT_LAST_REDUCTION |
5424 | || reduction_type == FOLD_LEFT_REDUCTION) |
5425 | /* No extra instructions need in the epilogue. */ |
5426 | ; |
5427 | else |
5428 | { |
5429 | int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); |
5430 | tree bitsize = TYPE_SIZE (op.type); |
5431 | int element_bitsize = tree_to_uhwi (bitsize); |
5432 | int nelements = vec_size_in_bits / element_bitsize; |
5433 | |
5434 | if (op.code == COND_EXPR) |
5435 | op.code = MAX_EXPR; |
5436 | |
5437 | /* We have a whole vector shift available. */ |
5438 | if (VECTOR_MODE_P (mode) |
5439 | && directly_supported_p (op.code, vectype) |
5440 | && have_whole_vector_shift (mode)) |
5441 | { |
5442 | /* Final reduction via vector shifts and the reduction operator. |
5443 | Also requires scalar extract. */ |
5444 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, |
5445 | count: exact_log2 (x: nelements) * 2, |
5446 | kind: vector_stmt, stmt_info, misalign: 0, |
5447 | where: vect_epilogue); |
5448 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 1, |
5449 | kind: vec_to_scalar, stmt_info, misalign: 0, |
5450 | where: vect_epilogue); |
5451 | } |
5452 | else |
5453 | /* Use extracts and reduction op for final reduction. For N |
5454 | elements, we have N extracts and N-1 reduction ops. */ |
5455 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, |
5456 | count: nelements + nelements - 1, |
5457 | kind: vector_stmt, stmt_info, misalign: 0, |
5458 | where: vect_epilogue); |
5459 | } |
5460 | } |
5461 | |
5462 | if (dump_enabled_p ()) |
5463 | dump_printf (MSG_NOTE, |
5464 | "vect_model_reduction_cost: inside_cost = %d, " |
5465 | "prologue_cost = %d, epilogue_cost = %d .\n" , inside_cost, |
5466 | prologue_cost, epilogue_cost); |
5467 | } |
5468 | |
5469 | /* SEQ is a sequence of instructions that initialize the reduction |
5470 | described by REDUC_INFO. Emit them in the appropriate place. */ |
5471 | |
5472 | static void |
5473 | vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo, |
5474 | stmt_vec_info reduc_info, gimple *seq) |
5475 | { |
5476 | if (reduc_info->reused_accumulator) |
5477 | { |
5478 | /* When reusing an accumulator from the main loop, we only need |
5479 | initialization instructions if the main loop can be skipped. |
5480 | In that case, emit the initialization instructions at the end |
5481 | of the guard block that does the skip. */ |
5482 | edge skip_edge = loop_vinfo->skip_main_loop_edge; |
5483 | gcc_assert (skip_edge); |
5484 | gimple_stmt_iterator gsi = gsi_last_bb (bb: skip_edge->src); |
5485 | gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); |
5486 | } |
5487 | else |
5488 | { |
5489 | /* The normal case: emit the initialization instructions on the |
5490 | preheader edge. */ |
5491 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
5492 | gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq); |
5493 | } |
5494 | } |
5495 | |
5496 | /* Function get_initial_def_for_reduction |
5497 | |
5498 | Input: |
5499 | REDUC_INFO - the info_for_reduction |
5500 | INIT_VAL - the initial value of the reduction variable |
5501 | NEUTRAL_OP - a value that has no effect on the reduction, as per |
5502 | neutral_op_for_reduction |
5503 | |
5504 | Output: |
5505 | Return a vector variable, initialized according to the operation that |
5506 | STMT_VINFO performs. This vector will be used as the initial value |
5507 | of the vector of partial results. |
5508 | |
5509 | The value we need is a vector in which element 0 has value INIT_VAL |
5510 | and every other element has value NEUTRAL_OP. */ |
5511 | |
5512 | static tree |
5513 | get_initial_def_for_reduction (loop_vec_info loop_vinfo, |
5514 | stmt_vec_info reduc_info, |
5515 | tree init_val, tree neutral_op) |
5516 | { |
5517 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
5518 | tree scalar_type = TREE_TYPE (init_val); |
5519 | tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); |
5520 | tree init_def; |
5521 | gimple_seq stmts = NULL; |
5522 | |
5523 | gcc_assert (vectype); |
5524 | |
5525 | gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type) |
5526 | || SCALAR_FLOAT_TYPE_P (scalar_type)); |
5527 | |
5528 | gcc_assert (nested_in_vect_loop_p (loop, reduc_info) |
5529 | || loop == (gimple_bb (reduc_info->stmt))->loop_father); |
5530 | |
5531 | if (operand_equal_p (init_val, neutral_op)) |
5532 | { |
5533 | /* If both elements are equal then the vector described above is |
5534 | just a splat. */ |
5535 | neutral_op = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: neutral_op); |
5536 | init_def = gimple_build_vector_from_val (seq: &stmts, type: vectype, op: neutral_op); |
5537 | } |
5538 | else |
5539 | { |
5540 | neutral_op = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: neutral_op); |
5541 | init_val = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: init_val); |
5542 | if (!TYPE_VECTOR_SUBPARTS (node: vectype).is_constant ()) |
5543 | { |
5544 | /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into |
5545 | element 0. */ |
5546 | init_def = gimple_build_vector_from_val (seq: &stmts, type: vectype, |
5547 | op: neutral_op); |
5548 | init_def = gimple_build (seq: &stmts, fn: CFN_VEC_SHL_INSERT, |
5549 | type: vectype, args: init_def, args: init_val); |
5550 | } |
5551 | else |
5552 | { |
5553 | /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */ |
5554 | tree_vector_builder elts (vectype, 1, 2); |
5555 | elts.quick_push (obj: init_val); |
5556 | elts.quick_push (obj: neutral_op); |
5557 | init_def = gimple_build_vector (seq: &stmts, builder: &elts); |
5558 | } |
5559 | } |
5560 | |
5561 | if (stmts) |
5562 | vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, seq: stmts); |
5563 | return init_def; |
5564 | } |
5565 | |
5566 | /* Get at the initial defs for the reduction PHIs for REDUC_INFO, |
5567 | which performs a reduction involving GROUP_SIZE scalar statements. |
5568 | NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP |
5569 | is nonnull, introducing extra elements of that value will not change the |
5570 | result. */ |
5571 | |
5572 | static void |
5573 | get_initial_defs_for_reduction (loop_vec_info loop_vinfo, |
5574 | stmt_vec_info reduc_info, |
5575 | vec<tree> *vec_oprnds, |
5576 | unsigned int number_of_vectors, |
5577 | unsigned int group_size, tree neutral_op) |
5578 | { |
5579 | vec<tree> &initial_values = reduc_info->reduc_initial_values; |
5580 | unsigned HOST_WIDE_INT nunits; |
5581 | unsigned j, number_of_places_left_in_vector; |
5582 | tree vector_type = STMT_VINFO_VECTYPE (reduc_info); |
5583 | unsigned int i; |
5584 | |
5585 | gcc_assert (group_size == initial_values.length () || neutral_op); |
5586 | |
5587 | /* NUMBER_OF_COPIES is the number of times we need to use the same values in |
5588 | created vectors. It is greater than 1 if unrolling is performed. |
5589 | |
5590 | For example, we have two scalar operands, s1 and s2 (e.g., group of |
5591 | strided accesses of size two), while NUNITS is four (i.e., four scalars |
5592 | of this type can be packed in a vector). The output vector will contain |
5593 | two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES |
5594 | will be 2). |
5595 | |
5596 | If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several |
5597 | vectors containing the operands. |
5598 | |
5599 | For example, NUNITS is four as before, and the group size is 8 |
5600 | (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and |
5601 | {s5, s6, s7, s8}. */ |
5602 | |
5603 | if (!TYPE_VECTOR_SUBPARTS (node: vector_type).is_constant (const_value: &nunits)) |
5604 | nunits = group_size; |
5605 | |
5606 | number_of_places_left_in_vector = nunits; |
5607 | bool constant_p = true; |
5608 | tree_vector_builder elts (vector_type, nunits, 1); |
5609 | elts.quick_grow (len: nunits); |
5610 | gimple_seq ctor_seq = NULL; |
5611 | for (j = 0; j < nunits * number_of_vectors; ++j) |
5612 | { |
5613 | tree op; |
5614 | i = j % group_size; |
5615 | |
5616 | /* Get the def before the loop. In reduction chain we have only |
5617 | one initial value. Else we have as many as PHIs in the group. */ |
5618 | if (i >= initial_values.length () || (j > i && neutral_op)) |
5619 | op = neutral_op; |
5620 | else |
5621 | op = initial_values[i]; |
5622 | |
5623 | /* Create 'vect_ = {op0,op1,...,opn}'. */ |
5624 | number_of_places_left_in_vector--; |
5625 | elts[nunits - number_of_places_left_in_vector - 1] = op; |
5626 | if (!CONSTANT_CLASS_P (op)) |
5627 | constant_p = false; |
5628 | |
5629 | if (number_of_places_left_in_vector == 0) |
5630 | { |
5631 | tree init; |
5632 | if (constant_p && !neutral_op |
5633 | ? multiple_p (a: TYPE_VECTOR_SUBPARTS (node: vector_type), b: nunits) |
5634 | : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits)) |
5635 | /* Build the vector directly from ELTS. */ |
5636 | init = gimple_build_vector (seq: &ctor_seq, builder: &elts); |
5637 | else if (neutral_op) |
5638 | { |
5639 | /* Build a vector of the neutral value and shift the |
5640 | other elements into place. */ |
5641 | init = gimple_build_vector_from_val (seq: &ctor_seq, type: vector_type, |
5642 | op: neutral_op); |
5643 | int k = nunits; |
5644 | while (k > 0 && elts[k - 1] == neutral_op) |
5645 | k -= 1; |
5646 | while (k > 0) |
5647 | { |
5648 | k -= 1; |
5649 | init = gimple_build (seq: &ctor_seq, fn: CFN_VEC_SHL_INSERT, |
5650 | type: vector_type, args: init, args: elts[k]); |
5651 | } |
5652 | } |
5653 | else |
5654 | { |
5655 | /* First time round, duplicate ELTS to fill the |
5656 | required number of vectors. */ |
5657 | duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type, |
5658 | elts, number_of_vectors, *vec_oprnds); |
5659 | break; |
5660 | } |
5661 | vec_oprnds->quick_push (obj: init); |
5662 | |
5663 | number_of_places_left_in_vector = nunits; |
5664 | elts.new_vector (type: vector_type, npatterns: nunits, nelts_per_pattern: 1); |
5665 | elts.quick_grow (len: nunits); |
5666 | constant_p = true; |
5667 | } |
5668 | } |
5669 | if (ctor_seq != NULL) |
5670 | vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, seq: ctor_seq); |
5671 | } |
5672 | |
5673 | /* For a statement STMT_INFO taking part in a reduction operation return |
5674 | the stmt_vec_info the meta information is stored on. */ |
5675 | |
5676 | stmt_vec_info |
5677 | info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info) |
5678 | { |
5679 | stmt_info = vect_orig_stmt (stmt_info); |
5680 | gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info)); |
5681 | if (!is_a <gphi *> (p: stmt_info->stmt) |
5682 | || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) |
5683 | stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); |
5684 | gphi *phi = as_a <gphi *> (p: stmt_info->stmt); |
5685 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) |
5686 | { |
5687 | if (gimple_phi_num_args (gs: phi) == 1) |
5688 | stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); |
5689 | } |
5690 | else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) |
5691 | { |
5692 | stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi)); |
5693 | if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def) |
5694 | stmt_info = info; |
5695 | } |
5696 | return stmt_info; |
5697 | } |
5698 | |
5699 | /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that |
5700 | REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise |
5701 | return false. */ |
5702 | |
5703 | static bool |
5704 | vect_find_reusable_accumulator (loop_vec_info loop_vinfo, |
5705 | stmt_vec_info reduc_info) |
5706 | { |
5707 | loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); |
5708 | if (!main_loop_vinfo) |
5709 | return false; |
5710 | |
5711 | if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION) |
5712 | return false; |
5713 | |
5714 | unsigned int num_phis = reduc_info->reduc_initial_values.length (); |
5715 | auto_vec<tree, 16> main_loop_results (num_phis); |
5716 | auto_vec<tree, 16> initial_values (num_phis); |
5717 | if (edge main_loop_edge = loop_vinfo->main_loop_edge) |
5718 | { |
5719 | /* The epilogue loop can be entered either from the main loop or |
5720 | from an earlier guard block. */ |
5721 | edge skip_edge = loop_vinfo->skip_main_loop_edge; |
5722 | for (tree incoming_value : reduc_info->reduc_initial_values) |
5723 | { |
5724 | /* Look for: |
5725 | |
5726 | INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop), |
5727 | INITIAL_VALUE(guard block)>. */ |
5728 | gcc_assert (TREE_CODE (incoming_value) == SSA_NAME); |
5729 | |
5730 | gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value)); |
5731 | gcc_assert (gimple_bb (phi) == main_loop_edge->dest); |
5732 | |
5733 | tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge); |
5734 | tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge); |
5735 | |
5736 | main_loop_results.quick_push (obj: from_main_loop); |
5737 | initial_values.quick_push (obj: from_skip); |
5738 | } |
5739 | } |
5740 | else |
5741 | /* The main loop dominates the epilogue loop. */ |
5742 | main_loop_results.splice (src: reduc_info->reduc_initial_values); |
5743 | |
5744 | /* See if the main loop has the kind of accumulator we need. */ |
5745 | vect_reusable_accumulator *accumulator |
5746 | = main_loop_vinfo->reusable_accumulators.get (k: main_loop_results[0]); |
5747 | if (!accumulator |
5748 | || num_phis != accumulator->reduc_info->reduc_scalar_results.length () |
5749 | || !std::equal (first1: main_loop_results.begin (), last1: main_loop_results.end (), |
5750 | first2: accumulator->reduc_info->reduc_scalar_results.begin ())) |
5751 | return false; |
5752 | |
5753 | /* Handle the case where we can reduce wider vectors to narrower ones. */ |
5754 | tree vectype = STMT_VINFO_VECTYPE (reduc_info); |
5755 | tree old_vectype = TREE_TYPE (accumulator->reduc_input); |
5756 | unsigned HOST_WIDE_INT m; |
5757 | if (!constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: old_vectype), |
5758 | b: TYPE_VECTOR_SUBPARTS (node: vectype), multiple: &m)) |
5759 | return false; |
5760 | /* Check the intermediate vector types and operations are available. */ |
5761 | tree prev_vectype = old_vectype; |
5762 | poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (node: old_vectype); |
5763 | while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype))) |
5764 | { |
5765 | intermediate_nunits = exact_div (a: intermediate_nunits, b: 2); |
5766 | tree intermediate_vectype = get_related_vectype_for_scalar_type |
5767 | (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits); |
5768 | if (!intermediate_vectype |
5769 | || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info), |
5770 | intermediate_vectype) |
5771 | || !can_vec_extract (TYPE_MODE (prev_vectype), |
5772 | TYPE_MODE (intermediate_vectype))) |
5773 | return false; |
5774 | prev_vectype = intermediate_vectype; |
5775 | } |
5776 | |
5777 | /* Non-SLP reductions might apply an adjustment after the reduction |
5778 | operation, in order to simplify the initialization of the accumulator. |
5779 | If the epilogue loop carries on from where the main loop left off, |
5780 | it should apply the same adjustment to the final reduction result. |
5781 | |
5782 | If the epilogue loop can also be entered directly (rather than via |
5783 | the main loop), we need to be able to handle that case in the same way, |
5784 | with the same adjustment. (In principle we could add a PHI node |
5785 | to select the correct adjustment, but in practice that shouldn't be |
5786 | necessary.) */ |
5787 | tree main_adjustment |
5788 | = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info); |
5789 | if (loop_vinfo->main_loop_edge && main_adjustment) |
5790 | { |
5791 | gcc_assert (num_phis == 1); |
5792 | tree initial_value = initial_values[0]; |
5793 | /* Check that we can use INITIAL_VALUE as the adjustment and |
5794 | initialize the accumulator with a neutral value instead. */ |
5795 | if (!operand_equal_p (initial_value, main_adjustment)) |
5796 | return false; |
5797 | code_helper code = STMT_VINFO_REDUC_CODE (reduc_info); |
5798 | initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value), |
5799 | code, initial_value); |
5800 | } |
5801 | STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment; |
5802 | reduc_info->reduc_initial_values.truncate (size: 0); |
5803 | reduc_info->reduc_initial_values.splice (src: initial_values); |
5804 | reduc_info->reused_accumulator = accumulator; |
5805 | return true; |
5806 | } |
5807 | |
5808 | /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation |
5809 | CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */ |
5810 | |
5811 | static tree |
5812 | vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code, |
5813 | gimple_seq *seq) |
5814 | { |
5815 | unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant (); |
5816 | unsigned nunits1 = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant (); |
5817 | tree stype = TREE_TYPE (vectype); |
5818 | tree new_temp = vec_def; |
5819 | while (nunits > nunits1) |
5820 | { |
5821 | nunits /= 2; |
5822 | tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), |
5823 | stype, nunits); |
5824 | unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1)); |
5825 | |
5826 | /* The target has to make sure we support lowpart/highpart |
5827 | extraction, either via direct vector extract or through |
5828 | an integer mode punning. */ |
5829 | tree dst1, dst2; |
5830 | gimple *epilog_stmt; |
5831 | if (convert_optab_handler (op: vec_extract_optab, |
5832 | TYPE_MODE (TREE_TYPE (new_temp)), |
5833 | TYPE_MODE (vectype1)) |
5834 | != CODE_FOR_nothing) |
5835 | { |
5836 | /* Extract sub-vectors directly once vec_extract becomes |
5837 | a conversion optab. */ |
5838 | dst1 = make_ssa_name (var: vectype1); |
5839 | epilog_stmt |
5840 | = gimple_build_assign (dst1, BIT_FIELD_REF, |
5841 | build3 (BIT_FIELD_REF, vectype1, |
5842 | new_temp, TYPE_SIZE (vectype1), |
5843 | bitsize_int (0))); |
5844 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5845 | dst2 = make_ssa_name (var: vectype1); |
5846 | epilog_stmt |
5847 | = gimple_build_assign (dst2, BIT_FIELD_REF, |
5848 | build3 (BIT_FIELD_REF, vectype1, |
5849 | new_temp, TYPE_SIZE (vectype1), |
5850 | bitsize_int (bitsize))); |
5851 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5852 | } |
5853 | else |
5854 | { |
5855 | /* Extract via punning to appropriately sized integer mode |
5856 | vector. */ |
5857 | tree eltype = build_nonstandard_integer_type (bitsize, 1); |
5858 | tree etype = build_vector_type (eltype, 2); |
5859 | gcc_assert (convert_optab_handler (vec_extract_optab, |
5860 | TYPE_MODE (etype), |
5861 | TYPE_MODE (eltype)) |
5862 | != CODE_FOR_nothing); |
5863 | tree tem = make_ssa_name (var: etype); |
5864 | epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR, |
5865 | build1 (VIEW_CONVERT_EXPR, |
5866 | etype, new_temp)); |
5867 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5868 | new_temp = tem; |
5869 | tem = make_ssa_name (var: eltype); |
5870 | epilog_stmt |
5871 | = gimple_build_assign (tem, BIT_FIELD_REF, |
5872 | build3 (BIT_FIELD_REF, eltype, |
5873 | new_temp, TYPE_SIZE (eltype), |
5874 | bitsize_int (0))); |
5875 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5876 | dst1 = make_ssa_name (var: vectype1); |
5877 | epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR, |
5878 | build1 (VIEW_CONVERT_EXPR, |
5879 | vectype1, tem)); |
5880 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5881 | tem = make_ssa_name (var: eltype); |
5882 | epilog_stmt |
5883 | = gimple_build_assign (tem, BIT_FIELD_REF, |
5884 | build3 (BIT_FIELD_REF, eltype, |
5885 | new_temp, TYPE_SIZE (eltype), |
5886 | bitsize_int (bitsize))); |
5887 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5888 | dst2 = make_ssa_name (var: vectype1); |
5889 | epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, |
5890 | build1 (VIEW_CONVERT_EXPR, |
5891 | vectype1, tem)); |
5892 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5893 | } |
5894 | |
5895 | new_temp = gimple_build (seq, code, type: vectype1, ops: dst1, ops: dst2); |
5896 | } |
5897 | |
5898 | return new_temp; |
5899 | } |
5900 | |
5901 | /* Function vect_create_epilog_for_reduction |
5902 | |
5903 | Create code at the loop-epilog to finalize the result of a reduction |
5904 | computation. |
5905 | |
5906 | STMT_INFO is the scalar reduction stmt that is being vectorized. |
5907 | SLP_NODE is an SLP node containing a group of reduction statements. The |
5908 | first one in this group is STMT_INFO. |
5909 | SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE |
5910 | REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi |
5911 | (counting from 0) |
5912 | LOOP_EXIT is the edge to update in the merge block. In the case of a single |
5913 | exit this edge is always the main loop exit. |
5914 | |
5915 | This function: |
5916 | 1. Completes the reduction def-use cycles. |
5917 | 2. "Reduces" each vector of partial results VECT_DEFS into a single result, |
5918 | by calling the function specified by REDUC_FN if available, or by |
5919 | other means (whole-vector shifts or a scalar loop). |
5920 | The function also creates a new phi node at the loop exit to preserve |
5921 | loop-closed form, as illustrated below. |
5922 | |
5923 | The flow at the entry to this function: |
5924 | |
5925 | loop: |
5926 | vec_def = phi <vec_init, null> # REDUCTION_PHI |
5927 | VECT_DEF = vector_stmt # vectorized form of STMT_INFO |
5928 | s_loop = scalar_stmt # (scalar) STMT_INFO |
5929 | loop_exit: |
5930 | s_out0 = phi <s_loop> # (scalar) EXIT_PHI |
5931 | use <s_out0> |
5932 | use <s_out0> |
5933 | |
5934 | The above is transformed by this function into: |
5935 | |
5936 | loop: |
5937 | vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI |
5938 | VECT_DEF = vector_stmt # vectorized form of STMT_INFO |
5939 | s_loop = scalar_stmt # (scalar) STMT_INFO |
5940 | loop_exit: |
5941 | s_out0 = phi <s_loop> # (scalar) EXIT_PHI |
5942 | v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI |
5943 | v_out2 = reduce <v_out1> |
5944 | s_out3 = extract_field <v_out2, 0> |
5945 | s_out4 = adjust_result <s_out3> |
5946 | use <s_out4> |
5947 | use <s_out4> |
5948 | */ |
5949 | |
5950 | static void |
5951 | vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, |
5952 | stmt_vec_info stmt_info, |
5953 | slp_tree slp_node, |
5954 | slp_instance slp_node_instance, |
5955 | edge loop_exit) |
5956 | { |
5957 | stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info); |
5958 | gcc_assert (reduc_info->is_reduc_info); |
5959 | /* For double reductions we need to get at the inner loop reduction |
5960 | stmt which has the meta info attached. Our stmt_info is that of the |
5961 | loop-closed PHI of the inner loop which we remember as |
5962 | def for the reduction PHI generation. */ |
5963 | bool double_reduc = false; |
5964 | stmt_vec_info rdef_info = stmt_info; |
5965 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) |
5966 | { |
5967 | gcc_assert (!slp_node); |
5968 | double_reduc = true; |
5969 | stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def |
5970 | (gs: stmt_info->stmt, index: 0)); |
5971 | stmt_info = vect_stmt_to_vectorize (stmt_info); |
5972 | } |
5973 | code_helper code = STMT_VINFO_REDUC_CODE (reduc_info); |
5974 | internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info); |
5975 | tree vectype; |
5976 | machine_mode mode; |
5977 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL; |
5978 | basic_block exit_bb; |
5979 | tree scalar_dest; |
5980 | tree scalar_type; |
5981 | gimple *new_phi = NULL, *phi = NULL; |
5982 | gimple_stmt_iterator exit_gsi; |
5983 | tree new_temp = NULL_TREE, new_name, new_scalar_dest; |
5984 | gimple *epilog_stmt = NULL; |
5985 | gimple *exit_phi; |
5986 | tree bitsize; |
5987 | tree def; |
5988 | tree orig_name, scalar_result; |
5989 | imm_use_iterator imm_iter, phi_imm_iter; |
5990 | use_operand_p use_p, phi_use_p; |
5991 | gimple *use_stmt; |
5992 | auto_vec<tree> reduc_inputs; |
5993 | int j, i; |
5994 | vec<tree> &scalar_results = reduc_info->reduc_scalar_results; |
5995 | unsigned int group_size = 1, k; |
5996 | /* SLP reduction without reduction chain, e.g., |
5997 | # a1 = phi <a2, a0> |
5998 | # b1 = phi <b2, b0> |
5999 | a2 = operation (a1) |
6000 | b2 = operation (b1) */ |
6001 | bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)); |
6002 | bool direct_slp_reduc; |
6003 | tree induction_index = NULL_TREE; |
6004 | |
6005 | if (slp_node) |
6006 | group_size = SLP_TREE_LANES (slp_node); |
6007 | |
6008 | if (nested_in_vect_loop_p (loop, stmt_info)) |
6009 | { |
6010 | outer_loop = loop; |
6011 | loop = loop->inner; |
6012 | gcc_assert (!slp_node && double_reduc); |
6013 | } |
6014 | |
6015 | vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info); |
6016 | gcc_assert (vectype); |
6017 | mode = TYPE_MODE (vectype); |
6018 | |
6019 | tree induc_val = NULL_TREE; |
6020 | tree adjustment_def = NULL; |
6021 | if (slp_node) |
6022 | ; |
6023 | else |
6024 | { |
6025 | /* Optimize: for induction condition reduction, if we can't use zero |
6026 | for induc_val, use initial_def. */ |
6027 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) |
6028 | induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); |
6029 | else if (double_reduc) |
6030 | ; |
6031 | else |
6032 | adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info); |
6033 | } |
6034 | |
6035 | stmt_vec_info single_live_out_stmt[] = { stmt_info }; |
6036 | array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt; |
6037 | if (slp_reduc) |
6038 | /* All statements produce live-out values. */ |
6039 | live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node); |
6040 | |
6041 | unsigned vec_num; |
6042 | int ncopies; |
6043 | if (slp_node) |
6044 | { |
6045 | vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length (); |
6046 | ncopies = 1; |
6047 | } |
6048 | else |
6049 | { |
6050 | vec_num = 1; |
6051 | ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length (); |
6052 | } |
6053 | |
6054 | /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) |
6055 | which is updated with the current index of the loop for every match of |
6056 | the original loop's cond_expr (VEC_STMT). This results in a vector |
6057 | containing the last time the condition passed for that vector lane. |
6058 | The first match will be a 1 to allow 0 to be used for non-matching |
6059 | indexes. If there are no matches at all then the vector will be all |
6060 | zeroes. |
6061 | |
6062 | PR92772: This algorithm is broken for architectures that support |
6063 | masked vectors, but do not provide fold_extract_last. */ |
6064 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) |
6065 | { |
6066 | auto_vec<std::pair<tree, bool>, 2> ccompares; |
6067 | stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info); |
6068 | cond_info = vect_stmt_to_vectorize (stmt_info: cond_info); |
6069 | while (cond_info != reduc_info) |
6070 | { |
6071 | if (gimple_assign_rhs_code (gs: cond_info->stmt) == COND_EXPR) |
6072 | { |
6073 | gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0]; |
6074 | gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); |
6075 | ccompares.safe_push |
6076 | (obj: std::make_pair (x: unshare_expr (gimple_assign_rhs1 (gs: vec_stmt)), |
6077 | STMT_VINFO_REDUC_IDX (cond_info) == 2)); |
6078 | } |
6079 | cond_info |
6080 | = loop_vinfo->lookup_def (gimple_op (gs: cond_info->stmt, |
6081 | i: 1 + STMT_VINFO_REDUC_IDX |
6082 | (cond_info))); |
6083 | cond_info = vect_stmt_to_vectorize (stmt_info: cond_info); |
6084 | } |
6085 | gcc_assert (ccompares.length () != 0); |
6086 | |
6087 | tree indx_before_incr, indx_after_incr; |
6088 | poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (node: vectype); |
6089 | int scalar_precision |
6090 | = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype))); |
6091 | tree cr_index_scalar_type = make_unsigned_type (scalar_precision); |
6092 | tree cr_index_vector_type = get_related_vectype_for_scalar_type |
6093 | (TYPE_MODE (vectype), cr_index_scalar_type, |
6094 | TYPE_VECTOR_SUBPARTS (node: vectype)); |
6095 | |
6096 | /* First we create a simple vector induction variable which starts |
6097 | with the values {1,2,3,...} (SERIES_VECT) and increments by the |
6098 | vector size (STEP). */ |
6099 | |
6100 | /* Create a {1,2,3,...} vector. */ |
6101 | tree series_vect = build_index_vector (cr_index_vector_type, 1, 1); |
6102 | |
6103 | /* Create a vector of the step value. */ |
6104 | tree step = build_int_cst (cr_index_scalar_type, nunits_out); |
6105 | tree vec_step = build_vector_from_val (cr_index_vector_type, step); |
6106 | |
6107 | /* Create an induction variable. */ |
6108 | gimple_stmt_iterator incr_gsi; |
6109 | bool insert_after; |
6110 | vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after); |
6111 | create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi, |
6112 | insert_after, &indx_before_incr, &indx_after_incr); |
6113 | |
6114 | /* Next create a new phi node vector (NEW_PHI_TREE) which starts |
6115 | filled with zeros (VEC_ZERO). */ |
6116 | |
6117 | /* Create a vector of 0s. */ |
6118 | tree zero = build_zero_cst (cr_index_scalar_type); |
6119 | tree vec_zero = build_vector_from_val (cr_index_vector_type, zero); |
6120 | |
6121 | /* Create a vector phi node. */ |
6122 | tree new_phi_tree = make_ssa_name (var: cr_index_vector_type); |
6123 | new_phi = create_phi_node (new_phi_tree, loop->header); |
6124 | add_phi_arg (as_a <gphi *> (p: new_phi), vec_zero, |
6125 | loop_preheader_edge (loop), UNKNOWN_LOCATION); |
6126 | |
6127 | /* Now take the condition from the loops original cond_exprs |
6128 | and produce a new cond_exprs (INDEX_COND_EXPR) which for |
6129 | every match uses values from the induction variable |
6130 | (INDEX_BEFORE_INCR) otherwise uses values from the phi node |
6131 | (NEW_PHI_TREE). |
6132 | Finally, we update the phi (NEW_PHI_TREE) to take the value of |
6133 | the new cond_expr (INDEX_COND_EXPR). */ |
6134 | gimple_seq stmts = NULL; |
6135 | for (int i = ccompares.length () - 1; i != -1; --i) |
6136 | { |
6137 | tree ccompare = ccompares[i].first; |
6138 | if (ccompares[i].second) |
6139 | new_phi_tree = gimple_build (seq: &stmts, code: VEC_COND_EXPR, |
6140 | type: cr_index_vector_type, |
6141 | ops: ccompare, |
6142 | ops: indx_before_incr, ops: new_phi_tree); |
6143 | else |
6144 | new_phi_tree = gimple_build (seq: &stmts, code: VEC_COND_EXPR, |
6145 | type: cr_index_vector_type, |
6146 | ops: ccompare, |
6147 | ops: new_phi_tree, ops: indx_before_incr); |
6148 | } |
6149 | gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT); |
6150 | |
6151 | /* Update the phi with the vec cond. */ |
6152 | induction_index = new_phi_tree; |
6153 | add_phi_arg (as_a <gphi *> (p: new_phi), induction_index, |
6154 | loop_latch_edge (loop), UNKNOWN_LOCATION); |
6155 | } |
6156 | |
6157 | /* 2. Create epilog code. |
6158 | The reduction epilog code operates across the elements of the vector |
6159 | of partial results computed by the vectorized loop. |
6160 | The reduction epilog code consists of: |
6161 | |
6162 | step 1: compute the scalar result in a vector (v_out2) |
6163 | step 2: extract the scalar result (s_out3) from the vector (v_out2) |
6164 | step 3: adjust the scalar result (s_out3) if needed. |
6165 | |
6166 | Step 1 can be accomplished using one the following three schemes: |
6167 | (scheme 1) using reduc_fn, if available. |
6168 | (scheme 2) using whole-vector shifts, if available. |
6169 | (scheme 3) using a scalar loop. In this case steps 1+2 above are |
6170 | combined. |
6171 | |
6172 | The overall epilog code looks like this: |
6173 | |
6174 | s_out0 = phi <s_loop> # original EXIT_PHI |
6175 | v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI |
6176 | v_out2 = reduce <v_out1> # step 1 |
6177 | s_out3 = extract_field <v_out2, 0> # step 2 |
6178 | s_out4 = adjust_result <s_out3> # step 3 |
6179 | |
6180 | (step 3 is optional, and steps 1 and 2 may be combined). |
6181 | Lastly, the uses of s_out0 are replaced by s_out4. */ |
6182 | |
6183 | |
6184 | /* 2.1 Create new loop-exit-phis to preserve loop-closed form: |
6185 | v_out1 = phi <VECT_DEF> |
6186 | Store them in NEW_PHIS. */ |
6187 | if (double_reduc) |
6188 | loop = outer_loop; |
6189 | /* We need to reduce values in all exits. */ |
6190 | exit_bb = loop_exit->dest; |
6191 | exit_gsi = gsi_after_labels (bb: exit_bb); |
6192 | reduc_inputs.create (nelems: slp_node ? vec_num : ncopies); |
6193 | for (unsigned i = 0; i < vec_num; i++) |
6194 | { |
6195 | gimple_seq stmts = NULL; |
6196 | if (slp_node) |
6197 | def = vect_get_slp_vect_def (slp_node, i); |
6198 | else |
6199 | def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]); |
6200 | for (j = 0; j < ncopies; j++) |
6201 | { |
6202 | tree new_def = copy_ssa_name (var: def); |
6203 | phi = create_phi_node (new_def, exit_bb); |
6204 | if (j) |
6205 | def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]); |
6206 | if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit) |
6207 | SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def); |
6208 | else |
6209 | { |
6210 | for (unsigned k = 0; k < gimple_phi_num_args (gs: phi); k++) |
6211 | SET_PHI_ARG_DEF (phi, k, def); |
6212 | } |
6213 | new_def = gimple_convert (seq: &stmts, type: vectype, op: new_def); |
6214 | reduc_inputs.quick_push (obj: new_def); |
6215 | } |
6216 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6217 | } |
6218 | |
6219 | /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 |
6220 | (i.e. when reduc_fn is not available) and in the final adjustment |
6221 | code (if needed). Also get the original scalar reduction variable as |
6222 | defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it |
6223 | represents a reduction pattern), the tree-code and scalar-def are |
6224 | taken from the original stmt that the pattern-stmt (STMT) replaces. |
6225 | Otherwise (it is a regular reduction) - the tree-code and scalar-def |
6226 | are taken from STMT. */ |
6227 | |
6228 | stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); |
6229 | if (orig_stmt_info != stmt_info) |
6230 | { |
6231 | /* Reduction pattern */ |
6232 | gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); |
6233 | gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info); |
6234 | } |
6235 | |
6236 | scalar_dest = gimple_get_lhs (orig_stmt_info->stmt); |
6237 | scalar_type = TREE_TYPE (scalar_dest); |
6238 | scalar_results.truncate (size: 0); |
6239 | scalar_results.reserve_exact (nelems: group_size); |
6240 | new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); |
6241 | bitsize = TYPE_SIZE (scalar_type); |
6242 | |
6243 | /* True if we should implement SLP_REDUC using native reduction operations |
6244 | instead of scalar operations. */ |
6245 | direct_slp_reduc = (reduc_fn != IFN_LAST |
6246 | && slp_reduc |
6247 | && !TYPE_VECTOR_SUBPARTS (node: vectype).is_constant ()); |
6248 | |
6249 | /* In case of reduction chain, e.g., |
6250 | # a1 = phi <a3, a0> |
6251 | a2 = operation (a1) |
6252 | a3 = operation (a2), |
6253 | |
6254 | we may end up with more than one vector result. Here we reduce them |
6255 | to one vector. |
6256 | |
6257 | The same is true for a SLP reduction, e.g., |
6258 | # a1 = phi <a2, a0> |
6259 | # b1 = phi <b2, b0> |
6260 | a2 = operation (a1) |
6261 | b2 = operation (a2), |
6262 | |
6263 | where we can end up with more than one vector as well. We can |
6264 | easily accumulate vectors when the number of vector elements is |
6265 | a multiple of the SLP group size. |
6266 | |
6267 | The same is true if we couldn't use a single defuse cycle. */ |
6268 | if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
6269 | || direct_slp_reduc |
6270 | || (slp_reduc |
6271 | && constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: vectype), b: group_size)) |
6272 | || ncopies > 1) |
6273 | { |
6274 | gimple_seq stmts = NULL; |
6275 | tree single_input = reduc_inputs[0]; |
6276 | for (k = 1; k < reduc_inputs.length (); k++) |
6277 | single_input = gimple_build (seq: &stmts, code, type: vectype, |
6278 | ops: single_input, ops: reduc_inputs[k]); |
6279 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6280 | |
6281 | reduc_inputs.truncate (size: 0); |
6282 | reduc_inputs.safe_push (obj: single_input); |
6283 | } |
6284 | |
6285 | tree orig_reduc_input = reduc_inputs[0]; |
6286 | |
6287 | /* If this loop is an epilogue loop that can be skipped after the |
6288 | main loop, we can only share a reduction operation between the |
6289 | main loop and the epilogue if we put it at the target of the |
6290 | skip edge. |
6291 | |
6292 | We can still reuse accumulators if this check fails. Doing so has |
6293 | the minor(?) benefit of making the epilogue loop's scalar result |
6294 | independent of the main loop's scalar result. */ |
6295 | bool unify_with_main_loop_p = false; |
6296 | if (reduc_info->reused_accumulator |
6297 | && loop_vinfo->skip_this_loop_edge |
6298 | && single_succ_p (bb: exit_bb) |
6299 | && single_succ (bb: exit_bb) == loop_vinfo->skip_this_loop_edge->dest) |
6300 | { |
6301 | unify_with_main_loop_p = true; |
6302 | |
6303 | basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest; |
6304 | reduc_inputs[0] = make_ssa_name (var: vectype); |
6305 | gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block); |
6306 | add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (bb: exit_bb), |
6307 | UNKNOWN_LOCATION); |
6308 | add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input, |
6309 | loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION); |
6310 | exit_gsi = gsi_after_labels (bb: reduc_block); |
6311 | } |
6312 | |
6313 | /* Shouldn't be used beyond this point. */ |
6314 | exit_bb = nullptr; |
6315 | |
6316 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION |
6317 | && reduc_fn != IFN_LAST) |
6318 | { |
6319 | /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing |
6320 | various data values where the condition matched and another vector |
6321 | (INDUCTION_INDEX) containing all the indexes of those matches. We |
6322 | need to extract the last matching index (which will be the index with |
6323 | highest value) and use this to index into the data vector. |
6324 | For the case where there were no matches, the data vector will contain |
6325 | all default values and the index vector will be all zeros. */ |
6326 | |
6327 | /* Get various versions of the type of the vector of indexes. */ |
6328 | tree index_vec_type = TREE_TYPE (induction_index); |
6329 | gcc_checking_assert (TYPE_UNSIGNED (index_vec_type)); |
6330 | tree index_scalar_type = TREE_TYPE (index_vec_type); |
6331 | tree index_vec_cmp_type = truth_type_for (index_vec_type); |
6332 | |
6333 | /* Get an unsigned integer version of the type of the data vector. */ |
6334 | int scalar_precision |
6335 | = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); |
6336 | tree scalar_type_unsigned = make_unsigned_type (scalar_precision); |
6337 | tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned, |
6338 | vectype); |
6339 | |
6340 | /* First we need to create a vector (ZERO_VEC) of zeros and another |
6341 | vector (MAX_INDEX_VEC) filled with the last matching index, which we |
6342 | can create using a MAX reduction and then expanding. |
6343 | In the case where the loop never made any matches, the max index will |
6344 | be zero. */ |
6345 | |
6346 | /* Vector of {0, 0, 0,...}. */ |
6347 | tree zero_vec = build_zero_cst (vectype); |
6348 | |
6349 | /* Find maximum value from the vector of found indexes. */ |
6350 | tree max_index = make_ssa_name (var: index_scalar_type); |
6351 | gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX, |
6352 | 1, induction_index); |
6353 | gimple_call_set_lhs (gs: max_index_stmt, lhs: max_index); |
6354 | gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT); |
6355 | |
6356 | /* Vector of {max_index, max_index, max_index,...}. */ |
6357 | tree max_index_vec = make_ssa_name (var: index_vec_type); |
6358 | tree max_index_vec_rhs = build_vector_from_val (index_vec_type, |
6359 | max_index); |
6360 | gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec, |
6361 | max_index_vec_rhs); |
6362 | gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT); |
6363 | |
6364 | /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes |
6365 | with the vector (INDUCTION_INDEX) of found indexes, choosing values |
6366 | from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC) |
6367 | otherwise. Only one value should match, resulting in a vector |
6368 | (VEC_COND) with one data value and the rest zeros. |
6369 | In the case where the loop never made any matches, every index will |
6370 | match, resulting in a vector with all data values (which will all be |
6371 | the default value). */ |
6372 | |
6373 | /* Compare the max index vector to the vector of found indexes to find |
6374 | the position of the max value. */ |
6375 | tree vec_compare = make_ssa_name (var: index_vec_cmp_type); |
6376 | gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR, |
6377 | induction_index, |
6378 | max_index_vec); |
6379 | gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT); |
6380 | |
6381 | /* Use the compare to choose either values from the data vector or |
6382 | zero. */ |
6383 | tree vec_cond = make_ssa_name (var: vectype); |
6384 | gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR, |
6385 | vec_compare, |
6386 | reduc_inputs[0], |
6387 | zero_vec); |
6388 | gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT); |
6389 | |
6390 | /* Finally we need to extract the data value from the vector (VEC_COND) |
6391 | into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR |
6392 | reduction, but because this doesn't exist, we can use a MAX reduction |
6393 | instead. The data value might be signed or a float so we need to cast |
6394 | it first. |
6395 | In the case where the loop never made any matches, the data values are |
6396 | all identical, and so will reduce down correctly. */ |
6397 | |
6398 | /* Make the matched data values unsigned. */ |
6399 | tree vec_cond_cast = make_ssa_name (var: vectype_unsigned); |
6400 | tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned, |
6401 | vec_cond); |
6402 | gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast, |
6403 | VIEW_CONVERT_EXPR, |
6404 | vec_cond_cast_rhs); |
6405 | gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT); |
6406 | |
6407 | /* Reduce down to a scalar value. */ |
6408 | tree data_reduc = make_ssa_name (var: scalar_type_unsigned); |
6409 | gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX, |
6410 | 1, vec_cond_cast); |
6411 | gimple_call_set_lhs (gs: data_reduc_stmt, lhs: data_reduc); |
6412 | gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT); |
6413 | |
6414 | /* Convert the reduced value back to the result type and set as the |
6415 | result. */ |
6416 | gimple_seq stmts = NULL; |
6417 | new_temp = gimple_build (seq: &stmts, code: VIEW_CONVERT_EXPR, type: scalar_type, |
6418 | ops: data_reduc); |
6419 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6420 | scalar_results.safe_push (obj: new_temp); |
6421 | } |
6422 | else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION |
6423 | && reduc_fn == IFN_LAST) |
6424 | { |
6425 | /* Condition reduction without supported IFN_REDUC_MAX. Generate |
6426 | idx = 0; |
6427 | idx_val = induction_index[0]; |
6428 | val = data_reduc[0]; |
6429 | for (idx = 0, val = init, i = 0; i < nelts; ++i) |
6430 | if (induction_index[i] > idx_val) |
6431 | val = data_reduc[i], idx_val = induction_index[i]; |
6432 | return val; */ |
6433 | |
6434 | tree data_eltype = TREE_TYPE (vectype); |
6435 | tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index)); |
6436 | unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype)); |
6437 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); |
6438 | /* Enforced by vectorizable_reduction, which ensures we have target |
6439 | support before allowing a conditional reduction on variable-length |
6440 | vectors. */ |
6441 | unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant (); |
6442 | tree idx_val = NULL_TREE, val = NULL_TREE; |
6443 | for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size) |
6444 | { |
6445 | tree old_idx_val = idx_val; |
6446 | tree old_val = val; |
6447 | idx_val = make_ssa_name (var: idx_eltype); |
6448 | epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF, |
6449 | build3 (BIT_FIELD_REF, idx_eltype, |
6450 | induction_index, |
6451 | bitsize_int (el_size), |
6452 | bitsize_int (off))); |
6453 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6454 | val = make_ssa_name (var: data_eltype); |
6455 | epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF, |
6456 | build3 (BIT_FIELD_REF, |
6457 | data_eltype, |
6458 | reduc_inputs[0], |
6459 | bitsize_int (el_size), |
6460 | bitsize_int (off))); |
6461 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6462 | if (off != 0) |
6463 | { |
6464 | tree new_idx_val = idx_val; |
6465 | if (off != v_size - el_size) |
6466 | { |
6467 | new_idx_val = make_ssa_name (var: idx_eltype); |
6468 | epilog_stmt = gimple_build_assign (new_idx_val, |
6469 | MAX_EXPR, idx_val, |
6470 | old_idx_val); |
6471 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6472 | } |
6473 | tree cond = make_ssa_name (boolean_type_node); |
6474 | epilog_stmt = gimple_build_assign (cond, GT_EXPR, |
6475 | idx_val, old_idx_val); |
6476 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6477 | tree new_val = make_ssa_name (var: data_eltype); |
6478 | epilog_stmt = gimple_build_assign (new_val, COND_EXPR, |
6479 | cond, val, old_val); |
6480 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6481 | idx_val = new_idx_val; |
6482 | val = new_val; |
6483 | } |
6484 | } |
6485 | /* Convert the reduced value back to the result type and set as the |
6486 | result. */ |
6487 | gimple_seq stmts = NULL; |
6488 | val = gimple_convert (seq: &stmts, type: scalar_type, op: val); |
6489 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6490 | scalar_results.safe_push (obj: val); |
6491 | } |
6492 | |
6493 | /* 2.3 Create the reduction code, using one of the three schemes described |
6494 | above. In SLP we simply need to extract all the elements from the |
6495 | vector (without reducing them), so we use scalar shifts. */ |
6496 | else if (reduc_fn != IFN_LAST && !slp_reduc) |
6497 | { |
6498 | tree tmp; |
6499 | tree vec_elem_type; |
6500 | |
6501 | /* Case 1: Create: |
6502 | v_out2 = reduc_expr <v_out1> */ |
6503 | |
6504 | if (dump_enabled_p ()) |
6505 | dump_printf_loc (MSG_NOTE, vect_location, |
6506 | "Reduce using direct vector reduction.\n" ); |
6507 | |
6508 | gimple_seq stmts = NULL; |
6509 | vec_elem_type = TREE_TYPE (vectype); |
6510 | new_temp = gimple_build (seq: &stmts, fn: as_combined_fn (fn: reduc_fn), |
6511 | type: vec_elem_type, args: reduc_inputs[0]); |
6512 | new_temp = gimple_convert (seq: &stmts, type: scalar_type, op: new_temp); |
6513 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6514 | |
6515 | if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) |
6516 | && induc_val) |
6517 | { |
6518 | /* Earlier we set the initial value to be a vector if induc_val |
6519 | values. Check the result and if it is induc_val then replace |
6520 | with the original initial value, unless induc_val is |
6521 | the same as initial_def already. */ |
6522 | tree zcompare = make_ssa_name (boolean_type_node); |
6523 | epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, |
6524 | new_temp, induc_val); |
6525 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6526 | tree initial_def = reduc_info->reduc_initial_values[0]; |
6527 | tmp = make_ssa_name (var: new_scalar_dest); |
6528 | epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, |
6529 | initial_def, new_temp); |
6530 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6531 | new_temp = tmp; |
6532 | } |
6533 | |
6534 | scalar_results.safe_push (obj: new_temp); |
6535 | } |
6536 | else if (direct_slp_reduc) |
6537 | { |
6538 | /* Here we create one vector for each of the REDUC_GROUP_SIZE results, |
6539 | with the elements for other SLP statements replaced with the |
6540 | neutral value. We can then do a normal reduction on each vector. */ |
6541 | |
6542 | /* Enforced by vectorizable_reduction. */ |
6543 | gcc_assert (reduc_inputs.length () == 1); |
6544 | gcc_assert (pow2p_hwi (group_size)); |
6545 | |
6546 | gimple_seq seq = NULL; |
6547 | |
6548 | /* Build a vector {0, 1, 2, ...}, with the same number of elements |
6549 | and the same element size as VECTYPE. */ |
6550 | tree index = build_index_vector (vectype, 0, 1); |
6551 | tree index_type = TREE_TYPE (index); |
6552 | tree index_elt_type = TREE_TYPE (index_type); |
6553 | tree mask_type = truth_type_for (index_type); |
6554 | |
6555 | /* Create a vector that, for each element, identifies which of |
6556 | the REDUC_GROUP_SIZE results should use it. */ |
6557 | tree index_mask = build_int_cst (index_elt_type, group_size - 1); |
6558 | index = gimple_build (seq: &seq, code: BIT_AND_EXPR, type: index_type, ops: index, |
6559 | ops: build_vector_from_val (index_type, index_mask)); |
6560 | |
6561 | /* Get a neutral vector value. This is simply a splat of the neutral |
6562 | scalar value if we have one, otherwise the initial scalar value |
6563 | is itself a neutral value. */ |
6564 | tree vector_identity = NULL_TREE; |
6565 | tree neutral_op = NULL_TREE; |
6566 | if (slp_node) |
6567 | { |
6568 | tree initial_value = NULL_TREE; |
6569 | if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
6570 | initial_value = reduc_info->reduc_initial_values[0]; |
6571 | neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code, |
6572 | initial_value, as_initial: false); |
6573 | } |
6574 | if (neutral_op) |
6575 | vector_identity = gimple_build_vector_from_val (seq: &seq, type: vectype, |
6576 | op: neutral_op); |
6577 | for (unsigned int i = 0; i < group_size; ++i) |
6578 | { |
6579 | /* If there's no univeral neutral value, we can use the |
6580 | initial scalar value from the original PHI. This is used |
6581 | for MIN and MAX reduction, for example. */ |
6582 | if (!neutral_op) |
6583 | { |
6584 | tree scalar_value = reduc_info->reduc_initial_values[i]; |
6585 | scalar_value = gimple_convert (seq: &seq, TREE_TYPE (vectype), |
6586 | op: scalar_value); |
6587 | vector_identity = gimple_build_vector_from_val (seq: &seq, type: vectype, |
6588 | op: scalar_value); |
6589 | } |
6590 | |
6591 | /* Calculate the equivalent of: |
6592 | |
6593 | sel[j] = (index[j] == i); |
6594 | |
6595 | which selects the elements of REDUC_INPUTS[0] that should |
6596 | be included in the result. */ |
6597 | tree compare_val = build_int_cst (index_elt_type, i); |
6598 | compare_val = build_vector_from_val (index_type, compare_val); |
6599 | tree sel = gimple_build (seq: &seq, code: EQ_EXPR, type: mask_type, |
6600 | ops: index, ops: compare_val); |
6601 | |
6602 | /* Calculate the equivalent of: |
6603 | |
6604 | vec = seq ? reduc_inputs[0] : vector_identity; |
6605 | |
6606 | VEC is now suitable for a full vector reduction. */ |
6607 | tree vec = gimple_build (seq: &seq, code: VEC_COND_EXPR, type: vectype, |
6608 | ops: sel, ops: reduc_inputs[0], ops: vector_identity); |
6609 | |
6610 | /* Do the reduction and convert it to the appropriate type. */ |
6611 | tree scalar = gimple_build (seq: &seq, fn: as_combined_fn (fn: reduc_fn), |
6612 | TREE_TYPE (vectype), args: vec); |
6613 | scalar = gimple_convert (seq: &seq, type: scalar_type, op: scalar); |
6614 | scalar_results.safe_push (obj: scalar); |
6615 | } |
6616 | gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT); |
6617 | } |
6618 | else |
6619 | { |
6620 | bool reduce_with_shift; |
6621 | tree vec_temp; |
6622 | |
6623 | gcc_assert (slp_reduc || reduc_inputs.length () == 1); |
6624 | |
6625 | /* See if the target wants to do the final (shift) reduction |
6626 | in a vector mode of smaller size and first reduce upper/lower |
6627 | halves against each other. */ |
6628 | enum machine_mode mode1 = mode; |
6629 | tree stype = TREE_TYPE (vectype); |
6630 | unsigned nunits = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant (); |
6631 | unsigned nunits1 = nunits; |
6632 | if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode |
6633 | && reduc_inputs.length () == 1) |
6634 | { |
6635 | nunits1 = GET_MODE_NUNITS (mode: mode1).to_constant (); |
6636 | /* For SLP reductions we have to make sure lanes match up, but |
6637 | since we're doing individual element final reduction reducing |
6638 | vector width here is even more important. |
6639 | ??? We can also separate lanes with permutes, for the common |
6640 | case of power-of-two group-size odd/even extracts would work. */ |
6641 | if (slp_reduc && nunits != nunits1) |
6642 | { |
6643 | nunits1 = least_common_multiple (nunits1, group_size); |
6644 | gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits); |
6645 | } |
6646 | } |
6647 | if (!slp_reduc |
6648 | && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) |
6649 | nunits1 = GET_MODE_NUNITS (mode: mode1).to_constant (); |
6650 | |
6651 | tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), |
6652 | stype, nunits1); |
6653 | reduce_with_shift = have_whole_vector_shift (mode: mode1); |
6654 | if (!VECTOR_MODE_P (mode1) |
6655 | || !directly_supported_p (code, vectype1)) |
6656 | reduce_with_shift = false; |
6657 | |
6658 | /* First reduce the vector to the desired vector size we should |
6659 | do shift reduction on by combining upper and lower halves. */ |
6660 | gimple_seq stmts = NULL; |
6661 | new_temp = vect_create_partial_epilog (vec_def: reduc_inputs[0], vectype: vectype1, |
6662 | code, seq: &stmts); |
6663 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6664 | reduc_inputs[0] = new_temp; |
6665 | |
6666 | if (reduce_with_shift && !slp_reduc) |
6667 | { |
6668 | int element_bitsize = tree_to_uhwi (bitsize); |
6669 | /* Enforced by vectorizable_reduction, which disallows SLP reductions |
6670 | for variable-length vectors and also requires direct target support |
6671 | for loop reductions. */ |
6672 | int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); |
6673 | int nelements = vec_size_in_bits / element_bitsize; |
6674 | vec_perm_builder sel; |
6675 | vec_perm_indices indices; |
6676 | |
6677 | int elt_offset; |
6678 | |
6679 | tree zero_vec = build_zero_cst (vectype1); |
6680 | /* Case 2: Create: |
6681 | for (offset = nelements/2; offset >= 1; offset/=2) |
6682 | { |
6683 | Create: va' = vec_shift <va, offset> |
6684 | Create: va = vop <va, va'> |
6685 | } */ |
6686 | |
6687 | tree rhs; |
6688 | |
6689 | if (dump_enabled_p ()) |
6690 | dump_printf_loc (MSG_NOTE, vect_location, |
6691 | "Reduce using vector shifts\n" ); |
6692 | |
6693 | gimple_seq stmts = NULL; |
6694 | new_temp = gimple_convert (seq: &stmts, type: vectype1, op: new_temp); |
6695 | for (elt_offset = nelements / 2; |
6696 | elt_offset >= 1; |
6697 | elt_offset /= 2) |
6698 | { |
6699 | calc_vec_perm_mask_for_shift (offset: elt_offset, nelt: nelements, sel: &sel); |
6700 | indices.new_vector (sel, 2, nelements); |
6701 | tree mask = vect_gen_perm_mask_any (vectype1, indices); |
6702 | new_name = gimple_build (seq: &stmts, code: VEC_PERM_EXPR, type: vectype1, |
6703 | ops: new_temp, ops: zero_vec, ops: mask); |
6704 | new_temp = gimple_build (seq: &stmts, code, |
6705 | type: vectype1, ops: new_name, ops: new_temp); |
6706 | } |
6707 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6708 | |
6709 | /* 2.4 Extract the final scalar result. Create: |
6710 | s_out3 = extract_field <v_out2, bitpos> */ |
6711 | |
6712 | if (dump_enabled_p ()) |
6713 | dump_printf_loc (MSG_NOTE, vect_location, |
6714 | "extract scalar result\n" ); |
6715 | |
6716 | rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, |
6717 | bitsize, bitsize_zero_node); |
6718 | epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); |
6719 | new_temp = make_ssa_name (var: new_scalar_dest, stmt: epilog_stmt); |
6720 | gimple_assign_set_lhs (gs: epilog_stmt, lhs: new_temp); |
6721 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6722 | scalar_results.safe_push (obj: new_temp); |
6723 | } |
6724 | else |
6725 | { |
6726 | /* Case 3: Create: |
6727 | s = extract_field <v_out2, 0> |
6728 | for (offset = element_size; |
6729 | offset < vector_size; |
6730 | offset += element_size;) |
6731 | { |
6732 | Create: s' = extract_field <v_out2, offset> |
6733 | Create: s = op <s, s'> // For non SLP cases |
6734 | } */ |
6735 | |
6736 | if (dump_enabled_p ()) |
6737 | dump_printf_loc (MSG_NOTE, vect_location, |
6738 | "Reduce using scalar code.\n" ); |
6739 | |
6740 | int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); |
6741 | int element_bitsize = tree_to_uhwi (bitsize); |
6742 | tree compute_type = TREE_TYPE (vectype); |
6743 | gimple_seq stmts = NULL; |
6744 | FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp) |
6745 | { |
6746 | int bit_offset; |
6747 | new_temp = gimple_build (seq: &stmts, code: BIT_FIELD_REF, type: compute_type, |
6748 | ops: vec_temp, ops: bitsize, bitsize_zero_node); |
6749 | |
6750 | /* In SLP we don't need to apply reduction operation, so we just |
6751 | collect s' values in SCALAR_RESULTS. */ |
6752 | if (slp_reduc) |
6753 | scalar_results.safe_push (obj: new_temp); |
6754 | |
6755 | for (bit_offset = element_bitsize; |
6756 | bit_offset < vec_size_in_bits; |
6757 | bit_offset += element_bitsize) |
6758 | { |
6759 | tree bitpos = bitsize_int (bit_offset); |
6760 | new_name = gimple_build (seq: &stmts, code: BIT_FIELD_REF, |
6761 | type: compute_type, ops: vec_temp, |
6762 | ops: bitsize, ops: bitpos); |
6763 | if (slp_reduc) |
6764 | { |
6765 | /* In SLP we don't need to apply reduction operation, so |
6766 | we just collect s' values in SCALAR_RESULTS. */ |
6767 | new_temp = new_name; |
6768 | scalar_results.safe_push (obj: new_name); |
6769 | } |
6770 | else |
6771 | new_temp = gimple_build (seq: &stmts, code, type: compute_type, |
6772 | ops: new_name, ops: new_temp); |
6773 | } |
6774 | } |
6775 | |
6776 | /* The only case where we need to reduce scalar results in SLP, is |
6777 | unrolling. If the size of SCALAR_RESULTS is greater than |
6778 | REDUC_GROUP_SIZE, we reduce them combining elements modulo |
6779 | REDUC_GROUP_SIZE. */ |
6780 | if (slp_reduc) |
6781 | { |
6782 | tree res, first_res, new_res; |
6783 | |
6784 | /* Reduce multiple scalar results in case of SLP unrolling. */ |
6785 | for (j = group_size; scalar_results.iterate (ix: j, ptr: &res); |
6786 | j++) |
6787 | { |
6788 | first_res = scalar_results[j % group_size]; |
6789 | new_res = gimple_build (seq: &stmts, code, type: compute_type, |
6790 | ops: first_res, ops: res); |
6791 | scalar_results[j % group_size] = new_res; |
6792 | } |
6793 | scalar_results.truncate (size: group_size); |
6794 | for (k = 0; k < group_size; k++) |
6795 | scalar_results[k] = gimple_convert (seq: &stmts, type: scalar_type, |
6796 | op: scalar_results[k]); |
6797 | } |
6798 | else |
6799 | { |
6800 | /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */ |
6801 | new_temp = gimple_convert (seq: &stmts, type: scalar_type, op: new_temp); |
6802 | scalar_results.safe_push (obj: new_temp); |
6803 | } |
6804 | |
6805 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6806 | } |
6807 | |
6808 | if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) |
6809 | && induc_val) |
6810 | { |
6811 | /* Earlier we set the initial value to be a vector if induc_val |
6812 | values. Check the result and if it is induc_val then replace |
6813 | with the original initial value, unless induc_val is |
6814 | the same as initial_def already. */ |
6815 | tree zcompare = make_ssa_name (boolean_type_node); |
6816 | epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp, |
6817 | induc_val); |
6818 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6819 | tree initial_def = reduc_info->reduc_initial_values[0]; |
6820 | tree tmp = make_ssa_name (var: new_scalar_dest); |
6821 | epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, |
6822 | initial_def, new_temp); |
6823 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6824 | scalar_results[0] = tmp; |
6825 | } |
6826 | } |
6827 | |
6828 | /* 2.5 Adjust the final result by the initial value of the reduction |
6829 | variable. (When such adjustment is not needed, then |
6830 | 'adjustment_def' is zero). For example, if code is PLUS we create: |
6831 | new_temp = loop_exit_def + adjustment_def */ |
6832 | |
6833 | if (adjustment_def) |
6834 | { |
6835 | gcc_assert (!slp_reduc); |
6836 | gimple_seq stmts = NULL; |
6837 | if (double_reduc) |
6838 | { |
6839 | gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def))); |
6840 | adjustment_def = gimple_convert (seq: &stmts, type: vectype, op: adjustment_def); |
6841 | new_temp = gimple_build (seq: &stmts, code, type: vectype, |
6842 | ops: reduc_inputs[0], ops: adjustment_def); |
6843 | } |
6844 | else |
6845 | { |
6846 | new_temp = scalar_results[0]; |
6847 | gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); |
6848 | adjustment_def = gimple_convert (seq: &stmts, TREE_TYPE (vectype), |
6849 | op: adjustment_def); |
6850 | new_temp = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: new_temp); |
6851 | new_temp = gimple_build (seq: &stmts, code, TREE_TYPE (vectype), |
6852 | ops: new_temp, ops: adjustment_def); |
6853 | new_temp = gimple_convert (seq: &stmts, type: scalar_type, op: new_temp); |
6854 | } |
6855 | |
6856 | epilog_stmt = gimple_seq_last_stmt (s: stmts); |
6857 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6858 | scalar_results[0] = new_temp; |
6859 | } |
6860 | |
6861 | /* Record this operation if it could be reused by the epilogue loop. */ |
6862 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION |
6863 | && reduc_inputs.length () == 1) |
6864 | loop_vinfo->reusable_accumulators.put (k: scalar_results[0], |
6865 | v: { .reduc_input: orig_reduc_input, .reduc_info: reduc_info }); |
6866 | |
6867 | if (double_reduc) |
6868 | loop = outer_loop; |
6869 | |
6870 | /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit |
6871 | phis with new adjusted scalar results, i.e., replace use <s_out0> |
6872 | with use <s_out4>. |
6873 | |
6874 | Transform: |
6875 | loop_exit: |
6876 | s_out0 = phi <s_loop> # (scalar) EXIT_PHI |
6877 | v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI |
6878 | v_out2 = reduce <v_out1> |
6879 | s_out3 = extract_field <v_out2, 0> |
6880 | s_out4 = adjust_result <s_out3> |
6881 | use <s_out0> |
6882 | use <s_out0> |
6883 | |
6884 | into: |
6885 | |
6886 | loop_exit: |
6887 | s_out0 = phi <s_loop> # (scalar) EXIT_PHI |
6888 | v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI |
6889 | v_out2 = reduce <v_out1> |
6890 | s_out3 = extract_field <v_out2, 0> |
6891 | s_out4 = adjust_result <s_out3> |
6892 | use <s_out4> |
6893 | use <s_out4> */ |
6894 | |
6895 | gcc_assert (live_out_stmts.size () == scalar_results.length ()); |
6896 | auto_vec<gimple *> phis; |
6897 | for (k = 0; k < live_out_stmts.size (); k++) |
6898 | { |
6899 | stmt_vec_info scalar_stmt_info = vect_orig_stmt (stmt_info: live_out_stmts[k]); |
6900 | scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt); |
6901 | |
6902 | /* Find the loop-closed-use at the loop exit of the original scalar |
6903 | result. (The reduction result is expected to have two immediate uses, |
6904 | one at the latch block, and one at the loop exit). For double |
6905 | reductions we are looking for exit phis of the outer loop. */ |
6906 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) |
6907 | { |
6908 | if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) |
6909 | { |
6910 | if (!is_gimple_debug (USE_STMT (use_p)) |
6911 | && gimple_bb (USE_STMT (use_p)) == loop_exit->dest) |
6912 | phis.safe_push (USE_STMT (use_p)); |
6913 | } |
6914 | else |
6915 | { |
6916 | if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI) |
6917 | { |
6918 | tree phi_res = PHI_RESULT (USE_STMT (use_p)); |
6919 | |
6920 | FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res) |
6921 | { |
6922 | if (!flow_bb_inside_loop_p (loop, |
6923 | gimple_bb (USE_STMT (phi_use_p))) |
6924 | && !is_gimple_debug (USE_STMT (phi_use_p))) |
6925 | phis.safe_push (USE_STMT (phi_use_p)); |
6926 | } |
6927 | } |
6928 | } |
6929 | } |
6930 | |
6931 | FOR_EACH_VEC_ELT (phis, i, exit_phi) |
6932 | { |
6933 | /* Replace the uses: */ |
6934 | orig_name = PHI_RESULT (exit_phi); |
6935 | |
6936 | /* Look for a single use at the target of the skip edge. */ |
6937 | if (unify_with_main_loop_p) |
6938 | { |
6939 | use_operand_p use_p; |
6940 | gimple *user; |
6941 | if (!single_imm_use (var: orig_name, use_p: &use_p, stmt: &user)) |
6942 | gcc_unreachable (); |
6943 | orig_name = gimple_get_lhs (user); |
6944 | } |
6945 | |
6946 | scalar_result = scalar_results[k]; |
6947 | FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) |
6948 | { |
6949 | FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) |
6950 | SET_USE (use_p, scalar_result); |
6951 | update_stmt (s: use_stmt); |
6952 | } |
6953 | } |
6954 | |
6955 | phis.truncate (size: 0); |
6956 | } |
6957 | } |
6958 | |
6959 | /* Return a vector of type VECTYPE that is equal to the vector select |
6960 | operation "MASK ? VEC : IDENTITY". Insert the select statements |
6961 | before GSI. */ |
6962 | |
6963 | static tree |
6964 | merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype, |
6965 | tree vec, tree identity) |
6966 | { |
6967 | tree cond = make_temp_ssa_name (type: vectype, NULL, name: "cond" ); |
6968 | gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR, |
6969 | mask, vec, identity); |
6970 | gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); |
6971 | return cond; |
6972 | } |
6973 | |
6974 | /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right |
6975 | order, starting with LHS. Insert the extraction statements before GSI and |
6976 | associate the new scalar SSA names with variable SCALAR_DEST. |
6977 | If MASK is nonzero mask the input and then operate on it unconditionally. |
6978 | Return the SSA name for the result. */ |
6979 | |
6980 | static tree |
6981 | vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest, |
6982 | tree_code code, tree lhs, tree vector_rhs, |
6983 | tree mask) |
6984 | { |
6985 | tree vectype = TREE_TYPE (vector_rhs); |
6986 | tree scalar_type = TREE_TYPE (vectype); |
6987 | tree bitsize = TYPE_SIZE (scalar_type); |
6988 | unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); |
6989 | unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize); |
6990 | |
6991 | /* Re-create a VEC_COND_EXPR to mask the input here in order to be able |
6992 | to perform an unconditional element-wise reduction of it. */ |
6993 | if (mask) |
6994 | { |
6995 | tree masked_vector_rhs = make_temp_ssa_name (type: vectype, NULL, |
6996 | name: "masked_vector_rhs" ); |
6997 | tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE, |
6998 | as_initial: false); |
6999 | tree vector_identity = build_vector_from_val (vectype, neutral_op); |
7000 | gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR, |
7001 | mask, vector_rhs, vector_identity); |
7002 | gsi_insert_before (gsi, select, GSI_SAME_STMT); |
7003 | vector_rhs = masked_vector_rhs; |
7004 | } |
7005 | |
7006 | for (unsigned HOST_WIDE_INT bit_offset = 0; |
7007 | bit_offset < vec_size_in_bits; |
7008 | bit_offset += element_bitsize) |
7009 | { |
7010 | tree bitpos = bitsize_int (bit_offset); |
7011 | tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs, |
7012 | bitsize, bitpos); |
7013 | |
7014 | gassign *stmt = gimple_build_assign (scalar_dest, rhs); |
7015 | rhs = make_ssa_name (var: scalar_dest, stmt); |
7016 | gimple_assign_set_lhs (gs: stmt, lhs: rhs); |
7017 | gsi_insert_before (gsi, stmt, GSI_SAME_STMT); |
7018 | |
7019 | stmt = gimple_build_assign (scalar_dest, code, lhs, rhs); |
7020 | tree new_name = make_ssa_name (var: scalar_dest, stmt); |
7021 | gimple_assign_set_lhs (gs: stmt, lhs: new_name); |
7022 | gsi_insert_before (gsi, stmt, GSI_SAME_STMT); |
7023 | lhs = new_name; |
7024 | } |
7025 | return lhs; |
7026 | } |
7027 | |
7028 | /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the |
7029 | type of the vector input. */ |
7030 | |
7031 | static internal_fn |
7032 | get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in) |
7033 | { |
7034 | internal_fn mask_reduc_fn; |
7035 | internal_fn mask_len_reduc_fn; |
7036 | |
7037 | switch (reduc_fn) |
7038 | { |
7039 | case IFN_FOLD_LEFT_PLUS: |
7040 | mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS; |
7041 | mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS; |
7042 | break; |
7043 | |
7044 | default: |
7045 | return IFN_LAST; |
7046 | } |
7047 | |
7048 | if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in, |
7049 | OPTIMIZE_FOR_SPEED)) |
7050 | return mask_reduc_fn; |
7051 | if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in, |
7052 | OPTIMIZE_FOR_SPEED)) |
7053 | return mask_len_reduc_fn; |
7054 | return IFN_LAST; |
7055 | } |
7056 | |
7057 | /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the |
7058 | statement that sets the live-out value. REDUC_DEF_STMT is the phi |
7059 | statement. CODE is the operation performed by STMT_INFO and OPS are |
7060 | its scalar operands. REDUC_INDEX is the index of the operand in |
7061 | OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that |
7062 | implements in-order reduction, or IFN_LAST if we should open-code it. |
7063 | VECTYPE_IN is the type of the vector input. MASKS specifies the masks |
7064 | that should be used to control the operation in a fully-masked loop. */ |
7065 | |
7066 | static bool |
7067 | vectorize_fold_left_reduction (loop_vec_info loop_vinfo, |
7068 | stmt_vec_info stmt_info, |
7069 | gimple_stmt_iterator *gsi, |
7070 | gimple **vec_stmt, slp_tree slp_node, |
7071 | gimple *reduc_def_stmt, |
7072 | code_helper code, internal_fn reduc_fn, |
7073 | tree *ops, int num_ops, tree vectype_in, |
7074 | int reduc_index, vec_loop_masks *masks, |
7075 | vec_loop_lens *lens) |
7076 | { |
7077 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
7078 | tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); |
7079 | internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in); |
7080 | |
7081 | int ncopies; |
7082 | if (slp_node) |
7083 | ncopies = 1; |
7084 | else |
7085 | ncopies = vect_get_num_copies (loop_vinfo, vectype: vectype_in); |
7086 | |
7087 | gcc_assert (!nested_in_vect_loop_p (loop, stmt_info)); |
7088 | gcc_assert (ncopies == 1); |
7089 | |
7090 | bool is_cond_op = false; |
7091 | if (!code.is_tree_code ()) |
7092 | { |
7093 | code = conditional_internal_fn_code (internal_fn (code)); |
7094 | gcc_assert (code != ERROR_MARK); |
7095 | is_cond_op = true; |
7096 | } |
7097 | |
7098 | gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op); |
7099 | |
7100 | if (slp_node) |
7101 | { |
7102 | if (is_cond_op) |
7103 | { |
7104 | if (dump_enabled_p ()) |
7105 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7106 | "fold-left reduction on SLP not supported.\n" ); |
7107 | return false; |
7108 | } |
7109 | |
7110 | gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out), |
7111 | TYPE_VECTOR_SUBPARTS (vectype_in))); |
7112 | } |
7113 | |
7114 | /* The operands either come from a binary operation or an IFN_COND operation. |
7115 | The former is a gimple assign with binary rhs and the latter is a |
7116 | gimple call with four arguments. */ |
7117 | gcc_assert (num_ops == 2 || num_ops == 4); |
7118 | tree op0, opmask; |
7119 | if (!is_cond_op) |
7120 | op0 = ops[1 - reduc_index]; |
7121 | else |
7122 | { |
7123 | op0 = ops[2 + (1 - reduc_index)]; |
7124 | opmask = ops[0]; |
7125 | gcc_assert (!slp_node); |
7126 | } |
7127 | |
7128 | int group_size = 1; |
7129 | stmt_vec_info scalar_dest_def_info; |
7130 | auto_vec<tree> vec_oprnds0, vec_opmask; |
7131 | if (slp_node) |
7132 | { |
7133 | auto_vec<vec<tree> > vec_defs (2); |
7134 | vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs); |
7135 | vec_oprnds0.safe_splice (src: vec_defs[1 - reduc_index]); |
7136 | vec_defs[0].release (); |
7137 | vec_defs[1].release (); |
7138 | group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); |
7139 | scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; |
7140 | } |
7141 | else |
7142 | { |
7143 | vect_get_vec_defs_for_operand (vinfo: loop_vinfo, stmt_info, 1, |
7144 | op: op0, &vec_oprnds0); |
7145 | scalar_dest_def_info = stmt_info; |
7146 | |
7147 | /* For an IFN_COND_OP we also need the vector mask operand. */ |
7148 | if (is_cond_op) |
7149 | vect_get_vec_defs_for_operand (vinfo: loop_vinfo, stmt_info, 1, |
7150 | op: opmask, &vec_opmask); |
7151 | } |
7152 | |
7153 | gimple *sdef = vect_orig_stmt (stmt_info: scalar_dest_def_info)->stmt; |
7154 | tree scalar_dest = gimple_get_lhs (sdef); |
7155 | tree scalar_type = TREE_TYPE (scalar_dest); |
7156 | tree reduc_var = gimple_phi_result (gs: reduc_def_stmt); |
7157 | |
7158 | int vec_num = vec_oprnds0.length (); |
7159 | gcc_assert (vec_num == 1 || slp_node); |
7160 | tree vec_elem_type = TREE_TYPE (vectype_out); |
7161 | gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type)); |
7162 | |
7163 | tree vector_identity = NULL_TREE; |
7164 | if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) |
7165 | { |
7166 | vector_identity = build_zero_cst (vectype_out); |
7167 | if (!HONOR_SIGNED_ZEROS (vectype_out)) |
7168 | ; |
7169 | else |
7170 | { |
7171 | gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out)); |
7172 | vector_identity = const_unop (NEGATE_EXPR, vectype_out, |
7173 | vector_identity); |
7174 | } |
7175 | } |
7176 | |
7177 | tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL); |
7178 | int i; |
7179 | tree def0; |
7180 | FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) |
7181 | { |
7182 | gimple *new_stmt; |
7183 | tree mask = NULL_TREE; |
7184 | tree len = NULL_TREE; |
7185 | tree bias = NULL_TREE; |
7186 | if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) |
7187 | mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i); |
7188 | else if (is_cond_op) |
7189 | mask = vec_opmask[0]; |
7190 | if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) |
7191 | { |
7192 | len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in, |
7193 | i, 1); |
7194 | signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); |
7195 | bias = build_int_cst (intQI_type_node, biasval); |
7196 | if (!is_cond_op) |
7197 | mask = build_minus_one_cst (truth_type_for (vectype_in)); |
7198 | } |
7199 | |
7200 | /* Handle MINUS by adding the negative. */ |
7201 | if (reduc_fn != IFN_LAST && code == MINUS_EXPR) |
7202 | { |
7203 | tree negated = make_ssa_name (var: vectype_out); |
7204 | new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0); |
7205 | gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); |
7206 | def0 = negated; |
7207 | } |
7208 | |
7209 | if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) |
7210 | && mask && mask_reduc_fn == IFN_LAST) |
7211 | def0 = merge_with_identity (gsi, mask, vectype: vectype_out, vec: def0, |
7212 | identity: vector_identity); |
7213 | |
7214 | /* On the first iteration the input is simply the scalar phi |
7215 | result, and for subsequent iterations it is the output of |
7216 | the preceding operation. */ |
7217 | if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST)) |
7218 | { |
7219 | if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS) |
7220 | new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var, |
7221 | def0, mask, len, bias); |
7222 | else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS) |
7223 | new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var, |
7224 | def0, mask); |
7225 | else |
7226 | new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, |
7227 | def0); |
7228 | /* For chained SLP reductions the output of the previous reduction |
7229 | operation serves as the input of the next. For the final statement |
7230 | the output cannot be a temporary - we reuse the original |
7231 | scalar destination of the last statement. */ |
7232 | if (i != vec_num - 1) |
7233 | { |
7234 | gimple_set_lhs (new_stmt, scalar_dest_var); |
7235 | reduc_var = make_ssa_name (var: scalar_dest_var, stmt: new_stmt); |
7236 | gimple_set_lhs (new_stmt, reduc_var); |
7237 | } |
7238 | } |
7239 | else |
7240 | { |
7241 | reduc_var = vect_expand_fold_left (gsi, scalar_dest: scalar_dest_var, |
7242 | code: tree_code (code), lhs: reduc_var, vector_rhs: def0, |
7243 | mask); |
7244 | new_stmt = SSA_NAME_DEF_STMT (reduc_var); |
7245 | /* Remove the statement, so that we can use the same code paths |
7246 | as for statements that we've just created. */ |
7247 | gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt); |
7248 | gsi_remove (&tmp_gsi, true); |
7249 | } |
7250 | |
7251 | if (i == vec_num - 1) |
7252 | { |
7253 | gimple_set_lhs (new_stmt, scalar_dest); |
7254 | vect_finish_replace_stmt (loop_vinfo, |
7255 | scalar_dest_def_info, |
7256 | new_stmt); |
7257 | } |
7258 | else |
7259 | vect_finish_stmt_generation (loop_vinfo, |
7260 | scalar_dest_def_info, |
7261 | new_stmt, gsi); |
7262 | |
7263 | if (slp_node) |
7264 | slp_node->push_vec_def (def: new_stmt); |
7265 | else |
7266 | { |
7267 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_stmt); |
7268 | *vec_stmt = new_stmt; |
7269 | } |
7270 | } |
7271 | |
7272 | return true; |
7273 | } |
7274 | |
7275 | /* Function is_nonwrapping_integer_induction. |
7276 | |
7277 | Check if STMT_VINO (which is part of loop LOOP) both increments and |
7278 | does not cause overflow. */ |
7279 | |
7280 | static bool |
7281 | is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop) |
7282 | { |
7283 | gphi *phi = as_a <gphi *> (p: stmt_vinfo->stmt); |
7284 | tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo); |
7285 | tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo); |
7286 | tree lhs_type = TREE_TYPE (gimple_phi_result (phi)); |
7287 | widest_int ni, max_loop_value, lhs_max; |
7288 | wi::overflow_type overflow = wi::OVF_NONE; |
7289 | |
7290 | /* Make sure the loop is integer based. */ |
7291 | if (TREE_CODE (base) != INTEGER_CST |
7292 | || TREE_CODE (step) != INTEGER_CST) |
7293 | return false; |
7294 | |
7295 | /* Check that the max size of the loop will not wrap. */ |
7296 | |
7297 | if (TYPE_OVERFLOW_UNDEFINED (lhs_type)) |
7298 | return true; |
7299 | |
7300 | if (! max_stmt_executions (loop, &ni)) |
7301 | return false; |
7302 | |
7303 | max_loop_value = wi::mul (x: wi::to_widest (t: step), y: ni, TYPE_SIGN (lhs_type), |
7304 | overflow: &overflow); |
7305 | if (overflow) |
7306 | return false; |
7307 | |
7308 | max_loop_value = wi::add (x: wi::to_widest (t: base), y: max_loop_value, |
7309 | TYPE_SIGN (lhs_type), overflow: &overflow); |
7310 | if (overflow) |
7311 | return false; |
7312 | |
7313 | return (wi::min_precision (x: max_loop_value, TYPE_SIGN (lhs_type)) |
7314 | <= TYPE_PRECISION (lhs_type)); |
7315 | } |
7316 | |
7317 | /* Check if masking can be supported by inserting a conditional expression. |
7318 | CODE is the code for the operation. COND_FN is the conditional internal |
7319 | function, if it exists. VECTYPE_IN is the type of the vector input. */ |
7320 | static bool |
7321 | use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn, |
7322 | tree vectype_in) |
7323 | { |
7324 | if (cond_fn != IFN_LAST |
7325 | && direct_internal_fn_supported_p (cond_fn, vectype_in, |
7326 | OPTIMIZE_FOR_SPEED)) |
7327 | return false; |
7328 | |
7329 | if (code.is_tree_code ()) |
7330 | switch (tree_code (code)) |
7331 | { |
7332 | case DOT_PROD_EXPR: |
7333 | case SAD_EXPR: |
7334 | return true; |
7335 | |
7336 | default: |
7337 | break; |
7338 | } |
7339 | return false; |
7340 | } |
7341 | |
7342 | /* Insert a conditional expression to enable masked vectorization. CODE is the |
7343 | code for the operation. VOP is the array of operands. MASK is the loop |
7344 | mask. GSI is a statement iterator used to place the new conditional |
7345 | expression. */ |
7346 | static void |
7347 | build_vect_cond_expr (code_helper code, tree vop[3], tree mask, |
7348 | gimple_stmt_iterator *gsi) |
7349 | { |
7350 | switch (tree_code (code)) |
7351 | { |
7352 | case DOT_PROD_EXPR: |
7353 | { |
7354 | tree vectype = TREE_TYPE (vop[1]); |
7355 | tree zero = build_zero_cst (vectype); |
7356 | tree masked_op1 = make_temp_ssa_name (type: vectype, NULL, name: "masked_op1" ); |
7357 | gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, |
7358 | mask, vop[1], zero); |
7359 | gsi_insert_before (gsi, select, GSI_SAME_STMT); |
7360 | vop[1] = masked_op1; |
7361 | break; |
7362 | } |
7363 | |
7364 | case SAD_EXPR: |
7365 | { |
7366 | tree vectype = TREE_TYPE (vop[1]); |
7367 | tree masked_op1 = make_temp_ssa_name (type: vectype, NULL, name: "masked_op1" ); |
7368 | gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, |
7369 | mask, vop[1], vop[0]); |
7370 | gsi_insert_before (gsi, select, GSI_SAME_STMT); |
7371 | vop[1] = masked_op1; |
7372 | break; |
7373 | } |
7374 | |
7375 | default: |
7376 | gcc_unreachable (); |
7377 | } |
7378 | } |
7379 | |
7380 | /* Function vectorizable_reduction. |
7381 | |
7382 | Check if STMT_INFO performs a reduction operation that can be vectorized. |
7383 | If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized |
7384 | stmt to replace it, put it in VEC_STMT, and insert it at GSI. |
7385 | Return true if STMT_INFO is vectorizable in this way. |
7386 | |
7387 | This function also handles reduction idioms (patterns) that have been |
7388 | recognized in advance during vect_pattern_recog. In this case, STMT_INFO |
7389 | may be of this form: |
7390 | X = pattern_expr (arg0, arg1, ..., X) |
7391 | and its STMT_VINFO_RELATED_STMT points to the last stmt in the original |
7392 | sequence that had been detected and replaced by the pattern-stmt |
7393 | (STMT_INFO). |
7394 | |
7395 | This function also handles reduction of condition expressions, for example: |
7396 | for (int i = 0; i < N; i++) |
7397 | if (a[i] < value) |
7398 | last = a[i]; |
7399 | This is handled by vectorising the loop and creating an additional vector |
7400 | containing the loop indexes for which "a[i] < value" was true. In the |
7401 | function epilogue this is reduced to a single max value and then used to |
7402 | index into the vector of results. |
7403 | |
7404 | In some cases of reduction patterns, the type of the reduction variable X is |
7405 | different than the type of the other arguments of STMT_INFO. |
7406 | In such cases, the vectype that is used when transforming STMT_INFO into |
7407 | a vector stmt is different than the vectype that is used to determine the |
7408 | vectorization factor, because it consists of a different number of elements |
7409 | than the actual number of elements that are being operated upon in parallel. |
7410 | |
7411 | For example, consider an accumulation of shorts into an int accumulator. |
7412 | On some targets it's possible to vectorize this pattern operating on 8 |
7413 | shorts at a time (hence, the vectype for purposes of determining the |
7414 | vectorization factor should be V8HI); on the other hand, the vectype that |
7415 | is used to create the vector form is actually V4SI (the type of the result). |
7416 | |
7417 | Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that |
7418 | indicates what is the actual level of parallelism (V8HI in the example), so |
7419 | that the right vectorization factor would be derived. This vectype |
7420 | corresponds to the type of arguments to the reduction stmt, and should *NOT* |
7421 | be used to create the vectorized stmt. The right vectype for the vectorized |
7422 | stmt is obtained from the type of the result X: |
7423 | get_vectype_for_scalar_type (vinfo, TREE_TYPE (X)) |
7424 | |
7425 | This means that, contrary to "regular" reductions (or "regular" stmts in |
7426 | general), the following equation: |
7427 | STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X)) |
7428 | does *NOT* necessarily hold for reduction patterns. */ |
7429 | |
7430 | bool |
7431 | vectorizable_reduction (loop_vec_info loop_vinfo, |
7432 | stmt_vec_info stmt_info, slp_tree slp_node, |
7433 | slp_instance slp_node_instance, |
7434 | stmt_vector_for_cost *cost_vec) |
7435 | { |
7436 | tree vectype_in = NULL_TREE; |
7437 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
7438 | enum vect_def_type cond_reduc_dt = vect_unknown_def_type; |
7439 | stmt_vec_info cond_stmt_vinfo = NULL; |
7440 | int i; |
7441 | int ncopies; |
7442 | bool single_defuse_cycle = false; |
7443 | bool nested_cycle = false; |
7444 | bool double_reduc = false; |
7445 | int vec_num; |
7446 | tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; |
7447 | tree cond_reduc_val = NULL_TREE; |
7448 | |
7449 | /* Make sure it was already recognized as a reduction computation. */ |
7450 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def |
7451 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def |
7452 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle) |
7453 | return false; |
7454 | |
7455 | /* The stmt we store reduction analysis meta on. */ |
7456 | stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info); |
7457 | reduc_info->is_reduc_info = true; |
7458 | |
7459 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) |
7460 | { |
7461 | if (is_a <gphi *> (p: stmt_info->stmt)) |
7462 | { |
7463 | if (slp_node) |
7464 | { |
7465 | /* We eventually need to set a vector type on invariant |
7466 | arguments. */ |
7467 | unsigned j; |
7468 | slp_tree child; |
7469 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child) |
7470 | if (!vect_maybe_update_slp_op_vectype |
7471 | (child, SLP_TREE_VECTYPE (slp_node))) |
7472 | { |
7473 | if (dump_enabled_p ()) |
7474 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7475 | "incompatible vector types for " |
7476 | "invariants\n" ); |
7477 | return false; |
7478 | } |
7479 | } |
7480 | /* Analysis for double-reduction is done on the outer |
7481 | loop PHI, nested cycles have no further restrictions. */ |
7482 | STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type; |
7483 | } |
7484 | else |
7485 | STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; |
7486 | return true; |
7487 | } |
7488 | |
7489 | stmt_vec_info orig_stmt_of_analysis = stmt_info; |
7490 | stmt_vec_info phi_info = stmt_info; |
7491 | if (!is_a <gphi *> (p: stmt_info->stmt)) |
7492 | { |
7493 | STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; |
7494 | return true; |
7495 | } |
7496 | if (slp_node) |
7497 | { |
7498 | slp_node_instance->reduc_phis = slp_node; |
7499 | /* ??? We're leaving slp_node to point to the PHIs, we only |
7500 | need it to get at the number of vector stmts which wasn't |
7501 | yet initialized for the instance root. */ |
7502 | } |
7503 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) |
7504 | { |
7505 | use_operand_p use_p; |
7506 | gimple *use_stmt; |
7507 | bool res = single_imm_use (var: gimple_phi_result (gs: stmt_info->stmt), |
7508 | use_p: &use_p, stmt: &use_stmt); |
7509 | gcc_assert (res); |
7510 | phi_info = loop_vinfo->lookup_stmt (use_stmt); |
7511 | } |
7512 | |
7513 | /* PHIs should not participate in patterns. */ |
7514 | gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); |
7515 | gphi *reduc_def_phi = as_a <gphi *> (p: phi_info->stmt); |
7516 | |
7517 | /* Verify following REDUC_IDX from the latch def leads us back to the PHI |
7518 | and compute the reduction chain length. Discover the real |
7519 | reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */ |
7520 | tree reduc_def |
7521 | = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, |
7522 | loop_latch_edge |
7523 | (gimple_bb (reduc_def_phi)->loop_father)); |
7524 | unsigned reduc_chain_length = 0; |
7525 | bool only_slp_reduc_chain = true; |
7526 | stmt_info = NULL; |
7527 | slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL; |
7528 | while (reduc_def != PHI_RESULT (reduc_def_phi)) |
7529 | { |
7530 | stmt_vec_info def = loop_vinfo->lookup_def (reduc_def); |
7531 | stmt_vec_info vdef = vect_stmt_to_vectorize (stmt_info: def); |
7532 | if (STMT_VINFO_REDUC_IDX (vdef) == -1) |
7533 | { |
7534 | if (dump_enabled_p ()) |
7535 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7536 | "reduction chain broken by patterns.\n" ); |
7537 | return false; |
7538 | } |
7539 | if (!REDUC_GROUP_FIRST_ELEMENT (vdef)) |
7540 | only_slp_reduc_chain = false; |
7541 | /* For epilogue generation live members of the chain need |
7542 | to point back to the PHI via their original stmt for |
7543 | info_for_reduction to work. For SLP we need to look at |
7544 | all lanes here - even though we only will vectorize from |
7545 | the SLP node with live lane zero the other live lanes also |
7546 | need to be identified as part of a reduction to be able |
7547 | to skip code generation for them. */ |
7548 | if (slp_for_stmt_info) |
7549 | { |
7550 | for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info)) |
7551 | if (STMT_VINFO_LIVE_P (s)) |
7552 | STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info; |
7553 | } |
7554 | else if (STMT_VINFO_LIVE_P (vdef)) |
7555 | STMT_VINFO_REDUC_DEF (def) = phi_info; |
7556 | gimple_match_op op; |
7557 | if (!gimple_extract_op (vdef->stmt, &op)) |
7558 | { |
7559 | if (dump_enabled_p ()) |
7560 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7561 | "reduction chain includes unsupported" |
7562 | " statement type.\n" ); |
7563 | return false; |
7564 | } |
7565 | if (CONVERT_EXPR_CODE_P (op.code)) |
7566 | { |
7567 | if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))) |
7568 | { |
7569 | if (dump_enabled_p ()) |
7570 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7571 | "conversion in the reduction chain.\n" ); |
7572 | return false; |
7573 | } |
7574 | } |
7575 | else if (!stmt_info) |
7576 | /* First non-conversion stmt. */ |
7577 | stmt_info = vdef; |
7578 | reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)]; |
7579 | reduc_chain_length++; |
7580 | if (!stmt_info && slp_node) |
7581 | slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0]; |
7582 | } |
7583 | /* PHIs should not participate in patterns. */ |
7584 | gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); |
7585 | |
7586 | if (nested_in_vect_loop_p (loop, stmt_info)) |
7587 | { |
7588 | loop = loop->inner; |
7589 | nested_cycle = true; |
7590 | } |
7591 | |
7592 | /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last |
7593 | element. */ |
7594 | if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
7595 | { |
7596 | gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info)); |
7597 | stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info); |
7598 | } |
7599 | if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
7600 | gcc_assert (slp_node |
7601 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info); |
7602 | |
7603 | /* 1. Is vectorizable reduction? */ |
7604 | /* Not supportable if the reduction variable is used in the loop, unless |
7605 | it's a reduction chain. */ |
7606 | if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer |
7607 | && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
7608 | return false; |
7609 | |
7610 | /* Reductions that are not used even in an enclosing outer-loop, |
7611 | are expected to be "live" (used out of the loop). */ |
7612 | if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope |
7613 | && !STMT_VINFO_LIVE_P (stmt_info)) |
7614 | return false; |
7615 | |
7616 | /* 2. Has this been recognized as a reduction pattern? |
7617 | |
7618 | Check if STMT represents a pattern that has been recognized |
7619 | in earlier analysis stages. For stmts that represent a pattern, |
7620 | the STMT_VINFO_RELATED_STMT field records the last stmt in |
7621 | the original sequence that constitutes the pattern. */ |
7622 | |
7623 | stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); |
7624 | if (orig_stmt_info) |
7625 | { |
7626 | gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); |
7627 | gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); |
7628 | } |
7629 | |
7630 | /* 3. Check the operands of the operation. The first operands are defined |
7631 | inside the loop body. The last operand is the reduction variable, |
7632 | which is defined by the loop-header-phi. */ |
7633 | |
7634 | tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); |
7635 | STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out; |
7636 | gimple_match_op op; |
7637 | if (!gimple_extract_op (stmt_info->stmt, &op)) |
7638 | gcc_unreachable (); |
7639 | bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR |
7640 | || op.code == WIDEN_SUM_EXPR |
7641 | || op.code == SAD_EXPR); |
7642 | |
7643 | if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type) |
7644 | && !SCALAR_FLOAT_TYPE_P (op.type)) |
7645 | return false; |
7646 | |
7647 | /* Do not try to vectorize bit-precision reductions. */ |
7648 | if (!type_has_mode_precision_p (t: op.type)) |
7649 | return false; |
7650 | |
7651 | /* For lane-reducing ops we're reducing the number of reduction PHIs |
7652 | which means the only use of that may be in the lane-reducing operation. */ |
7653 | if (lane_reduc_code_p |
7654 | && reduc_chain_length != 1 |
7655 | && !only_slp_reduc_chain) |
7656 | { |
7657 | if (dump_enabled_p ()) |
7658 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7659 | "lane-reducing reduction with extra stmts.\n" ); |
7660 | return false; |
7661 | } |
7662 | |
7663 | /* All uses but the last are expected to be defined in the loop. |
7664 | The last use is the reduction variable. In case of nested cycle this |
7665 | assumption is not true: we use reduc_index to record the index of the |
7666 | reduction variable. */ |
7667 | slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops); |
7668 | tree *vectype_op = XALLOCAVEC (tree, op.num_ops); |
7669 | /* We need to skip an extra operand for COND_EXPRs with embedded |
7670 | comparison. */ |
7671 | unsigned opno_adjust = 0; |
7672 | if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0])) |
7673 | opno_adjust = 1; |
7674 | for (i = 0; i < (int) op.num_ops; i++) |
7675 | { |
7676 | /* The condition of COND_EXPR is checked in vectorizable_condition(). */ |
7677 | if (i == 0 && op.code == COND_EXPR) |
7678 | continue; |
7679 | |
7680 | stmt_vec_info def_stmt_info; |
7681 | enum vect_def_type dt; |
7682 | if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info, |
7683 | i + opno_adjust, &op.ops[i], &slp_op[i], &dt, |
7684 | &vectype_op[i], &def_stmt_info)) |
7685 | { |
7686 | if (dump_enabled_p ()) |
7687 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7688 | "use not simple.\n" ); |
7689 | return false; |
7690 | } |
7691 | if (i == STMT_VINFO_REDUC_IDX (stmt_info)) |
7692 | continue; |
7693 | |
7694 | /* For an IFN_COND_OP we might hit the reduction definition operand |
7695 | twice (once as definition, once as else). */ |
7696 | if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)]) |
7697 | continue; |
7698 | |
7699 | /* There should be only one cycle def in the stmt, the one |
7700 | leading to reduc_def. */ |
7701 | if (VECTORIZABLE_CYCLE_DEF (dt)) |
7702 | return false; |
7703 | |
7704 | if (!vectype_op[i]) |
7705 | vectype_op[i] |
7706 | = get_vectype_for_scalar_type (loop_vinfo, |
7707 | TREE_TYPE (op.ops[i]), slp_op[i]); |
7708 | |
7709 | /* To properly compute ncopies we are interested in the widest |
7710 | non-reduction input type in case we're looking at a widening |
7711 | accumulation that we later handle in vect_transform_reduction. */ |
7712 | if (lane_reduc_code_p |
7713 | && vectype_op[i] |
7714 | && (!vectype_in |
7715 | || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) |
7716 | < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i])))))) |
7717 | vectype_in = vectype_op[i]; |
7718 | |
7719 | /* Record how the non-reduction-def value of COND_EXPR is defined. |
7720 | ??? For a chain of multiple CONDs we'd have to match them up all. */ |
7721 | if (op.code == COND_EXPR && reduc_chain_length == 1) |
7722 | { |
7723 | if (dt == vect_constant_def) |
7724 | { |
7725 | cond_reduc_dt = dt; |
7726 | cond_reduc_val = op.ops[i]; |
7727 | } |
7728 | else if (dt == vect_induction_def |
7729 | && def_stmt_info |
7730 | && is_nonwrapping_integer_induction (stmt_vinfo: def_stmt_info, loop)) |
7731 | { |
7732 | cond_reduc_dt = dt; |
7733 | cond_stmt_vinfo = def_stmt_info; |
7734 | } |
7735 | } |
7736 | } |
7737 | if (!vectype_in) |
7738 | vectype_in = STMT_VINFO_VECTYPE (phi_info); |
7739 | STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; |
7740 | |
7741 | enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info); |
7742 | STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type; |
7743 | /* If we have a condition reduction, see if we can simplify it further. */ |
7744 | if (v_reduc_type == COND_REDUCTION) |
7745 | { |
7746 | if (slp_node) |
7747 | return false; |
7748 | |
7749 | /* When the condition uses the reduction value in the condition, fail. */ |
7750 | if (STMT_VINFO_REDUC_IDX (stmt_info) == 0) |
7751 | { |
7752 | if (dump_enabled_p ()) |
7753 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7754 | "condition depends on previous iteration\n" ); |
7755 | return false; |
7756 | } |
7757 | |
7758 | if (reduc_chain_length == 1 |
7759 | && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in, |
7760 | OPTIMIZE_FOR_SPEED) |
7761 | || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST, |
7762 | vectype_in, |
7763 | OPTIMIZE_FOR_SPEED))) |
7764 | { |
7765 | if (dump_enabled_p ()) |
7766 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7767 | "optimizing condition reduction with" |
7768 | " FOLD_EXTRACT_LAST.\n" ); |
7769 | STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION; |
7770 | } |
7771 | else if (cond_reduc_dt == vect_induction_def) |
7772 | { |
7773 | tree base |
7774 | = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo); |
7775 | tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo); |
7776 | |
7777 | gcc_assert (TREE_CODE (base) == INTEGER_CST |
7778 | && TREE_CODE (step) == INTEGER_CST); |
7779 | cond_reduc_val = NULL_TREE; |
7780 | enum tree_code cond_reduc_op_code = ERROR_MARK; |
7781 | tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo)); |
7782 | if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base))) |
7783 | ; |
7784 | /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR |
7785 | above base; punt if base is the minimum value of the type for |
7786 | MAX_EXPR or maximum value of the type for MIN_EXPR for now. */ |
7787 | else if (tree_int_cst_sgn (step) == -1) |
7788 | { |
7789 | cond_reduc_op_code = MIN_EXPR; |
7790 | if (tree_int_cst_sgn (base) == -1) |
7791 | cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); |
7792 | else if (tree_int_cst_lt (t1: base, |
7793 | TYPE_MAX_VALUE (TREE_TYPE (base)))) |
7794 | cond_reduc_val |
7795 | = int_const_binop (PLUS_EXPR, base, integer_one_node); |
7796 | } |
7797 | else |
7798 | { |
7799 | cond_reduc_op_code = MAX_EXPR; |
7800 | if (tree_int_cst_sgn (base) == 1) |
7801 | cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); |
7802 | else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)), |
7803 | t2: base)) |
7804 | cond_reduc_val |
7805 | = int_const_binop (MINUS_EXPR, base, integer_one_node); |
7806 | } |
7807 | if (cond_reduc_val) |
7808 | { |
7809 | if (dump_enabled_p ()) |
7810 | dump_printf_loc (MSG_NOTE, vect_location, |
7811 | "condition expression based on " |
7812 | "integer induction.\n" ); |
7813 | STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code; |
7814 | STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) |
7815 | = cond_reduc_val; |
7816 | STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION; |
7817 | } |
7818 | } |
7819 | else if (cond_reduc_dt == vect_constant_def) |
7820 | { |
7821 | enum vect_def_type cond_initial_dt; |
7822 | tree cond_initial_val = vect_phi_initial_value (phi: reduc_def_phi); |
7823 | vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt); |
7824 | if (cond_initial_dt == vect_constant_def |
7825 | && types_compatible_p (TREE_TYPE (cond_initial_val), |
7826 | TREE_TYPE (cond_reduc_val))) |
7827 | { |
7828 | tree e = fold_binary (LE_EXPR, boolean_type_node, |
7829 | cond_initial_val, cond_reduc_val); |
7830 | if (e && (integer_onep (e) || integer_zerop (e))) |
7831 | { |
7832 | if (dump_enabled_p ()) |
7833 | dump_printf_loc (MSG_NOTE, vect_location, |
7834 | "condition expression based on " |
7835 | "compile time constant.\n" ); |
7836 | /* Record reduction code at analysis stage. */ |
7837 | STMT_VINFO_REDUC_CODE (reduc_info) |
7838 | = integer_onep (e) ? MAX_EXPR : MIN_EXPR; |
7839 | STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION; |
7840 | } |
7841 | } |
7842 | } |
7843 | } |
7844 | |
7845 | if (STMT_VINFO_LIVE_P (phi_info)) |
7846 | return false; |
7847 | |
7848 | if (slp_node) |
7849 | ncopies = 1; |
7850 | else |
7851 | ncopies = vect_get_num_copies (loop_vinfo, vectype: vectype_in); |
7852 | |
7853 | gcc_assert (ncopies >= 1); |
7854 | |
7855 | poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (node: vectype_out); |
7856 | |
7857 | if (nested_cycle) |
7858 | { |
7859 | gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) |
7860 | == vect_double_reduction_def); |
7861 | double_reduc = true; |
7862 | } |
7863 | |
7864 | /* 4.2. Check support for the epilog operation. |
7865 | |
7866 | If STMT represents a reduction pattern, then the type of the |
7867 | reduction variable may be different than the type of the rest |
7868 | of the arguments. For example, consider the case of accumulation |
7869 | of shorts into an int accumulator; The original code: |
7870 | S1: int_a = (int) short_a; |
7871 | orig_stmt-> S2: int_acc = plus <int_a ,int_acc>; |
7872 | |
7873 | was replaced with: |
7874 | STMT: int_acc = widen_sum <short_a, int_acc> |
7875 | |
7876 | This means that: |
7877 | 1. The tree-code that is used to create the vector operation in the |
7878 | epilog code (that reduces the partial results) is not the |
7879 | tree-code of STMT, but is rather the tree-code of the original |
7880 | stmt from the pattern that STMT is replacing. I.e, in the example |
7881 | above we want to use 'widen_sum' in the loop, but 'plus' in the |
7882 | epilog. |
7883 | 2. The type (mode) we use to check available target support |
7884 | for the vector operation to be created in the *epilog*, is |
7885 | determined by the type of the reduction variable (in the example |
7886 | above we'd check this: optab_handler (plus_optab, vect_int_mode])). |
7887 | However the type (mode) we use to check available target support |
7888 | for the vector operation to be created *inside the loop*, is |
7889 | determined by the type of the other arguments to STMT (in the |
7890 | example we'd check this: optab_handler (widen_sum_optab, |
7891 | vect_short_mode)). |
7892 | |
7893 | This is contrary to "regular" reductions, in which the types of all |
7894 | the arguments are the same as the type of the reduction variable. |
7895 | For "regular" reductions we can therefore use the same vector type |
7896 | (and also the same tree-code) when generating the epilog code and |
7897 | when generating the code inside the loop. */ |
7898 | |
7899 | code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info); |
7900 | |
7901 | /* If conversion might have created a conditional operation like |
7902 | IFN_COND_ADD already. Use the internal code for the following checks. */ |
7903 | if (orig_code.is_internal_fn ()) |
7904 | { |
7905 | tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code)); |
7906 | orig_code = new_code != ERROR_MARK ? new_code : orig_code; |
7907 | } |
7908 | |
7909 | STMT_VINFO_REDUC_CODE (reduc_info) = orig_code; |
7910 | |
7911 | vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); |
7912 | if (reduction_type == TREE_CODE_REDUCTION) |
7913 | { |
7914 | /* Check whether it's ok to change the order of the computation. |
7915 | Generally, when vectorizing a reduction we change the order of the |
7916 | computation. This may change the behavior of the program in some |
7917 | cases, so we need to check that this is ok. One exception is when |
7918 | vectorizing an outer-loop: the inner-loop is executed sequentially, |
7919 | and therefore vectorizing reductions in the inner-loop during |
7920 | outer-loop vectorization is safe. Likewise when we are vectorizing |
7921 | a series of reductions using SLP and the VF is one the reductions |
7922 | are performed in scalar order. */ |
7923 | if (slp_node |
7924 | && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
7925 | && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u)) |
7926 | ; |
7927 | else if (needs_fold_left_reduction_p (type: op.type, code: orig_code)) |
7928 | { |
7929 | /* When vectorizing a reduction chain w/o SLP the reduction PHI |
7930 | is not directy used in stmt. */ |
7931 | if (!only_slp_reduc_chain |
7932 | && reduc_chain_length != 1) |
7933 | { |
7934 | if (dump_enabled_p ()) |
7935 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7936 | "in-order reduction chain without SLP.\n" ); |
7937 | return false; |
7938 | } |
7939 | STMT_VINFO_REDUC_TYPE (reduc_info) |
7940 | = reduction_type = FOLD_LEFT_REDUCTION; |
7941 | } |
7942 | else if (!commutative_binary_op_p (orig_code, op.type) |
7943 | || !associative_binary_op_p (orig_code, op.type)) |
7944 | { |
7945 | if (dump_enabled_p ()) |
7946 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7947 | "reduction: not commutative/associative\n" ); |
7948 | return false; |
7949 | } |
7950 | } |
7951 | |
7952 | if ((double_reduc || reduction_type != TREE_CODE_REDUCTION) |
7953 | && ncopies > 1) |
7954 | { |
7955 | if (dump_enabled_p ()) |
7956 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7957 | "multiple types in double reduction or condition " |
7958 | "reduction or fold-left reduction.\n" ); |
7959 | return false; |
7960 | } |
7961 | |
7962 | internal_fn reduc_fn = IFN_LAST; |
7963 | if (reduction_type == TREE_CODE_REDUCTION |
7964 | || reduction_type == FOLD_LEFT_REDUCTION |
7965 | || reduction_type == INTEGER_INDUC_COND_REDUCTION |
7966 | || reduction_type == CONST_COND_REDUCTION) |
7967 | { |
7968 | if (reduction_type == FOLD_LEFT_REDUCTION |
7969 | ? fold_left_reduction_fn (code: orig_code, reduc_fn: &reduc_fn) |
7970 | : reduction_fn_for_scalar_code (code: orig_code, reduc_fn: &reduc_fn)) |
7971 | { |
7972 | if (reduc_fn != IFN_LAST |
7973 | && !direct_internal_fn_supported_p (reduc_fn, vectype_out, |
7974 | OPTIMIZE_FOR_SPEED)) |
7975 | { |
7976 | if (dump_enabled_p ()) |
7977 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7978 | "reduc op not supported by target.\n" ); |
7979 | |
7980 | reduc_fn = IFN_LAST; |
7981 | } |
7982 | } |
7983 | else |
7984 | { |
7985 | if (!nested_cycle || double_reduc) |
7986 | { |
7987 | if (dump_enabled_p ()) |
7988 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7989 | "no reduc code for scalar code.\n" ); |
7990 | |
7991 | return false; |
7992 | } |
7993 | } |
7994 | } |
7995 | else if (reduction_type == COND_REDUCTION) |
7996 | { |
7997 | int scalar_precision |
7998 | = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type)); |
7999 | cr_index_scalar_type = make_unsigned_type (scalar_precision); |
8000 | cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type, |
8001 | vectype_out); |
8002 | |
8003 | if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type, |
8004 | OPTIMIZE_FOR_SPEED)) |
8005 | reduc_fn = IFN_REDUC_MAX; |
8006 | } |
8007 | STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn; |
8008 | |
8009 | if (reduction_type != EXTRACT_LAST_REDUCTION |
8010 | && (!nested_cycle || double_reduc) |
8011 | && reduc_fn == IFN_LAST |
8012 | && !nunits_out.is_constant ()) |
8013 | { |
8014 | if (dump_enabled_p ()) |
8015 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8016 | "missing target support for reduction on" |
8017 | " variable-length vectors.\n" ); |
8018 | return false; |
8019 | } |
8020 | |
8021 | /* For SLP reductions, see if there is a neutral value we can use. */ |
8022 | tree neutral_op = NULL_TREE; |
8023 | if (slp_node) |
8024 | { |
8025 | tree initial_value = NULL_TREE; |
8026 | if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL) |
8027 | initial_value = vect_phi_initial_value (phi: reduc_def_phi); |
8028 | neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out), |
8029 | code: orig_code, initial_value); |
8030 | } |
8031 | |
8032 | if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION) |
8033 | { |
8034 | /* We can't support in-order reductions of code such as this: |
8035 | |
8036 | for (int i = 0; i < n1; ++i) |
8037 | for (int j = 0; j < n2; ++j) |
8038 | l += a[j]; |
8039 | |
8040 | since GCC effectively transforms the loop when vectorizing: |
8041 | |
8042 | for (int i = 0; i < n1 / VF; ++i) |
8043 | for (int j = 0; j < n2; ++j) |
8044 | for (int k = 0; k < VF; ++k) |
8045 | l += a[j]; |
8046 | |
8047 | which is a reassociation of the original operation. */ |
8048 | if (dump_enabled_p ()) |
8049 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8050 | "in-order double reduction not supported.\n" ); |
8051 | |
8052 | return false; |
8053 | } |
8054 | |
8055 | if (reduction_type == FOLD_LEFT_REDUCTION |
8056 | && slp_node |
8057 | && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
8058 | { |
8059 | /* We cannot use in-order reductions in this case because there is |
8060 | an implicit reassociation of the operations involved. */ |
8061 | if (dump_enabled_p ()) |
8062 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8063 | "in-order unchained SLP reductions not supported.\n" ); |
8064 | return false; |
8065 | } |
8066 | |
8067 | /* For double reductions, and for SLP reductions with a neutral value, |
8068 | we construct a variable-length initial vector by loading a vector |
8069 | full of the neutral value and then shift-and-inserting the start |
8070 | values into the low-numbered elements. */ |
8071 | if ((double_reduc || neutral_op) |
8072 | && !nunits_out.is_constant () |
8073 | && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT, |
8074 | vectype_out, OPTIMIZE_FOR_SPEED)) |
8075 | { |
8076 | if (dump_enabled_p ()) |
8077 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8078 | "reduction on variable-length vectors requires" |
8079 | " target support for a vector-shift-and-insert" |
8080 | " operation.\n" ); |
8081 | return false; |
8082 | } |
8083 | |
8084 | /* Check extra constraints for variable-length unchained SLP reductions. */ |
8085 | if (slp_node |
8086 | && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
8087 | && !nunits_out.is_constant ()) |
8088 | { |
8089 | /* We checked above that we could build the initial vector when |
8090 | there's a neutral element value. Check here for the case in |
8091 | which each SLP statement has its own initial value and in which |
8092 | that value needs to be repeated for every instance of the |
8093 | statement within the initial vector. */ |
8094 | unsigned int group_size = SLP_TREE_LANES (slp_node); |
8095 | if (!neutral_op |
8096 | && !can_duplicate_and_interleave_p (loop_vinfo, group_size, |
8097 | TREE_TYPE (vectype_out))) |
8098 | { |
8099 | if (dump_enabled_p ()) |
8100 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8101 | "unsupported form of SLP reduction for" |
8102 | " variable-length vectors: cannot build" |
8103 | " initial vector.\n" ); |
8104 | return false; |
8105 | } |
8106 | /* The epilogue code relies on the number of elements being a multiple |
8107 | of the group size. The duplicate-and-interleave approach to setting |
8108 | up the initial vector does too. */ |
8109 | if (!multiple_p (a: nunits_out, b: group_size)) |
8110 | { |
8111 | if (dump_enabled_p ()) |
8112 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8113 | "unsupported form of SLP reduction for" |
8114 | " variable-length vectors: the vector size" |
8115 | " is not a multiple of the number of results.\n" ); |
8116 | return false; |
8117 | } |
8118 | } |
8119 | |
8120 | if (reduction_type == COND_REDUCTION) |
8121 | { |
8122 | widest_int ni; |
8123 | |
8124 | if (! max_loop_iterations (loop, &ni)) |
8125 | { |
8126 | if (dump_enabled_p ()) |
8127 | dump_printf_loc (MSG_NOTE, vect_location, |
8128 | "loop count not known, cannot create cond " |
8129 | "reduction.\n" ); |
8130 | return false; |
8131 | } |
8132 | /* Convert backedges to iterations. */ |
8133 | ni += 1; |
8134 | |
8135 | /* The additional index will be the same type as the condition. Check |
8136 | that the loop can fit into this less one (because we'll use up the |
8137 | zero slot for when there are no matches). */ |
8138 | tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type); |
8139 | if (wi::geu_p (x: ni, y: wi::to_widest (t: max_index))) |
8140 | { |
8141 | if (dump_enabled_p ()) |
8142 | dump_printf_loc (MSG_NOTE, vect_location, |
8143 | "loop size is greater than data size.\n" ); |
8144 | return false; |
8145 | } |
8146 | } |
8147 | |
8148 | /* In case the vectorization factor (VF) is bigger than the number |
8149 | of elements that we can fit in a vectype (nunits), we have to generate |
8150 | more than one vector stmt - i.e - we need to "unroll" the |
8151 | vector stmt by a factor VF/nunits. For more details see documentation |
8152 | in vectorizable_operation. */ |
8153 | |
8154 | /* If the reduction is used in an outer loop we need to generate |
8155 | VF intermediate results, like so (e.g. for ncopies=2): |
8156 | r0 = phi (init, r0) |
8157 | r1 = phi (init, r1) |
8158 | r0 = x0 + r0; |
8159 | r1 = x1 + r1; |
8160 | (i.e. we generate VF results in 2 registers). |
8161 | In this case we have a separate def-use cycle for each copy, and therefore |
8162 | for each copy we get the vector def for the reduction variable from the |
8163 | respective phi node created for this copy. |
8164 | |
8165 | Otherwise (the reduction is unused in the loop nest), we can combine |
8166 | together intermediate results, like so (e.g. for ncopies=2): |
8167 | r = phi (init, r) |
8168 | r = x0 + r; |
8169 | r = x1 + r; |
8170 | (i.e. we generate VF/2 results in a single register). |
8171 | In this case for each copy we get the vector def for the reduction variable |
8172 | from the vectorized reduction operation generated in the previous iteration. |
8173 | |
8174 | This only works when we see both the reduction PHI and its only consumer |
8175 | in vectorizable_reduction and there are no intermediate stmts |
8176 | participating. When unrolling we want each unrolled iteration to have its |
8177 | own reduction accumulator since one of the main goals of unrolling a |
8178 | reduction is to reduce the aggregate loop-carried latency. */ |
8179 | if (ncopies > 1 |
8180 | && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) |
8181 | && reduc_chain_length == 1 |
8182 | && loop_vinfo->suggested_unroll_factor == 1) |
8183 | single_defuse_cycle = true; |
8184 | |
8185 | if (single_defuse_cycle || lane_reduc_code_p) |
8186 | { |
8187 | gcc_assert (op.code != COND_EXPR); |
8188 | |
8189 | /* 4. Supportable by target? */ |
8190 | bool ok = true; |
8191 | |
8192 | /* 4.1. check support for the operation in the loop |
8193 | |
8194 | This isn't necessary for the lane reduction codes, since they |
8195 | can only be produced by pattern matching, and it's up to the |
8196 | pattern matcher to test for support. The main reason for |
8197 | specifically skipping this step is to avoid rechecking whether |
8198 | mixed-sign dot-products can be implemented using signed |
8199 | dot-products. */ |
8200 | machine_mode vec_mode = TYPE_MODE (vectype_in); |
8201 | if (!lane_reduc_code_p |
8202 | && !directly_supported_p (op.code, vectype_in, optab_vector)) |
8203 | { |
8204 | if (dump_enabled_p ()) |
8205 | dump_printf (MSG_NOTE, "op not supported by target.\n" ); |
8206 | if (maybe_ne (a: GET_MODE_SIZE (mode: vec_mode), UNITS_PER_WORD) |
8207 | || !vect_can_vectorize_without_simd_p (op.code)) |
8208 | ok = false; |
8209 | else |
8210 | if (dump_enabled_p ()) |
8211 | dump_printf (MSG_NOTE, "proceeding using word mode.\n" ); |
8212 | } |
8213 | |
8214 | if (vect_emulated_vector_p (vectype_in) |
8215 | && !vect_can_vectorize_without_simd_p (op.code)) |
8216 | { |
8217 | if (dump_enabled_p ()) |
8218 | dump_printf (MSG_NOTE, "using word mode not possible.\n" ); |
8219 | return false; |
8220 | } |
8221 | |
8222 | /* lane-reducing operations have to go through vect_transform_reduction. |
8223 | For the other cases try without the single cycle optimization. */ |
8224 | if (!ok) |
8225 | { |
8226 | if (lane_reduc_code_p) |
8227 | return false; |
8228 | else |
8229 | single_defuse_cycle = false; |
8230 | } |
8231 | } |
8232 | STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; |
8233 | |
8234 | /* If the reduction stmt is one of the patterns that have lane |
8235 | reduction embedded we cannot handle the case of ! single_defuse_cycle. */ |
8236 | if ((ncopies > 1 && ! single_defuse_cycle) |
8237 | && lane_reduc_code_p) |
8238 | { |
8239 | if (dump_enabled_p ()) |
8240 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8241 | "multi def-use cycle not possible for lane-reducing " |
8242 | "reduction operation\n" ); |
8243 | return false; |
8244 | } |
8245 | |
8246 | if (slp_node |
8247 | && !(!single_defuse_cycle |
8248 | && !lane_reduc_code_p |
8249 | && reduction_type != FOLD_LEFT_REDUCTION)) |
8250 | for (i = 0; i < (int) op.num_ops; i++) |
8251 | if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i])) |
8252 | { |
8253 | if (dump_enabled_p ()) |
8254 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8255 | "incompatible vector types for invariants\n" ); |
8256 | return false; |
8257 | } |
8258 | |
8259 | if (slp_node) |
8260 | vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); |
8261 | else |
8262 | vec_num = 1; |
8263 | |
8264 | vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn, |
8265 | reduction_type, ncopies, cost_vec); |
8266 | /* Cost the reduction op inside the loop if transformed via |
8267 | vect_transform_reduction. Otherwise this is costed by the |
8268 | separate vectorizable_* routines. */ |
8269 | if (single_defuse_cycle || lane_reduc_code_p) |
8270 | { |
8271 | int factor = 1; |
8272 | if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info)) |
8273 | /* Three dot-products and a subtraction. */ |
8274 | factor = 4; |
8275 | record_stmt_cost (body_cost_vec: cost_vec, count: ncopies * factor, kind: vector_stmt, |
8276 | stmt_info, misalign: 0, where: vect_body); |
8277 | } |
8278 | |
8279 | if (dump_enabled_p () |
8280 | && reduction_type == FOLD_LEFT_REDUCTION) |
8281 | dump_printf_loc (MSG_NOTE, vect_location, |
8282 | "using an in-order (fold-left) reduction.\n" ); |
8283 | STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; |
8284 | /* All but single defuse-cycle optimized, lane-reducing and fold-left |
8285 | reductions go through their own vectorizable_* routines. */ |
8286 | if (!single_defuse_cycle |
8287 | && !lane_reduc_code_p |
8288 | && reduction_type != FOLD_LEFT_REDUCTION) |
8289 | { |
8290 | stmt_vec_info tem |
8291 | = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); |
8292 | if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem)) |
8293 | { |
8294 | gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem)); |
8295 | tem = REDUC_GROUP_FIRST_ELEMENT (tem); |
8296 | } |
8297 | STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def; |
8298 | STMT_VINFO_DEF_TYPE (tem) = vect_internal_def; |
8299 | } |
8300 | else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) |
8301 | { |
8302 | vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); |
8303 | vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); |
8304 | internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type); |
8305 | |
8306 | if (reduction_type != FOLD_LEFT_REDUCTION |
8307 | && !use_mask_by_cond_expr_p (code: op.code, cond_fn, vectype_in) |
8308 | && (cond_fn == IFN_LAST |
8309 | || !direct_internal_fn_supported_p (cond_fn, vectype_in, |
8310 | OPTIMIZE_FOR_SPEED))) |
8311 | { |
8312 | if (dump_enabled_p ()) |
8313 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8314 | "can't operate on partial vectors because" |
8315 | " no conditional operation is available.\n" ); |
8316 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
8317 | } |
8318 | else if (reduction_type == FOLD_LEFT_REDUCTION |
8319 | && reduc_fn == IFN_LAST |
8320 | && !expand_vec_cond_expr_p (vectype_in, |
8321 | truth_type_for (vectype_in), |
8322 | SSA_NAME)) |
8323 | { |
8324 | if (dump_enabled_p ()) |
8325 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8326 | "can't operate on partial vectors because" |
8327 | " no conditional operation is available.\n" ); |
8328 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
8329 | } |
8330 | else if (reduction_type == FOLD_LEFT_REDUCTION |
8331 | && internal_fn_mask_index (reduc_fn) == -1 |
8332 | && FLOAT_TYPE_P (vectype_in) |
8333 | && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in)) |
8334 | { |
8335 | if (dump_enabled_p ()) |
8336 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8337 | "can't operate on partial vectors because" |
8338 | " signed zeros cannot be preserved.\n" ); |
8339 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
8340 | } |
8341 | else |
8342 | { |
8343 | internal_fn mask_reduc_fn |
8344 | = get_masked_reduction_fn (reduc_fn, vectype_in); |
8345 | |
8346 | if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS) |
8347 | vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, |
8348 | vectype_in, 1); |
8349 | else |
8350 | vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, |
8351 | vectype_in, NULL); |
8352 | } |
8353 | } |
8354 | return true; |
8355 | } |
8356 | |
8357 | /* STMT_INFO is a dot-product reduction whose multiplication operands |
8358 | have different signs. Emit a sequence to emulate the operation |
8359 | using a series of signed DOT_PROD_EXPRs and return the last |
8360 | statement generated. VEC_DEST is the result of the vector operation |
8361 | and VOP lists its inputs. */ |
8362 | |
8363 | static gassign * |
8364 | vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, |
8365 | gimple_stmt_iterator *gsi, tree vec_dest, |
8366 | tree vop[3]) |
8367 | { |
8368 | tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest)); |
8369 | tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0])); |
8370 | tree narrow_elttype = TREE_TYPE (narrow_vectype); |
8371 | gimple *new_stmt; |
8372 | |
8373 | /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */ |
8374 | if (!TYPE_UNSIGNED (TREE_TYPE (vop[0]))) |
8375 | std::swap (a&: vop[0], b&: vop[1]); |
8376 | |
8377 | /* Convert all inputs to signed types. */ |
8378 | for (int i = 0; i < 3; ++i) |
8379 | if (TYPE_UNSIGNED (TREE_TYPE (vop[i]))) |
8380 | { |
8381 | tree tmp = make_ssa_name (var: signed_type_for (TREE_TYPE (vop[i]))); |
8382 | new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]); |
8383 | vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); |
8384 | vop[i] = tmp; |
8385 | } |
8386 | |
8387 | /* In the comments below we assume 8-bit inputs for simplicity, |
8388 | but the approach works for any full integer type. */ |
8389 | |
8390 | /* Create a vector of -128. */ |
8391 | tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype); |
8392 | tree min_narrow = build_vector_from_val (narrow_vectype, |
8393 | min_narrow_elttype); |
8394 | |
8395 | /* Create a vector of 64. */ |
8396 | auto half_wi = wi::lrshift (x: wi::to_wide (t: min_narrow_elttype), y: 1); |
8397 | tree half_narrow = wide_int_to_tree (type: narrow_elttype, cst: half_wi); |
8398 | half_narrow = build_vector_from_val (narrow_vectype, half_narrow); |
8399 | |
8400 | /* Emit: SUB_RES = VOP[0] - 128. */ |
8401 | tree sub_res = make_ssa_name (var: narrow_vectype); |
8402 | new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow); |
8403 | vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); |
8404 | |
8405 | /* Emit: |
8406 | |
8407 | STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>; |
8408 | STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>; |
8409 | STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>; |
8410 | |
8411 | on the basis that x * y == (x - 128) * y + 64 * y + 64 * y |
8412 | Doing the two 64 * y steps first allows more time to compute x. */ |
8413 | tree stage1 = make_ssa_name (var: wide_vectype); |
8414 | new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR, |
8415 | vop[1], half_narrow, vop[2]); |
8416 | vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); |
8417 | |
8418 | tree stage2 = make_ssa_name (var: wide_vectype); |
8419 | new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR, |
8420 | vop[1], half_narrow, stage1); |
8421 | vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); |
8422 | |
8423 | tree stage3 = make_ssa_name (var: wide_vectype); |
8424 | new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR, |
8425 | sub_res, vop[1], stage2); |
8426 | vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); |
8427 | |
8428 | /* Convert STAGE3 to the reduction type. */ |
8429 | return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3); |
8430 | } |
8431 | |
8432 | /* Transform the definition stmt STMT_INFO of a reduction PHI backedge |
8433 | value. */ |
8434 | |
8435 | bool |
8436 | vect_transform_reduction (loop_vec_info loop_vinfo, |
8437 | stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, |
8438 | gimple **vec_stmt, slp_tree slp_node) |
8439 | { |
8440 | tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); |
8441 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
8442 | int i; |
8443 | int ncopies; |
8444 | int vec_num; |
8445 | |
8446 | stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info); |
8447 | gcc_assert (reduc_info->is_reduc_info); |
8448 | |
8449 | if (nested_in_vect_loop_p (loop, stmt_info)) |
8450 | { |
8451 | loop = loop->inner; |
8452 | gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def); |
8453 | } |
8454 | |
8455 | gimple_match_op op; |
8456 | if (!gimple_extract_op (stmt_info->stmt, &op)) |
8457 | gcc_unreachable (); |
8458 | |
8459 | /* All uses but the last are expected to be defined in the loop. |
8460 | The last use is the reduction variable. In case of nested cycle this |
8461 | assumption is not true: we use reduc_index to record the index of the |
8462 | reduction variable. */ |
8463 | stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)); |
8464 | gphi *reduc_def_phi = as_a <gphi *> (p: phi_info->stmt); |
8465 | int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); |
8466 | tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); |
8467 | |
8468 | if (slp_node) |
8469 | { |
8470 | ncopies = 1; |
8471 | vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); |
8472 | } |
8473 | else |
8474 | { |
8475 | ncopies = vect_get_num_copies (loop_vinfo, vectype: vectype_in); |
8476 | vec_num = 1; |
8477 | } |
8478 | |
8479 | code_helper code = canonicalize_code (op.code, op.type); |
8480 | internal_fn cond_fn = get_conditional_internal_fn (code, op.type); |
8481 | |
8482 | vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); |
8483 | vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); |
8484 | bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in); |
8485 | |
8486 | /* Transform. */ |
8487 | tree new_temp = NULL_TREE; |
8488 | auto_vec<tree> vec_oprnds0; |
8489 | auto_vec<tree> vec_oprnds1; |
8490 | auto_vec<tree> vec_oprnds2; |
8491 | tree def0; |
8492 | |
8493 | if (dump_enabled_p ()) |
8494 | dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n" ); |
8495 | |
8496 | /* FORNOW: Multiple types are not supported for condition. */ |
8497 | if (code == COND_EXPR) |
8498 | gcc_assert (ncopies == 1); |
8499 | |
8500 | /* A binary COND_OP reduction must have the same definition and else |
8501 | value. */ |
8502 | bool cond_fn_p = code.is_internal_fn () |
8503 | && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK; |
8504 | if (cond_fn_p) |
8505 | { |
8506 | gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB |
8507 | || code == IFN_COND_MUL || code == IFN_COND_AND |
8508 | || code == IFN_COND_IOR || code == IFN_COND_XOR); |
8509 | gcc_assert (op.num_ops == 4 |
8510 | && (op.ops[reduc_index] |
8511 | == op.ops[internal_fn_else_index ((internal_fn) code)])); |
8512 | } |
8513 | |
8514 | bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); |
8515 | |
8516 | vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); |
8517 | if (reduction_type == FOLD_LEFT_REDUCTION) |
8518 | { |
8519 | internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info); |
8520 | gcc_assert (code.is_tree_code () || cond_fn_p); |
8521 | return vectorize_fold_left_reduction |
8522 | (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_stmt: reduc_def_phi, |
8523 | code, reduc_fn, ops: op.ops, num_ops: op.num_ops, vectype_in, |
8524 | reduc_index, masks, lens); |
8525 | } |
8526 | |
8527 | bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info); |
8528 | gcc_assert (single_defuse_cycle |
8529 | || code == DOT_PROD_EXPR |
8530 | || code == WIDEN_SUM_EXPR |
8531 | || code == SAD_EXPR); |
8532 | |
8533 | /* Create the destination vector */ |
8534 | tree scalar_dest = gimple_get_lhs (stmt_info->stmt); |
8535 | tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out); |
8536 | |
8537 | /* Get NCOPIES vector definitions for all operands except the reduction |
8538 | definition. */ |
8539 | if (!cond_fn_p) |
8540 | { |
8541 | vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, |
8542 | single_defuse_cycle && reduc_index == 0 |
8543 | ? NULL_TREE : op.ops[0], &vec_oprnds0, |
8544 | single_defuse_cycle && reduc_index == 1 |
8545 | ? NULL_TREE : op.ops[1], &vec_oprnds1, |
8546 | op.num_ops == 3 |
8547 | && !(single_defuse_cycle && reduc_index == 2) |
8548 | ? op.ops[2] : NULL_TREE, &vec_oprnds2); |
8549 | } |
8550 | else |
8551 | { |
8552 | /* For a conditional operation pass the truth type as mask |
8553 | vectype. */ |
8554 | gcc_assert (single_defuse_cycle |
8555 | && (reduc_index == 1 || reduc_index == 2)); |
8556 | vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, |
8557 | op.ops[0], truth_type_for (vectype_in), &vec_oprnds0, |
8558 | reduc_index == 1 ? NULL_TREE : op.ops[1], |
8559 | NULL_TREE, &vec_oprnds1, |
8560 | reduc_index == 2 ? NULL_TREE : op.ops[2], |
8561 | NULL_TREE, &vec_oprnds2); |
8562 | } |
8563 | |
8564 | /* For single def-use cycles get one copy of the vectorized reduction |
8565 | definition. */ |
8566 | if (single_defuse_cycle) |
8567 | { |
8568 | gcc_assert (!slp_node); |
8569 | vect_get_vec_defs_for_operand (vinfo: loop_vinfo, stmt_info, 1, |
8570 | op: op.ops[reduc_index], |
8571 | reduc_index == 0 ? &vec_oprnds0 |
8572 | : (reduc_index == 1 ? &vec_oprnds1 |
8573 | : &vec_oprnds2)); |
8574 | } |
8575 | |
8576 | bool emulated_mixed_dot_prod |
8577 | = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info); |
8578 | FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) |
8579 | { |
8580 | gimple *new_stmt; |
8581 | tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; |
8582 | if (masked_loop_p && !mask_by_cond_expr) |
8583 | { |
8584 | /* No conditional ifns have been defined for dot-product yet. */ |
8585 | gcc_assert (code != DOT_PROD_EXPR); |
8586 | |
8587 | /* Make sure that the reduction accumulator is vop[0]. */ |
8588 | if (reduc_index == 1) |
8589 | { |
8590 | gcc_assert (commutative_binary_op_p (code, op.type)); |
8591 | std::swap (a&: vop[0], b&: vop[1]); |
8592 | } |
8593 | tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, |
8594 | vec_num * ncopies, vectype_in, i); |
8595 | gcall *call = gimple_build_call_internal (cond_fn, 4, mask, |
8596 | vop[0], vop[1], vop[0]); |
8597 | new_temp = make_ssa_name (var: vec_dest, stmt: call); |
8598 | gimple_call_set_lhs (gs: call, lhs: new_temp); |
8599 | gimple_call_set_nothrow (s: call, nothrow_p: true); |
8600 | vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi); |
8601 | new_stmt = call; |
8602 | } |
8603 | else |
8604 | { |
8605 | if (op.num_ops >= 3) |
8606 | vop[2] = vec_oprnds2[i]; |
8607 | |
8608 | if (masked_loop_p && mask_by_cond_expr) |
8609 | { |
8610 | tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, |
8611 | vec_num * ncopies, vectype_in, i); |
8612 | build_vect_cond_expr (code, vop, mask, gsi); |
8613 | } |
8614 | |
8615 | if (emulated_mixed_dot_prod) |
8616 | new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi, |
8617 | vec_dest, vop); |
8618 | |
8619 | else if (code.is_internal_fn () && !cond_fn_p) |
8620 | new_stmt = gimple_build_call_internal (internal_fn (code), |
8621 | op.num_ops, |
8622 | vop[0], vop[1], vop[2]); |
8623 | else if (code.is_internal_fn () && cond_fn_p) |
8624 | new_stmt = gimple_build_call_internal (internal_fn (code), |
8625 | op.num_ops, |
8626 | vop[0], vop[1], vop[2], |
8627 | vop[1]); |
8628 | else |
8629 | new_stmt = gimple_build_assign (vec_dest, tree_code (op.code), |
8630 | vop[0], vop[1], vop[2]); |
8631 | new_temp = make_ssa_name (var: vec_dest, stmt: new_stmt); |
8632 | gimple_set_lhs (new_stmt, new_temp); |
8633 | vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); |
8634 | } |
8635 | |
8636 | if (slp_node) |
8637 | slp_node->push_vec_def (def: new_stmt); |
8638 | else if (single_defuse_cycle |
8639 | && i < ncopies - 1) |
8640 | { |
8641 | if (reduc_index == 0) |
8642 | vec_oprnds0.safe_push (obj: gimple_get_lhs (new_stmt)); |
8643 | else if (reduc_index == 1) |
8644 | vec_oprnds1.safe_push (obj: gimple_get_lhs (new_stmt)); |
8645 | else if (reduc_index == 2) |
8646 | vec_oprnds2.safe_push (obj: gimple_get_lhs (new_stmt)); |
8647 | } |
8648 | else |
8649 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_stmt); |
8650 | } |
8651 | |
8652 | if (!slp_node) |
8653 | *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; |
8654 | |
8655 | return true; |
8656 | } |
8657 | |
8658 | /* Transform phase of a cycle PHI. */ |
8659 | |
8660 | bool |
8661 | vect_transform_cycle_phi (loop_vec_info loop_vinfo, |
8662 | stmt_vec_info stmt_info, gimple **vec_stmt, |
8663 | slp_tree slp_node, slp_instance slp_node_instance) |
8664 | { |
8665 | tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); |
8666 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
8667 | int i; |
8668 | int ncopies; |
8669 | int j; |
8670 | bool nested_cycle = false; |
8671 | int vec_num; |
8672 | |
8673 | if (nested_in_vect_loop_p (loop, stmt_info)) |
8674 | { |
8675 | loop = loop->inner; |
8676 | nested_cycle = true; |
8677 | } |
8678 | |
8679 | stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); |
8680 | reduc_stmt_info = vect_stmt_to_vectorize (stmt_info: reduc_stmt_info); |
8681 | stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info); |
8682 | gcc_assert (reduc_info->is_reduc_info); |
8683 | |
8684 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION |
8685 | || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION) |
8686 | /* Leave the scalar phi in place. */ |
8687 | return true; |
8688 | |
8689 | tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); |
8690 | /* For a nested cycle we do not fill the above. */ |
8691 | if (!vectype_in) |
8692 | vectype_in = STMT_VINFO_VECTYPE (stmt_info); |
8693 | gcc_assert (vectype_in); |
8694 | |
8695 | if (slp_node) |
8696 | { |
8697 | /* The size vect_schedule_slp_instance computes is off for us. */ |
8698 | vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo) |
8699 | * SLP_TREE_LANES (slp_node), vectype: vectype_in); |
8700 | ncopies = 1; |
8701 | } |
8702 | else |
8703 | { |
8704 | vec_num = 1; |
8705 | ncopies = vect_get_num_copies (loop_vinfo, vectype: vectype_in); |
8706 | } |
8707 | |
8708 | /* Check whether we should use a single PHI node and accumulate |
8709 | vectors to one before the backedge. */ |
8710 | if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info)) |
8711 | ncopies = 1; |
8712 | |
8713 | /* Create the destination vector */ |
8714 | gphi *phi = as_a <gphi *> (p: stmt_info->stmt); |
8715 | tree vec_dest = vect_create_destination_var (gimple_phi_result (gs: phi), |
8716 | vectype_out); |
8717 | |
8718 | /* Get the loop-entry arguments. */ |
8719 | tree vec_initial_def = NULL_TREE; |
8720 | auto_vec<tree> vec_initial_defs; |
8721 | if (slp_node) |
8722 | { |
8723 | vec_initial_defs.reserve (nelems: vec_num); |
8724 | if (nested_cycle) |
8725 | { |
8726 | unsigned phi_idx = loop_preheader_edge (loop)->dest_idx; |
8727 | vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx], |
8728 | &vec_initial_defs); |
8729 | } |
8730 | else |
8731 | { |
8732 | gcc_assert (slp_node == slp_node_instance->reduc_phis); |
8733 | vec<tree> &initial_values = reduc_info->reduc_initial_values; |
8734 | vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node); |
8735 | |
8736 | unsigned int num_phis = stmts.length (); |
8737 | if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info)) |
8738 | num_phis = 1; |
8739 | initial_values.reserve (nelems: num_phis); |
8740 | for (unsigned int i = 0; i < num_phis; ++i) |
8741 | { |
8742 | gphi *this_phi = as_a<gphi *> (p: stmts[i]->stmt); |
8743 | initial_values.quick_push (obj: vect_phi_initial_value (phi: this_phi)); |
8744 | } |
8745 | if (vec_num == 1) |
8746 | vect_find_reusable_accumulator (loop_vinfo, reduc_info); |
8747 | if (!initial_values.is_empty ()) |
8748 | { |
8749 | tree initial_value |
8750 | = (num_phis == 1 ? initial_values[0] : NULL_TREE); |
8751 | code_helper code = STMT_VINFO_REDUC_CODE (reduc_info); |
8752 | tree neutral_op |
8753 | = neutral_op_for_reduction (TREE_TYPE (vectype_out), |
8754 | code, initial_value); |
8755 | get_initial_defs_for_reduction (loop_vinfo, reduc_info, |
8756 | vec_oprnds: &vec_initial_defs, number_of_vectors: vec_num, |
8757 | group_size: stmts.length (), neutral_op); |
8758 | } |
8759 | } |
8760 | } |
8761 | else |
8762 | { |
8763 | /* Get at the scalar def before the loop, that defines the initial |
8764 | value of the reduction variable. */ |
8765 | tree initial_def = vect_phi_initial_value (phi); |
8766 | reduc_info->reduc_initial_values.safe_push (obj: initial_def); |
8767 | /* Optimize: if initial_def is for REDUC_MAX smaller than the base |
8768 | and we can't use zero for induc_val, use initial_def. Similarly |
8769 | for REDUC_MIN and initial_def larger than the base. */ |
8770 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) |
8771 | { |
8772 | tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); |
8773 | if (TREE_CODE (initial_def) == INTEGER_CST |
8774 | && !integer_zerop (induc_val) |
8775 | && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR |
8776 | && tree_int_cst_lt (t1: initial_def, t2: induc_val)) |
8777 | || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR |
8778 | && tree_int_cst_lt (t1: induc_val, t2: initial_def)))) |
8779 | { |
8780 | induc_val = initial_def; |
8781 | /* Communicate we used the initial_def to epilouge |
8782 | generation. */ |
8783 | STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE; |
8784 | } |
8785 | vec_initial_def = build_vector_from_val (vectype_out, induc_val); |
8786 | } |
8787 | else if (nested_cycle) |
8788 | { |
8789 | /* Do not use an adjustment def as that case is not supported |
8790 | correctly if ncopies is not one. */ |
8791 | vect_get_vec_defs_for_operand (vinfo: loop_vinfo, reduc_stmt_info, |
8792 | ncopies, op: initial_def, |
8793 | &vec_initial_defs); |
8794 | } |
8795 | else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION |
8796 | || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) |
8797 | /* Fill the initial vector with the initial scalar value. */ |
8798 | vec_initial_def |
8799 | = get_initial_def_for_reduction (loop_vinfo, reduc_info: reduc_stmt_info, |
8800 | init_val: initial_def, neutral_op: initial_def); |
8801 | else |
8802 | { |
8803 | if (ncopies == 1) |
8804 | vect_find_reusable_accumulator (loop_vinfo, reduc_info); |
8805 | if (!reduc_info->reduc_initial_values.is_empty ()) |
8806 | { |
8807 | initial_def = reduc_info->reduc_initial_values[0]; |
8808 | code_helper code = STMT_VINFO_REDUC_CODE (reduc_info); |
8809 | tree neutral_op |
8810 | = neutral_op_for_reduction (TREE_TYPE (initial_def), |
8811 | code, initial_value: initial_def); |
8812 | gcc_assert (neutral_op); |
8813 | /* Try to simplify the vector initialization by applying an |
8814 | adjustment after the reduction has been performed. */ |
8815 | if (!reduc_info->reused_accumulator |
8816 | && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
8817 | && !operand_equal_p (neutral_op, initial_def)) |
8818 | { |
8819 | STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) |
8820 | = initial_def; |
8821 | initial_def = neutral_op; |
8822 | } |
8823 | vec_initial_def |
8824 | = get_initial_def_for_reduction (loop_vinfo, reduc_info, |
8825 | init_val: initial_def, neutral_op); |
8826 | } |
8827 | } |
8828 | } |
8829 | |
8830 | if (vec_initial_def) |
8831 | { |
8832 | vec_initial_defs.create (nelems: ncopies); |
8833 | for (i = 0; i < ncopies; ++i) |
8834 | vec_initial_defs.quick_push (obj: vec_initial_def); |
8835 | } |
8836 | |
8837 | if (auto *accumulator = reduc_info->reused_accumulator) |
8838 | { |
8839 | tree def = accumulator->reduc_input; |
8840 | if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def))) |
8841 | { |
8842 | unsigned int nreduc; |
8843 | bool res = constant_multiple_p (a: TYPE_VECTOR_SUBPARTS |
8844 | (TREE_TYPE (def)), |
8845 | b: TYPE_VECTOR_SUBPARTS (node: vectype_out), |
8846 | multiple: &nreduc); |
8847 | gcc_assert (res); |
8848 | gimple_seq stmts = NULL; |
8849 | /* Reduce the single vector to a smaller one. */ |
8850 | if (nreduc != 1) |
8851 | { |
8852 | /* Perform the reduction in the appropriate type. */ |
8853 | tree rvectype = vectype_out; |
8854 | if (!useless_type_conversion_p (TREE_TYPE (vectype_out), |
8855 | TREE_TYPE (TREE_TYPE (def)))) |
8856 | rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)), |
8857 | TYPE_VECTOR_SUBPARTS |
8858 | (node: vectype_out)); |
8859 | def = vect_create_partial_epilog (vec_def: def, vectype: rvectype, |
8860 | STMT_VINFO_REDUC_CODE |
8861 | (reduc_info), |
8862 | seq: &stmts); |
8863 | } |
8864 | /* The epilogue loop might use a different vector mode, like |
8865 | VNx2DI vs. V2DI. */ |
8866 | if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def))) |
8867 | { |
8868 | tree reduc_type = build_vector_type_for_mode |
8869 | (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out)); |
8870 | def = gimple_convert (seq: &stmts, type: reduc_type, op: def); |
8871 | } |
8872 | /* Adjust the input so we pick up the partially reduced value |
8873 | for the skip edge in vect_create_epilog_for_reduction. */ |
8874 | accumulator->reduc_input = def; |
8875 | /* And the reduction could be carried out using a different sign. */ |
8876 | if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def))) |
8877 | def = gimple_convert (seq: &stmts, type: vectype_out, op: def); |
8878 | if (loop_vinfo->main_loop_edge) |
8879 | { |
8880 | /* While we'd like to insert on the edge this will split |
8881 | blocks and disturb bookkeeping, we also will eventually |
8882 | need this on the skip edge. Rely on sinking to |
8883 | fixup optimal placement and insert in the pred. */ |
8884 | gimple_stmt_iterator gsi |
8885 | = gsi_last_bb (bb: loop_vinfo->main_loop_edge->src); |
8886 | /* Insert before a cond that eventually skips the |
8887 | epilogue. */ |
8888 | if (!gsi_end_p (i: gsi) && stmt_ends_bb_p (gsi_stmt (i: gsi))) |
8889 | gsi_prev (i: &gsi); |
8890 | gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING); |
8891 | } |
8892 | else |
8893 | gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), |
8894 | stmts); |
8895 | } |
8896 | if (loop_vinfo->main_loop_edge) |
8897 | vec_initial_defs[0] |
8898 | = vect_get_main_loop_result (loop_vinfo, def, |
8899 | vec_initial_defs[0]); |
8900 | else |
8901 | vec_initial_defs.safe_push (obj: def); |
8902 | } |
8903 | |
8904 | /* Generate the reduction PHIs upfront. */ |
8905 | for (i = 0; i < vec_num; i++) |
8906 | { |
8907 | tree vec_init_def = vec_initial_defs[i]; |
8908 | for (j = 0; j < ncopies; j++) |
8909 | { |
8910 | /* Create the reduction-phi that defines the reduction |
8911 | operand. */ |
8912 | gphi *new_phi = create_phi_node (vec_dest, loop->header); |
8913 | |
8914 | /* Set the loop-entry arg of the reduction-phi. */ |
8915 | if (j != 0 && nested_cycle) |
8916 | vec_init_def = vec_initial_defs[j]; |
8917 | add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop), |
8918 | UNKNOWN_LOCATION); |
8919 | |
8920 | /* The loop-latch arg is set in epilogue processing. */ |
8921 | |
8922 | if (slp_node) |
8923 | slp_node->push_vec_def (def: new_phi); |
8924 | else |
8925 | { |
8926 | if (j == 0) |
8927 | *vec_stmt = new_phi; |
8928 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_phi); |
8929 | } |
8930 | } |
8931 | } |
8932 | |
8933 | return true; |
8934 | } |
8935 | |
8936 | /* Vectorizes LC PHIs. */ |
8937 | |
8938 | bool |
8939 | vectorizable_lc_phi (loop_vec_info loop_vinfo, |
8940 | stmt_vec_info stmt_info, gimple **vec_stmt, |
8941 | slp_tree slp_node) |
8942 | { |
8943 | if (!loop_vinfo |
8944 | || !is_a <gphi *> (p: stmt_info->stmt) |
8945 | || gimple_phi_num_args (gs: stmt_info->stmt) != 1) |
8946 | return false; |
8947 | |
8948 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def |
8949 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) |
8950 | return false; |
8951 | |
8952 | if (!vec_stmt) /* transformation not required. */ |
8953 | { |
8954 | /* Deal with copies from externs or constants that disguise as |
8955 | loop-closed PHI nodes (PR97886). */ |
8956 | if (slp_node |
8957 | && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0], |
8958 | SLP_TREE_VECTYPE (slp_node))) |
8959 | { |
8960 | if (dump_enabled_p ()) |
8961 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8962 | "incompatible vector types for invariants\n" ); |
8963 | return false; |
8964 | } |
8965 | STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type; |
8966 | return true; |
8967 | } |
8968 | |
8969 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
8970 | tree scalar_dest = gimple_phi_result (gs: stmt_info->stmt); |
8971 | basic_block bb = gimple_bb (g: stmt_info->stmt); |
8972 | edge e = single_pred_edge (bb); |
8973 | tree vec_dest = vect_create_destination_var (scalar_dest, vectype); |
8974 | auto_vec<tree> vec_oprnds; |
8975 | vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, |
8976 | !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1, |
8977 | gimple_phi_arg_def (gs: stmt_info->stmt, index: 0), &vec_oprnds); |
8978 | for (unsigned i = 0; i < vec_oprnds.length (); i++) |
8979 | { |
8980 | /* Create the vectorized LC PHI node. */ |
8981 | gphi *new_phi = create_phi_node (vec_dest, bb); |
8982 | add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION); |
8983 | if (slp_node) |
8984 | slp_node->push_vec_def (def: new_phi); |
8985 | else |
8986 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_phi); |
8987 | } |
8988 | if (!slp_node) |
8989 | *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; |
8990 | |
8991 | return true; |
8992 | } |
8993 | |
8994 | /* Vectorizes PHIs. */ |
8995 | |
8996 | bool |
8997 | vectorizable_phi (vec_info *, |
8998 | stmt_vec_info stmt_info, gimple **vec_stmt, |
8999 | slp_tree slp_node, stmt_vector_for_cost *cost_vec) |
9000 | { |
9001 | if (!is_a <gphi *> (p: stmt_info->stmt) || !slp_node) |
9002 | return false; |
9003 | |
9004 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def) |
9005 | return false; |
9006 | |
9007 | tree vectype = SLP_TREE_VECTYPE (slp_node); |
9008 | |
9009 | if (!vec_stmt) /* transformation not required. */ |
9010 | { |
9011 | slp_tree child; |
9012 | unsigned i; |
9013 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child) |
9014 | if (!child) |
9015 | { |
9016 | if (dump_enabled_p ()) |
9017 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9018 | "PHI node with unvectorized backedge def\n" ); |
9019 | return false; |
9020 | } |
9021 | else if (!vect_maybe_update_slp_op_vectype (child, vectype)) |
9022 | { |
9023 | if (dump_enabled_p ()) |
9024 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9025 | "incompatible vector types for invariants\n" ); |
9026 | return false; |
9027 | } |
9028 | else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def |
9029 | && !useless_type_conversion_p (vectype, |
9030 | SLP_TREE_VECTYPE (child))) |
9031 | { |
9032 | /* With bools we can have mask and non-mask precision vectors |
9033 | or different non-mask precisions. while pattern recog is |
9034 | supposed to guarantee consistency here bugs in it can cause |
9035 | mismatches (PR103489 and PR103800 for example). |
9036 | Deal with them here instead of ICEing later. */ |
9037 | if (dump_enabled_p ()) |
9038 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9039 | "incompatible vector type setup from " |
9040 | "bool pattern detection\n" ); |
9041 | return false; |
9042 | } |
9043 | |
9044 | /* For single-argument PHIs assume coalescing which means zero cost |
9045 | for the scalar and the vector PHIs. This avoids artificially |
9046 | favoring the vector path (but may pessimize it in some cases). */ |
9047 | if (gimple_phi_num_args (gs: as_a <gphi *> (p: stmt_info->stmt)) > 1) |
9048 | record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node), |
9049 | vector_stmt, stmt_info, vectype, 0, vect_body); |
9050 | STMT_VINFO_TYPE (stmt_info) = phi_info_type; |
9051 | return true; |
9052 | } |
9053 | |
9054 | tree scalar_dest = gimple_phi_result (gs: stmt_info->stmt); |
9055 | basic_block bb = gimple_bb (g: stmt_info->stmt); |
9056 | tree vec_dest = vect_create_destination_var (scalar_dest, vectype); |
9057 | auto_vec<gphi *> new_phis; |
9058 | for (unsigned i = 0; i < gimple_phi_num_args (gs: stmt_info->stmt); ++i) |
9059 | { |
9060 | slp_tree child = SLP_TREE_CHILDREN (slp_node)[i]; |
9061 | |
9062 | /* Skip not yet vectorized defs. */ |
9063 | if (SLP_TREE_DEF_TYPE (child) == vect_internal_def |
9064 | && SLP_TREE_VEC_DEFS (child).is_empty ()) |
9065 | continue; |
9066 | |
9067 | auto_vec<tree> vec_oprnds; |
9068 | vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds); |
9069 | if (!new_phis.exists ()) |
9070 | { |
9071 | new_phis.create (nelems: vec_oprnds.length ()); |
9072 | for (unsigned j = 0; j < vec_oprnds.length (); j++) |
9073 | { |
9074 | /* Create the vectorized LC PHI node. */ |
9075 | new_phis.quick_push (obj: create_phi_node (vec_dest, bb)); |
9076 | slp_node->push_vec_def (def: new_phis[j]); |
9077 | } |
9078 | } |
9079 | edge e = gimple_phi_arg_edge (phi: as_a <gphi *> (p: stmt_info->stmt), i); |
9080 | for (unsigned j = 0; j < vec_oprnds.length (); j++) |
9081 | add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION); |
9082 | } |
9083 | /* We should have at least one already vectorized child. */ |
9084 | gcc_assert (new_phis.exists ()); |
9085 | |
9086 | return true; |
9087 | } |
9088 | |
9089 | /* Vectorizes first order recurrences. An overview of the transformation |
9090 | is described below. Suppose we have the following loop. |
9091 | |
9092 | int t = 0; |
9093 | for (int i = 0; i < n; ++i) |
9094 | { |
9095 | b[i] = a[i] - t; |
9096 | t = a[i]; |
9097 | } |
9098 | |
9099 | There is a first-order recurrence on 'a'. For this loop, the scalar IR |
9100 | looks (simplified) like: |
9101 | |
9102 | scalar.preheader: |
9103 | init = 0; |
9104 | |
9105 | scalar.body: |
9106 | i = PHI <0(scalar.preheader), i+1(scalar.body)> |
9107 | _2 = PHI <(init(scalar.preheader), <_1(scalar.body)> |
9108 | _1 = a[i] |
9109 | b[i] = _1 - _2 |
9110 | if (i < n) goto scalar.body |
9111 | |
9112 | In this example, _2 is a recurrence because it's value depends on the |
9113 | previous iteration. We vectorize this as (VF = 4) |
9114 | |
9115 | vector.preheader: |
9116 | vect_init = vect_cst(..., ..., ..., 0) |
9117 | |
9118 | vector.body |
9119 | i = PHI <0(vector.preheader), i+4(vector.body)> |
9120 | vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)> |
9121 | vect_2 = a[i, i+1, i+2, i+3]; |
9122 | vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 }) |
9123 | b[i, i+1, i+2, i+3] = vect_2 - vect_3 |
9124 | if (..) goto vector.body |
9125 | |
9126 | In this function, vectorizable_recurr, we code generate both the |
9127 | vector PHI node and the permute since those together compute the |
9128 | vectorized value of the scalar PHI. We do not yet have the |
9129 | backedge value to fill in there nor into the vec_perm. Those |
9130 | are filled in maybe_set_vectorized_backedge_value and |
9131 | vect_schedule_scc. |
9132 | |
9133 | TODO: Since the scalar loop does not have a use of the recurrence |
9134 | outside of the loop the natural way to implement peeling via |
9135 | vectorizing the live value doesn't work. For now peeling of loops |
9136 | with a recurrence is not implemented. For SLP the supported cases |
9137 | are restricted to those requiring a single vector recurrence PHI. */ |
9138 | |
9139 | bool |
9140 | vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, |
9141 | gimple **vec_stmt, slp_tree slp_node, |
9142 | stmt_vector_for_cost *cost_vec) |
9143 | { |
9144 | if (!loop_vinfo || !is_a<gphi *> (p: stmt_info->stmt)) |
9145 | return false; |
9146 | |
9147 | gphi *phi = as_a<gphi *> (p: stmt_info->stmt); |
9148 | |
9149 | /* So far we only support first-order recurrence auto-vectorization. */ |
9150 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence) |
9151 | return false; |
9152 | |
9153 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
9154 | unsigned ncopies; |
9155 | if (slp_node) |
9156 | ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); |
9157 | else |
9158 | ncopies = vect_get_num_copies (loop_vinfo, vectype); |
9159 | poly_int64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype); |
9160 | unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1; |
9161 | /* We need to be able to make progress with a single vector. */ |
9162 | if (maybe_gt (dist * 2, nunits)) |
9163 | { |
9164 | if (dump_enabled_p ()) |
9165 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9166 | "first order recurrence exceeds half of " |
9167 | "a vector\n" ); |
9168 | return false; |
9169 | } |
9170 | |
9171 | /* First-order recurrence autovectorization needs to handle permutation |
9172 | with indices = [nunits-1, nunits, nunits+1, ...]. */ |
9173 | vec_perm_builder sel (nunits, 1, 3); |
9174 | for (int i = 0; i < 3; ++i) |
9175 | sel.quick_push (obj: nunits - dist + i); |
9176 | vec_perm_indices indices (sel, 2, nunits); |
9177 | |
9178 | if (!vec_stmt) /* transformation not required. */ |
9179 | { |
9180 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype), |
9181 | indices)) |
9182 | return false; |
9183 | |
9184 | if (slp_node) |
9185 | { |
9186 | /* We eventually need to set a vector type on invariant |
9187 | arguments. */ |
9188 | unsigned j; |
9189 | slp_tree child; |
9190 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child) |
9191 | if (!vect_maybe_update_slp_op_vectype |
9192 | (child, SLP_TREE_VECTYPE (slp_node))) |
9193 | { |
9194 | if (dump_enabled_p ()) |
9195 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9196 | "incompatible vector types for " |
9197 | "invariants\n" ); |
9198 | return false; |
9199 | } |
9200 | } |
9201 | |
9202 | /* Verify we have set up compatible types. */ |
9203 | edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo)); |
9204 | tree latch_vectype = NULL_TREE; |
9205 | if (slp_node) |
9206 | { |
9207 | slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx]; |
9208 | latch_vectype = SLP_TREE_VECTYPE (latch_def); |
9209 | } |
9210 | else |
9211 | { |
9212 | tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, le); |
9213 | if (TREE_CODE (latch_def) == SSA_NAME) |
9214 | { |
9215 | stmt_vec_info latch_def_info = loop_vinfo->lookup_def (latch_def); |
9216 | latch_def_info = vect_stmt_to_vectorize (stmt_info: latch_def_info); |
9217 | latch_vectype = STMT_VINFO_VECTYPE (latch_def_info); |
9218 | } |
9219 | } |
9220 | if (!types_compatible_p (type1: latch_vectype, type2: vectype)) |
9221 | return false; |
9222 | |
9223 | /* The recurrence costs the initialization vector and one permute |
9224 | for each copy. */ |
9225 | unsigned prologue_cost = record_stmt_cost (body_cost_vec: cost_vec, count: 1, kind: scalar_to_vec, |
9226 | stmt_info, misalign: 0, where: vect_prologue); |
9227 | unsigned inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: ncopies, kind: vector_stmt, |
9228 | stmt_info, misalign: 0, where: vect_body); |
9229 | if (dump_enabled_p ()) |
9230 | dump_printf_loc (MSG_NOTE, vect_location, |
9231 | "vectorizable_recurr: inside_cost = %d, " |
9232 | "prologue_cost = %d .\n" , inside_cost, |
9233 | prologue_cost); |
9234 | |
9235 | STMT_VINFO_TYPE (stmt_info) = recurr_info_type; |
9236 | return true; |
9237 | } |
9238 | |
9239 | edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); |
9240 | basic_block bb = gimple_bb (g: phi); |
9241 | tree = PHI_ARG_DEF_FROM_EDGE (phi, pe); |
9242 | if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader))) |
9243 | { |
9244 | gimple_seq stmts = NULL; |
9245 | preheader = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: preheader); |
9246 | gsi_insert_seq_on_edge_immediate (pe, stmts); |
9247 | } |
9248 | tree vec_init = build_vector_from_val (vectype, preheader); |
9249 | vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL); |
9250 | |
9251 | /* Create the vectorized first-order PHI node. */ |
9252 | tree vec_dest = vect_get_new_vect_var (vectype, |
9253 | vect_simple_var, "vec_recur_" ); |
9254 | gphi *new_phi = create_phi_node (vec_dest, bb); |
9255 | add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION); |
9256 | |
9257 | /* Insert shuffles the first-order recurrence autovectorization. |
9258 | result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */ |
9259 | tree perm = vect_gen_perm_mask_checked (vectype, indices); |
9260 | |
9261 | /* Insert the required permute after the latch definition. The |
9262 | second and later operands are tentative and will be updated when we have |
9263 | vectorized the latch definition. */ |
9264 | edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo)); |
9265 | gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le)); |
9266 | gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def); |
9267 | gsi_next (i: &gsi2); |
9268 | |
9269 | for (unsigned i = 0; i < ncopies; ++i) |
9270 | { |
9271 | vec_dest = make_ssa_name (var: vectype); |
9272 | gassign *vperm |
9273 | = gimple_build_assign (vec_dest, VEC_PERM_EXPR, |
9274 | i == 0 ? gimple_phi_result (gs: new_phi) : NULL, |
9275 | NULL, perm); |
9276 | vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2); |
9277 | |
9278 | if (slp_node) |
9279 | slp_node->push_vec_def (def: vperm); |
9280 | else |
9281 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: vperm); |
9282 | } |
9283 | |
9284 | if (!slp_node) |
9285 | *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; |
9286 | return true; |
9287 | } |
9288 | |
9289 | /* Return true if VECTYPE represents a vector that requires lowering |
9290 | by the vector lowering pass. */ |
9291 | |
9292 | bool |
9293 | vect_emulated_vector_p (tree vectype) |
9294 | { |
9295 | return (!VECTOR_MODE_P (TYPE_MODE (vectype)) |
9296 | && (!VECTOR_BOOLEAN_TYPE_P (vectype) |
9297 | || TYPE_PRECISION (TREE_TYPE (vectype)) != 1)); |
9298 | } |
9299 | |
9300 | /* Return true if we can emulate CODE on an integer mode representation |
9301 | of a vector. */ |
9302 | |
9303 | bool |
9304 | vect_can_vectorize_without_simd_p (tree_code code) |
9305 | { |
9306 | switch (code) |
9307 | { |
9308 | case PLUS_EXPR: |
9309 | case MINUS_EXPR: |
9310 | case NEGATE_EXPR: |
9311 | case BIT_AND_EXPR: |
9312 | case BIT_IOR_EXPR: |
9313 | case BIT_XOR_EXPR: |
9314 | case BIT_NOT_EXPR: |
9315 | return true; |
9316 | |
9317 | default: |
9318 | return false; |
9319 | } |
9320 | } |
9321 | |
9322 | /* Likewise, but taking a code_helper. */ |
9323 | |
9324 | bool |
9325 | vect_can_vectorize_without_simd_p (code_helper code) |
9326 | { |
9327 | return (code.is_tree_code () |
9328 | && vect_can_vectorize_without_simd_p (code: tree_code (code))); |
9329 | } |
9330 | |
9331 | /* Create vector init for vectorized iv. */ |
9332 | static tree |
9333 | vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr, |
9334 | tree step_expr, poly_uint64 nunits, |
9335 | tree vectype, |
9336 | enum vect_induction_op_type induction_type) |
9337 | { |
9338 | unsigned HOST_WIDE_INT const_nunits; |
9339 | tree vec_shift, vec_init, new_name; |
9340 | unsigned i; |
9341 | tree itype = TREE_TYPE (vectype); |
9342 | |
9343 | /* iv_loop is the loop to be vectorized. Create: |
9344 | vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */ |
9345 | new_name = gimple_convert (seq: stmts, type: itype, op: init_expr); |
9346 | switch (induction_type) |
9347 | { |
9348 | case vect_step_op_shr: |
9349 | case vect_step_op_shl: |
9350 | /* Build the Initial value from shift_expr. */ |
9351 | vec_init = gimple_build_vector_from_val (seq: stmts, |
9352 | type: vectype, |
9353 | op: new_name); |
9354 | vec_shift = gimple_build (seq: stmts, code: VEC_SERIES_EXPR, type: vectype, |
9355 | ops: build_zero_cst (itype), ops: step_expr); |
9356 | vec_init = gimple_build (seq: stmts, |
9357 | code: (induction_type == vect_step_op_shr |
9358 | ? RSHIFT_EXPR : LSHIFT_EXPR), |
9359 | type: vectype, ops: vec_init, ops: vec_shift); |
9360 | break; |
9361 | |
9362 | case vect_step_op_neg: |
9363 | { |
9364 | vec_init = gimple_build_vector_from_val (seq: stmts, |
9365 | type: vectype, |
9366 | op: new_name); |
9367 | tree vec_neg = gimple_build (seq: stmts, code: NEGATE_EXPR, |
9368 | type: vectype, ops: vec_init); |
9369 | /* The encoding has 2 interleaved stepped patterns. */ |
9370 | vec_perm_builder sel (nunits, 2, 3); |
9371 | sel.quick_grow (len: 6); |
9372 | for (i = 0; i < 3; i++) |
9373 | { |
9374 | sel[2 * i] = i; |
9375 | sel[2 * i + 1] = i + nunits; |
9376 | } |
9377 | vec_perm_indices indices (sel, 2, nunits); |
9378 | /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may |
9379 | fail when vec_init is const vector. In that situation vec_perm is not |
9380 | really needed. */ |
9381 | tree perm_mask_even |
9382 | = vect_gen_perm_mask_any (vectype, indices); |
9383 | vec_init = gimple_build (seq: stmts, code: VEC_PERM_EXPR, |
9384 | type: vectype, |
9385 | ops: vec_init, ops: vec_neg, |
9386 | ops: perm_mask_even); |
9387 | } |
9388 | break; |
9389 | |
9390 | case vect_step_op_mul: |
9391 | { |
9392 | /* Use unsigned mult to avoid UD integer overflow. */ |
9393 | gcc_assert (nunits.is_constant (&const_nunits)); |
9394 | tree utype = unsigned_type_for (itype); |
9395 | tree uvectype = build_vector_type (utype, |
9396 | TYPE_VECTOR_SUBPARTS (node: vectype)); |
9397 | new_name = gimple_convert (seq: stmts, type: utype, op: new_name); |
9398 | vec_init = gimple_build_vector_from_val (seq: stmts, |
9399 | type: uvectype, |
9400 | op: new_name); |
9401 | tree_vector_builder elts (uvectype, const_nunits, 1); |
9402 | tree elt_step = build_one_cst (utype); |
9403 | |
9404 | elts.quick_push (obj: elt_step); |
9405 | for (i = 1; i < const_nunits; i++) |
9406 | { |
9407 | /* Create: new_name_i = new_name + step_expr. */ |
9408 | elt_step = gimple_build (seq: stmts, code: MULT_EXPR, |
9409 | type: utype, ops: elt_step, ops: step_expr); |
9410 | elts.quick_push (obj: elt_step); |
9411 | } |
9412 | /* Create a vector from [new_name_0, new_name_1, ..., |
9413 | new_name_nunits-1]. */ |
9414 | tree vec_mul = gimple_build_vector (seq: stmts, builder: &elts); |
9415 | vec_init = gimple_build (seq: stmts, code: MULT_EXPR, type: uvectype, |
9416 | ops: vec_init, ops: vec_mul); |
9417 | vec_init = gimple_convert (seq: stmts, type: vectype, op: vec_init); |
9418 | } |
9419 | break; |
9420 | |
9421 | default: |
9422 | gcc_unreachable (); |
9423 | } |
9424 | |
9425 | return vec_init; |
9426 | } |
9427 | |
9428 | /* Peel init_expr by skip_niter for induction_type. */ |
9429 | tree |
9430 | vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr, |
9431 | tree skip_niters, tree step_expr, |
9432 | enum vect_induction_op_type induction_type) |
9433 | { |
9434 | gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST); |
9435 | tree type = TREE_TYPE (init_expr); |
9436 | unsigned prec = TYPE_PRECISION (type); |
9437 | switch (induction_type) |
9438 | { |
9439 | case vect_step_op_neg: |
9440 | if (TREE_INT_CST_LOW (skip_niters) % 2) |
9441 | init_expr = gimple_build (seq: stmts, code: NEGATE_EXPR, type, ops: init_expr); |
9442 | /* else no change. */ |
9443 | break; |
9444 | |
9445 | case vect_step_op_shr: |
9446 | case vect_step_op_shl: |
9447 | skip_niters = gimple_convert (seq: stmts, type, op: skip_niters); |
9448 | step_expr = gimple_build (seq: stmts, code: MULT_EXPR, type, ops: step_expr, ops: skip_niters); |
9449 | /* When shift mount >= precision, need to avoid UD. |
9450 | In the original loop, there's no UD, and according to semantic, |
9451 | init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */ |
9452 | if (!tree_fits_uhwi_p (step_expr) |
9453 | || tree_to_uhwi (step_expr) >= prec) |
9454 | { |
9455 | if (induction_type == vect_step_op_shl |
9456 | || TYPE_UNSIGNED (type)) |
9457 | init_expr = build_zero_cst (type); |
9458 | else |
9459 | init_expr = gimple_build (seq: stmts, code: RSHIFT_EXPR, type, |
9460 | ops: init_expr, |
9461 | ops: wide_int_to_tree (type, cst: prec - 1)); |
9462 | } |
9463 | else |
9464 | init_expr = gimple_build (seq: stmts, code: (induction_type == vect_step_op_shr |
9465 | ? RSHIFT_EXPR : LSHIFT_EXPR), |
9466 | type, ops: init_expr, ops: step_expr); |
9467 | break; |
9468 | |
9469 | case vect_step_op_mul: |
9470 | { |
9471 | tree utype = unsigned_type_for (type); |
9472 | init_expr = gimple_convert (seq: stmts, type: utype, op: init_expr); |
9473 | wide_int skipn = wi::to_wide (t: skip_niters); |
9474 | wide_int begin = wi::to_wide (t: step_expr); |
9475 | auto_mpz base, exp, mod, res; |
9476 | wi::to_mpz (begin, base, TYPE_SIGN (type)); |
9477 | wi::to_mpz (skipn, exp, UNSIGNED); |
9478 | mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type)); |
9479 | mpz_powm (res, base, exp, mod); |
9480 | begin = wi::from_mpz (utype, res, true); |
9481 | tree mult_expr = wide_int_to_tree (type: utype, cst: begin); |
9482 | init_expr = gimple_build (seq: stmts, code: MULT_EXPR, type: utype, |
9483 | ops: init_expr, ops: mult_expr); |
9484 | init_expr = gimple_convert (seq: stmts, type, op: init_expr); |
9485 | } |
9486 | break; |
9487 | |
9488 | default: |
9489 | gcc_unreachable (); |
9490 | } |
9491 | |
9492 | return init_expr; |
9493 | } |
9494 | |
9495 | /* Create vector step for vectorized iv. */ |
9496 | static tree |
9497 | vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr, |
9498 | poly_uint64 vf, |
9499 | enum vect_induction_op_type induction_type) |
9500 | { |
9501 | tree expr = build_int_cst (TREE_TYPE (step_expr), vf); |
9502 | tree new_name = NULL; |
9503 | /* Step should be pow (step, vf) for mult induction. */ |
9504 | if (induction_type == vect_step_op_mul) |
9505 | { |
9506 | gcc_assert (vf.is_constant ()); |
9507 | wide_int begin = wi::to_wide (t: step_expr); |
9508 | |
9509 | for (unsigned i = 0; i != vf.to_constant () - 1; i++) |
9510 | begin = wi::mul (x: begin, y: wi::to_wide (t: step_expr)); |
9511 | |
9512 | new_name = wide_int_to_tree (TREE_TYPE (step_expr), cst: begin); |
9513 | } |
9514 | else if (induction_type == vect_step_op_neg) |
9515 | /* Do nothing. */ |
9516 | ; |
9517 | else |
9518 | new_name = gimple_build (seq: stmts, code: MULT_EXPR, TREE_TYPE (step_expr), |
9519 | ops: expr, ops: step_expr); |
9520 | return new_name; |
9521 | } |
9522 | |
9523 | static tree |
9524 | vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo, |
9525 | stmt_vec_info stmt_info, |
9526 | tree new_name, tree vectype, |
9527 | enum vect_induction_op_type induction_type) |
9528 | { |
9529 | /* No step is needed for neg induction. */ |
9530 | if (induction_type == vect_step_op_neg) |
9531 | return NULL; |
9532 | |
9533 | tree t = unshare_expr (new_name); |
9534 | gcc_assert (CONSTANT_CLASS_P (new_name) |
9535 | || TREE_CODE (new_name) == SSA_NAME); |
9536 | tree new_vec = build_vector_from_val (vectype, t); |
9537 | tree vec_step = vect_init_vector (loop_vinfo, stmt_info, |
9538 | new_vec, vectype, NULL); |
9539 | return vec_step; |
9540 | } |
9541 | |
9542 | /* Update vectorized iv with vect_step, induc_def is init. */ |
9543 | static tree |
9544 | vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype, |
9545 | tree induc_def, tree vec_step, |
9546 | enum vect_induction_op_type induction_type) |
9547 | { |
9548 | tree vec_def = induc_def; |
9549 | switch (induction_type) |
9550 | { |
9551 | case vect_step_op_mul: |
9552 | { |
9553 | /* Use unsigned mult to avoid UD integer overflow. */ |
9554 | tree uvectype |
9555 | = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)), |
9556 | TYPE_VECTOR_SUBPARTS (node: vectype)); |
9557 | vec_def = gimple_convert (seq: stmts, type: uvectype, op: vec_def); |
9558 | vec_step = gimple_convert (seq: stmts, type: uvectype, op: vec_step); |
9559 | vec_def = gimple_build (seq: stmts, code: MULT_EXPR, type: uvectype, |
9560 | ops: vec_def, ops: vec_step); |
9561 | vec_def = gimple_convert (seq: stmts, type: vectype, op: vec_def); |
9562 | } |
9563 | break; |
9564 | |
9565 | case vect_step_op_shr: |
9566 | vec_def = gimple_build (seq: stmts, code: RSHIFT_EXPR, type: vectype, |
9567 | ops: vec_def, ops: vec_step); |
9568 | break; |
9569 | |
9570 | case vect_step_op_shl: |
9571 | vec_def = gimple_build (seq: stmts, code: LSHIFT_EXPR, type: vectype, |
9572 | ops: vec_def, ops: vec_step); |
9573 | break; |
9574 | case vect_step_op_neg: |
9575 | vec_def = induc_def; |
9576 | /* Do nothing. */ |
9577 | break; |
9578 | default: |
9579 | gcc_unreachable (); |
9580 | } |
9581 | |
9582 | return vec_def; |
9583 | |
9584 | } |
9585 | |
9586 | /* Function vectorizable_induction |
9587 | |
9588 | Check if STMT_INFO performs an nonlinear induction computation that can be |
9589 | vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create |
9590 | a vectorized phi to replace it, put it in VEC_STMT, and add it to the same |
9591 | basic block. |
9592 | Return true if STMT_INFO is vectorizable in this way. */ |
9593 | |
9594 | static bool |
9595 | vectorizable_nonlinear_induction (loop_vec_info loop_vinfo, |
9596 | stmt_vec_info stmt_info, |
9597 | gimple **vec_stmt, slp_tree slp_node, |
9598 | stmt_vector_for_cost *cost_vec) |
9599 | { |
9600 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
9601 | unsigned ncopies; |
9602 | bool nested_in_vect_loop = false; |
9603 | class loop *iv_loop; |
9604 | tree vec_def; |
9605 | edge pe = loop_preheader_edge (loop); |
9606 | basic_block new_bb; |
9607 | tree vec_init, vec_step; |
9608 | tree new_name; |
9609 | gimple *new_stmt; |
9610 | gphi *induction_phi; |
9611 | tree induc_def, vec_dest; |
9612 | tree init_expr, step_expr; |
9613 | tree niters_skip; |
9614 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
9615 | unsigned i; |
9616 | gimple_stmt_iterator si; |
9617 | |
9618 | gphi *phi = dyn_cast <gphi *> (p: stmt_info->stmt); |
9619 | |
9620 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
9621 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype); |
9622 | enum vect_induction_op_type induction_type |
9623 | = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); |
9624 | |
9625 | gcc_assert (induction_type > vect_step_op_add); |
9626 | |
9627 | if (slp_node) |
9628 | ncopies = 1; |
9629 | else |
9630 | ncopies = vect_get_num_copies (loop_vinfo, vectype); |
9631 | gcc_assert (ncopies >= 1); |
9632 | |
9633 | /* FORNOW. Only handle nonlinear induction in the same loop. */ |
9634 | if (nested_in_vect_loop_p (loop, stmt_info)) |
9635 | { |
9636 | if (dump_enabled_p ()) |
9637 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9638 | "nonlinear induction in nested loop.\n" ); |
9639 | return false; |
9640 | } |
9641 | |
9642 | iv_loop = loop; |
9643 | gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); |
9644 | |
9645 | /* TODO: Support slp for nonlinear iv. There should be separate vector iv |
9646 | update for each iv and a permutation to generate wanted vector iv. */ |
9647 | if (slp_node) |
9648 | { |
9649 | if (dump_enabled_p ()) |
9650 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9651 | "SLP induction not supported for nonlinear" |
9652 | " induction.\n" ); |
9653 | return false; |
9654 | } |
9655 | |
9656 | if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype))) |
9657 | { |
9658 | if (dump_enabled_p ()) |
9659 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9660 | "floating point nonlinear induction vectorization" |
9661 | " not supported.\n" ); |
9662 | return false; |
9663 | } |
9664 | |
9665 | step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); |
9666 | init_expr = vect_phi_initial_value (phi); |
9667 | gcc_assert (step_expr != NULL_TREE && init_expr != NULL |
9668 | && TREE_CODE (step_expr) == INTEGER_CST); |
9669 | /* step_expr should be aligned with init_expr, |
9670 | .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */ |
9671 | step_expr = fold_convert (TREE_TYPE (vectype), step_expr); |
9672 | |
9673 | if (TREE_CODE (init_expr) == INTEGER_CST) |
9674 | init_expr = fold_convert (TREE_TYPE (vectype), init_expr); |
9675 | else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr))) |
9676 | { |
9677 | /* INIT_EXPR could be a bit_field, bail out for such case. */ |
9678 | if (dump_enabled_p ()) |
9679 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9680 | "nonlinear induction vectorization failed:" |
9681 | " component type of vectype is not a nop conversion" |
9682 | " from type of init_expr.\n" ); |
9683 | return false; |
9684 | } |
9685 | |
9686 | switch (induction_type) |
9687 | { |
9688 | case vect_step_op_neg: |
9689 | if (maybe_eq (a: TYPE_VECTOR_SUBPARTS (node: vectype), b: 1u)) |
9690 | return false; |
9691 | if (TREE_CODE (init_expr) != INTEGER_CST |
9692 | && TREE_CODE (init_expr) != REAL_CST) |
9693 | { |
9694 | /* Check for backend support of NEGATE_EXPR and vec_perm. */ |
9695 | if (!directly_supported_p (NEGATE_EXPR, vectype)) |
9696 | return false; |
9697 | |
9698 | /* The encoding has 2 interleaved stepped patterns. */ |
9699 | vec_perm_builder sel (nunits, 2, 3); |
9700 | machine_mode mode = TYPE_MODE (vectype); |
9701 | sel.quick_grow (len: 6); |
9702 | for (i = 0; i < 3; i++) |
9703 | { |
9704 | sel[i * 2] = i; |
9705 | sel[i * 2 + 1] = i + nunits; |
9706 | } |
9707 | vec_perm_indices indices (sel, 2, nunits); |
9708 | if (!can_vec_perm_const_p (mode, mode, indices)) |
9709 | return false; |
9710 | } |
9711 | break; |
9712 | |
9713 | case vect_step_op_mul: |
9714 | { |
9715 | /* Check for backend support of MULT_EXPR. */ |
9716 | if (!directly_supported_p (MULT_EXPR, vectype)) |
9717 | return false; |
9718 | |
9719 | /* ?? How to construct vector step for variable number vector. |
9720 | [ 1, step, pow (step, 2), pow (step, 4), .. ]. */ |
9721 | if (!vf.is_constant ()) |
9722 | return false; |
9723 | } |
9724 | break; |
9725 | |
9726 | case vect_step_op_shr: |
9727 | /* Check for backend support of RSHIFT_EXPR. */ |
9728 | if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector)) |
9729 | return false; |
9730 | |
9731 | /* Don't shift more than type precision to avoid UD. */ |
9732 | if (!tree_fits_uhwi_p (step_expr) |
9733 | || maybe_ge (nunits * tree_to_uhwi (step_expr), |
9734 | TYPE_PRECISION (TREE_TYPE (init_expr)))) |
9735 | return false; |
9736 | break; |
9737 | |
9738 | case vect_step_op_shl: |
9739 | /* Check for backend support of RSHIFT_EXPR. */ |
9740 | if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector)) |
9741 | return false; |
9742 | |
9743 | /* Don't shift more than type precision to avoid UD. */ |
9744 | if (!tree_fits_uhwi_p (step_expr) |
9745 | || maybe_ge (nunits * tree_to_uhwi (step_expr), |
9746 | TYPE_PRECISION (TREE_TYPE (init_expr)))) |
9747 | return false; |
9748 | |
9749 | break; |
9750 | |
9751 | default: |
9752 | gcc_unreachable (); |
9753 | } |
9754 | |
9755 | if (!vec_stmt) /* transformation not required. */ |
9756 | { |
9757 | unsigned inside_cost = 0, prologue_cost = 0; |
9758 | /* loop cost for vec_loop. Neg induction doesn't have any |
9759 | inside_cost. */ |
9760 | inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: ncopies, kind: vector_stmt, |
9761 | stmt_info, misalign: 0, where: vect_body); |
9762 | |
9763 | /* loop cost for vec_loop. Neg induction doesn't have any |
9764 | inside_cost. */ |
9765 | if (induction_type == vect_step_op_neg) |
9766 | inside_cost = 0; |
9767 | |
9768 | /* prologue cost for vec_init and vec_step. */ |
9769 | prologue_cost = record_stmt_cost (body_cost_vec: cost_vec, count: 2, kind: scalar_to_vec, |
9770 | stmt_info, misalign: 0, where: vect_prologue); |
9771 | |
9772 | if (dump_enabled_p ()) |
9773 | dump_printf_loc (MSG_NOTE, vect_location, |
9774 | "vect_model_induction_cost: inside_cost = %d, " |
9775 | "prologue_cost = %d. \n" , inside_cost, |
9776 | prologue_cost); |
9777 | |
9778 | STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; |
9779 | DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction" ); |
9780 | return true; |
9781 | } |
9782 | |
9783 | /* Transform. */ |
9784 | |
9785 | /* Compute a vector variable, initialized with the first VF values of |
9786 | the induction variable. E.g., for an iv with IV_PHI='X' and |
9787 | evolution S, for a vector of 4 units, we want to compute: |
9788 | [X, X + S, X + 2*S, X + 3*S]. */ |
9789 | |
9790 | if (dump_enabled_p ()) |
9791 | dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n" ); |
9792 | |
9793 | pe = loop_preheader_edge (iv_loop); |
9794 | /* Find the first insertion point in the BB. */ |
9795 | basic_block bb = gimple_bb (g: phi); |
9796 | si = gsi_after_labels (bb); |
9797 | |
9798 | gimple_seq stmts = NULL; |
9799 | |
9800 | niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); |
9801 | /* If we are using the loop mask to "peel" for alignment then we need |
9802 | to adjust the start value here. */ |
9803 | if (niters_skip != NULL_TREE) |
9804 | init_expr = vect_peel_nonlinear_iv_init (stmts: &stmts, init_expr, skip_niters: niters_skip, |
9805 | step_expr, induction_type); |
9806 | |
9807 | vec_init = vect_create_nonlinear_iv_init (stmts: &stmts, init_expr, |
9808 | step_expr, nunits, vectype, |
9809 | induction_type); |
9810 | if (stmts) |
9811 | { |
9812 | new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
9813 | gcc_assert (!new_bb); |
9814 | } |
9815 | |
9816 | stmts = NULL; |
9817 | new_name = vect_create_nonlinear_iv_step (stmts: &stmts, step_expr, |
9818 | vf, induction_type); |
9819 | if (stmts) |
9820 | { |
9821 | new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
9822 | gcc_assert (!new_bb); |
9823 | } |
9824 | |
9825 | vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info, |
9826 | new_name, vectype, |
9827 | induction_type); |
9828 | /* Create the following def-use cycle: |
9829 | loop prolog: |
9830 | vec_init = ... |
9831 | vec_step = ... |
9832 | loop: |
9833 | vec_iv = PHI <vec_init, vec_loop> |
9834 | ... |
9835 | STMT |
9836 | ... |
9837 | vec_loop = vec_iv + vec_step; */ |
9838 | |
9839 | /* Create the induction-phi that defines the induction-operand. */ |
9840 | vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_" ); |
9841 | induction_phi = create_phi_node (vec_dest, iv_loop->header); |
9842 | induc_def = PHI_RESULT (induction_phi); |
9843 | |
9844 | /* Create the iv update inside the loop. */ |
9845 | stmts = NULL; |
9846 | vec_def = vect_update_nonlinear_iv (stmts: &stmts, vectype, |
9847 | induc_def, vec_step, |
9848 | induction_type); |
9849 | |
9850 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
9851 | new_stmt = SSA_NAME_DEF_STMT (vec_def); |
9852 | |
9853 | /* Set the arguments of the phi node: */ |
9854 | add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); |
9855 | add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), |
9856 | UNKNOWN_LOCATION); |
9857 | |
9858 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: induction_phi); |
9859 | *vec_stmt = induction_phi; |
9860 | |
9861 | /* In case that vectorization factor (VF) is bigger than the number |
9862 | of elements that we can fit in a vectype (nunits), we have to generate |
9863 | more than one vector stmt - i.e - we need to "unroll" the |
9864 | vector stmt by a factor VF/nunits. For more details see documentation |
9865 | in vectorizable_operation. */ |
9866 | |
9867 | if (ncopies > 1) |
9868 | { |
9869 | stmts = NULL; |
9870 | /* FORNOW. This restriction should be relaxed. */ |
9871 | gcc_assert (!nested_in_vect_loop); |
9872 | |
9873 | new_name = vect_create_nonlinear_iv_step (stmts: &stmts, step_expr, |
9874 | vf: nunits, induction_type); |
9875 | |
9876 | vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info, |
9877 | new_name, vectype, |
9878 | induction_type); |
9879 | vec_def = induc_def; |
9880 | for (i = 1; i < ncopies; i++) |
9881 | { |
9882 | /* vec_i = vec_prev + vec_step. */ |
9883 | stmts = NULL; |
9884 | vec_def = vect_update_nonlinear_iv (stmts: &stmts, vectype, |
9885 | induc_def: vec_def, vec_step, |
9886 | induction_type); |
9887 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
9888 | new_stmt = SSA_NAME_DEF_STMT (vec_def); |
9889 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_stmt); |
9890 | } |
9891 | } |
9892 | |
9893 | if (dump_enabled_p ()) |
9894 | dump_printf_loc (MSG_NOTE, vect_location, |
9895 | "transform induction: created def-use cycle: %G%G" , |
9896 | (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def)); |
9897 | |
9898 | return true; |
9899 | } |
9900 | |
9901 | /* Function vectorizable_induction |
9902 | |
9903 | Check if STMT_INFO performs an induction computation that can be vectorized. |
9904 | If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized |
9905 | phi to replace it, put it in VEC_STMT, and add it to the same basic block. |
9906 | Return true if STMT_INFO is vectorizable in this way. */ |
9907 | |
9908 | bool |
9909 | vectorizable_induction (loop_vec_info loop_vinfo, |
9910 | stmt_vec_info stmt_info, |
9911 | gimple **vec_stmt, slp_tree slp_node, |
9912 | stmt_vector_for_cost *cost_vec) |
9913 | { |
9914 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
9915 | unsigned ncopies; |
9916 | bool nested_in_vect_loop = false; |
9917 | class loop *iv_loop; |
9918 | tree vec_def; |
9919 | edge pe = loop_preheader_edge (loop); |
9920 | basic_block new_bb; |
9921 | tree new_vec, vec_init, vec_step, t; |
9922 | tree new_name; |
9923 | gimple *new_stmt; |
9924 | gphi *induction_phi; |
9925 | tree induc_def, vec_dest; |
9926 | tree init_expr, step_expr; |
9927 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
9928 | unsigned i; |
9929 | tree expr; |
9930 | gimple_stmt_iterator si; |
9931 | enum vect_induction_op_type induction_type |
9932 | = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); |
9933 | |
9934 | gphi *phi = dyn_cast <gphi *> (p: stmt_info->stmt); |
9935 | if (!phi) |
9936 | return false; |
9937 | |
9938 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
9939 | return false; |
9940 | |
9941 | /* Make sure it was recognized as induction computation. */ |
9942 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) |
9943 | return false; |
9944 | |
9945 | /* Handle nonlinear induction in a separate place. */ |
9946 | if (induction_type != vect_step_op_add) |
9947 | return vectorizable_nonlinear_induction (loop_vinfo, stmt_info, |
9948 | vec_stmt, slp_node, cost_vec); |
9949 | |
9950 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
9951 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype); |
9952 | |
9953 | if (slp_node) |
9954 | ncopies = 1; |
9955 | else |
9956 | ncopies = vect_get_num_copies (loop_vinfo, vectype); |
9957 | gcc_assert (ncopies >= 1); |
9958 | |
9959 | /* FORNOW. These restrictions should be relaxed. */ |
9960 | if (nested_in_vect_loop_p (loop, stmt_info)) |
9961 | { |
9962 | imm_use_iterator imm_iter; |
9963 | use_operand_p use_p; |
9964 | gimple *exit_phi; |
9965 | edge latch_e; |
9966 | tree loop_arg; |
9967 | |
9968 | if (ncopies > 1) |
9969 | { |
9970 | if (dump_enabled_p ()) |
9971 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9972 | "multiple types in nested loop.\n" ); |
9973 | return false; |
9974 | } |
9975 | |
9976 | exit_phi = NULL; |
9977 | latch_e = loop_latch_edge (loop->inner); |
9978 | loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); |
9979 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) |
9980 | { |
9981 | gimple *use_stmt = USE_STMT (use_p); |
9982 | if (is_gimple_debug (gs: use_stmt)) |
9983 | continue; |
9984 | |
9985 | if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (g: use_stmt))) |
9986 | { |
9987 | exit_phi = use_stmt; |
9988 | break; |
9989 | } |
9990 | } |
9991 | if (exit_phi) |
9992 | { |
9993 | stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi); |
9994 | if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo) |
9995 | && !STMT_VINFO_LIVE_P (exit_phi_vinfo))) |
9996 | { |
9997 | if (dump_enabled_p ()) |
9998 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9999 | "inner-loop induction only used outside " |
10000 | "of the outer vectorized loop.\n" ); |
10001 | return false; |
10002 | } |
10003 | } |
10004 | |
10005 | nested_in_vect_loop = true; |
10006 | iv_loop = loop->inner; |
10007 | } |
10008 | else |
10009 | iv_loop = loop; |
10010 | gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); |
10011 | |
10012 | if (slp_node && !nunits.is_constant ()) |
10013 | { |
10014 | /* The current SLP code creates the step value element-by-element. */ |
10015 | if (dump_enabled_p ()) |
10016 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
10017 | "SLP induction not supported for variable-length" |
10018 | " vectors.\n" ); |
10019 | return false; |
10020 | } |
10021 | |
10022 | if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float) |
10023 | { |
10024 | if (dump_enabled_p ()) |
10025 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
10026 | "floating point induction vectorization disabled\n" ); |
10027 | return false; |
10028 | } |
10029 | |
10030 | step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); |
10031 | gcc_assert (step_expr != NULL_TREE); |
10032 | if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)) |
10033 | && !type_has_mode_precision_p (TREE_TYPE (step_expr))) |
10034 | { |
10035 | if (dump_enabled_p ()) |
10036 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
10037 | "bit-precision induction vectorization not " |
10038 | "supported.\n" ); |
10039 | return false; |
10040 | } |
10041 | tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype); |
10042 | |
10043 | /* Check for backend support of PLUS/MINUS_EXPR. */ |
10044 | if (!directly_supported_p (PLUS_EXPR, step_vectype) |
10045 | || !directly_supported_p (MINUS_EXPR, step_vectype)) |
10046 | return false; |
10047 | |
10048 | if (!vec_stmt) /* transformation not required. */ |
10049 | { |
10050 | unsigned inside_cost = 0, prologue_cost = 0; |
10051 | if (slp_node) |
10052 | { |
10053 | /* We eventually need to set a vector type on invariant |
10054 | arguments. */ |
10055 | unsigned j; |
10056 | slp_tree child; |
10057 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child) |
10058 | if (!vect_maybe_update_slp_op_vectype |
10059 | (child, SLP_TREE_VECTYPE (slp_node))) |
10060 | { |
10061 | if (dump_enabled_p ()) |
10062 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
10063 | "incompatible vector types for " |
10064 | "invariants\n" ); |
10065 | return false; |
10066 | } |
10067 | /* loop cost for vec_loop. */ |
10068 | inside_cost |
10069 | = record_stmt_cost (body_cost_vec: cost_vec, |
10070 | SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node), |
10071 | kind: vector_stmt, stmt_info, misalign: 0, where: vect_body); |
10072 | /* prologue cost for vec_init (if not nested) and step. */ |
10073 | prologue_cost = record_stmt_cost (body_cost_vec: cost_vec, count: 1 + !nested_in_vect_loop, |
10074 | kind: scalar_to_vec, |
10075 | stmt_info, misalign: 0, where: vect_prologue); |
10076 | } |
10077 | else /* if (!slp_node) */ |
10078 | { |
10079 | /* loop cost for vec_loop. */ |
10080 | inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: ncopies, kind: vector_stmt, |
10081 | stmt_info, misalign: 0, where: vect_body); |
10082 | /* prologue cost for vec_init and vec_step. */ |
10083 | prologue_cost = record_stmt_cost (body_cost_vec: cost_vec, count: 2, kind: scalar_to_vec, |
10084 | stmt_info, misalign: 0, where: vect_prologue); |
10085 | } |
10086 | if (dump_enabled_p ()) |
10087 | dump_printf_loc (MSG_NOTE, vect_location, |
10088 | "vect_model_induction_cost: inside_cost = %d, " |
10089 | "prologue_cost = %d .\n" , inside_cost, |
10090 | prologue_cost); |
10091 | |
10092 | STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; |
10093 | DUMP_VECT_SCOPE ("vectorizable_induction" ); |
10094 | return true; |
10095 | } |
10096 | |
10097 | /* Transform. */ |
10098 | |
10099 | /* Compute a vector variable, initialized with the first VF values of |
10100 | the induction variable. E.g., for an iv with IV_PHI='X' and |
10101 | evolution S, for a vector of 4 units, we want to compute: |
10102 | [X, X + S, X + 2*S, X + 3*S]. */ |
10103 | |
10104 | if (dump_enabled_p ()) |
10105 | dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n" ); |
10106 | |
10107 | pe = loop_preheader_edge (iv_loop); |
10108 | /* Find the first insertion point in the BB. */ |
10109 | basic_block bb = gimple_bb (g: phi); |
10110 | si = gsi_after_labels (bb); |
10111 | |
10112 | /* For SLP induction we have to generate several IVs as for example |
10113 | with group size 3 we need |
10114 | [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1] |
10115 | [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */ |
10116 | if (slp_node) |
10117 | { |
10118 | /* Enforced above. */ |
10119 | unsigned int const_nunits = nunits.to_constant (); |
10120 | |
10121 | /* The initial values are vectorized, but any lanes > group_size |
10122 | need adjustment. */ |
10123 | slp_tree init_node |
10124 | = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx]; |
10125 | |
10126 | /* Gather steps. Since we do not vectorize inductions as |
10127 | cycles we have to reconstruct the step from SCEV data. */ |
10128 | unsigned group_size = SLP_TREE_LANES (slp_node); |
10129 | tree *steps = XALLOCAVEC (tree, group_size); |
10130 | tree *inits = XALLOCAVEC (tree, group_size); |
10131 | stmt_vec_info phi_info; |
10132 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info) |
10133 | { |
10134 | steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); |
10135 | if (!init_node) |
10136 | inits[i] = gimple_phi_arg_def (gs: as_a<gphi *> (p: phi_info->stmt), |
10137 | index: pe->dest_idx); |
10138 | } |
10139 | |
10140 | /* Now generate the IVs. */ |
10141 | unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); |
10142 | gcc_assert ((const_nunits * nvects) % group_size == 0); |
10143 | unsigned nivs; |
10144 | if (nested_in_vect_loop) |
10145 | nivs = nvects; |
10146 | else |
10147 | { |
10148 | /* Compute the number of distinct IVs we need. First reduce |
10149 | group_size if it is a multiple of const_nunits so we get |
10150 | one IV for a group_size of 4 but const_nunits 2. */ |
10151 | unsigned group_sizep = group_size; |
10152 | if (group_sizep % const_nunits == 0) |
10153 | group_sizep = group_sizep / const_nunits; |
10154 | nivs = least_common_multiple (group_sizep, |
10155 | const_nunits) / const_nunits; |
10156 | } |
10157 | tree stept = TREE_TYPE (step_vectype); |
10158 | tree lupdate_mul = NULL_TREE; |
10159 | if (!nested_in_vect_loop) |
10160 | { |
10161 | /* The number of iterations covered in one vector iteration. */ |
10162 | unsigned lup_mul = (nvects * const_nunits) / group_size; |
10163 | lupdate_mul |
10164 | = build_vector_from_val (step_vectype, |
10165 | SCALAR_FLOAT_TYPE_P (stept) |
10166 | ? build_real_from_wide (stept, lup_mul, |
10167 | UNSIGNED) |
10168 | : build_int_cstu (type: stept, lup_mul)); |
10169 | } |
10170 | tree peel_mul = NULL_TREE; |
10171 | gimple_seq init_stmts = NULL; |
10172 | if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)) |
10173 | { |
10174 | if (SCALAR_FLOAT_TYPE_P (stept)) |
10175 | peel_mul = gimple_build (seq: &init_stmts, code: FLOAT_EXPR, type: stept, |
10176 | LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); |
10177 | else |
10178 | peel_mul = gimple_convert (seq: &init_stmts, type: stept, |
10179 | LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); |
10180 | peel_mul = gimple_build_vector_from_val (seq: &init_stmts, |
10181 | type: step_vectype, op: peel_mul); |
10182 | } |
10183 | unsigned ivn; |
10184 | auto_vec<tree> vec_steps; |
10185 | for (ivn = 0; ivn < nivs; ++ivn) |
10186 | { |
10187 | tree_vector_builder step_elts (step_vectype, const_nunits, 1); |
10188 | tree_vector_builder init_elts (vectype, const_nunits, 1); |
10189 | tree_vector_builder mul_elts (step_vectype, const_nunits, 1); |
10190 | for (unsigned eltn = 0; eltn < const_nunits; ++eltn) |
10191 | { |
10192 | /* The scalar steps of the IVs. */ |
10193 | tree elt = steps[(ivn*const_nunits + eltn) % group_size]; |
10194 | elt = gimple_convert (seq: &init_stmts, TREE_TYPE (step_vectype), op: elt); |
10195 | step_elts.quick_push (obj: elt); |
10196 | if (!init_node) |
10197 | { |
10198 | /* The scalar inits of the IVs if not vectorized. */ |
10199 | elt = inits[(ivn*const_nunits + eltn) % group_size]; |
10200 | if (!useless_type_conversion_p (TREE_TYPE (vectype), |
10201 | TREE_TYPE (elt))) |
10202 | elt = gimple_build (seq: &init_stmts, code: VIEW_CONVERT_EXPR, |
10203 | TREE_TYPE (vectype), ops: elt); |
10204 | init_elts.quick_push (obj: elt); |
10205 | } |
10206 | /* The number of steps to add to the initial values. */ |
10207 | unsigned mul_elt = (ivn*const_nunits + eltn) / group_size; |
10208 | mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept) |
10209 | ? build_real_from_wide (stept, |
10210 | mul_elt, UNSIGNED) |
10211 | : build_int_cstu (type: stept, mul_elt)); |
10212 | } |
10213 | vec_step = gimple_build_vector (seq: &init_stmts, builder: &step_elts); |
10214 | vec_steps.safe_push (obj: vec_step); |
10215 | tree step_mul = gimple_build_vector (seq: &init_stmts, builder: &mul_elts); |
10216 | if (peel_mul) |
10217 | step_mul = gimple_build (seq: &init_stmts, code: PLUS_EXPR, type: step_vectype, |
10218 | ops: step_mul, ops: peel_mul); |
10219 | if (!init_node) |
10220 | vec_init = gimple_build_vector (seq: &init_stmts, builder: &init_elts); |
10221 | |
10222 | /* Create the induction-phi that defines the induction-operand. */ |
10223 | vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, |
10224 | "vec_iv_" ); |
10225 | induction_phi = create_phi_node (vec_dest, iv_loop->header); |
10226 | induc_def = PHI_RESULT (induction_phi); |
10227 | |
10228 | /* Create the iv update inside the loop */ |
10229 | tree up = vec_step; |
10230 | if (lupdate_mul) |
10231 | up = gimple_build (seq: &init_stmts, code: MULT_EXPR, type: step_vectype, |
10232 | ops: vec_step, ops: lupdate_mul); |
10233 | gimple_seq stmts = NULL; |
10234 | vec_def = gimple_convert (seq: &stmts, type: step_vectype, op: induc_def); |
10235 | vec_def = gimple_build (seq: &stmts, |
10236 | code: PLUS_EXPR, type: step_vectype, ops: vec_def, ops: up); |
10237 | vec_def = gimple_convert (seq: &stmts, type: vectype, op: vec_def); |
10238 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
10239 | add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), |
10240 | UNKNOWN_LOCATION); |
10241 | |
10242 | if (init_node) |
10243 | vec_init = vect_get_slp_vect_def (init_node, ivn); |
10244 | if (!nested_in_vect_loop |
10245 | && !integer_zerop (step_mul)) |
10246 | { |
10247 | vec_def = gimple_convert (seq: &init_stmts, type: step_vectype, op: vec_init); |
10248 | up = gimple_build (seq: &init_stmts, code: MULT_EXPR, type: step_vectype, |
10249 | ops: vec_step, ops: step_mul); |
10250 | vec_def = gimple_build (seq: &init_stmts, code: PLUS_EXPR, type: step_vectype, |
10251 | ops: vec_def, ops: up); |
10252 | vec_init = gimple_convert (seq: &init_stmts, type: vectype, op: vec_def); |
10253 | } |
10254 | |
10255 | /* Set the arguments of the phi node: */ |
10256 | add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); |
10257 | |
10258 | slp_node->push_vec_def (def: induction_phi); |
10259 | } |
10260 | if (!nested_in_vect_loop) |
10261 | { |
10262 | /* Fill up to the number of vectors we need for the whole group. */ |
10263 | nivs = least_common_multiple (group_size, |
10264 | const_nunits) / const_nunits; |
10265 | vec_steps.reserve (nelems: nivs-ivn); |
10266 | for (; ivn < nivs; ++ivn) |
10267 | { |
10268 | slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]); |
10269 | vec_steps.quick_push (obj: vec_steps[0]); |
10270 | } |
10271 | } |
10272 | |
10273 | /* Re-use IVs when we can. We are generating further vector |
10274 | stmts by adding VF' * stride to the IVs generated above. */ |
10275 | if (ivn < nvects) |
10276 | { |
10277 | unsigned vfp |
10278 | = least_common_multiple (group_size, const_nunits) / group_size; |
10279 | tree lupdate_mul |
10280 | = build_vector_from_val (step_vectype, |
10281 | SCALAR_FLOAT_TYPE_P (stept) |
10282 | ? build_real_from_wide (stept, |
10283 | vfp, UNSIGNED) |
10284 | : build_int_cstu (type: stept, vfp)); |
10285 | for (; ivn < nvects; ++ivn) |
10286 | { |
10287 | gimple *iv |
10288 | = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]); |
10289 | tree def = gimple_get_lhs (iv); |
10290 | if (ivn < 2*nivs) |
10291 | vec_steps[ivn - nivs] |
10292 | = gimple_build (seq: &init_stmts, code: MULT_EXPR, type: step_vectype, |
10293 | ops: vec_steps[ivn - nivs], ops: lupdate_mul); |
10294 | gimple_seq stmts = NULL; |
10295 | def = gimple_convert (seq: &stmts, type: step_vectype, op: def); |
10296 | def = gimple_build (seq: &stmts, code: PLUS_EXPR, type: step_vectype, |
10297 | ops: def, ops: vec_steps[ivn % nivs]); |
10298 | def = gimple_convert (seq: &stmts, type: vectype, op: def); |
10299 | if (gimple_code (g: iv) == GIMPLE_PHI) |
10300 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
10301 | else |
10302 | { |
10303 | gimple_stmt_iterator tgsi = gsi_for_stmt (iv); |
10304 | gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING); |
10305 | } |
10306 | slp_node->push_vec_def (def); |
10307 | } |
10308 | } |
10309 | |
10310 | new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts); |
10311 | gcc_assert (!new_bb); |
10312 | |
10313 | return true; |
10314 | } |
10315 | |
10316 | init_expr = vect_phi_initial_value (phi); |
10317 | |
10318 | gimple_seq stmts = NULL; |
10319 | if (!nested_in_vect_loop) |
10320 | { |
10321 | /* Convert the initial value to the IV update type. */ |
10322 | tree new_type = TREE_TYPE (step_expr); |
10323 | init_expr = gimple_convert (seq: &stmts, type: new_type, op: init_expr); |
10324 | |
10325 | /* If we are using the loop mask to "peel" for alignment then we need |
10326 | to adjust the start value here. */ |
10327 | tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); |
10328 | if (skip_niters != NULL_TREE) |
10329 | { |
10330 | if (FLOAT_TYPE_P (vectype)) |
10331 | skip_niters = gimple_build (seq: &stmts, code: FLOAT_EXPR, type: new_type, |
10332 | ops: skip_niters); |
10333 | else |
10334 | skip_niters = gimple_convert (seq: &stmts, type: new_type, op: skip_niters); |
10335 | tree skip_step = gimple_build (seq: &stmts, code: MULT_EXPR, type: new_type, |
10336 | ops: skip_niters, ops: step_expr); |
10337 | init_expr = gimple_build (seq: &stmts, code: MINUS_EXPR, type: new_type, |
10338 | ops: init_expr, ops: skip_step); |
10339 | } |
10340 | } |
10341 | |
10342 | if (stmts) |
10343 | { |
10344 | new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
10345 | gcc_assert (!new_bb); |
10346 | } |
10347 | |
10348 | /* Create the vector that holds the initial_value of the induction. */ |
10349 | if (nested_in_vect_loop) |
10350 | { |
10351 | /* iv_loop is nested in the loop to be vectorized. init_expr had already |
10352 | been created during vectorization of previous stmts. We obtain it |
10353 | from the STMT_VINFO_VEC_STMT of the defining stmt. */ |
10354 | auto_vec<tree> vec_inits; |
10355 | vect_get_vec_defs_for_operand (vinfo: loop_vinfo, stmt_info, 1, |
10356 | op: init_expr, &vec_inits); |
10357 | vec_init = vec_inits[0]; |
10358 | /* If the initial value is not of proper type, convert it. */ |
10359 | if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init))) |
10360 | { |
10361 | new_stmt |
10362 | = gimple_build_assign (vect_get_new_ssa_name (vectype, |
10363 | vect_simple_var, |
10364 | "vec_iv_" ), |
10365 | VIEW_CONVERT_EXPR, |
10366 | build1 (VIEW_CONVERT_EXPR, vectype, |
10367 | vec_init)); |
10368 | vec_init = gimple_assign_lhs (gs: new_stmt); |
10369 | new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop), |
10370 | new_stmt); |
10371 | gcc_assert (!new_bb); |
10372 | } |
10373 | } |
10374 | else |
10375 | { |
10376 | /* iv_loop is the loop to be vectorized. Create: |
10377 | vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ |
10378 | stmts = NULL; |
10379 | new_name = gimple_convert (seq: &stmts, TREE_TYPE (step_expr), op: init_expr); |
10380 | |
10381 | unsigned HOST_WIDE_INT const_nunits; |
10382 | if (nunits.is_constant (const_value: &const_nunits)) |
10383 | { |
10384 | tree_vector_builder elts (step_vectype, const_nunits, 1); |
10385 | elts.quick_push (obj: new_name); |
10386 | for (i = 1; i < const_nunits; i++) |
10387 | { |
10388 | /* Create: new_name_i = new_name + step_expr */ |
10389 | new_name = gimple_build (seq: &stmts, code: PLUS_EXPR, TREE_TYPE (new_name), |
10390 | ops: new_name, ops: step_expr); |
10391 | elts.quick_push (obj: new_name); |
10392 | } |
10393 | /* Create a vector from [new_name_0, new_name_1, ..., |
10394 | new_name_nunits-1] */ |
10395 | vec_init = gimple_build_vector (seq: &stmts, builder: &elts); |
10396 | } |
10397 | else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))) |
10398 | /* Build the initial value directly from a VEC_SERIES_EXPR. */ |
10399 | vec_init = gimple_build (seq: &stmts, code: VEC_SERIES_EXPR, type: step_vectype, |
10400 | ops: new_name, ops: step_expr); |
10401 | else |
10402 | { |
10403 | /* Build: |
10404 | [base, base, base, ...] |
10405 | + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */ |
10406 | gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))); |
10407 | gcc_assert (flag_associative_math); |
10408 | tree index = build_index_vector (step_vectype, 0, 1); |
10409 | tree base_vec = gimple_build_vector_from_val (seq: &stmts, type: step_vectype, |
10410 | op: new_name); |
10411 | tree step_vec = gimple_build_vector_from_val (seq: &stmts, type: step_vectype, |
10412 | op: step_expr); |
10413 | vec_init = gimple_build (seq: &stmts, code: FLOAT_EXPR, type: step_vectype, ops: index); |
10414 | vec_init = gimple_build (seq: &stmts, code: MULT_EXPR, type: step_vectype, |
10415 | ops: vec_init, ops: step_vec); |
10416 | vec_init = gimple_build (seq: &stmts, code: PLUS_EXPR, type: step_vectype, |
10417 | ops: vec_init, ops: base_vec); |
10418 | } |
10419 | vec_init = gimple_convert (seq: &stmts, type: vectype, op: vec_init); |
10420 | |
10421 | if (stmts) |
10422 | { |
10423 | new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
10424 | gcc_assert (!new_bb); |
10425 | } |
10426 | } |
10427 | |
10428 | |
10429 | /* Create the vector that holds the step of the induction. */ |
10430 | gimple_stmt_iterator *step_iv_si = NULL; |
10431 | if (nested_in_vect_loop) |
10432 | /* iv_loop is nested in the loop to be vectorized. Generate: |
10433 | vec_step = [S, S, S, S] */ |
10434 | new_name = step_expr; |
10435 | else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) |
10436 | { |
10437 | /* When we're using loop_len produced by SELEC_VL, the non-final |
10438 | iterations are not always processing VF elements. So vectorize |
10439 | induction variable instead of |
10440 | |
10441 | _21 = vect_vec_iv_.6_22 + { VF, ... }; |
10442 | |
10443 | We should generate: |
10444 | |
10445 | _35 = .SELECT_VL (ivtmp_33, VF); |
10446 | vect_cst__22 = [vec_duplicate_expr] _35; |
10447 | _21 = vect_vec_iv_.6_22 + vect_cst__22; */ |
10448 | gcc_assert (!slp_node); |
10449 | gimple_seq seq = NULL; |
10450 | vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); |
10451 | tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0); |
10452 | expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr), |
10453 | unshare_expr (len)), |
10454 | &seq, true, NULL_TREE); |
10455 | new_name = gimple_build (seq: &seq, code: MULT_EXPR, TREE_TYPE (step_expr), ops: expr, |
10456 | ops: step_expr); |
10457 | gsi_insert_seq_before (&si, seq, GSI_SAME_STMT); |
10458 | step_iv_si = &si; |
10459 | } |
10460 | else |
10461 | { |
10462 | /* iv_loop is the loop to be vectorized. Generate: |
10463 | vec_step = [VF*S, VF*S, VF*S, VF*S] */ |
10464 | gimple_seq seq = NULL; |
10465 | if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) |
10466 | { |
10467 | expr = build_int_cst (integer_type_node, vf); |
10468 | expr = gimple_build (seq: &seq, code: FLOAT_EXPR, TREE_TYPE (step_expr), ops: expr); |
10469 | } |
10470 | else |
10471 | expr = build_int_cst (TREE_TYPE (step_expr), vf); |
10472 | new_name = gimple_build (seq: &seq, code: MULT_EXPR, TREE_TYPE (step_expr), |
10473 | ops: expr, ops: step_expr); |
10474 | if (seq) |
10475 | { |
10476 | new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); |
10477 | gcc_assert (!new_bb); |
10478 | } |
10479 | } |
10480 | |
10481 | t = unshare_expr (new_name); |
10482 | gcc_assert (CONSTANT_CLASS_P (new_name) |
10483 | || TREE_CODE (new_name) == SSA_NAME); |
10484 | new_vec = build_vector_from_val (step_vectype, t); |
10485 | vec_step = vect_init_vector (loop_vinfo, stmt_info, |
10486 | new_vec, step_vectype, step_iv_si); |
10487 | |
10488 | |
10489 | /* Create the following def-use cycle: |
10490 | loop prolog: |
10491 | vec_init = ... |
10492 | vec_step = ... |
10493 | loop: |
10494 | vec_iv = PHI <vec_init, vec_loop> |
10495 | ... |
10496 | STMT |
10497 | ... |
10498 | vec_loop = vec_iv + vec_step; */ |
10499 | |
10500 | /* Create the induction-phi that defines the induction-operand. */ |
10501 | vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_" ); |
10502 | induction_phi = create_phi_node (vec_dest, iv_loop->header); |
10503 | induc_def = PHI_RESULT (induction_phi); |
10504 | |
10505 | /* Create the iv update inside the loop */ |
10506 | stmts = NULL; |
10507 | vec_def = gimple_convert (seq: &stmts, type: step_vectype, op: induc_def); |
10508 | vec_def = gimple_build (seq: &stmts, code: PLUS_EXPR, type: step_vectype, ops: vec_def, ops: vec_step); |
10509 | vec_def = gimple_convert (seq: &stmts, type: vectype, op: vec_def); |
10510 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
10511 | new_stmt = SSA_NAME_DEF_STMT (vec_def); |
10512 | |
10513 | /* Set the arguments of the phi node: */ |
10514 | add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); |
10515 | add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), |
10516 | UNKNOWN_LOCATION); |
10517 | |
10518 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: induction_phi); |
10519 | *vec_stmt = induction_phi; |
10520 | |
10521 | /* In case that vectorization factor (VF) is bigger than the number |
10522 | of elements that we can fit in a vectype (nunits), we have to generate |
10523 | more than one vector stmt - i.e - we need to "unroll" the |
10524 | vector stmt by a factor VF/nunits. For more details see documentation |
10525 | in vectorizable_operation. */ |
10526 | |
10527 | if (ncopies > 1) |
10528 | { |
10529 | gimple_seq seq = NULL; |
10530 | /* FORNOW. This restriction should be relaxed. */ |
10531 | gcc_assert (!nested_in_vect_loop); |
10532 | /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */ |
10533 | gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)); |
10534 | |
10535 | /* Create the vector that holds the step of the induction. */ |
10536 | if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) |
10537 | { |
10538 | expr = build_int_cst (integer_type_node, nunits); |
10539 | expr = gimple_build (seq: &seq, code: FLOAT_EXPR, TREE_TYPE (step_expr), ops: expr); |
10540 | } |
10541 | else |
10542 | expr = build_int_cst (TREE_TYPE (step_expr), nunits); |
10543 | new_name = gimple_build (seq: &seq, code: MULT_EXPR, TREE_TYPE (step_expr), |
10544 | ops: expr, ops: step_expr); |
10545 | if (seq) |
10546 | { |
10547 | new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); |
10548 | gcc_assert (!new_bb); |
10549 | } |
10550 | |
10551 | t = unshare_expr (new_name); |
10552 | gcc_assert (CONSTANT_CLASS_P (new_name) |
10553 | || TREE_CODE (new_name) == SSA_NAME); |
10554 | new_vec = build_vector_from_val (step_vectype, t); |
10555 | vec_step = vect_init_vector (loop_vinfo, stmt_info, |
10556 | new_vec, step_vectype, NULL); |
10557 | |
10558 | vec_def = induc_def; |
10559 | for (i = 1; i < ncopies + 1; i++) |
10560 | { |
10561 | /* vec_i = vec_prev + vec_step */ |
10562 | gimple_seq stmts = NULL; |
10563 | vec_def = gimple_convert (seq: &stmts, type: step_vectype, op: vec_def); |
10564 | vec_def = gimple_build (seq: &stmts, |
10565 | code: PLUS_EXPR, type: step_vectype, ops: vec_def, ops: vec_step); |
10566 | vec_def = gimple_convert (seq: &stmts, type: vectype, op: vec_def); |
10567 | |
10568 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
10569 | if (i < ncopies) |
10570 | { |
10571 | new_stmt = SSA_NAME_DEF_STMT (vec_def); |
10572 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_stmt); |
10573 | } |
10574 | else |
10575 | { |
10576 | /* vec_1 = vec_iv + (VF/n * S) |
10577 | vec_2 = vec_1 + (VF/n * S) |
10578 | ... |
10579 | vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop |
10580 | |
10581 | vec_n is used as vec_loop to save the large step register and |
10582 | related operations. */ |
10583 | add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), |
10584 | UNKNOWN_LOCATION); |
10585 | } |
10586 | } |
10587 | } |
10588 | |
10589 | if (dump_enabled_p ()) |
10590 | dump_printf_loc (MSG_NOTE, vect_location, |
10591 | "transform induction: created def-use cycle: %G%G" , |
10592 | (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def)); |
10593 | |
10594 | return true; |
10595 | } |
10596 | |
10597 | /* Function vectorizable_live_operation_1. |
10598 | |
10599 | helper function for vectorizable_live_operation. */ |
10600 | |
10601 | static tree |
10602 | vectorizable_live_operation_1 (loop_vec_info loop_vinfo, |
10603 | stmt_vec_info stmt_info, basic_block exit_bb, |
10604 | tree vectype, int ncopies, slp_tree slp_node, |
10605 | tree bitsize, tree bitstart, tree vec_lhs, |
10606 | tree lhs_type, gimple_stmt_iterator *exit_gsi) |
10607 | { |
10608 | gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo)); |
10609 | |
10610 | tree vec_lhs_phi = copy_ssa_name (var: vec_lhs); |
10611 | gimple *phi = create_phi_node (vec_lhs_phi, exit_bb); |
10612 | for (unsigned i = 0; i < gimple_phi_num_args (gs: phi); i++) |
10613 | SET_PHI_ARG_DEF (phi, i, vec_lhs); |
10614 | |
10615 | gimple_seq stmts = NULL; |
10616 | tree new_tree; |
10617 | |
10618 | /* If bitstart is 0 then we can use a BIT_FIELD_REF */ |
10619 | if (integer_zerop (bitstart)) |
10620 | { |
10621 | tree scalar_res = gimple_build (seq: &stmts, code: BIT_FIELD_REF, TREE_TYPE (vectype), |
10622 | ops: vec_lhs_phi, ops: bitsize, ops: bitstart); |
10623 | |
10624 | /* Convert the extracted vector element to the scalar type. */ |
10625 | new_tree = gimple_convert (seq: &stmts, type: lhs_type, op: scalar_res); |
10626 | } |
10627 | else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) |
10628 | { |
10629 | /* Emit: |
10630 | |
10631 | SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1> |
10632 | |
10633 | where VEC_LHS is the vectorized live-out result and MASK is |
10634 | the loop mask for the final iteration. */ |
10635 | gcc_assert (ncopies == 1 && !slp_node); |
10636 | gimple_seq tem = NULL; |
10637 | gimple_stmt_iterator gsi = gsi_last (seq&: tem); |
10638 | tree len = vect_get_loop_len (loop_vinfo, &gsi, |
10639 | &LOOP_VINFO_LENS (loop_vinfo), |
10640 | 1, vectype, 0, 0); |
10641 | |
10642 | /* BIAS - 1. */ |
10643 | signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); |
10644 | tree bias_minus_one |
10645 | = int_const_binop (MINUS_EXPR, |
10646 | build_int_cst (TREE_TYPE (len), biasval), |
10647 | build_one_cst (TREE_TYPE (len))); |
10648 | |
10649 | /* LAST_INDEX = LEN + (BIAS - 1). */ |
10650 | tree last_index = gimple_build (seq: &stmts, code: PLUS_EXPR, TREE_TYPE (len), |
10651 | ops: len, ops: bias_minus_one); |
10652 | |
10653 | /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */ |
10654 | tree scalar_res |
10655 | = gimple_build (seq: &stmts, fn: CFN_VEC_EXTRACT, TREE_TYPE (vectype), |
10656 | args: vec_lhs_phi, args: last_index); |
10657 | |
10658 | /* Convert the extracted vector element to the scalar type. */ |
10659 | new_tree = gimple_convert (seq: &stmts, type: lhs_type, op: scalar_res); |
10660 | } |
10661 | else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) |
10662 | { |
10663 | /* Emit: |
10664 | |
10665 | SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> |
10666 | |
10667 | where VEC_LHS is the vectorized live-out result and MASK is |
10668 | the loop mask for the final iteration. */ |
10669 | gcc_assert (!slp_node); |
10670 | tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); |
10671 | gimple_seq tem = NULL; |
10672 | gimple_stmt_iterator gsi = gsi_last (seq&: tem); |
10673 | tree mask = vect_get_loop_mask (loop_vinfo, &gsi, |
10674 | &LOOP_VINFO_MASKS (loop_vinfo), |
10675 | 1, vectype, 0); |
10676 | tree scalar_res; |
10677 | gimple_seq_add_seq (&stmts, tem); |
10678 | |
10679 | scalar_res = gimple_build (seq: &stmts, fn: CFN_EXTRACT_LAST, type: scalar_type, |
10680 | args: mask, args: vec_lhs_phi); |
10681 | |
10682 | /* Convert the extracted vector element to the scalar type. */ |
10683 | new_tree = gimple_convert (seq: &stmts, type: lhs_type, op: scalar_res); |
10684 | } |
10685 | else |
10686 | { |
10687 | tree bftype = TREE_TYPE (vectype); |
10688 | if (VECTOR_BOOLEAN_TYPE_P (vectype)) |
10689 | bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1); |
10690 | new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart); |
10691 | new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), |
10692 | &stmts, true, NULL_TREE); |
10693 | } |
10694 | |
10695 | *exit_gsi = gsi_after_labels (bb: exit_bb); |
10696 | if (stmts) |
10697 | gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT); |
10698 | |
10699 | return new_tree; |
10700 | } |
10701 | |
10702 | /* Function vectorizable_live_operation. |
10703 | |
10704 | STMT_INFO computes a value that is used outside the loop. Check if |
10705 | it can be supported. */ |
10706 | |
10707 | bool |
10708 | vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, |
10709 | slp_tree slp_node, slp_instance slp_node_instance, |
10710 | int slp_index, bool vec_stmt_p, |
10711 | stmt_vector_for_cost *cost_vec) |
10712 | { |
10713 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
10714 | imm_use_iterator imm_iter; |
10715 | tree lhs, lhs_type, bitsize; |
10716 | tree vectype = (slp_node |
10717 | ? SLP_TREE_VECTYPE (slp_node) |
10718 | : STMT_VINFO_VECTYPE (stmt_info)); |
10719 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype); |
10720 | int ncopies; |
10721 | gimple *use_stmt; |
10722 | use_operand_p use_p; |
10723 | auto_vec<tree> vec_oprnds; |
10724 | int vec_entry = 0; |
10725 | poly_uint64 vec_index = 0; |
10726 | |
10727 | gcc_assert (STMT_VINFO_LIVE_P (stmt_info) |
10728 | || LOOP_VINFO_EARLY_BREAKS (loop_vinfo)); |
10729 | |
10730 | /* If a stmt of a reduction is live, vectorize it via |
10731 | vect_create_epilog_for_reduction. vectorizable_reduction assessed |
10732 | validity so just trigger the transform here. */ |
10733 | if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))) |
10734 | { |
10735 | if (!vec_stmt_p) |
10736 | return true; |
10737 | /* For SLP reductions we vectorize the epilogue for all involved stmts |
10738 | together. */ |
10739 | if (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) && slp_index != 0) |
10740 | return true; |
10741 | stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info); |
10742 | gcc_assert (reduc_info->is_reduc_info); |
10743 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION |
10744 | || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION) |
10745 | return true; |
10746 | |
10747 | if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo) |
10748 | || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)) |
10749 | vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node, |
10750 | slp_node_instance, |
10751 | LOOP_VINFO_IV_EXIT (loop_vinfo)); |
10752 | |
10753 | /* If early break we only have to materialize the reduction on the merge |
10754 | block, but we have to find an alternate exit first. */ |
10755 | if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) |
10756 | { |
10757 | slp_tree phis_node = slp_node ? slp_node_instance->reduc_phis : NULL; |
10758 | for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo))) |
10759 | if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo)) |
10760 | { |
10761 | vect_create_epilog_for_reduction (loop_vinfo, stmt_info: reduc_info, |
10762 | slp_node: phis_node, slp_node_instance, |
10763 | loop_exit: exit); |
10764 | break; |
10765 | } |
10766 | if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)) |
10767 | vect_create_epilog_for_reduction (loop_vinfo, stmt_info: reduc_info, |
10768 | slp_node: phis_node, slp_node_instance, |
10769 | LOOP_VINFO_IV_EXIT (loop_vinfo)); |
10770 | } |
10771 | |
10772 | return true; |
10773 | } |
10774 | |
10775 | /* If STMT is not relevant and it is a simple assignment and its inputs are |
10776 | invariant then it can remain in place, unvectorized. The original last |
10777 | scalar value that it computes will be used. */ |
10778 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
10779 | { |
10780 | gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo)); |
10781 | if (dump_enabled_p ()) |
10782 | dump_printf_loc (MSG_NOTE, vect_location, |
10783 | "statement is simple and uses invariant. Leaving in " |
10784 | "place.\n" ); |
10785 | return true; |
10786 | } |
10787 | |
10788 | if (slp_node) |
10789 | ncopies = 1; |
10790 | else |
10791 | ncopies = vect_get_num_copies (loop_vinfo, vectype); |
10792 | |
10793 | if (slp_node) |
10794 | { |
10795 | gcc_assert (slp_index >= 0); |
10796 | |
10797 | /* Get the last occurrence of the scalar index from the concatenation of |
10798 | all the slp vectors. Calculate which slp vector it is and the index |
10799 | within. */ |
10800 | int num_scalar = SLP_TREE_LANES (slp_node); |
10801 | int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); |
10802 | poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index; |
10803 | |
10804 | /* Calculate which vector contains the result, and which lane of |
10805 | that vector we need. */ |
10806 | if (!can_div_trunc_p (a: pos, b: nunits, quotient: &vec_entry, remainder: &vec_index)) |
10807 | { |
10808 | if (dump_enabled_p ()) |
10809 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
10810 | "Cannot determine which vector holds the" |
10811 | " final result.\n" ); |
10812 | return false; |
10813 | } |
10814 | } |
10815 | |
10816 | if (!vec_stmt_p) |
10817 | { |
10818 | /* No transformation required. */ |
10819 | if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) |
10820 | { |
10821 | if (slp_node) |
10822 | { |
10823 | if (dump_enabled_p ()) |
10824 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
10825 | "can't operate on partial vectors " |
10826 | "because an SLP statement is live after " |
10827 | "the loop.\n" ); |
10828 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
10829 | } |
10830 | else if (ncopies > 1) |
10831 | { |
10832 | if (dump_enabled_p ()) |
10833 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
10834 | "can't operate on partial vectors " |
10835 | "because ncopies is greater than 1.\n" ); |
10836 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
10837 | } |
10838 | else |
10839 | { |
10840 | gcc_assert (ncopies == 1 && !slp_node); |
10841 | if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, |
10842 | OPTIMIZE_FOR_SPEED)) |
10843 | vect_record_loop_mask (loop_vinfo, |
10844 | &LOOP_VINFO_MASKS (loop_vinfo), |
10845 | 1, vectype, NULL); |
10846 | else if (can_vec_extract_var_idx_p ( |
10847 | TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) |
10848 | vect_record_loop_len (loop_vinfo, |
10849 | &LOOP_VINFO_LENS (loop_vinfo), |
10850 | 1, vectype, 1); |
10851 | else |
10852 | { |
10853 | if (dump_enabled_p ()) |
10854 | dump_printf_loc ( |
10855 | MSG_MISSED_OPTIMIZATION, vect_location, |
10856 | "can't operate on partial vectors " |
10857 | "because the target doesn't support extract " |
10858 | "last reduction.\n" ); |
10859 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
10860 | } |
10861 | } |
10862 | } |
10863 | /* ??? Enable for loop costing as well. */ |
10864 | if (!loop_vinfo) |
10865 | record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE, |
10866 | 0, vect_epilogue); |
10867 | return true; |
10868 | } |
10869 | |
10870 | /* Use the lhs of the original scalar statement. */ |
10871 | gimple *stmt = vect_orig_stmt (stmt_info)->stmt; |
10872 | if (dump_enabled_p ()) |
10873 | dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live " |
10874 | "stmt %G" , stmt); |
10875 | |
10876 | lhs = gimple_get_lhs (stmt); |
10877 | lhs_type = TREE_TYPE (lhs); |
10878 | |
10879 | bitsize = vector_element_bits_tree (vectype); |
10880 | |
10881 | /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */ |
10882 | tree vec_lhs, vec_lhs0, bitstart; |
10883 | gimple *vec_stmt, *vec_stmt0; |
10884 | if (slp_node) |
10885 | { |
10886 | gcc_assert (!loop_vinfo |
10887 | || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) |
10888 | && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))); |
10889 | |
10890 | /* Get the correct slp vectorized stmt. */ |
10891 | vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; |
10892 | vec_stmt = SSA_NAME_DEF_STMT (vec_lhs); |
10893 | |
10894 | /* In case we need to early break vectorize also get the first stmt. */ |
10895 | vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0]; |
10896 | vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0); |
10897 | |
10898 | /* Get entry to use. */ |
10899 | bitstart = bitsize_int (vec_index); |
10900 | bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart); |
10901 | } |
10902 | else |
10903 | { |
10904 | /* For multiple copies, get the last copy. */ |
10905 | vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last (); |
10906 | vec_lhs = gimple_get_lhs (vec_stmt); |
10907 | |
10908 | /* In case we need to early break vectorize also get the first stmt. */ |
10909 | vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0]; |
10910 | vec_lhs0 = gimple_get_lhs (vec_stmt0); |
10911 | |
10912 | /* Get the last lane in the vector. */ |
10913 | bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1)); |
10914 | } |
10915 | |
10916 | if (loop_vinfo) |
10917 | { |
10918 | /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI |
10919 | requirement, insert one phi node for it. It looks like: |
10920 | loop; |
10921 | BB: |
10922 | # lhs' = PHI <lhs> |
10923 | ==> |
10924 | loop; |
10925 | BB: |
10926 | # vec_lhs' = PHI <vec_lhs> |
10927 | new_tree = lane_extract <vec_lhs', ...>; |
10928 | lhs' = new_tree; */ |
10929 | |
10930 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
10931 | /* Check if we have a loop where the chosen exit is not the main exit, |
10932 | in these cases for an early break we restart the iteration the vector code |
10933 | did. For the live values we want the value at the start of the iteration |
10934 | rather than at the end. */ |
10935 | edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo); |
10936 | bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo); |
10937 | FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) |
10938 | if (!is_gimple_debug (gs: use_stmt) |
10939 | && !flow_bb_inside_loop_p (loop, gimple_bb (g: use_stmt))) |
10940 | FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) |
10941 | { |
10942 | edge e = gimple_phi_arg_edge (phi: as_a <gphi *> (p: use_stmt), |
10943 | i: phi_arg_index_from_use (use: use_p)); |
10944 | gcc_assert (loop_exit_edge_p (loop, e)); |
10945 | bool main_exit_edge = e == main_e; |
10946 | tree tmp_vec_lhs = vec_lhs; |
10947 | tree tmp_bitstart = bitstart; |
10948 | |
10949 | /* For early exit where the exit is not in the BB that leads |
10950 | to the latch then we're restarting the iteration in the |
10951 | scalar loop. So get the first live value. */ |
10952 | if ((all_exits_as_early_p || !main_exit_edge) |
10953 | && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def) |
10954 | { |
10955 | tmp_vec_lhs = vec_lhs0; |
10956 | tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart)); |
10957 | } |
10958 | |
10959 | gimple_stmt_iterator exit_gsi; |
10960 | tree new_tree |
10961 | = vectorizable_live_operation_1 (loop_vinfo, stmt_info, |
10962 | exit_bb: e->dest, vectype, ncopies, |
10963 | slp_node, bitsize, |
10964 | bitstart: tmp_bitstart, vec_lhs: tmp_vec_lhs, |
10965 | lhs_type, exit_gsi: &exit_gsi); |
10966 | |
10967 | auto gsi = gsi_for_stmt (use_stmt); |
10968 | tree lhs_phi = gimple_phi_result (gs: use_stmt); |
10969 | remove_phi_node (&gsi, false); |
10970 | gimple *copy = gimple_build_assign (lhs_phi, new_tree); |
10971 | gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT); |
10972 | break; |
10973 | } |
10974 | |
10975 | /* There a no further out-of-loop uses of lhs by LC-SSA construction. */ |
10976 | FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) |
10977 | gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))); |
10978 | } |
10979 | else |
10980 | { |
10981 | /* For basic-block vectorization simply insert the lane-extraction. */ |
10982 | tree bftype = TREE_TYPE (vectype); |
10983 | if (VECTOR_BOOLEAN_TYPE_P (vectype)) |
10984 | bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1); |
10985 | tree new_tree = build3 (BIT_FIELD_REF, bftype, |
10986 | vec_lhs, bitsize, bitstart); |
10987 | gimple_seq stmts = NULL; |
10988 | new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), |
10989 | &stmts, true, NULL_TREE); |
10990 | if (TREE_CODE (new_tree) == SSA_NAME |
10991 | && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs)) |
10992 | SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1; |
10993 | if (is_a <gphi *> (p: vec_stmt)) |
10994 | { |
10995 | gimple_stmt_iterator si = gsi_after_labels (bb: gimple_bb (g: vec_stmt)); |
10996 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
10997 | } |
10998 | else |
10999 | { |
11000 | gimple_stmt_iterator si = gsi_for_stmt (vec_stmt); |
11001 | gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT); |
11002 | } |
11003 | |
11004 | /* Replace use of lhs with newly computed result. If the use stmt is a |
11005 | single arg PHI, just replace all uses of PHI result. It's necessary |
11006 | because lcssa PHI defining lhs may be before newly inserted stmt. */ |
11007 | use_operand_p use_p; |
11008 | stmt_vec_info use_stmt_info; |
11009 | FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) |
11010 | if (!is_gimple_debug (gs: use_stmt) |
11011 | && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt)) |
11012 | || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))) |
11013 | { |
11014 | /* ??? This can happen when the live lane ends up being |
11015 | rooted in a vector construction code-generated by an |
11016 | external SLP node (and code-generation for that already |
11017 | happened). See gcc.dg/vect/bb-slp-47.c. |
11018 | Doing this is what would happen if that vector CTOR |
11019 | were not code-generated yet so it is not too bad. |
11020 | ??? In fact we'd likely want to avoid this situation |
11021 | in the first place. */ |
11022 | if (TREE_CODE (new_tree) == SSA_NAME |
11023 | && !SSA_NAME_IS_DEFAULT_DEF (new_tree) |
11024 | && gimple_code (g: use_stmt) != GIMPLE_PHI |
11025 | && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree), |
11026 | use_stmt)) |
11027 | { |
11028 | if (dump_enabled_p ()) |
11029 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
11030 | "Using original scalar computation for " |
11031 | "live lane because use preceeds vector " |
11032 | "def\n" ); |
11033 | continue; |
11034 | } |
11035 | /* ??? It can also happen that we end up pulling a def into |
11036 | a loop where replacing out-of-loop uses would require |
11037 | a new LC SSA PHI node. Retain the original scalar in |
11038 | those cases as well. PR98064. */ |
11039 | if (TREE_CODE (new_tree) == SSA_NAME |
11040 | && !SSA_NAME_IS_DEFAULT_DEF (new_tree) |
11041 | && (gimple_bb (g: use_stmt)->loop_father |
11042 | != gimple_bb (g: vec_stmt)->loop_father) |
11043 | && !flow_loop_nested_p (gimple_bb (g: vec_stmt)->loop_father, |
11044 | gimple_bb (g: use_stmt)->loop_father)) |
11045 | { |
11046 | if (dump_enabled_p ()) |
11047 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
11048 | "Using original scalar computation for " |
11049 | "live lane because there is an out-of-loop " |
11050 | "definition for it\n" ); |
11051 | continue; |
11052 | } |
11053 | FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) |
11054 | SET_USE (use_p, new_tree); |
11055 | update_stmt (s: use_stmt); |
11056 | } |
11057 | } |
11058 | |
11059 | return true; |
11060 | } |
11061 | |
11062 | /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */ |
11063 | |
11064 | static void |
11065 | vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info) |
11066 | { |
11067 | ssa_op_iter op_iter; |
11068 | imm_use_iterator imm_iter; |
11069 | def_operand_p def_p; |
11070 | gimple *ustmt; |
11071 | |
11072 | FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF) |
11073 | { |
11074 | FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p)) |
11075 | { |
11076 | basic_block bb; |
11077 | |
11078 | if (!is_gimple_debug (gs: ustmt)) |
11079 | continue; |
11080 | |
11081 | bb = gimple_bb (g: ustmt); |
11082 | |
11083 | if (!flow_bb_inside_loop_p (loop, bb)) |
11084 | { |
11085 | if (gimple_debug_bind_p (s: ustmt)) |
11086 | { |
11087 | if (dump_enabled_p ()) |
11088 | dump_printf_loc (MSG_NOTE, vect_location, |
11089 | "killing debug use\n" ); |
11090 | |
11091 | gimple_debug_bind_reset_value (dbg: ustmt); |
11092 | update_stmt (s: ustmt); |
11093 | } |
11094 | else |
11095 | gcc_unreachable (); |
11096 | } |
11097 | } |
11098 | } |
11099 | } |
11100 | |
11101 | /* Given loop represented by LOOP_VINFO, return true if computation of |
11102 | LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false |
11103 | otherwise. */ |
11104 | |
11105 | static bool |
11106 | loop_niters_no_overflow (loop_vec_info loop_vinfo) |
11107 | { |
11108 | /* Constant case. */ |
11109 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) |
11110 | { |
11111 | tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo); |
11112 | tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo); |
11113 | |
11114 | gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST); |
11115 | gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST); |
11116 | if (wi::to_widest (t: cst_nitersm1) < wi::to_widest (t: cst_niters)) |
11117 | return true; |
11118 | } |
11119 | |
11120 | widest_int max; |
11121 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
11122 | /* Check the upper bound of loop niters. */ |
11123 | if (get_max_loop_iterations (loop, nit: &max)) |
11124 | { |
11125 | tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); |
11126 | signop sgn = TYPE_SIGN (type); |
11127 | widest_int type_max = widest_int::from (x: wi::max_value (type), sgn); |
11128 | if (max < type_max) |
11129 | return true; |
11130 | } |
11131 | return false; |
11132 | } |
11133 | |
11134 | /* Return a mask type with half the number of elements as OLD_TYPE, |
11135 | given that it should have mode NEW_MODE. */ |
11136 | |
11137 | tree |
11138 | vect_halve_mask_nunits (tree old_type, machine_mode new_mode) |
11139 | { |
11140 | poly_uint64 nunits = exact_div (a: TYPE_VECTOR_SUBPARTS (node: old_type), b: 2); |
11141 | return build_truth_vector_type_for_mode (nunits, new_mode); |
11142 | } |
11143 | |
11144 | /* Return a mask type with twice as many elements as OLD_TYPE, |
11145 | given that it should have mode NEW_MODE. */ |
11146 | |
11147 | tree |
11148 | vect_double_mask_nunits (tree old_type, machine_mode new_mode) |
11149 | { |
11150 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: old_type) * 2; |
11151 | return build_truth_vector_type_for_mode (nunits, new_mode); |
11152 | } |
11153 | |
11154 | /* Record that a fully-masked version of LOOP_VINFO would need MASKS to |
11155 | contain a sequence of NVECTORS masks that each control a vector of type |
11156 | VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND |
11157 | these vector masks with the vector version of SCALAR_MASK. */ |
11158 | |
11159 | void |
11160 | vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, |
11161 | unsigned int nvectors, tree vectype, tree scalar_mask) |
11162 | { |
11163 | gcc_assert (nvectors != 0); |
11164 | |
11165 | if (scalar_mask) |
11166 | { |
11167 | scalar_cond_masked_key cond (scalar_mask, nvectors); |
11168 | loop_vinfo->scalar_cond_masked_set.add (k: cond); |
11169 | } |
11170 | |
11171 | masks->mask_set.add (k: std::make_pair (x&: vectype, y&: nvectors)); |
11172 | } |
11173 | |
11174 | /* Given a complete set of masks MASKS, extract mask number INDEX |
11175 | for an rgroup that operates on NVECTORS vectors of type VECTYPE, |
11176 | where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI. |
11177 | |
11178 | See the comment above vec_loop_masks for more details about the mask |
11179 | arrangement. */ |
11180 | |
11181 | tree |
11182 | vect_get_loop_mask (loop_vec_info loop_vinfo, |
11183 | gimple_stmt_iterator *gsi, vec_loop_masks *masks, |
11184 | unsigned int nvectors, tree vectype, unsigned int index) |
11185 | { |
11186 | if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) |
11187 | == vect_partial_vectors_while_ult) |
11188 | { |
11189 | rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1]; |
11190 | tree mask_type = rgm->type; |
11191 | |
11192 | /* Populate the rgroup's mask array, if this is the first time we've |
11193 | used it. */ |
11194 | if (rgm->controls.is_empty ()) |
11195 | { |
11196 | rgm->controls.safe_grow_cleared (len: nvectors, exact: true); |
11197 | for (unsigned int i = 0; i < nvectors; ++i) |
11198 | { |
11199 | tree mask = make_temp_ssa_name (type: mask_type, NULL, name: "loop_mask" ); |
11200 | /* Provide a dummy definition until the real one is available. */ |
11201 | SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); |
11202 | rgm->controls[i] = mask; |
11203 | } |
11204 | } |
11205 | |
11206 | tree mask = rgm->controls[index]; |
11207 | if (maybe_ne (a: TYPE_VECTOR_SUBPARTS (node: mask_type), |
11208 | b: TYPE_VECTOR_SUBPARTS (node: vectype))) |
11209 | { |
11210 | /* A loop mask for data type X can be reused for data type Y |
11211 | if X has N times more elements than Y and if Y's elements |
11212 | are N times bigger than X's. In this case each sequence |
11213 | of N elements in the loop mask will be all-zero or all-one. |
11214 | We can then view-convert the mask so that each sequence of |
11215 | N elements is replaced by a single element. */ |
11216 | gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), |
11217 | TYPE_VECTOR_SUBPARTS (vectype))); |
11218 | gimple_seq seq = NULL; |
11219 | mask_type = truth_type_for (vectype); |
11220 | mask = gimple_build (seq: &seq, code: VIEW_CONVERT_EXPR, type: mask_type, ops: mask); |
11221 | if (seq) |
11222 | gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); |
11223 | } |
11224 | return mask; |
11225 | } |
11226 | else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) |
11227 | == vect_partial_vectors_avx512) |
11228 | { |
11229 | /* The number of scalars per iteration and the number of vectors are |
11230 | both compile-time constants. */ |
11231 | unsigned int nscalars_per_iter |
11232 | = exact_div (a: nvectors * TYPE_VECTOR_SUBPARTS (node: vectype), |
11233 | LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); |
11234 | |
11235 | rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1]; |
11236 | |
11237 | /* The stored nV is dependent on the mask type produced. */ |
11238 | gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), |
11239 | TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant () |
11240 | == rgm->factor); |
11241 | nvectors = rgm->factor; |
11242 | |
11243 | /* Populate the rgroup's mask array, if this is the first time we've |
11244 | used it. */ |
11245 | if (rgm->controls.is_empty ()) |
11246 | { |
11247 | rgm->controls.safe_grow_cleared (len: nvectors, exact: true); |
11248 | for (unsigned int i = 0; i < nvectors; ++i) |
11249 | { |
11250 | tree mask = make_temp_ssa_name (type: rgm->type, NULL, name: "loop_mask" ); |
11251 | /* Provide a dummy definition until the real one is available. */ |
11252 | SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); |
11253 | rgm->controls[i] = mask; |
11254 | } |
11255 | } |
11256 | if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type), |
11257 | TYPE_VECTOR_SUBPARTS (vectype))) |
11258 | return rgm->controls[index]; |
11259 | |
11260 | /* Split the vector if needed. Since we are dealing with integer mode |
11261 | masks with AVX512 we can operate on the integer representation |
11262 | performing the whole vector shifting. */ |
11263 | unsigned HOST_WIDE_INT factor; |
11264 | bool ok = constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: rgm->type), |
11265 | b: TYPE_VECTOR_SUBPARTS (node: vectype), multiple: &factor); |
11266 | gcc_assert (ok); |
11267 | gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT); |
11268 | tree mask_type = truth_type_for (vectype); |
11269 | gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT); |
11270 | unsigned vi = index / factor; |
11271 | unsigned vpart = index % factor; |
11272 | tree vec = rgm->controls[vi]; |
11273 | gimple_seq seq = NULL; |
11274 | vec = gimple_build (seq: &seq, code: VIEW_CONVERT_EXPR, |
11275 | type: lang_hooks.types.type_for_mode |
11276 | (TYPE_MODE (rgm->type), 1), ops: vec); |
11277 | /* For integer mode masks simply shift the right bits into position. */ |
11278 | if (vpart != 0) |
11279 | vec = gimple_build (seq: &seq, code: RSHIFT_EXPR, TREE_TYPE (vec), ops: vec, |
11280 | ops: build_int_cst (integer_type_node, |
11281 | (TYPE_VECTOR_SUBPARTS (node: vectype) |
11282 | * vpart))); |
11283 | vec = gimple_convert (seq: &seq, type: lang_hooks.types.type_for_mode |
11284 | (TYPE_MODE (mask_type), 1), op: vec); |
11285 | vec = gimple_build (seq: &seq, code: VIEW_CONVERT_EXPR, type: mask_type, ops: vec); |
11286 | if (seq) |
11287 | gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); |
11288 | return vec; |
11289 | } |
11290 | else |
11291 | gcc_unreachable (); |
11292 | } |
11293 | |
11294 | /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS |
11295 | lengths for controlling an operation on VECTYPE. The operation splits |
11296 | each element of VECTYPE into FACTOR separate subelements, measuring the |
11297 | length as a number of these subelements. */ |
11298 | |
11299 | void |
11300 | vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens, |
11301 | unsigned int nvectors, tree vectype, unsigned int factor) |
11302 | { |
11303 | gcc_assert (nvectors != 0); |
11304 | if (lens->length () < nvectors) |
11305 | lens->safe_grow_cleared (len: nvectors, exact: true); |
11306 | rgroup_controls *rgl = &(*lens)[nvectors - 1]; |
11307 | |
11308 | /* The number of scalars per iteration, scalar occupied bytes and |
11309 | the number of vectors are both compile-time constants. */ |
11310 | unsigned int nscalars_per_iter |
11311 | = exact_div (a: nvectors * TYPE_VECTOR_SUBPARTS (node: vectype), |
11312 | LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); |
11313 | |
11314 | if (rgl->max_nscalars_per_iter < nscalars_per_iter) |
11315 | { |
11316 | /* For now, we only support cases in which all loads and stores fall back |
11317 | to VnQI or none do. */ |
11318 | gcc_assert (!rgl->max_nscalars_per_iter |
11319 | || (rgl->factor == 1 && factor == 1) |
11320 | || (rgl->max_nscalars_per_iter * rgl->factor |
11321 | == nscalars_per_iter * factor)); |
11322 | rgl->max_nscalars_per_iter = nscalars_per_iter; |
11323 | rgl->type = vectype; |
11324 | rgl->factor = factor; |
11325 | } |
11326 | } |
11327 | |
11328 | /* Given a complete set of lengths LENS, extract length number INDEX |
11329 | for an rgroup that operates on NVECTORS vectors of type VECTYPE, |
11330 | where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR |
11331 | multipled by the number of elements that should be processed. |
11332 | Insert any set-up statements before GSI. */ |
11333 | |
11334 | tree |
11335 | vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi, |
11336 | vec_loop_lens *lens, unsigned int nvectors, tree vectype, |
11337 | unsigned int index, unsigned int factor) |
11338 | { |
11339 | rgroup_controls *rgl = &(*lens)[nvectors - 1]; |
11340 | bool use_bias_adjusted_len = |
11341 | LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0; |
11342 | |
11343 | /* Populate the rgroup's len array, if this is the first time we've |
11344 | used it. */ |
11345 | if (rgl->controls.is_empty ()) |
11346 | { |
11347 | rgl->controls.safe_grow_cleared (len: nvectors, exact: true); |
11348 | for (unsigned int i = 0; i < nvectors; ++i) |
11349 | { |
11350 | tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo); |
11351 | gcc_assert (len_type != NULL_TREE); |
11352 | |
11353 | tree len = make_temp_ssa_name (type: len_type, NULL, name: "loop_len" ); |
11354 | |
11355 | /* Provide a dummy definition until the real one is available. */ |
11356 | SSA_NAME_DEF_STMT (len) = gimple_build_nop (); |
11357 | rgl->controls[i] = len; |
11358 | |
11359 | if (use_bias_adjusted_len) |
11360 | { |
11361 | gcc_assert (i == 0); |
11362 | tree adjusted_len = |
11363 | make_temp_ssa_name (type: len_type, NULL, name: "adjusted_loop_len" ); |
11364 | SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop (); |
11365 | rgl->bias_adjusted_ctrl = adjusted_len; |
11366 | } |
11367 | } |
11368 | } |
11369 | |
11370 | if (use_bias_adjusted_len) |
11371 | return rgl->bias_adjusted_ctrl; |
11372 | |
11373 | tree loop_len = rgl->controls[index]; |
11374 | if (rgl->factor == 1 && factor == 1) |
11375 | { |
11376 | poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (node: rgl->type); |
11377 | poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (node: vectype); |
11378 | if (maybe_ne (a: nunits1, b: nunits2)) |
11379 | { |
11380 | /* A loop len for data type X can be reused for data type Y |
11381 | if X has N times more elements than Y and if Y's elements |
11382 | are N times bigger than X's. */ |
11383 | gcc_assert (multiple_p (nunits1, nunits2)); |
11384 | factor = exact_div (a: nunits1, b: nunits2).to_constant (); |
11385 | tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); |
11386 | gimple_seq seq = NULL; |
11387 | loop_len = gimple_build (seq: &seq, code: RDIV_EXPR, type: iv_type, ops: loop_len, |
11388 | ops: build_int_cst (iv_type, factor)); |
11389 | if (seq) |
11390 | gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); |
11391 | } |
11392 | } |
11393 | return loop_len; |
11394 | } |
11395 | |
11396 | /* Scale profiling counters by estimation for LOOP which is vectorized |
11397 | by factor VF. |
11398 | If FLAT is true, the loop we started with had unrealistically flat |
11399 | profile. */ |
11400 | |
11401 | static void |
11402 | scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat) |
11403 | { |
11404 | /* For flat profiles do not scale down proportionally by VF and only |
11405 | cap by known iteration count bounds. */ |
11406 | if (flat) |
11407 | { |
11408 | if (dump_file && (dump_flags & TDF_DETAILS)) |
11409 | fprintf (stream: dump_file, |
11410 | format: "Vectorized loop profile seems flat; not scaling iteration " |
11411 | "count down by the vectorization factor %i\n" , vf); |
11412 | scale_loop_profile (loop, profile_probability::always (), |
11413 | get_likely_max_loop_iterations_int (loop)); |
11414 | return; |
11415 | } |
11416 | /* Loop body executes VF fewer times and exit increases VF times. */ |
11417 | profile_count entry_count = loop_preheader_edge (loop)->count (); |
11418 | |
11419 | /* If we have unreliable loop profile avoid dropping entry |
11420 | count bellow header count. This can happen since loops |
11421 | has unrealistically low trip counts. */ |
11422 | while (vf > 1 |
11423 | && loop->header->count > entry_count |
11424 | && loop->header->count < entry_count * vf) |
11425 | { |
11426 | if (dump_file && (dump_flags & TDF_DETAILS)) |
11427 | fprintf (stream: dump_file, |
11428 | format: "Vectorization factor %i seems too large for profile " |
11429 | "prevoiusly believed to be consistent; reducing.\n" , vf); |
11430 | vf /= 2; |
11431 | } |
11432 | |
11433 | if (entry_count.nonzero_p ()) |
11434 | set_edge_probability_and_rescale_others |
11435 | (exit_e, |
11436 | entry_count.probability_in (overall: loop->header->count / vf)); |
11437 | /* Avoid producing very large exit probability when we do not have |
11438 | sensible profile. */ |
11439 | else if (exit_e->probability < profile_probability::always () / (vf * 2)) |
11440 | set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf); |
11441 | loop->latch->count = single_pred_edge (bb: loop->latch)->count (); |
11442 | |
11443 | scale_loop_profile (loop, profile_probability::always () / vf, |
11444 | get_likely_max_loop_iterations_int (loop)); |
11445 | } |
11446 | |
11447 | /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI |
11448 | latch edge values originally defined by it. */ |
11449 | |
11450 | static void |
11451 | maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo, |
11452 | stmt_vec_info def_stmt_info) |
11453 | { |
11454 | tree def = gimple_get_lhs (vect_orig_stmt (stmt_info: def_stmt_info)->stmt); |
11455 | if (!def || TREE_CODE (def) != SSA_NAME) |
11456 | return; |
11457 | stmt_vec_info phi_info; |
11458 | imm_use_iterator iter; |
11459 | use_operand_p use_p; |
11460 | FOR_EACH_IMM_USE_FAST (use_p, iter, def) |
11461 | { |
11462 | gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)); |
11463 | if (!phi) |
11464 | continue; |
11465 | if (!(gimple_bb (g: phi)->loop_father->header == gimple_bb (g: phi) |
11466 | && (phi_info = loop_vinfo->lookup_stmt (phi)) |
11467 | && STMT_VINFO_RELEVANT_P (phi_info))) |
11468 | continue; |
11469 | loop_p loop = gimple_bb (g: phi)->loop_father; |
11470 | edge e = loop_latch_edge (loop); |
11471 | if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def) |
11472 | continue; |
11473 | |
11474 | if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info)) |
11475 | && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION |
11476 | && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION) |
11477 | { |
11478 | vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info); |
11479 | vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info); |
11480 | gcc_assert (phi_defs.length () == latch_defs.length ()); |
11481 | for (unsigned i = 0; i < phi_defs.length (); ++i) |
11482 | add_phi_arg (as_a <gphi *> (p: phi_defs[i]), |
11483 | gimple_get_lhs (latch_defs[i]), e, |
11484 | gimple_phi_arg_location (phi, i: e->dest_idx)); |
11485 | } |
11486 | else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence) |
11487 | { |
11488 | /* For first order recurrences we have to update both uses of |
11489 | the latch definition, the one in the PHI node and the one |
11490 | in the generated VEC_PERM_EXPR. */ |
11491 | vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info); |
11492 | vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info); |
11493 | gcc_assert (phi_defs.length () == latch_defs.length ()); |
11494 | tree phidef = gimple_assign_rhs1 (gs: phi_defs[0]); |
11495 | gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef)); |
11496 | for (unsigned i = 0; i < phi_defs.length (); ++i) |
11497 | { |
11498 | gassign *perm = as_a <gassign *> (p: phi_defs[i]); |
11499 | if (i > 0) |
11500 | gimple_assign_set_rhs1 (gs: perm, rhs: gimple_get_lhs (latch_defs[i-1])); |
11501 | gimple_assign_set_rhs2 (gs: perm, rhs: gimple_get_lhs (latch_defs[i])); |
11502 | update_stmt (s: perm); |
11503 | } |
11504 | add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e, |
11505 | gimple_phi_arg_location (phi, i: e->dest_idx)); |
11506 | } |
11507 | } |
11508 | } |
11509 | |
11510 | /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI. |
11511 | When vectorizing STMT_INFO as a store, set *SEEN_STORE to its |
11512 | stmt_vec_info. */ |
11513 | |
11514 | static bool |
11515 | vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, |
11516 | gimple_stmt_iterator *gsi, stmt_vec_info *seen_store) |
11517 | { |
11518 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
11519 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
11520 | |
11521 | if (dump_enabled_p ()) |
11522 | dump_printf_loc (MSG_NOTE, vect_location, |
11523 | "------>vectorizing statement: %G" , stmt_info->stmt); |
11524 | |
11525 | if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) |
11526 | vect_loop_kill_debug_uses (loop, stmt_info); |
11527 | |
11528 | if (!STMT_VINFO_RELEVANT_P (stmt_info) |
11529 | && !STMT_VINFO_LIVE_P (stmt_info)) |
11530 | { |
11531 | if (is_gimple_call (gs: stmt_info->stmt) |
11532 | && gimple_call_internal_p (gs: stmt_info->stmt, fn: IFN_MASK_CALL)) |
11533 | { |
11534 | gcc_assert (!gimple_call_lhs (stmt_info->stmt)); |
11535 | *seen_store = stmt_info; |
11536 | return false; |
11537 | } |
11538 | return false; |
11539 | } |
11540 | |
11541 | if (STMT_VINFO_VECTYPE (stmt_info)) |
11542 | { |
11543 | poly_uint64 nunits |
11544 | = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); |
11545 | if (!STMT_SLP_TYPE (stmt_info) |
11546 | && maybe_ne (a: nunits, b: vf) |
11547 | && dump_enabled_p ()) |
11548 | /* For SLP VF is set according to unrolling factor, and not |
11549 | to vector size, hence for SLP this print is not valid. */ |
11550 | dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n" ); |
11551 | } |
11552 | |
11553 | /* Pure SLP statements have already been vectorized. We still need |
11554 | to apply loop vectorization to hybrid SLP statements. */ |
11555 | if (PURE_SLP_STMT (stmt_info)) |
11556 | return false; |
11557 | |
11558 | if (dump_enabled_p ()) |
11559 | dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n" ); |
11560 | |
11561 | if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL)) |
11562 | *seen_store = stmt_info; |
11563 | |
11564 | return true; |
11565 | } |
11566 | |
11567 | /* Helper function to pass to simplify_replace_tree to enable replacing tree's |
11568 | in the hash_map with its corresponding values. */ |
11569 | |
11570 | static tree |
11571 | find_in_mapping (tree t, void *context) |
11572 | { |
11573 | hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context; |
11574 | |
11575 | tree *value = mapping->get (k: t); |
11576 | return value ? *value : t; |
11577 | } |
11578 | |
11579 | /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the |
11580 | original loop that has now been vectorized. |
11581 | |
11582 | The inits of the data_references need to be advanced with the number of |
11583 | iterations of the main loop. This has been computed in vect_do_peeling and |
11584 | is stored in parameter ADVANCE. We first restore the data_references |
11585 | initial offset with the values recored in ORIG_DRS_INIT. |
11586 | |
11587 | Since the loop_vec_info of this EPILOGUE was constructed for the original |
11588 | loop, its stmt_vec_infos all point to the original statements. These need |
11589 | to be updated to point to their corresponding copies as well as the SSA_NAMES |
11590 | in their PATTERN_DEF_SEQs and RELATED_STMTs. |
11591 | |
11592 | The data_reference's connections also need to be updated. Their |
11593 | corresponding dr_vec_info need to be reconnected to the EPILOGUE's |
11594 | stmt_vec_infos, their statements need to point to their corresponding copy, |
11595 | if they are gather loads or scatter stores then their reference needs to be |
11596 | updated to point to its corresponding copy. */ |
11597 | |
11598 | static void |
11599 | update_epilogue_loop_vinfo (class loop *epilogue, tree advance) |
11600 | { |
11601 | loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (loop: epilogue); |
11602 | auto_vec<gimple *> stmt_worklist; |
11603 | hash_map<tree,tree> mapping; |
11604 | gimple *orig_stmt, *new_stmt; |
11605 | gimple_stmt_iterator epilogue_gsi; |
11606 | gphi_iterator epilogue_phi_gsi; |
11607 | stmt_vec_info stmt_vinfo = NULL, related_vinfo; |
11608 | basic_block *epilogue_bbs = get_loop_body (epilogue); |
11609 | unsigned i; |
11610 | |
11611 | free (LOOP_VINFO_BBS (epilogue_vinfo)); |
11612 | LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs; |
11613 | |
11614 | /* Advance data_reference's with the number of iterations of the previous |
11615 | loop and its prologue. */ |
11616 | vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR); |
11617 | |
11618 | |
11619 | /* The EPILOGUE loop is a copy of the original loop so they share the same |
11620 | gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to |
11621 | point to the copied statements. We also create a mapping of all LHS' in |
11622 | the original loop and all the LHS' in the EPILOGUE and create worklists to |
11623 | update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */ |
11624 | for (unsigned i = 0; i < epilogue->num_nodes; ++i) |
11625 | { |
11626 | for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]); |
11627 | !gsi_end_p (i: epilogue_phi_gsi); gsi_next (i: &epilogue_phi_gsi)) |
11628 | { |
11629 | new_stmt = epilogue_phi_gsi.phi (); |
11630 | |
11631 | gcc_assert (gimple_uid (new_stmt) > 0); |
11632 | stmt_vinfo |
11633 | = epilogue_vinfo->stmt_vec_infos[gimple_uid (g: new_stmt) - 1]; |
11634 | |
11635 | orig_stmt = STMT_VINFO_STMT (stmt_vinfo); |
11636 | STMT_VINFO_STMT (stmt_vinfo) = new_stmt; |
11637 | |
11638 | mapping.put (k: gimple_phi_result (gs: orig_stmt), |
11639 | v: gimple_phi_result (gs: new_stmt)); |
11640 | /* PHI nodes can not have patterns or related statements. */ |
11641 | gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL |
11642 | && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL); |
11643 | } |
11644 | |
11645 | for (epilogue_gsi = gsi_start_bb (bb: epilogue_bbs[i]); |
11646 | !gsi_end_p (i: epilogue_gsi); gsi_next (i: &epilogue_gsi)) |
11647 | { |
11648 | new_stmt = gsi_stmt (i: epilogue_gsi); |
11649 | if (is_gimple_debug (gs: new_stmt)) |
11650 | continue; |
11651 | |
11652 | gcc_assert (gimple_uid (new_stmt) > 0); |
11653 | stmt_vinfo |
11654 | = epilogue_vinfo->stmt_vec_infos[gimple_uid (g: new_stmt) - 1]; |
11655 | |
11656 | orig_stmt = STMT_VINFO_STMT (stmt_vinfo); |
11657 | STMT_VINFO_STMT (stmt_vinfo) = new_stmt; |
11658 | |
11659 | if (tree old_lhs = gimple_get_lhs (orig_stmt)) |
11660 | mapping.put (k: old_lhs, v: gimple_get_lhs (new_stmt)); |
11661 | |
11662 | if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo)) |
11663 | { |
11664 | gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo); |
11665 | for (gimple_stmt_iterator gsi = gsi_start (seq); |
11666 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
11667 | stmt_worklist.safe_push (obj: gsi_stmt (i: gsi)); |
11668 | } |
11669 | |
11670 | related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo); |
11671 | if (related_vinfo != NULL && related_vinfo != stmt_vinfo) |
11672 | { |
11673 | gimple *stmt = STMT_VINFO_STMT (related_vinfo); |
11674 | stmt_worklist.safe_push (obj: stmt); |
11675 | /* Set BB such that the assert in |
11676 | 'get_initial_def_for_reduction' is able to determine that |
11677 | the BB of the related stmt is inside this loop. */ |
11678 | gimple_set_bb (stmt, |
11679 | gimple_bb (g: new_stmt)); |
11680 | related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo); |
11681 | gcc_assert (related_vinfo == NULL |
11682 | || related_vinfo == stmt_vinfo); |
11683 | } |
11684 | } |
11685 | } |
11686 | |
11687 | /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed |
11688 | using the original main loop and thus need to be updated to refer to the |
11689 | cloned variables used in the epilogue. */ |
11690 | for (unsigned i = 0; i < stmt_worklist.length (); ++i) |
11691 | { |
11692 | gimple *stmt = stmt_worklist[i]; |
11693 | tree *new_op; |
11694 | |
11695 | for (unsigned j = 1; j < gimple_num_ops (gs: stmt); ++j) |
11696 | { |
11697 | tree op = gimple_op (gs: stmt, i: j); |
11698 | if ((new_op = mapping.get(k: op))) |
11699 | gimple_set_op (gs: stmt, i: j, op: *new_op); |
11700 | else |
11701 | { |
11702 | /* PR92429: The last argument of simplify_replace_tree disables |
11703 | folding when replacing arguments. This is required as |
11704 | otherwise you might end up with different statements than the |
11705 | ones analyzed in vect_loop_analyze, leading to different |
11706 | vectorization. */ |
11707 | op = simplify_replace_tree (op, NULL_TREE, NULL_TREE, |
11708 | &find_in_mapping, &mapping, do_fold: false); |
11709 | gimple_set_op (gs: stmt, i: j, op); |
11710 | } |
11711 | } |
11712 | } |
11713 | |
11714 | struct data_reference *dr; |
11715 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo); |
11716 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
11717 | { |
11718 | orig_stmt = DR_STMT (dr); |
11719 | gcc_assert (gimple_uid (orig_stmt) > 0); |
11720 | stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (g: orig_stmt) - 1]; |
11721 | /* Data references for gather loads and scatter stores do not use the |
11722 | updated offset we set using ADVANCE. Instead we have to make sure the |
11723 | reference in the data references point to the corresponding copy of |
11724 | the original in the epilogue. Make sure to update both |
11725 | gather/scatters recognized by dataref analysis and also other |
11726 | refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */ |
11727 | auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_info: stmt_vinfo); |
11728 | if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER |
11729 | || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo)) |
11730 | { |
11731 | DR_REF (dr) |
11732 | = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE, |
11733 | &find_in_mapping, &mapping); |
11734 | DR_BASE_ADDRESS (dr) |
11735 | = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE, |
11736 | &find_in_mapping, &mapping); |
11737 | } |
11738 | DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo); |
11739 | stmt_vinfo->dr_aux.stmt = stmt_vinfo; |
11740 | } |
11741 | |
11742 | epilogue_vinfo->shared->datarefs_copy.release (); |
11743 | epilogue_vinfo->shared->save_datarefs (); |
11744 | } |
11745 | |
11746 | /* When vectorizing early break statements instructions that happen before |
11747 | the early break in the current BB need to be moved to after the early |
11748 | break. This function deals with that and assumes that any validity |
11749 | checks has already been performed. |
11750 | |
11751 | While moving the instructions if it encounters a VUSE or VDEF it then |
11752 | corrects the VUSES as it moves the statements along. GDEST is the location |
11753 | in which to insert the new statements. */ |
11754 | |
11755 | static void |
11756 | move_early_exit_stmts (loop_vec_info loop_vinfo) |
11757 | { |
11758 | DUMP_VECT_SCOPE ("move_early_exit_stmts" ); |
11759 | |
11760 | if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ()) |
11761 | return; |
11762 | |
11763 | /* Move all stmts that need moving. */ |
11764 | basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo); |
11765 | gimple_stmt_iterator dest_gsi = gsi_after_labels (bb: dest_bb); |
11766 | |
11767 | tree last_seen_vuse = NULL_TREE; |
11768 | for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo)) |
11769 | { |
11770 | /* We have to update crossed degenerate virtual PHIs. Simply |
11771 | elide them. */ |
11772 | if (gphi *vphi = dyn_cast <gphi *> (p: stmt)) |
11773 | { |
11774 | tree vdef = gimple_phi_result (gs: vphi); |
11775 | tree vuse = gimple_phi_arg_def (gs: vphi, index: 0); |
11776 | imm_use_iterator iter; |
11777 | use_operand_p use_p; |
11778 | gimple *use_stmt; |
11779 | FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef) |
11780 | { |
11781 | FOR_EACH_IMM_USE_ON_STMT (use_p, iter) |
11782 | SET_USE (use_p, vuse); |
11783 | } |
11784 | auto gsi = gsi_for_stmt (stmt); |
11785 | remove_phi_node (&gsi, true); |
11786 | last_seen_vuse = vuse; |
11787 | continue; |
11788 | } |
11789 | |
11790 | /* Check to see if statement is still required for vect or has been |
11791 | elided. */ |
11792 | auto stmt_info = loop_vinfo->lookup_stmt (stmt); |
11793 | if (!stmt_info) |
11794 | continue; |
11795 | |
11796 | if (dump_enabled_p ()) |
11797 | dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G" , stmt); |
11798 | |
11799 | gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt); |
11800 | gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT); |
11801 | last_seen_vuse = gimple_vuse (g: stmt); |
11802 | } |
11803 | |
11804 | /* Update all the stmts with their new reaching VUSES. */ |
11805 | for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo)) |
11806 | { |
11807 | if (dump_enabled_p ()) |
11808 | dump_printf_loc (MSG_NOTE, vect_location, |
11809 | "updating vuse to %T for load %G" , |
11810 | last_seen_vuse, p); |
11811 | gimple_set_vuse (g: p, vuse: last_seen_vuse); |
11812 | update_stmt (s: p); |
11813 | } |
11814 | |
11815 | /* And update the LC PHIs on exits. */ |
11816 | for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo))) |
11817 | if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb)) |
11818 | if (gphi *phi = get_virtual_phi (e->dest)) |
11819 | SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse); |
11820 | } |
11821 | |
11822 | /* Function vect_transform_loop. |
11823 | |
11824 | The analysis phase has determined that the loop is vectorizable. |
11825 | Vectorize the loop - created vectorized stmts to replace the scalar |
11826 | stmts in the loop, and update the loop exit condition. |
11827 | Returns scalar epilogue loop if any. */ |
11828 | |
11829 | class loop * |
11830 | vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) |
11831 | { |
11832 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
11833 | class loop *epilogue = NULL; |
11834 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
11835 | int nbbs = loop->num_nodes; |
11836 | int i; |
11837 | tree niters_vector = NULL_TREE; |
11838 | tree step_vector = NULL_TREE; |
11839 | tree niters_vector_mult_vf = NULL_TREE; |
11840 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
11841 | unsigned int lowest_vf = constant_lower_bound (a: vf); |
11842 | gimple *stmt; |
11843 | bool check_profitability = false; |
11844 | unsigned int th; |
11845 | bool flat = maybe_flat_loop_profile (loop); |
11846 | |
11847 | DUMP_VECT_SCOPE ("vec_transform_loop" ); |
11848 | |
11849 | loop_vinfo->shared->check_datarefs (); |
11850 | |
11851 | /* Use the more conservative vectorization threshold. If the number |
11852 | of iterations is constant assume the cost check has been performed |
11853 | by our caller. If the threshold makes all loops profitable that |
11854 | run at least the (estimated) vectorization factor number of times |
11855 | checking is pointless, too. */ |
11856 | th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); |
11857 | if (vect_apply_runtime_profitability_check_p (loop_vinfo)) |
11858 | { |
11859 | if (dump_enabled_p ()) |
11860 | dump_printf_loc (MSG_NOTE, vect_location, |
11861 | "Profitability threshold is %d loop iterations.\n" , |
11862 | th); |
11863 | check_profitability = true; |
11864 | } |
11865 | |
11866 | /* Make sure there exists a single-predecessor exit bb. Do this before |
11867 | versioning. */ |
11868 | edge e = LOOP_VINFO_IV_EXIT (loop_vinfo); |
11869 | if (! single_pred_p (bb: e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) |
11870 | { |
11871 | split_loop_exit_edge (e, true); |
11872 | if (dump_enabled_p ()) |
11873 | dump_printf (MSG_NOTE, "split exit edge\n" ); |
11874 | } |
11875 | |
11876 | /* Version the loop first, if required, so the profitability check |
11877 | comes first. */ |
11878 | |
11879 | if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) |
11880 | { |
11881 | class loop *sloop |
11882 | = vect_loop_versioning (loop_vinfo, loop_vectorized_call); |
11883 | sloop->force_vectorize = false; |
11884 | check_profitability = false; |
11885 | } |
11886 | |
11887 | /* Make sure there exists a single-predecessor exit bb also on the |
11888 | scalar loop copy. Do this after versioning but before peeling |
11889 | so CFG structure is fine for both scalar and if-converted loop |
11890 | to make slpeel_duplicate_current_defs_from_edges face matched |
11891 | loop closed PHI nodes on the exit. */ |
11892 | if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) |
11893 | { |
11894 | e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo); |
11895 | if (! single_pred_p (bb: e->dest)) |
11896 | { |
11897 | split_loop_exit_edge (e, true); |
11898 | if (dump_enabled_p ()) |
11899 | dump_printf (MSG_NOTE, "split exit edge of scalar loop\n" ); |
11900 | } |
11901 | } |
11902 | |
11903 | tree niters = vect_build_loop_niters (loop_vinfo); |
11904 | LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; |
11905 | tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); |
11906 | bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo); |
11907 | tree advance; |
11908 | drs_init_vec orig_drs_init; |
11909 | |
11910 | epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, |
11911 | &step_vector, &niters_vector_mult_vf, th, |
11912 | check_profitability, niters_no_overflow, |
11913 | &advance); |
11914 | if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo) |
11915 | && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ()) |
11916 | { |
11917 | /* Ifcvt duplicates loop preheader, loop body and produces an basic |
11918 | block after loop exit. We need to scale all that. */ |
11919 | basic_block |
11920 | = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src; |
11921 | preheader->count |
11922 | = preheader->count.apply_probability |
11923 | (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo)); |
11924 | scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo), |
11925 | LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo)); |
11926 | LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count; |
11927 | } |
11928 | |
11929 | if (niters_vector == NULL_TREE) |
11930 | { |
11931 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
11932 | && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) |
11933 | && known_eq (lowest_vf, vf)) |
11934 | { |
11935 | niters_vector |
11936 | = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), |
11937 | LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf); |
11938 | step_vector = build_one_cst (TREE_TYPE (niters)); |
11939 | } |
11940 | else if (vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
11941 | vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector, |
11942 | &step_vector, niters_no_overflow); |
11943 | else |
11944 | /* vect_do_peeling subtracted the number of peeled prologue |
11945 | iterations from LOOP_VINFO_NITERS. */ |
11946 | vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo), |
11947 | &niters_vector, &step_vector, |
11948 | niters_no_overflow); |
11949 | } |
11950 | |
11951 | /* 1) Make sure the loop header has exactly two entries |
11952 | 2) Make sure we have a preheader basic block. */ |
11953 | |
11954 | gcc_assert (EDGE_COUNT (loop->header->preds) == 2); |
11955 | |
11956 | split_edge (loop_preheader_edge (loop)); |
11957 | |
11958 | if (vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
11959 | /* This will deal with any possible peeling. */ |
11960 | vect_prepare_for_masked_peels (loop_vinfo); |
11961 | |
11962 | /* Handle any code motion that we need to for early-break vectorization after |
11963 | we've done peeling but just before we start vectorizing. */ |
11964 | if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) |
11965 | move_early_exit_stmts (loop_vinfo); |
11966 | |
11967 | /* Schedule the SLP instances first, then handle loop vectorization |
11968 | below. */ |
11969 | if (!loop_vinfo->slp_instances.is_empty ()) |
11970 | { |
11971 | DUMP_VECT_SCOPE ("scheduling SLP instances" ); |
11972 | vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); |
11973 | } |
11974 | |
11975 | /* FORNOW: the vectorizer supports only loops which body consist |
11976 | of one basic block (header + empty latch). When the vectorizer will |
11977 | support more involved loop forms, the order by which the BBs are |
11978 | traversed need to be reconsidered. */ |
11979 | |
11980 | for (i = 0; i < nbbs; i++) |
11981 | { |
11982 | basic_block bb = bbs[i]; |
11983 | stmt_vec_info stmt_info; |
11984 | |
11985 | for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si); |
11986 | gsi_next (i: &si)) |
11987 | { |
11988 | gphi *phi = si.phi (); |
11989 | if (dump_enabled_p ()) |
11990 | dump_printf_loc (MSG_NOTE, vect_location, |
11991 | "------>vectorizing phi: %G" , (gimple *) phi); |
11992 | stmt_info = loop_vinfo->lookup_stmt (phi); |
11993 | if (!stmt_info) |
11994 | continue; |
11995 | |
11996 | if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) |
11997 | vect_loop_kill_debug_uses (loop, stmt_info); |
11998 | |
11999 | if (!STMT_VINFO_RELEVANT_P (stmt_info) |
12000 | && !STMT_VINFO_LIVE_P (stmt_info)) |
12001 | continue; |
12002 | |
12003 | if (STMT_VINFO_VECTYPE (stmt_info) |
12004 | && (maybe_ne |
12005 | (a: TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), b: vf)) |
12006 | && dump_enabled_p ()) |
12007 | dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n" ); |
12008 | |
12009 | if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def |
12010 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
12011 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def |
12012 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle |
12013 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence |
12014 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def) |
12015 | && ! PURE_SLP_STMT (stmt_info)) |
12016 | { |
12017 | if (dump_enabled_p ()) |
12018 | dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n" ); |
12019 | vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL); |
12020 | } |
12021 | } |
12022 | |
12023 | for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si); |
12024 | gsi_next (i: &si)) |
12025 | { |
12026 | gphi *phi = si.phi (); |
12027 | stmt_info = loop_vinfo->lookup_stmt (phi); |
12028 | if (!stmt_info) |
12029 | continue; |
12030 | |
12031 | if (!STMT_VINFO_RELEVANT_P (stmt_info) |
12032 | && !STMT_VINFO_LIVE_P (stmt_info)) |
12033 | continue; |
12034 | |
12035 | if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def |
12036 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
12037 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def |
12038 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle |
12039 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def |
12040 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence) |
12041 | && ! PURE_SLP_STMT (stmt_info)) |
12042 | maybe_set_vectorized_backedge_value (loop_vinfo, def_stmt_info: stmt_info); |
12043 | } |
12044 | |
12045 | for (gimple_stmt_iterator si = gsi_start_bb (bb); |
12046 | !gsi_end_p (i: si);) |
12047 | { |
12048 | stmt = gsi_stmt (i: si); |
12049 | /* During vectorization remove existing clobber stmts. */ |
12050 | if (gimple_clobber_p (s: stmt)) |
12051 | { |
12052 | unlink_stmt_vdef (stmt); |
12053 | gsi_remove (&si, true); |
12054 | release_defs (stmt); |
12055 | } |
12056 | else |
12057 | { |
12058 | /* Ignore vector stmts created in the outer loop. */ |
12059 | stmt_info = loop_vinfo->lookup_stmt (stmt); |
12060 | |
12061 | /* vector stmts created in the outer-loop during vectorization of |
12062 | stmts in an inner-loop may not have a stmt_info, and do not |
12063 | need to be vectorized. */ |
12064 | stmt_vec_info seen_store = NULL; |
12065 | if (stmt_info) |
12066 | { |
12067 | if (STMT_VINFO_IN_PATTERN_P (stmt_info)) |
12068 | { |
12069 | gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); |
12070 | for (gimple_stmt_iterator subsi = gsi_start (seq&: def_seq); |
12071 | !gsi_end_p (i: subsi); gsi_next (i: &subsi)) |
12072 | { |
12073 | stmt_vec_info pat_stmt_info |
12074 | = loop_vinfo->lookup_stmt (gsi_stmt (i: subsi)); |
12075 | vect_transform_loop_stmt (loop_vinfo, stmt_info: pat_stmt_info, |
12076 | gsi: &si, seen_store: &seen_store); |
12077 | } |
12078 | stmt_vec_info pat_stmt_info |
12079 | = STMT_VINFO_RELATED_STMT (stmt_info); |
12080 | if (vect_transform_loop_stmt (loop_vinfo, stmt_info: pat_stmt_info, |
12081 | gsi: &si, seen_store: &seen_store)) |
12082 | maybe_set_vectorized_backedge_value (loop_vinfo, |
12083 | def_stmt_info: pat_stmt_info); |
12084 | } |
12085 | else |
12086 | { |
12087 | if (vect_transform_loop_stmt (loop_vinfo, stmt_info, gsi: &si, |
12088 | seen_store: &seen_store)) |
12089 | maybe_set_vectorized_backedge_value (loop_vinfo, |
12090 | def_stmt_info: stmt_info); |
12091 | } |
12092 | } |
12093 | gsi_next (i: &si); |
12094 | if (seen_store) |
12095 | { |
12096 | if (STMT_VINFO_GROUPED_ACCESS (seen_store)) |
12097 | /* Interleaving. If IS_STORE is TRUE, the |
12098 | vectorization of the interleaving chain was |
12099 | completed - free all the stores in the chain. */ |
12100 | vect_remove_stores (loop_vinfo, |
12101 | DR_GROUP_FIRST_ELEMENT (seen_store)); |
12102 | else |
12103 | /* Free the attached stmt_vec_info and remove the stmt. */ |
12104 | loop_vinfo->remove_stmt (stmt_info); |
12105 | } |
12106 | } |
12107 | } |
12108 | |
12109 | /* Stub out scalar statements that must not survive vectorization. |
12110 | Doing this here helps with grouped statements, or statements that |
12111 | are involved in patterns. */ |
12112 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb); |
12113 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
12114 | { |
12115 | gcall *call = dyn_cast <gcall *> (p: gsi_stmt (i: gsi)); |
12116 | if (!call || !gimple_call_internal_p (gs: call)) |
12117 | continue; |
12118 | internal_fn ifn = gimple_call_internal_fn (gs: call); |
12119 | if (ifn == IFN_MASK_LOAD) |
12120 | { |
12121 | tree lhs = gimple_get_lhs (call); |
12122 | if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) |
12123 | { |
12124 | tree zero = build_zero_cst (TREE_TYPE (lhs)); |
12125 | gimple *new_stmt = gimple_build_assign (lhs, zero); |
12126 | gsi_replace (&gsi, new_stmt, true); |
12127 | } |
12128 | } |
12129 | else if (conditional_internal_fn_code (ifn) != ERROR_MARK) |
12130 | { |
12131 | tree lhs = gimple_get_lhs (call); |
12132 | if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) |
12133 | { |
12134 | tree else_arg |
12135 | = gimple_call_arg (gs: call, index: gimple_call_num_args (gs: call) - 1); |
12136 | gimple *new_stmt = gimple_build_assign (lhs, else_arg); |
12137 | gsi_replace (&gsi, new_stmt, true); |
12138 | } |
12139 | } |
12140 | } |
12141 | } /* BBs in loop */ |
12142 | |
12143 | /* The vectorization factor is always > 1, so if we use an IV increment of 1. |
12144 | a zero NITERS becomes a nonzero NITERS_VECTOR. */ |
12145 | if (integer_onep (step_vector)) |
12146 | niters_no_overflow = true; |
12147 | vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo, |
12148 | niters_vector, step_vector, niters_vector_mult_vf, |
12149 | !niters_no_overflow); |
12150 | |
12151 | unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); |
12152 | |
12153 | /* True if the final iteration might not handle a full vector's |
12154 | worth of scalar iterations. */ |
12155 | bool final_iter_may_be_partial |
12156 | = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) |
12157 | || LOOP_VINFO_EARLY_BREAKS (loop_vinfo); |
12158 | |
12159 | /* +1 to convert latch counts to loop iteration counts. */ |
12160 | int bias_for_lowest = 1; |
12161 | |
12162 | /* When we are peeling for gaps then we take away one scalar iteration |
12163 | from the vector loop. Thus we can adjust the upper bound by one |
12164 | scalar iteration. But only when we know the bound applies to the |
12165 | IV exit test which might not be true when we have multiple exits. */ |
12166 | if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) |
12167 | bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0; |
12168 | |
12169 | int bias_for_assumed = bias_for_lowest; |
12170 | int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
12171 | if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
12172 | { |
12173 | /* When the amount of peeling is known at compile time, the first |
12174 | iteration will have exactly alignment_npeels active elements. |
12175 | In the worst case it will have at least one. */ |
12176 | int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1); |
12177 | bias_for_lowest += lowest_vf - min_first_active; |
12178 | bias_for_assumed += assumed_vf - min_first_active; |
12179 | } |
12180 | /* In these calculations the "- 1" converts loop iteration counts |
12181 | back to latch counts. */ |
12182 | if (loop->any_upper_bound) |
12183 | { |
12184 | loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); |
12185 | loop->nb_iterations_upper_bound |
12186 | = (final_iter_may_be_partial |
12187 | ? wi::udiv_ceil (x: loop->nb_iterations_upper_bound + bias_for_lowest, |
12188 | y: lowest_vf) - 1 |
12189 | : wi::udiv_floor (x: loop->nb_iterations_upper_bound + bias_for_lowest, |
12190 | y: lowest_vf) - 1); |
12191 | if (main_vinfo |
12192 | /* Both peeling for alignment and peeling for gaps can end up |
12193 | with the scalar epilogue running for more than VF-1 iterations. */ |
12194 | && !main_vinfo->peeling_for_alignment |
12195 | && !main_vinfo->peeling_for_gaps) |
12196 | { |
12197 | unsigned int bound; |
12198 | poly_uint64 main_iters |
12199 | = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo), |
12200 | LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo)); |
12201 | main_iters |
12202 | = upper_bound (a: main_iters, |
12203 | LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo)); |
12204 | if (can_div_away_from_zero_p (a: main_iters, |
12205 | LOOP_VINFO_VECT_FACTOR (loop_vinfo), |
12206 | quotient: &bound)) |
12207 | loop->nb_iterations_upper_bound |
12208 | = wi::umin (x: (bound_wide_int) (bound - 1), |
12209 | y: loop->nb_iterations_upper_bound); |
12210 | } |
12211 | } |
12212 | if (loop->any_likely_upper_bound) |
12213 | loop->nb_iterations_likely_upper_bound |
12214 | = (final_iter_may_be_partial |
12215 | ? wi::udiv_ceil (x: loop->nb_iterations_likely_upper_bound |
12216 | + bias_for_lowest, y: lowest_vf) - 1 |
12217 | : wi::udiv_floor (x: loop->nb_iterations_likely_upper_bound |
12218 | + bias_for_lowest, y: lowest_vf) - 1); |
12219 | if (loop->any_estimate) |
12220 | loop->nb_iterations_estimate |
12221 | = (final_iter_may_be_partial |
12222 | ? wi::udiv_ceil (x: loop->nb_iterations_estimate + bias_for_assumed, |
12223 | y: assumed_vf) - 1 |
12224 | : wi::udiv_floor (x: loop->nb_iterations_estimate + bias_for_assumed, |
12225 | y: assumed_vf) - 1); |
12226 | scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), |
12227 | vf: assumed_vf, flat); |
12228 | |
12229 | if (dump_enabled_p ()) |
12230 | { |
12231 | if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) |
12232 | { |
12233 | dump_printf_loc (MSG_NOTE, vect_location, |
12234 | "LOOP VECTORIZED\n" ); |
12235 | if (loop->inner) |
12236 | dump_printf_loc (MSG_NOTE, vect_location, |
12237 | "OUTER LOOP VECTORIZED\n" ); |
12238 | dump_printf (MSG_NOTE, "\n" ); |
12239 | } |
12240 | else |
12241 | dump_printf_loc (MSG_NOTE, vect_location, |
12242 | "LOOP EPILOGUE VECTORIZED (MODE=%s)\n" , |
12243 | GET_MODE_NAME (loop_vinfo->vector_mode)); |
12244 | } |
12245 | |
12246 | /* Loops vectorized with a variable factor won't benefit from |
12247 | unrolling/peeling. */ |
12248 | if (!vf.is_constant ()) |
12249 | { |
12250 | loop->unroll = 1; |
12251 | if (dump_enabled_p ()) |
12252 | dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to" |
12253 | " variable-length vectorization factor\n" ); |
12254 | } |
12255 | /* Free SLP instances here because otherwise stmt reference counting |
12256 | won't work. */ |
12257 | slp_instance instance; |
12258 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) |
12259 | vect_free_slp_instance (instance); |
12260 | LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); |
12261 | /* Clear-up safelen field since its value is invalid after vectorization |
12262 | since vectorized loop can have loop-carried dependencies. */ |
12263 | loop->safelen = 0; |
12264 | |
12265 | if (epilogue) |
12266 | { |
12267 | update_epilogue_loop_vinfo (epilogue, advance); |
12268 | |
12269 | epilogue->simduid = loop->simduid; |
12270 | epilogue->force_vectorize = loop->force_vectorize; |
12271 | epilogue->dont_vectorize = false; |
12272 | } |
12273 | |
12274 | return epilogue; |
12275 | } |
12276 | |
12277 | /* The code below is trying to perform simple optimization - revert |
12278 | if-conversion for masked stores, i.e. if the mask of a store is zero |
12279 | do not perform it and all stored value producers also if possible. |
12280 | For example, |
12281 | for (i=0; i<n; i++) |
12282 | if (c[i]) |
12283 | { |
12284 | p1[i] += 1; |
12285 | p2[i] = p3[i] +2; |
12286 | } |
12287 | this transformation will produce the following semi-hammock: |
12288 | |
12289 | if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 }) |
12290 | { |
12291 | vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165); |
12292 | vect__12.22_172 = vect__11.19_170 + vect_cst__171; |
12293 | MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172); |
12294 | vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165); |
12295 | vect__19.28_184 = vect__18.25_182 + vect_cst__183; |
12296 | MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184); |
12297 | } |
12298 | */ |
12299 | |
12300 | void |
12301 | optimize_mask_stores (class loop *loop) |
12302 | { |
12303 | basic_block *bbs = get_loop_body (loop); |
12304 | unsigned nbbs = loop->num_nodes; |
12305 | unsigned i; |
12306 | basic_block bb; |
12307 | class loop *bb_loop; |
12308 | gimple_stmt_iterator gsi; |
12309 | gimple *stmt; |
12310 | auto_vec<gimple *> worklist; |
12311 | auto_purge_vect_location sentinel; |
12312 | |
12313 | vect_location = find_loop_location (loop); |
12314 | /* Pick up all masked stores in loop if any. */ |
12315 | for (i = 0; i < nbbs; i++) |
12316 | { |
12317 | bb = bbs[i]; |
12318 | for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); |
12319 | gsi_next (i: &gsi)) |
12320 | { |
12321 | stmt = gsi_stmt (i: gsi); |
12322 | if (gimple_call_internal_p (gs: stmt, fn: IFN_MASK_STORE)) |
12323 | worklist.safe_push (obj: stmt); |
12324 | } |
12325 | } |
12326 | |
12327 | free (ptr: bbs); |
12328 | if (worklist.is_empty ()) |
12329 | return; |
12330 | |
12331 | /* Loop has masked stores. */ |
12332 | while (!worklist.is_empty ()) |
12333 | { |
12334 | gimple *last, *last_store; |
12335 | edge e, efalse; |
12336 | tree mask; |
12337 | basic_block store_bb, join_bb; |
12338 | gimple_stmt_iterator gsi_to; |
12339 | tree vdef, new_vdef; |
12340 | gphi *phi; |
12341 | tree vectype; |
12342 | tree zero; |
12343 | |
12344 | last = worklist.pop (); |
12345 | mask = gimple_call_arg (gs: last, index: 2); |
12346 | bb = gimple_bb (g: last); |
12347 | /* Create then_bb and if-then structure in CFG, then_bb belongs to |
12348 | the same loop as if_bb. It could be different to LOOP when two |
12349 | level loop-nest is vectorized and mask_store belongs to the inner |
12350 | one. */ |
12351 | e = split_block (bb, last); |
12352 | bb_loop = bb->loop_father; |
12353 | gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop)); |
12354 | join_bb = e->dest; |
12355 | store_bb = create_empty_bb (bb); |
12356 | add_bb_to_loop (store_bb, bb_loop); |
12357 | e->flags = EDGE_TRUE_VALUE; |
12358 | efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE); |
12359 | /* Put STORE_BB to likely part. */ |
12360 | efalse->probability = profile_probability::likely (); |
12361 | e->probability = efalse->probability.invert (); |
12362 | store_bb->count = efalse->count (); |
12363 | make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU); |
12364 | if (dom_info_available_p (CDI_DOMINATORS)) |
12365 | set_immediate_dominator (CDI_DOMINATORS, store_bb, bb); |
12366 | if (dump_enabled_p ()) |
12367 | dump_printf_loc (MSG_NOTE, vect_location, |
12368 | "Create new block %d to sink mask stores." , |
12369 | store_bb->index); |
12370 | /* Create vector comparison with boolean result. */ |
12371 | vectype = TREE_TYPE (mask); |
12372 | zero = build_zero_cst (vectype); |
12373 | stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE); |
12374 | gsi = gsi_last_bb (bb); |
12375 | gsi_insert_after (&gsi, stmt, GSI_SAME_STMT); |
12376 | /* Create new PHI node for vdef of the last masked store: |
12377 | .MEM_2 = VDEF <.MEM_1> |
12378 | will be converted to |
12379 | .MEM.3 = VDEF <.MEM_1> |
12380 | and new PHI node will be created in join bb |
12381 | .MEM_2 = PHI <.MEM_1, .MEM_3> |
12382 | */ |
12383 | vdef = gimple_vdef (g: last); |
12384 | new_vdef = make_ssa_name (var: gimple_vop (cfun), stmt: last); |
12385 | gimple_set_vdef (g: last, vdef: new_vdef); |
12386 | phi = create_phi_node (vdef, join_bb); |
12387 | add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION); |
12388 | |
12389 | /* Put all masked stores with the same mask to STORE_BB if possible. */ |
12390 | while (true) |
12391 | { |
12392 | gimple_stmt_iterator gsi_from; |
12393 | gimple *stmt1 = NULL; |
12394 | |
12395 | /* Move masked store to STORE_BB. */ |
12396 | last_store = last; |
12397 | gsi = gsi_for_stmt (last); |
12398 | gsi_from = gsi; |
12399 | /* Shift GSI to the previous stmt for further traversal. */ |
12400 | gsi_prev (i: &gsi); |
12401 | gsi_to = gsi_start_bb (bb: store_bb); |
12402 | gsi_move_before (&gsi_from, &gsi_to); |
12403 | /* Setup GSI_TO to the non-empty block start. */ |
12404 | gsi_to = gsi_start_bb (bb: store_bb); |
12405 | if (dump_enabled_p ()) |
12406 | dump_printf_loc (MSG_NOTE, vect_location, |
12407 | "Move stmt to created bb\n%G" , last); |
12408 | /* Move all stored value producers if possible. */ |
12409 | while (!gsi_end_p (i: gsi)) |
12410 | { |
12411 | tree lhs; |
12412 | imm_use_iterator imm_iter; |
12413 | use_operand_p use_p; |
12414 | bool res; |
12415 | |
12416 | /* Skip debug statements. */ |
12417 | if (is_gimple_debug (gs: gsi_stmt (i: gsi))) |
12418 | { |
12419 | gsi_prev (i: &gsi); |
12420 | continue; |
12421 | } |
12422 | stmt1 = gsi_stmt (i: gsi); |
12423 | /* Do not consider statements writing to memory or having |
12424 | volatile operand. */ |
12425 | if (gimple_vdef (g: stmt1) |
12426 | || gimple_has_volatile_ops (stmt: stmt1)) |
12427 | break; |
12428 | gsi_from = gsi; |
12429 | gsi_prev (i: &gsi); |
12430 | lhs = gimple_get_lhs (stmt1); |
12431 | if (!lhs) |
12432 | break; |
12433 | |
12434 | /* LHS of vectorized stmt must be SSA_NAME. */ |
12435 | if (TREE_CODE (lhs) != SSA_NAME) |
12436 | break; |
12437 | |
12438 | if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) |
12439 | { |
12440 | /* Remove dead scalar statement. */ |
12441 | if (has_zero_uses (var: lhs)) |
12442 | { |
12443 | gsi_remove (&gsi_from, true); |
12444 | continue; |
12445 | } |
12446 | } |
12447 | |
12448 | /* Check that LHS does not have uses outside of STORE_BB. */ |
12449 | res = true; |
12450 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) |
12451 | { |
12452 | gimple *use_stmt; |
12453 | use_stmt = USE_STMT (use_p); |
12454 | if (is_gimple_debug (gs: use_stmt)) |
12455 | continue; |
12456 | if (gimple_bb (g: use_stmt) != store_bb) |
12457 | { |
12458 | res = false; |
12459 | break; |
12460 | } |
12461 | } |
12462 | if (!res) |
12463 | break; |
12464 | |
12465 | if (gimple_vuse (g: stmt1) |
12466 | && gimple_vuse (g: stmt1) != gimple_vuse (g: last_store)) |
12467 | break; |
12468 | |
12469 | /* Can move STMT1 to STORE_BB. */ |
12470 | if (dump_enabled_p ()) |
12471 | dump_printf_loc (MSG_NOTE, vect_location, |
12472 | "Move stmt to created bb\n%G" , stmt1); |
12473 | gsi_move_before (&gsi_from, &gsi_to); |
12474 | /* Shift GSI_TO for further insertion. */ |
12475 | gsi_prev (i: &gsi_to); |
12476 | } |
12477 | /* Put other masked stores with the same mask to STORE_BB. */ |
12478 | if (worklist.is_empty () |
12479 | || gimple_call_arg (gs: worklist.last (), index: 2) != mask |
12480 | || worklist.last () != stmt1) |
12481 | break; |
12482 | last = worklist.pop (); |
12483 | } |
12484 | add_phi_arg (phi, gimple_vuse (g: last_store), e, UNKNOWN_LOCATION); |
12485 | } |
12486 | } |
12487 | |
12488 | /* Decide whether it is possible to use a zero-based induction variable |
12489 | when vectorizing LOOP_VINFO with partial vectors. If it is, return |
12490 | the value that the induction variable must be able to hold in order |
12491 | to ensure that the rgroups eventually have no active vector elements. |
12492 | Return -1 otherwise. */ |
12493 | |
12494 | widest_int |
12495 | vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo) |
12496 | { |
12497 | tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); |
12498 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
12499 | unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo); |
12500 | |
12501 | /* Calculate the value that the induction variable must be able |
12502 | to hit in order to ensure that we end the loop with an all-false mask. |
12503 | This involves adding the maximum number of inactive trailing scalar |
12504 | iterations. */ |
12505 | widest_int iv_limit = -1; |
12506 | if (max_loop_iterations (loop, &iv_limit)) |
12507 | { |
12508 | if (niters_skip) |
12509 | { |
12510 | /* Add the maximum number of skipped iterations to the |
12511 | maximum iteration count. */ |
12512 | if (TREE_CODE (niters_skip) == INTEGER_CST) |
12513 | iv_limit += wi::to_widest (t: niters_skip); |
12514 | else |
12515 | iv_limit += max_vf - 1; |
12516 | } |
12517 | else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) |
12518 | /* Make a conservatively-correct assumption. */ |
12519 | iv_limit += max_vf - 1; |
12520 | |
12521 | /* IV_LIMIT is the maximum number of latch iterations, which is also |
12522 | the maximum in-range IV value. Round this value down to the previous |
12523 | vector alignment boundary and then add an extra full iteration. */ |
12524 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
12525 | iv_limit = (iv_limit & -(int) known_alignment (a: vf)) + max_vf; |
12526 | } |
12527 | return iv_limit; |
12528 | } |
12529 | |
12530 | /* For the given rgroup_controls RGC, check whether an induction variable |
12531 | would ever hit a value that produces a set of all-false masks or zero |
12532 | lengths before wrapping around. Return true if it's possible to wrap |
12533 | around before hitting the desirable value, otherwise return false. */ |
12534 | |
12535 | bool |
12536 | vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc) |
12537 | { |
12538 | widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo); |
12539 | |
12540 | if (iv_limit == -1) |
12541 | return true; |
12542 | |
12543 | tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo); |
12544 | unsigned int compare_precision = TYPE_PRECISION (compare_type); |
12545 | unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor; |
12546 | |
12547 | if (wi::min_precision (x: iv_limit * nitems, sgn: UNSIGNED) > compare_precision) |
12548 | return true; |
12549 | |
12550 | return false; |
12551 | } |
12552 | |