1 | /* Data References Analysis and Manipulation Utilities for Vectorization. |
2 | Copyright (C) 2003-2024 Free Software Foundation, Inc. |
3 | Contributed by Dorit Naishlos <dorit@il.ibm.com> |
4 | and Ira Rosen <irar@il.ibm.com> |
5 | |
6 | This file is part of GCC. |
7 | |
8 | GCC is free software; you can redistribute it and/or modify it under |
9 | the terms of the GNU General Public License as published by the Free |
10 | Software Foundation; either version 3, or (at your option) any later |
11 | version. |
12 | |
13 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or |
15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
16 | for more details. |
17 | |
18 | You should have received a copy of the GNU General Public License |
19 | along with GCC; see the file COPYING3. If not see |
20 | <http://www.gnu.org/licenses/>. */ |
21 | |
22 | #include "config.h" |
23 | #include "system.h" |
24 | #include "coretypes.h" |
25 | #include "backend.h" |
26 | #include "target.h" |
27 | #include "rtl.h" |
28 | #include "tree.h" |
29 | #include "gimple.h" |
30 | #include "predict.h" |
31 | #include "memmodel.h" |
32 | #include "tm_p.h" |
33 | #include "ssa.h" |
34 | #include "optabs-tree.h" |
35 | #include "cgraph.h" |
36 | #include "dumpfile.h" |
37 | #include "alias.h" |
38 | #include "fold-const.h" |
39 | #include "stor-layout.h" |
40 | #include "tree-eh.h" |
41 | #include "gimplify.h" |
42 | #include "gimple-iterator.h" |
43 | #include "gimplify-me.h" |
44 | #include "tree-ssa-loop-ivopts.h" |
45 | #include "tree-ssa-loop-manip.h" |
46 | #include "tree-ssa-loop.h" |
47 | #include "cfgloop.h" |
48 | #include "tree-scalar-evolution.h" |
49 | #include "tree-vectorizer.h" |
50 | #include "expr.h" |
51 | #include "builtins.h" |
52 | #include "tree-cfg.h" |
53 | #include "tree-hash-traits.h" |
54 | #include "vec-perm-indices.h" |
55 | #include "internal-fn.h" |
56 | #include "gimple-fold.h" |
57 | |
58 | /* Return true if load- or store-lanes optab OPTAB is implemented for |
59 | COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ |
60 | |
61 | static bool |
62 | vect_lanes_optab_supported_p (const char *name, convert_optab optab, |
63 | tree vectype, unsigned HOST_WIDE_INT count) |
64 | { |
65 | machine_mode mode, array_mode; |
66 | bool limit_p; |
67 | |
68 | mode = TYPE_MODE (vectype); |
69 | if (!targetm.array_mode (mode, count).exists (mode: &array_mode)) |
70 | { |
71 | poly_uint64 bits = count * GET_MODE_BITSIZE (mode); |
72 | limit_p = !targetm.array_mode_supported_p (mode, count); |
73 | if (!int_mode_for_size (size: bits, limit: limit_p).exists (mode: &array_mode)) |
74 | { |
75 | if (dump_enabled_p ()) |
76 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
77 | "no array mode for %s[%wu]\n" , |
78 | GET_MODE_NAME (mode), count); |
79 | return false; |
80 | } |
81 | } |
82 | |
83 | if (convert_optab_handler (op: optab, to_mode: array_mode, from_mode: mode) == CODE_FOR_nothing) |
84 | { |
85 | if (dump_enabled_p ()) |
86 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
87 | "cannot use %s<%s><%s>\n" , name, |
88 | GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); |
89 | return false; |
90 | } |
91 | |
92 | if (dump_enabled_p ()) |
93 | dump_printf_loc (MSG_NOTE, vect_location, |
94 | "can use %s<%s><%s>\n" , name, GET_MODE_NAME (array_mode), |
95 | GET_MODE_NAME (mode)); |
96 | |
97 | return true; |
98 | } |
99 | |
100 | /* Helper function to identify a simd clone call. If this is a call to a |
101 | function with simd clones then return the corresponding cgraph_node, |
102 | otherwise return NULL. */ |
103 | |
104 | static cgraph_node* |
105 | simd_clone_call_p (gimple *stmt) |
106 | { |
107 | gcall *call = dyn_cast <gcall *> (p: stmt); |
108 | if (!call) |
109 | return NULL; |
110 | |
111 | tree fndecl = NULL_TREE; |
112 | if (gimple_call_internal_p (gs: call, fn: IFN_MASK_CALL)) |
113 | fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0); |
114 | else |
115 | fndecl = gimple_call_fndecl (gs: stmt); |
116 | |
117 | if (fndecl == NULL_TREE) |
118 | return NULL; |
119 | |
120 | cgraph_node *node = cgraph_node::get (decl: fndecl); |
121 | if (node && node->simd_clones != NULL) |
122 | return node; |
123 | |
124 | return NULL; |
125 | } |
126 | |
127 | |
128 | |
129 | /* Return the smallest scalar part of STMT_INFO. |
130 | This is used to determine the vectype of the stmt. We generally set the |
131 | vectype according to the type of the result (lhs). For stmts whose |
132 | result-type is different than the type of the arguments (e.g., demotion, |
133 | promotion), vectype will be reset appropriately (later). Note that we have |
134 | to visit the smallest datatype in this function, because that determines the |
135 | VF. If the smallest datatype in the loop is present only as the rhs of a |
136 | promotion operation - we'd miss it. |
137 | Such a case, where a variable of this datatype does not appear in the lhs |
138 | anywhere in the loop, can only occur if it's an invariant: e.g.: |
139 | 'int_x = (int) short_inv', which we'd expect to have been optimized away by |
140 | invariant motion. However, we cannot rely on invariant motion to always |
141 | take invariants out of the loop, and so in the case of promotion we also |
142 | have to check the rhs. |
143 | LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding |
144 | types. */ |
145 | |
146 | tree |
147 | vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type) |
148 | { |
149 | HOST_WIDE_INT lhs, rhs; |
150 | |
151 | /* During the analysis phase, this function is called on arbitrary |
152 | statements that might not have scalar results. */ |
153 | if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type))) |
154 | return scalar_type; |
155 | |
156 | lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); |
157 | |
158 | gassign *assign = dyn_cast <gassign *> (p: stmt_info->stmt); |
159 | if (assign) |
160 | { |
161 | scalar_type = TREE_TYPE (gimple_assign_lhs (assign)); |
162 | if (gimple_assign_cast_p (s: assign) |
163 | || gimple_assign_rhs_code (gs: assign) == DOT_PROD_EXPR |
164 | || gimple_assign_rhs_code (gs: assign) == WIDEN_SUM_EXPR |
165 | || gimple_assign_rhs_code (gs: assign) == WIDEN_MULT_EXPR |
166 | || gimple_assign_rhs_code (gs: assign) == WIDEN_LSHIFT_EXPR |
167 | || gimple_assign_rhs_code (gs: assign) == FLOAT_EXPR) |
168 | { |
169 | tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign)); |
170 | |
171 | rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); |
172 | if (rhs < lhs) |
173 | scalar_type = rhs_type; |
174 | } |
175 | } |
176 | else if (cgraph_node *node = simd_clone_call_p (stmt: stmt_info->stmt)) |
177 | { |
178 | auto clone = node->simd_clones->simdclone; |
179 | for (unsigned int i = 0; i < clone->nargs; ++i) |
180 | { |
181 | if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR) |
182 | { |
183 | tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type); |
184 | rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type)); |
185 | if (rhs < lhs) |
186 | { |
187 | scalar_type = arg_scalar_type; |
188 | lhs = rhs; |
189 | } |
190 | } |
191 | } |
192 | } |
193 | else if (gcall *call = dyn_cast <gcall *> (p: stmt_info->stmt)) |
194 | { |
195 | unsigned int i = 0; |
196 | if (gimple_call_internal_p (gs: call)) |
197 | { |
198 | internal_fn ifn = gimple_call_internal_fn (gs: call); |
199 | if (internal_load_fn_p (ifn)) |
200 | /* For loads the LHS type does the trick. */ |
201 | i = ~0U; |
202 | else if (internal_store_fn_p (ifn)) |
203 | { |
204 | /* For stores use the tyep of the stored value. */ |
205 | i = internal_fn_stored_value_index (ifn); |
206 | scalar_type = TREE_TYPE (gimple_call_arg (call, i)); |
207 | i = ~0U; |
208 | } |
209 | else if (internal_fn_mask_index (ifn) == 0) |
210 | i = 1; |
211 | } |
212 | if (i < gimple_call_num_args (gs: call)) |
213 | { |
214 | tree rhs_type = TREE_TYPE (gimple_call_arg (call, i)); |
215 | if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type))) |
216 | { |
217 | rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); |
218 | if (rhs < lhs) |
219 | scalar_type = rhs_type; |
220 | } |
221 | } |
222 | } |
223 | |
224 | return scalar_type; |
225 | } |
226 | |
227 | |
228 | /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be |
229 | tested at run-time. Return TRUE if DDR was successfully inserted. |
230 | Return false if versioning is not supported. */ |
231 | |
232 | static opt_result |
233 | vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) |
234 | { |
235 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
236 | |
237 | if ((unsigned) param_vect_max_version_for_alias_checks == 0) |
238 | return opt_result::failure_at (loc: vect_location, |
239 | fmt: "will not create alias checks, as" |
240 | " --param vect-max-version-for-alias-checks" |
241 | " == 0\n" ); |
242 | |
243 | opt_result res |
244 | = runtime_alias_check_p (ddr, loop, |
245 | optimize_loop_nest_for_speed_p (loop)); |
246 | if (!res) |
247 | return res; |
248 | |
249 | LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (obj: ddr); |
250 | return opt_result::success (); |
251 | } |
252 | |
253 | /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */ |
254 | |
255 | static void |
256 | vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value) |
257 | { |
258 | const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo); |
259 | for (unsigned int i = 0; i < checks.length(); ++i) |
260 | if (checks[i] == value) |
261 | return; |
262 | |
263 | if (dump_enabled_p ()) |
264 | dump_printf_loc (MSG_NOTE, vect_location, |
265 | "need run-time check that %T is nonzero\n" , |
266 | value); |
267 | LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (obj: value); |
268 | } |
269 | |
270 | /* Return true if we know that the order of vectorized DR_INFO_A and |
271 | vectorized DR_INFO_B will be the same as the order of DR_INFO_A and |
272 | DR_INFO_B. At least one of the accesses is a write. */ |
273 | |
274 | static bool |
275 | vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b) |
276 | { |
277 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
278 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; |
279 | |
280 | /* Single statements are always kept in their original order. */ |
281 | if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) |
282 | && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) |
283 | return true; |
284 | |
285 | /* If there is a loop invariant read involved we might vectorize it in |
286 | the prologue, breaking scalar oder with respect to the in-loop store. */ |
287 | if ((DR_IS_READ (dr_info_a->dr) && integer_zerop (DR_STEP (dr_info_a->dr))) |
288 | || (DR_IS_READ (dr_info_b->dr) && integer_zerop (DR_STEP (dr_info_b->dr)))) |
289 | return false; |
290 | |
291 | /* STMT_A and STMT_B belong to overlapping groups. All loads are |
292 | emitted at the position of the first scalar load. |
293 | Stores in a group are emitted at the position of the last scalar store. |
294 | Compute that position and check whether the resulting order matches |
295 | the current one. */ |
296 | stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a); |
297 | if (il_a) |
298 | { |
299 | if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a))) |
300 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s; |
301 | s = DR_GROUP_NEXT_ELEMENT (s)) |
302 | il_a = get_later_stmt (stmt1_info: il_a, stmt2_info: s); |
303 | else /* DR_IS_READ */ |
304 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s; |
305 | s = DR_GROUP_NEXT_ELEMENT (s)) |
306 | if (get_later_stmt (stmt1_info: il_a, stmt2_info: s) == il_a) |
307 | il_a = s; |
308 | } |
309 | else |
310 | il_a = stmtinfo_a; |
311 | stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b); |
312 | if (il_b) |
313 | { |
314 | if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b))) |
315 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s; |
316 | s = DR_GROUP_NEXT_ELEMENT (s)) |
317 | il_b = get_later_stmt (stmt1_info: il_b, stmt2_info: s); |
318 | else /* DR_IS_READ */ |
319 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s; |
320 | s = DR_GROUP_NEXT_ELEMENT (s)) |
321 | if (get_later_stmt (stmt1_info: il_b, stmt2_info: s) == il_b) |
322 | il_b = s; |
323 | } |
324 | else |
325 | il_b = stmtinfo_b; |
326 | bool a_after_b = (get_later_stmt (stmt1_info: stmtinfo_a, stmt2_info: stmtinfo_b) == stmtinfo_a); |
327 | return (get_later_stmt (stmt1_info: il_a, stmt2_info: il_b) == il_a) == a_after_b; |
328 | } |
329 | |
330 | /* A subroutine of vect_analyze_data_ref_dependence. Handle |
331 | DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence |
332 | distances. These distances are conservatively correct but they don't |
333 | reflect a guaranteed dependence. |
334 | |
335 | Return true if this function does all the work necessary to avoid |
336 | an alias or false if the caller should use the dependence distances |
337 | to limit the vectorization factor in the usual way. LOOP_DEPTH is |
338 | the depth of the loop described by LOOP_VINFO and the other arguments |
339 | are as for vect_analyze_data_ref_dependence. */ |
340 | |
341 | static bool |
342 | vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr, |
343 | loop_vec_info loop_vinfo, |
344 | int loop_depth, unsigned int *max_vf) |
345 | { |
346 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
347 | for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr)) |
348 | { |
349 | int dist = dist_v[loop_depth]; |
350 | if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr))) |
351 | { |
352 | /* If the user asserted safelen >= DIST consecutive iterations |
353 | can be executed concurrently, assume independence. |
354 | |
355 | ??? An alternative would be to add the alias check even |
356 | in this case, and vectorize the fallback loop with the |
357 | maximum VF set to safelen. However, if the user has |
358 | explicitly given a length, it's less likely that that |
359 | would be a win. */ |
360 | if (loop->safelen >= 2 && abs_hwi (x: dist) <= loop->safelen) |
361 | { |
362 | if ((unsigned int) loop->safelen < *max_vf) |
363 | *max_vf = loop->safelen; |
364 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; |
365 | continue; |
366 | } |
367 | |
368 | /* For dependence distances of 2 or more, we have the option |
369 | of limiting VF or checking for an alias at runtime. |
370 | Prefer to check at runtime if we can, to avoid limiting |
371 | the VF unnecessarily when the bases are in fact independent. |
372 | |
373 | Note that the alias checks will be removed if the VF ends up |
374 | being small enough. */ |
375 | dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr)); |
376 | dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr)); |
377 | return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt) |
378 | && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt) |
379 | && vect_mark_for_runtime_alias_test (ddr, loop_vinfo)); |
380 | } |
381 | } |
382 | return true; |
383 | } |
384 | |
385 | |
386 | /* Function vect_analyze_data_ref_dependence. |
387 | |
388 | FIXME: I needed to change the sense of the returned flag. |
389 | |
390 | Return FALSE if there (might) exist a dependence between a memory-reference |
391 | DRA and a memory-reference DRB. When versioning for alias may check a |
392 | dependence at run-time, return TRUE. Adjust *MAX_VF according to |
393 | the data dependence. */ |
394 | |
395 | static opt_result |
396 | vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, |
397 | loop_vec_info loop_vinfo, |
398 | unsigned int *max_vf) |
399 | { |
400 | unsigned int i; |
401 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
402 | struct data_reference *dra = DDR_A (ddr); |
403 | struct data_reference *drb = DDR_B (ddr); |
404 | dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra); |
405 | dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb); |
406 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
407 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; |
408 | lambda_vector dist_v; |
409 | unsigned int loop_depth; |
410 | |
411 | /* If user asserted safelen consecutive iterations can be |
412 | executed concurrently, assume independence. */ |
413 | auto apply_safelen = [&]() |
414 | { |
415 | if (loop->safelen >= 2) |
416 | { |
417 | if ((unsigned int) loop->safelen < *max_vf) |
418 | *max_vf = loop->safelen; |
419 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; |
420 | return true; |
421 | } |
422 | return false; |
423 | }; |
424 | |
425 | /* In loop analysis all data references should be vectorizable. */ |
426 | if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) |
427 | || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) |
428 | gcc_unreachable (); |
429 | |
430 | /* Independent data accesses. */ |
431 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) |
432 | return opt_result::success (); |
433 | |
434 | if (dra == drb |
435 | || (DR_IS_READ (dra) && DR_IS_READ (drb))) |
436 | return opt_result::success (); |
437 | |
438 | /* We do not have to consider dependences between accesses that belong |
439 | to the same group, unless the stride could be smaller than the |
440 | group size. */ |
441 | if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) |
442 | && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) |
443 | == DR_GROUP_FIRST_ELEMENT (stmtinfo_b)) |
444 | && !STMT_VINFO_STRIDED_P (stmtinfo_a)) |
445 | return opt_result::success (); |
446 | |
447 | /* Even if we have an anti-dependence then, as the vectorized loop covers at |
448 | least two scalar iterations, there is always also a true dependence. |
449 | As the vectorizer does not re-order loads and stores we can ignore |
450 | the anti-dependence if TBAA can disambiguate both DRs similar to the |
451 | case with known negative distance anti-dependences (positive |
452 | distance anti-dependences would violate TBAA constraints). */ |
453 | if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) |
454 | || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) |
455 | && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), |
456 | get_alias_set (DR_REF (drb)))) |
457 | return opt_result::success (); |
458 | |
459 | if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) |
460 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) |
461 | { |
462 | if (apply_safelen ()) |
463 | return opt_result::success (); |
464 | |
465 | return opt_result::failure_at |
466 | (loc: stmtinfo_a->stmt, |
467 | fmt: "possible alias involving gather/scatter between %T and %T\n" , |
468 | DR_REF (dra), DR_REF (drb)); |
469 | } |
470 | |
471 | /* Unknown data dependence. */ |
472 | if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) |
473 | { |
474 | if (apply_safelen ()) |
475 | return opt_result::success (); |
476 | |
477 | if (dump_enabled_p ()) |
478 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, |
479 | "versioning for alias required: " |
480 | "can't determine dependence between %T and %T\n" , |
481 | DR_REF (dra), DR_REF (drb)); |
482 | |
483 | /* Add to list of ddrs that need to be tested at run-time. */ |
484 | return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); |
485 | } |
486 | |
487 | /* Known data dependence. */ |
488 | if (DDR_NUM_DIST_VECTS (ddr) == 0) |
489 | { |
490 | if (apply_safelen ()) |
491 | return opt_result::success (); |
492 | |
493 | if (dump_enabled_p ()) |
494 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, |
495 | "versioning for alias required: " |
496 | "bad dist vector for %T and %T\n" , |
497 | DR_REF (dra), DR_REF (drb)); |
498 | /* Add to list of ddrs that need to be tested at run-time. */ |
499 | return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); |
500 | } |
501 | |
502 | loop_depth = index_in_loop_nest (var: loop->num, DDR_LOOP_NEST (ddr)); |
503 | |
504 | if (DDR_COULD_BE_INDEPENDENT_P (ddr) |
505 | && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo, |
506 | loop_depth, max_vf)) |
507 | return opt_result::success (); |
508 | |
509 | FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) |
510 | { |
511 | int dist = dist_v[loop_depth]; |
512 | |
513 | if (dump_enabled_p ()) |
514 | dump_printf_loc (MSG_NOTE, vect_location, |
515 | "dependence distance = %d.\n" , dist); |
516 | |
517 | if (dist == 0) |
518 | { |
519 | if (dump_enabled_p ()) |
520 | dump_printf_loc (MSG_NOTE, vect_location, |
521 | "dependence distance == 0 between %T and %T\n" , |
522 | DR_REF (dra), DR_REF (drb)); |
523 | |
524 | /* When we perform grouped accesses and perform implicit CSE |
525 | by detecting equal accesses and doing disambiguation with |
526 | runtime alias tests like for |
527 | .. = a[i]; |
528 | .. = a[i+1]; |
529 | a[i] = ..; |
530 | a[i+1] = ..; |
531 | *p = ..; |
532 | .. = a[i]; |
533 | .. = a[i+1]; |
534 | where we will end up loading { a[i], a[i+1] } once, make |
535 | sure that inserting group loads before the first load and |
536 | stores after the last store will do the right thing. |
537 | Similar for groups like |
538 | a[i] = ...; |
539 | ... = a[i]; |
540 | a[i+1] = ...; |
541 | where loads from the group interleave with the store. */ |
542 | if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b)) |
543 | return opt_result::failure_at (loc: stmtinfo_a->stmt, |
544 | fmt: "READ_WRITE dependence" |
545 | " in interleaving.\n" ); |
546 | |
547 | if (loop->safelen < 2) |
548 | { |
549 | tree indicator = dr_zero_step_indicator (dra); |
550 | if (!indicator || integer_zerop (indicator)) |
551 | return opt_result::failure_at (loc: stmtinfo_a->stmt, |
552 | fmt: "access also has a zero step\n" ); |
553 | else if (TREE_CODE (indicator) != INTEGER_CST) |
554 | vect_check_nonzero_value (loop_vinfo, value: indicator); |
555 | } |
556 | continue; |
557 | } |
558 | |
559 | if (dist > 0 && DDR_REVERSED_P (ddr)) |
560 | { |
561 | /* If DDR_REVERSED_P the order of the data-refs in DDR was |
562 | reversed (to make distance vector positive), and the actual |
563 | distance is negative. */ |
564 | if (dump_enabled_p ()) |
565 | dump_printf_loc (MSG_NOTE, vect_location, |
566 | "dependence distance negative.\n" ); |
567 | /* When doing outer loop vectorization, we need to check if there is |
568 | a backward dependence at the inner loop level if the dependence |
569 | at the outer loop is reversed. See PR81740. */ |
570 | if (nested_in_vect_loop_p (loop, stmt_info: stmtinfo_a) |
571 | || nested_in_vect_loop_p (loop, stmt_info: stmtinfo_b)) |
572 | { |
573 | unsigned inner_depth = index_in_loop_nest (var: loop->inner->num, |
574 | DDR_LOOP_NEST (ddr)); |
575 | if (dist_v[inner_depth] < 0) |
576 | return opt_result::failure_at (loc: stmtinfo_a->stmt, |
577 | fmt: "not vectorized, dependence " |
578 | "between data-refs %T and %T\n" , |
579 | DR_REF (dra), DR_REF (drb)); |
580 | } |
581 | /* Record a negative dependence distance to later limit the |
582 | amount of stmt copying / unrolling we can perform. |
583 | Only need to handle read-after-write dependence. */ |
584 | if (DR_IS_READ (drb) |
585 | && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 |
586 | || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) |
587 | STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; |
588 | continue; |
589 | } |
590 | |
591 | unsigned int abs_dist = abs (x: dist); |
592 | if (abs_dist >= 2 && abs_dist < *max_vf) |
593 | { |
594 | /* The dependence distance requires reduction of the maximal |
595 | vectorization factor. */ |
596 | *max_vf = abs_dist; |
597 | if (dump_enabled_p ()) |
598 | dump_printf_loc (MSG_NOTE, vect_location, |
599 | "adjusting maximal vectorization factor to %i\n" , |
600 | *max_vf); |
601 | } |
602 | |
603 | if (abs_dist >= *max_vf) |
604 | { |
605 | /* Dependence distance does not create dependence, as far as |
606 | vectorization is concerned, in this case. */ |
607 | if (dump_enabled_p ()) |
608 | dump_printf_loc (MSG_NOTE, vect_location, |
609 | "dependence distance >= VF.\n" ); |
610 | continue; |
611 | } |
612 | |
613 | return opt_result::failure_at (loc: stmtinfo_a->stmt, |
614 | fmt: "not vectorized, possible dependence " |
615 | "between data-refs %T and %T\n" , |
616 | DR_REF (dra), DR_REF (drb)); |
617 | } |
618 | |
619 | return opt_result::success (); |
620 | } |
621 | |
622 | /* Function vect_analyze_early_break_dependences. |
623 | |
624 | Examine all the data references in the loop and make sure that if we have |
625 | multiple exits that we are able to safely move stores such that they become |
626 | safe for vectorization. The function also calculates the place where to move |
627 | the instructions to and computes what the new vUSE chain should be. |
628 | |
629 | This works in tandem with the CFG that will be produced by |
630 | slpeel_tree_duplicate_loop_to_edge_cfg later on. |
631 | |
632 | This function tries to validate whether an early break vectorization |
633 | is possible for the current instruction sequence. Returns True i |
634 | possible, otherwise False. |
635 | |
636 | Requirements: |
637 | - Any memory access must be to a fixed size buffer. |
638 | - There must not be any loads and stores to the same object. |
639 | - Multiple loads are allowed as long as they don't alias. |
640 | |
641 | NOTE: |
642 | This implementation is very conservative. Any overlapping loads/stores |
643 | that take place before the early break statement gets rejected aside from |
644 | WAR dependencies. |
645 | |
646 | i.e.: |
647 | |
648 | a[i] = 8 |
649 | c = a[i] |
650 | if (b[i]) |
651 | ... |
652 | |
653 | is not allowed, but |
654 | |
655 | c = a[i] |
656 | a[i] = 8 |
657 | if (b[i]) |
658 | ... |
659 | |
660 | is which is the common case. */ |
661 | |
662 | static opt_result |
663 | vect_analyze_early_break_dependences (loop_vec_info loop_vinfo) |
664 | { |
665 | DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences" ); |
666 | |
667 | /* List of all load data references found during traversal. */ |
668 | auto_vec<data_reference *> bases; |
669 | basic_block dest_bb = NULL; |
670 | |
671 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
672 | class loop *loop_nest = loop_outer (loop); |
673 | |
674 | if (dump_enabled_p ()) |
675 | dump_printf_loc (MSG_NOTE, vect_location, |
676 | "loop contains multiple exits, analyzing" |
677 | " statement dependencies.\n" ); |
678 | |
679 | if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)) |
680 | if (dump_enabled_p ()) |
681 | dump_printf_loc (MSG_NOTE, vect_location, |
682 | "alternate exit has been chosen as main exit.\n" ); |
683 | |
684 | /* Since we don't support general control flow, the location we'll move the |
685 | side-effects to is always the latch connected exit. When we support |
686 | general control flow we can do better but for now this is fine. Move |
687 | side-effects to the in-loop destination of the last early exit. For the |
688 | PEELED case we move the side-effects to the latch block as this is |
689 | guaranteed to be the last block to be executed when a vector iteration |
690 | finished. */ |
691 | if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)) |
692 | dest_bb = loop->latch; |
693 | else |
694 | dest_bb = single_pred (bb: loop->latch); |
695 | |
696 | /* We start looking from dest_bb, for the non-PEELED case we don't want to |
697 | move any stores already present, but we do want to read and validate the |
698 | loads. */ |
699 | basic_block bb = dest_bb; |
700 | |
701 | /* We move stores across all loads to the beginning of dest_bb, so |
702 | the first block processed below doesn't need dependence checking. */ |
703 | bool check_deps = false; |
704 | |
705 | do |
706 | { |
707 | gimple_stmt_iterator gsi = gsi_last_bb (bb); |
708 | |
709 | /* Now analyze all the remaining statements and try to determine which |
710 | instructions are allowed/needed to be moved. */ |
711 | while (!gsi_end_p (i: gsi)) |
712 | { |
713 | gimple *stmt = gsi_stmt (i: gsi); |
714 | gsi_prev (i: &gsi); |
715 | if (is_gimple_debug (gs: stmt)) |
716 | continue; |
717 | |
718 | stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt); |
719 | auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo); |
720 | if (!dr_ref) |
721 | continue; |
722 | |
723 | /* We know everything below dest_bb is safe since we know we |
724 | had a full vector iteration when reaching it. Either by |
725 | the loop entry / IV exit test being last or because this |
726 | is the loop latch itself. */ |
727 | if (!check_deps) |
728 | continue; |
729 | |
730 | /* Check if vector accesses to the object will be within bounds. |
731 | must be a constant or assume loop will be versioned or niters |
732 | bounded by VF so accesses are within range. We only need to check |
733 | the reads since writes are moved to a safe place where if we get |
734 | there we know they are safe to perform. */ |
735 | if (DR_IS_READ (dr_ref) |
736 | && !ref_within_array_bound (stmt, DR_REF (dr_ref))) |
737 | { |
738 | if (dump_enabled_p ()) |
739 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
740 | "early breaks not supported: vectorization " |
741 | "would %s beyond size of obj.\n" , |
742 | DR_IS_READ (dr_ref) ? "read" : "write" ); |
743 | return opt_result::failure_at (loc: stmt, |
744 | fmt: "can't safely apply code motion to " |
745 | "dependencies of %G to vectorize " |
746 | "the early exit.\n" , stmt); |
747 | } |
748 | |
749 | if (DR_IS_READ (dr_ref)) |
750 | bases.safe_push (obj: dr_ref); |
751 | else if (DR_IS_WRITE (dr_ref)) |
752 | { |
753 | /* We are moving writes down in the CFG. To be sure that this |
754 | is valid after vectorization we have to check all the loads |
755 | we are sinking the stores past to see if any of them may |
756 | alias or are the same object. |
757 | |
758 | Same objects will not be an issue because unless the store |
759 | is marked volatile the value can be forwarded. If the |
760 | store is marked volatile we don't vectorize the loop |
761 | anyway. |
762 | |
763 | That leaves the check for aliasing. We don't really need |
764 | to care about the stores aliasing with each other since the |
765 | stores are moved in order so the effects are still observed |
766 | correctly. This leaves the check for WAR dependencies |
767 | which we would be introducing here if the DR can alias. |
768 | The check is quadratic in loads/stores but I have not found |
769 | a better API to do this. I believe all loads and stores |
770 | must be checked. We also must check them when we |
771 | encountered the store, since we don't care about loads past |
772 | the store. */ |
773 | |
774 | for (auto dr_read : bases) |
775 | if (dr_may_alias_p (dr_ref, dr_read, loop_nest)) |
776 | { |
777 | if (dump_enabled_p ()) |
778 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, |
779 | vect_location, |
780 | "early breaks not supported: " |
781 | "overlapping loads and stores " |
782 | "found before the break " |
783 | "statement.\n" ); |
784 | |
785 | return opt_result::failure_at (loc: stmt, |
786 | fmt: "can't safely apply code motion to dependencies" |
787 | " to vectorize the early exit. %G may alias with" |
788 | " %G\n" , stmt, dr_read->stmt); |
789 | } |
790 | } |
791 | |
792 | if (gimple_vdef (g: stmt)) |
793 | { |
794 | if (dump_enabled_p ()) |
795 | dump_printf_loc (MSG_NOTE, vect_location, |
796 | "==> recording stmt %G" , stmt); |
797 | |
798 | LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (obj: stmt); |
799 | } |
800 | else if (gimple_vuse (g: stmt)) |
801 | { |
802 | LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (ix: 0, obj: stmt); |
803 | if (dump_enabled_p ()) |
804 | dump_printf_loc (MSG_NOTE, vect_location, |
805 | "marked statement for vUSE update: %G" , stmt); |
806 | } |
807 | } |
808 | |
809 | if (!single_pred_p (bb)) |
810 | { |
811 | gcc_assert (bb == loop->header); |
812 | break; |
813 | } |
814 | |
815 | /* If we possibly sink through a virtual PHI make sure to elide that. */ |
816 | if (gphi *vphi = get_virtual_phi (bb)) |
817 | LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (obj: vphi); |
818 | |
819 | /* All earlier blocks need dependence checking. */ |
820 | check_deps = true; |
821 | bb = single_pred (bb); |
822 | } |
823 | while (1); |
824 | |
825 | /* We don't allow outer -> inner loop transitions which should have been |
826 | trapped already during loop form analysis. */ |
827 | gcc_assert (dest_bb->loop_father == loop); |
828 | |
829 | /* Check that the destination block we picked has only one pred. To relax this we |
830 | have to take special care when moving the statements. We don't currently support |
831 | such control flow however this check is there to simplify how we handle |
832 | labels that may be present anywhere in the IL. This check is to ensure that the |
833 | labels aren't significant for the CFG. */ |
834 | if (!single_pred (bb: dest_bb)) |
835 | return opt_result::failure_at (loc: vect_location, |
836 | fmt: "chosen loop exit block (BB %d) does not have a " |
837 | "single predecessor which is currently not " |
838 | "supported for early break vectorization.\n" , |
839 | dest_bb->index); |
840 | |
841 | LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb; |
842 | |
843 | if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ()) |
844 | { |
845 | /* All uses shall be updated to that of the first load. Entries are |
846 | stored in reverse order. */ |
847 | tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ()); |
848 | for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo)) |
849 | { |
850 | if (dump_enabled_p ()) |
851 | dump_printf_loc (MSG_NOTE, vect_location, |
852 | "will update use: %T, mem_ref: %G" , vuse, g); |
853 | } |
854 | } |
855 | |
856 | if (dump_enabled_p ()) |
857 | dump_printf_loc (MSG_NOTE, vect_location, |
858 | "recorded statements to be moved to BB %d\n" , |
859 | LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index); |
860 | |
861 | return opt_result::success (); |
862 | } |
863 | |
864 | /* Function vect_analyze_data_ref_dependences. |
865 | |
866 | Examine all the data references in the loop, and make sure there do not |
867 | exist any data dependences between them. Set *MAX_VF according to |
868 | the maximum vectorization factor the data dependences allow. */ |
869 | |
870 | opt_result |
871 | vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, |
872 | unsigned int *max_vf) |
873 | { |
874 | unsigned int i; |
875 | struct data_dependence_relation *ddr; |
876 | |
877 | DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences" ); |
878 | |
879 | if (!LOOP_VINFO_DDRS (loop_vinfo).exists ()) |
880 | { |
881 | LOOP_VINFO_DDRS (loop_vinfo) |
882 | .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () |
883 | * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); |
884 | /* We do not need read-read dependences. */ |
885 | bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), |
886 | &LOOP_VINFO_DDRS (loop_vinfo), |
887 | LOOP_VINFO_LOOP_NEST (loop_vinfo), |
888 | false); |
889 | gcc_assert (res); |
890 | } |
891 | |
892 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; |
893 | |
894 | /* For epilogues we either have no aliases or alias versioning |
895 | was applied to original loop. Therefore we may just get max_vf |
896 | using VF of original loop. */ |
897 | if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) |
898 | *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo); |
899 | else |
900 | FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) |
901 | { |
902 | opt_result res |
903 | = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf); |
904 | if (!res) |
905 | return res; |
906 | } |
907 | |
908 | /* If we have early break statements in the loop, check to see if they |
909 | are of a form we can vectorizer. */ |
910 | if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) |
911 | return vect_analyze_early_break_dependences (loop_vinfo); |
912 | |
913 | return opt_result::success (); |
914 | } |
915 | |
916 | |
917 | /* Function vect_slp_analyze_data_ref_dependence. |
918 | |
919 | Return TRUE if there (might) exist a dependence between a memory-reference |
920 | DRA and a memory-reference DRB for VINFO. When versioning for alias |
921 | may check a dependence at run-time, return FALSE. Adjust *MAX_VF |
922 | according to the data dependence. */ |
923 | |
924 | static bool |
925 | vect_slp_analyze_data_ref_dependence (vec_info *vinfo, |
926 | struct data_dependence_relation *ddr) |
927 | { |
928 | struct data_reference *dra = DDR_A (ddr); |
929 | struct data_reference *drb = DDR_B (ddr); |
930 | dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); |
931 | dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); |
932 | |
933 | /* We need to check dependences of statements marked as unvectorizable |
934 | as well, they still can prohibit vectorization. */ |
935 | |
936 | /* Independent data accesses. */ |
937 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) |
938 | return false; |
939 | |
940 | if (dra == drb) |
941 | return false; |
942 | |
943 | /* Read-read is OK. */ |
944 | if (DR_IS_READ (dra) && DR_IS_READ (drb)) |
945 | return false; |
946 | |
947 | /* If dra and drb are part of the same interleaving chain consider |
948 | them independent. */ |
949 | if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt) |
950 | && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt) |
951 | == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt))) |
952 | return false; |
953 | |
954 | /* Unknown data dependence. */ |
955 | if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) |
956 | { |
957 | if (dump_enabled_p ()) |
958 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
959 | "can't determine dependence between %T and %T\n" , |
960 | DR_REF (dra), DR_REF (drb)); |
961 | } |
962 | else if (dump_enabled_p ()) |
963 | dump_printf_loc (MSG_NOTE, vect_location, |
964 | "determined dependence between %T and %T\n" , |
965 | DR_REF (dra), DR_REF (drb)); |
966 | |
967 | return true; |
968 | } |
969 | |
970 | |
971 | /* Analyze dependences involved in the transform of a store SLP NODE. */ |
972 | |
973 | static bool |
974 | vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node) |
975 | { |
976 | /* This walks over all stmts involved in the SLP store done |
977 | in NODE verifying we can sink them up to the last stmt in the |
978 | group. */ |
979 | stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node); |
980 | gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info))); |
981 | |
982 | for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k) |
983 | { |
984 | stmt_vec_info access_info |
985 | = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]); |
986 | if (access_info == last_access_info) |
987 | continue; |
988 | data_reference *dr_a = STMT_VINFO_DATA_REF (access_info); |
989 | ao_ref ref; |
990 | bool ref_initialized_p = false; |
991 | for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt); |
992 | gsi_stmt (i: gsi) != last_access_info->stmt; gsi_next (i: &gsi)) |
993 | { |
994 | gimple *stmt = gsi_stmt (i: gsi); |
995 | if (! gimple_vuse (g: stmt)) |
996 | continue; |
997 | |
998 | /* If we couldn't record a (single) data reference for this |
999 | stmt we have to resort to the alias oracle. */ |
1000 | stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt); |
1001 | data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info); |
1002 | if (!dr_b) |
1003 | { |
1004 | /* We are moving a store - this means |
1005 | we cannot use TBAA for disambiguation. */ |
1006 | if (!ref_initialized_p) |
1007 | ao_ref_init (&ref, DR_REF (dr_a)); |
1008 | if (stmt_may_clobber_ref_p_1 (stmt, &ref, false) |
1009 | || ref_maybe_used_by_stmt_p (stmt, &ref, false)) |
1010 | return false; |
1011 | continue; |
1012 | } |
1013 | |
1014 | gcc_assert (!gimple_visited_p (stmt)); |
1015 | |
1016 | ddr_p ddr = initialize_data_dependence_relation (dr_a, |
1017 | dr_b, vNULL); |
1018 | bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr); |
1019 | free_dependence_relation (ddr); |
1020 | if (dependent) |
1021 | return false; |
1022 | } |
1023 | } |
1024 | return true; |
1025 | } |
1026 | |
1027 | /* Analyze dependences involved in the transform of a load SLP NODE. STORES |
1028 | contain the vector of scalar stores of this instance if we are |
1029 | disambiguating the loads. */ |
1030 | |
1031 | static bool |
1032 | vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node, |
1033 | vec<stmt_vec_info> stores, |
1034 | stmt_vec_info last_store_info) |
1035 | { |
1036 | /* This walks over all stmts involved in the SLP load done |
1037 | in NODE verifying we can hoist them up to the first stmt in the |
1038 | group. */ |
1039 | stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node); |
1040 | gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info))); |
1041 | |
1042 | for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k) |
1043 | { |
1044 | stmt_vec_info access_info |
1045 | = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]); |
1046 | if (access_info == first_access_info) |
1047 | continue; |
1048 | data_reference *dr_a = STMT_VINFO_DATA_REF (access_info); |
1049 | ao_ref ref; |
1050 | bool ref_initialized_p = false; |
1051 | hash_set<stmt_vec_info> grp_visited; |
1052 | for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt); |
1053 | gsi_stmt (i: gsi) != first_access_info->stmt; gsi_prev (i: &gsi)) |
1054 | { |
1055 | gimple *stmt = gsi_stmt (i: gsi); |
1056 | if (! gimple_vdef (g: stmt)) |
1057 | continue; |
1058 | |
1059 | stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt); |
1060 | |
1061 | /* If we run into a store of this same instance (we've just |
1062 | marked those) then delay dependence checking until we run |
1063 | into the last store because this is where it will have |
1064 | been sunk to (and we verified that we can do that already). */ |
1065 | if (gimple_visited_p (stmt)) |
1066 | { |
1067 | if (stmt_info != last_store_info) |
1068 | continue; |
1069 | |
1070 | for (stmt_vec_info &store_info : stores) |
1071 | { |
1072 | data_reference *store_dr = STMT_VINFO_DATA_REF (store_info); |
1073 | ddr_p ddr = initialize_data_dependence_relation |
1074 | (dr_a, store_dr, vNULL); |
1075 | bool dependent |
1076 | = vect_slp_analyze_data_ref_dependence (vinfo, ddr); |
1077 | free_dependence_relation (ddr); |
1078 | if (dependent) |
1079 | return false; |
1080 | } |
1081 | continue; |
1082 | } |
1083 | |
1084 | auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool |
1085 | { |
1086 | /* We are hoisting a load - this means we can use TBAA for |
1087 | disambiguation. */ |
1088 | if (!ref_initialized_p) |
1089 | ao_ref_init (&ref, DR_REF (dr_a)); |
1090 | if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true)) |
1091 | { |
1092 | /* If we couldn't record a (single) data reference for this |
1093 | stmt we have to give up now. */ |
1094 | data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info); |
1095 | if (!dr_b) |
1096 | return false; |
1097 | ddr_p ddr = initialize_data_dependence_relation (dr_a, |
1098 | dr_b, vNULL); |
1099 | bool dependent |
1100 | = vect_slp_analyze_data_ref_dependence (vinfo, ddr); |
1101 | free_dependence_relation (ddr); |
1102 | if (dependent) |
1103 | return false; |
1104 | } |
1105 | /* No dependence. */ |
1106 | return true; |
1107 | }; |
1108 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
1109 | { |
1110 | /* When we run into a store group we have to honor |
1111 | that earlier stores might be moved here. We don't |
1112 | know exactly which and where to since we lack a |
1113 | back-mapping from DR to SLP node, so assume all |
1114 | earlier stores are sunk here. It's enough to |
1115 | consider the last stmt of a group for this. |
1116 | ??? Both this and the fact that we disregard that |
1117 | the conflicting instance might be removed later |
1118 | is overly conservative. */ |
1119 | if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info))) |
1120 | for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info); |
1121 | store_info != NULL; |
1122 | store_info = DR_GROUP_NEXT_ELEMENT (store_info)) |
1123 | if ((store_info == stmt_info |
1124 | || get_later_stmt (stmt1_info: store_info, stmt2_info: stmt_info) == stmt_info) |
1125 | && !check_hoist (store_info)) |
1126 | return false; |
1127 | } |
1128 | else |
1129 | { |
1130 | if (!check_hoist (stmt_info)) |
1131 | return false; |
1132 | } |
1133 | } |
1134 | } |
1135 | return true; |
1136 | } |
1137 | |
1138 | |
1139 | /* Function vect_analyze_data_ref_dependences. |
1140 | |
1141 | Examine all the data references in the basic-block, and make sure there |
1142 | do not exist any data dependences between them. Set *MAX_VF according to |
1143 | the maximum vectorization factor the data dependences allow. */ |
1144 | |
1145 | bool |
1146 | vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance) |
1147 | { |
1148 | DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence" ); |
1149 | |
1150 | /* The stores of this instance are at the root of the SLP tree. */ |
1151 | slp_tree store = NULL; |
1152 | if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store) |
1153 | store = SLP_INSTANCE_TREE (instance); |
1154 | |
1155 | /* Verify we can sink stores to the vectorized stmt insert location. */ |
1156 | stmt_vec_info last_store_info = NULL; |
1157 | if (store) |
1158 | { |
1159 | if (! vect_slp_analyze_store_dependences (vinfo, node: store)) |
1160 | return false; |
1161 | |
1162 | /* Mark stores in this instance and remember the last one. */ |
1163 | last_store_info = vect_find_last_scalar_stmt_in_slp (store); |
1164 | for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k) |
1165 | gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, visited_p: true); |
1166 | } |
1167 | |
1168 | bool res = true; |
1169 | |
1170 | /* Verify we can sink loads to the vectorized stmt insert location, |
1171 | special-casing stores of this instance. */ |
1172 | for (slp_tree &load : SLP_INSTANCE_LOADS (instance)) |
1173 | if (! vect_slp_analyze_load_dependences (vinfo, node: load, |
1174 | stores: store |
1175 | ? SLP_TREE_SCALAR_STMTS (store) |
1176 | : vNULL, last_store_info)) |
1177 | { |
1178 | res = false; |
1179 | break; |
1180 | } |
1181 | |
1182 | /* Unset the visited flag. */ |
1183 | if (store) |
1184 | for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k) |
1185 | gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, visited_p: false); |
1186 | |
1187 | return res; |
1188 | } |
1189 | |
1190 | /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET |
1191 | applied. */ |
1192 | |
1193 | int |
1194 | dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset) |
1195 | { |
1196 | HOST_WIDE_INT diff = 0; |
1197 | /* Alignment is only analyzed for the first element of a DR group, |
1198 | use that but adjust misalignment by the offset of the access. */ |
1199 | if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)) |
1200 | { |
1201 | dr_vec_info *first_dr |
1202 | = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt)); |
1203 | /* vect_analyze_data_ref_accesses guarantees that DR_INIT are |
1204 | INTEGER_CSTs and the first element in the group has the lowest |
1205 | address. */ |
1206 | diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr)) |
1207 | - TREE_INT_CST_LOW (DR_INIT (first_dr->dr))); |
1208 | gcc_assert (diff >= 0); |
1209 | dr_info = first_dr; |
1210 | } |
1211 | |
1212 | int misalign = dr_info->misalignment; |
1213 | gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED); |
1214 | if (misalign == DR_MISALIGNMENT_UNKNOWN) |
1215 | return misalign; |
1216 | |
1217 | /* If the access is only aligned for a vector type with smaller alignment |
1218 | requirement the access has unknown misalignment. */ |
1219 | if (maybe_lt (a: dr_info->target_alignment * BITS_PER_UNIT, |
1220 | b: targetm.vectorize.preferred_vector_alignment (vectype))) |
1221 | return DR_MISALIGNMENT_UNKNOWN; |
1222 | |
1223 | /* Apply the offset from the DR group start and the externally supplied |
1224 | offset which can for example result from a negative stride access. */ |
1225 | poly_int64 misalignment = misalign + diff + offset; |
1226 | |
1227 | /* vect_compute_data_ref_alignment will have ensured that target_alignment |
1228 | is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */ |
1229 | unsigned HOST_WIDE_INT target_alignment_c |
1230 | = dr_info->target_alignment.to_constant (); |
1231 | if (!known_misalignment (value: misalignment, align: target_alignment_c, misalign: &misalign)) |
1232 | return DR_MISALIGNMENT_UNKNOWN; |
1233 | return misalign; |
1234 | } |
1235 | |
1236 | /* Record the base alignment guarantee given by DRB, which occurs |
1237 | in STMT_INFO. */ |
1238 | |
1239 | static void |
1240 | vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info, |
1241 | innermost_loop_behavior *drb) |
1242 | { |
1243 | bool existed; |
1244 | std::pair<stmt_vec_info, innermost_loop_behavior *> &entry |
1245 | = vinfo->base_alignments.get_or_insert (k: drb->base_address, existed: &existed); |
1246 | if (!existed || entry.second->base_alignment < drb->base_alignment) |
1247 | { |
1248 | entry = std::make_pair (x&: stmt_info, y&: drb); |
1249 | if (dump_enabled_p ()) |
1250 | dump_printf_loc (MSG_NOTE, vect_location, |
1251 | "recording new base alignment for %T\n" |
1252 | " alignment: %d\n" |
1253 | " misalignment: %d\n" |
1254 | " based on: %G" , |
1255 | drb->base_address, |
1256 | drb->base_alignment, |
1257 | drb->base_misalignment, |
1258 | stmt_info->stmt); |
1259 | } |
1260 | } |
1261 | |
1262 | /* If the region we're going to vectorize is reached, all unconditional |
1263 | data references occur at least once. We can therefore pool the base |
1264 | alignment guarantees from each unconditional reference. Do this by |
1265 | going through all the data references in VINFO and checking whether |
1266 | the containing statement makes the reference unconditionally. If so, |
1267 | record the alignment of the base address in VINFO so that it can be |
1268 | used for all other references with the same base. */ |
1269 | |
1270 | void |
1271 | vect_record_base_alignments (vec_info *vinfo) |
1272 | { |
1273 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
1274 | class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; |
1275 | for (data_reference *dr : vinfo->shared->datarefs) |
1276 | { |
1277 | dr_vec_info *dr_info = vinfo->lookup_dr (dr); |
1278 | stmt_vec_info stmt_info = dr_info->stmt; |
1279 | if (!DR_IS_CONDITIONAL_IN_STMT (dr) |
1280 | && STMT_VINFO_VECTORIZABLE (stmt_info) |
1281 | && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) |
1282 | { |
1283 | vect_record_base_alignment (vinfo, stmt_info, drb: &DR_INNERMOST (dr)); |
1284 | |
1285 | /* If DR is nested in the loop that is being vectorized, we can also |
1286 | record the alignment of the base wrt the outer loop. */ |
1287 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
1288 | vect_record_base_alignment |
1289 | (vinfo, stmt_info, drb: &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info)); |
1290 | } |
1291 | } |
1292 | } |
1293 | |
1294 | /* Function vect_compute_data_ref_alignment |
1295 | |
1296 | Compute the misalignment of the data reference DR_INFO when vectorizing |
1297 | with VECTYPE. |
1298 | |
1299 | Output: |
1300 | 1. initialized misalignment info for DR_INFO |
1301 | |
1302 | FOR NOW: No analysis is actually performed. Misalignment is calculated |
1303 | only for trivial cases. TODO. */ |
1304 | |
1305 | static void |
1306 | vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info, |
1307 | tree vectype) |
1308 | { |
1309 | stmt_vec_info stmt_info = dr_info->stmt; |
1310 | vec_base_alignments *base_alignments = &vinfo->base_alignments; |
1311 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
1312 | class loop *loop = NULL; |
1313 | tree ref = DR_REF (dr_info->dr); |
1314 | |
1315 | if (dump_enabled_p ()) |
1316 | dump_printf_loc (MSG_NOTE, vect_location, |
1317 | "vect_compute_data_ref_alignment:\n" ); |
1318 | |
1319 | if (loop_vinfo) |
1320 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
1321 | |
1322 | /* Initialize misalignment to unknown. */ |
1323 | SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); |
1324 | |
1325 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) |
1326 | return; |
1327 | |
1328 | innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info); |
1329 | bool step_preserves_misalignment_p; |
1330 | |
1331 | poly_uint64 vector_alignment |
1332 | = exact_div (a: targetm.vectorize.preferred_vector_alignment (vectype), |
1333 | BITS_PER_UNIT); |
1334 | SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment); |
1335 | |
1336 | /* If the main loop has peeled for alignment we have no way of knowing |
1337 | whether the data accesses in the epilogues are aligned. We can't at |
1338 | compile time answer the question whether we have entered the main loop or |
1339 | not. Fixes PR 92351. */ |
1340 | if (loop_vinfo) |
1341 | { |
1342 | loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); |
1343 | if (orig_loop_vinfo |
1344 | && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0) |
1345 | return; |
1346 | } |
1347 | |
1348 | unsigned HOST_WIDE_INT vect_align_c; |
1349 | if (!vector_alignment.is_constant (const_value: &vect_align_c)) |
1350 | return; |
1351 | |
1352 | /* No step for BB vectorization. */ |
1353 | if (!loop) |
1354 | { |
1355 | gcc_assert (integer_zerop (drb->step)); |
1356 | step_preserves_misalignment_p = true; |
1357 | } |
1358 | |
1359 | /* In case the dataref is in an inner-loop of the loop that is being |
1360 | vectorized (LOOP), we use the base and misalignment information |
1361 | relative to the outer-loop (LOOP). This is ok only if the misalignment |
1362 | stays the same throughout the execution of the inner-loop, which is why |
1363 | we have to check that the stride of the dataref in the inner-loop evenly |
1364 | divides by the vector alignment. */ |
1365 | else if (nested_in_vect_loop_p (loop, stmt_info)) |
1366 | { |
1367 | step_preserves_misalignment_p |
1368 | = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0; |
1369 | |
1370 | if (dump_enabled_p ()) |
1371 | { |
1372 | if (step_preserves_misalignment_p) |
1373 | dump_printf_loc (MSG_NOTE, vect_location, |
1374 | "inner step divides the vector alignment.\n" ); |
1375 | else |
1376 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1377 | "inner step doesn't divide the vector" |
1378 | " alignment.\n" ); |
1379 | } |
1380 | } |
1381 | |
1382 | /* Similarly we can only use base and misalignment information relative to |
1383 | an innermost loop if the misalignment stays the same throughout the |
1384 | execution of the loop. As above, this is the case if the stride of |
1385 | the dataref evenly divides by the alignment. */ |
1386 | else |
1387 | { |
1388 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
1389 | step_preserves_misalignment_p |
1390 | = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, b: vect_align_c); |
1391 | |
1392 | if (!step_preserves_misalignment_p && dump_enabled_p ()) |
1393 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1394 | "step doesn't divide the vector alignment.\n" ); |
1395 | } |
1396 | |
1397 | unsigned int base_alignment = drb->base_alignment; |
1398 | unsigned int base_misalignment = drb->base_misalignment; |
1399 | |
1400 | /* Calculate the maximum of the pooled base address alignment and the |
1401 | alignment that we can compute for DR itself. */ |
1402 | std::pair<stmt_vec_info, innermost_loop_behavior *> *entry |
1403 | = base_alignments->get (k: drb->base_address); |
1404 | if (entry |
1405 | && base_alignment < (*entry).second->base_alignment |
1406 | && (loop_vinfo |
1407 | || (dominated_by_p (CDI_DOMINATORS, gimple_bb (g: stmt_info->stmt), |
1408 | gimple_bb (g: entry->first->stmt)) |
1409 | && (gimple_bb (g: stmt_info->stmt) != gimple_bb (g: entry->first->stmt) |
1410 | || (entry->first->dr_aux.group <= dr_info->group))))) |
1411 | { |
1412 | base_alignment = entry->second->base_alignment; |
1413 | base_misalignment = entry->second->base_misalignment; |
1414 | } |
1415 | |
1416 | if (drb->offset_alignment < vect_align_c |
1417 | || !step_preserves_misalignment_p |
1418 | /* We need to know whether the step wrt the vectorized loop is |
1419 | negative when computing the starting misalignment below. */ |
1420 | || TREE_CODE (drb->step) != INTEGER_CST) |
1421 | { |
1422 | if (dump_enabled_p ()) |
1423 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1424 | "Unknown alignment for access: %T\n" , ref); |
1425 | return; |
1426 | } |
1427 | |
1428 | if (base_alignment < vect_align_c) |
1429 | { |
1430 | unsigned int max_alignment; |
1431 | tree base = get_base_for_alignment (drb->base_address, &max_alignment); |
1432 | if (max_alignment < vect_align_c |
1433 | || !vect_can_force_dr_alignment_p (base, |
1434 | vect_align_c * BITS_PER_UNIT)) |
1435 | { |
1436 | if (dump_enabled_p ()) |
1437 | dump_printf_loc (MSG_NOTE, vect_location, |
1438 | "can't force alignment of ref: %T\n" , ref); |
1439 | return; |
1440 | } |
1441 | |
1442 | /* Force the alignment of the decl. |
1443 | NOTE: This is the only change to the code we make during |
1444 | the analysis phase, before deciding to vectorize the loop. */ |
1445 | if (dump_enabled_p ()) |
1446 | dump_printf_loc (MSG_NOTE, vect_location, |
1447 | "force alignment of %T\n" , ref); |
1448 | |
1449 | dr_info->base_decl = base; |
1450 | dr_info->base_misaligned = true; |
1451 | base_misalignment = 0; |
1452 | } |
1453 | poly_int64 misalignment |
1454 | = base_misalignment + wi::to_poly_offset (t: drb->init).force_shwi (); |
1455 | |
1456 | unsigned int const_misalignment; |
1457 | if (!known_misalignment (value: misalignment, align: vect_align_c, misalign: &const_misalignment)) |
1458 | { |
1459 | if (dump_enabled_p ()) |
1460 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1461 | "Non-constant misalignment for access: %T\n" , ref); |
1462 | return; |
1463 | } |
1464 | |
1465 | SET_DR_MISALIGNMENT (dr_info, const_misalignment); |
1466 | |
1467 | if (dump_enabled_p ()) |
1468 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1469 | "misalign = %d bytes of ref %T\n" , |
1470 | const_misalignment, ref); |
1471 | |
1472 | return; |
1473 | } |
1474 | |
1475 | /* Return whether DR_INFO, which is related to DR_PEEL_INFO in |
1476 | that it only differs in DR_INIT, is aligned if DR_PEEL_INFO |
1477 | is made aligned via peeling. */ |
1478 | |
1479 | static bool |
1480 | vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info, |
1481 | dr_vec_info *dr_peel_info) |
1482 | { |
1483 | if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info), |
1484 | DR_TARGET_ALIGNMENT (dr_info))) |
1485 | { |
1486 | poly_offset_int diff |
1487 | = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr)) |
1488 | - wi::to_poly_offset (DR_INIT (dr_info->dr))); |
1489 | if (known_eq (diff, 0) |
1490 | || multiple_p (a: diff, DR_TARGET_ALIGNMENT (dr_info))) |
1491 | return true; |
1492 | } |
1493 | return false; |
1494 | } |
1495 | |
1496 | /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made |
1497 | aligned via peeling. */ |
1498 | |
1499 | static bool |
1500 | vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info, |
1501 | dr_vec_info *dr_peel_info) |
1502 | { |
1503 | if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr), |
1504 | DR_BASE_ADDRESS (dr_peel_info->dr), flags: 0) |
1505 | || !operand_equal_p (DR_OFFSET (dr_info->dr), |
1506 | DR_OFFSET (dr_peel_info->dr), flags: 0) |
1507 | || !operand_equal_p (DR_STEP (dr_info->dr), |
1508 | DR_STEP (dr_peel_info->dr), flags: 0)) |
1509 | return false; |
1510 | |
1511 | return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info); |
1512 | } |
1513 | |
1514 | /* Compute the value for dr_info->misalign so that the access appears |
1515 | aligned. This is used by peeling to compensate for dr_misalignment |
1516 | applying the offset for negative step. */ |
1517 | |
1518 | int |
1519 | vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info) |
1520 | { |
1521 | if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0) |
1522 | return 0; |
1523 | |
1524 | tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); |
1525 | poly_int64 misalignment |
1526 | = ((TYPE_VECTOR_SUBPARTS (node: vectype) - 1) |
1527 | * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))); |
1528 | |
1529 | unsigned HOST_WIDE_INT target_alignment_c; |
1530 | int misalign; |
1531 | if (!dr_info->target_alignment.is_constant (const_value: &target_alignment_c) |
1532 | || !known_misalignment (value: misalignment, align: target_alignment_c, misalign: &misalign)) |
1533 | return DR_MISALIGNMENT_UNKNOWN; |
1534 | return misalign; |
1535 | } |
1536 | |
1537 | /* Function vect_update_misalignment_for_peel. |
1538 | Sets DR_INFO's misalignment |
1539 | - to 0 if it has the same alignment as DR_PEEL_INFO, |
1540 | - to the misalignment computed using NPEEL if DR_INFO's salignment is known, |
1541 | - to -1 (unknown) otherwise. |
1542 | |
1543 | DR_INFO - the data reference whose misalignment is to be adjusted. |
1544 | DR_PEEL_INFO - the data reference whose misalignment is being made |
1545 | zero in the vector loop by the peel. |
1546 | NPEEL - the number of iterations in the peel loop if the misalignment |
1547 | of DR_PEEL_INFO is known at compile time. */ |
1548 | |
1549 | static void |
1550 | vect_update_misalignment_for_peel (dr_vec_info *dr_info, |
1551 | dr_vec_info *dr_peel_info, int npeel) |
1552 | { |
1553 | /* If dr_info is aligned of dr_peel_info is, then mark it so. */ |
1554 | if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info)) |
1555 | { |
1556 | SET_DR_MISALIGNMENT (dr_info, |
1557 | vect_dr_misalign_for_aligned_access (dr_peel_info)); |
1558 | return; |
1559 | } |
1560 | |
1561 | unsigned HOST_WIDE_INT alignment; |
1562 | if (DR_TARGET_ALIGNMENT (dr_info).is_constant (const_value: &alignment) |
1563 | && known_alignment_for_access_p (dr_info, |
1564 | STMT_VINFO_VECTYPE (dr_info->stmt)) |
1565 | && known_alignment_for_access_p (dr_info: dr_peel_info, |
1566 | STMT_VINFO_VECTYPE (dr_peel_info->stmt))) |
1567 | { |
1568 | int misal = dr_info->misalignment; |
1569 | misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr)); |
1570 | misal &= alignment - 1; |
1571 | set_dr_misalignment (dr_info, val: misal); |
1572 | return; |
1573 | } |
1574 | |
1575 | if (dump_enabled_p ()) |
1576 | dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \ |
1577 | "to unknown (-1).\n" ); |
1578 | SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); |
1579 | } |
1580 | |
1581 | /* Return true if alignment is relevant for DR_INFO. */ |
1582 | |
1583 | static bool |
1584 | vect_relevant_for_alignment_p (dr_vec_info *dr_info) |
1585 | { |
1586 | stmt_vec_info stmt_info = dr_info->stmt; |
1587 | |
1588 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
1589 | return false; |
1590 | |
1591 | /* For interleaving, only the alignment of the first access matters. */ |
1592 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info) |
1593 | && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) |
1594 | return false; |
1595 | |
1596 | /* Scatter-gather and invariant accesses continue to address individual |
1597 | scalars, so vector-level alignment is irrelevant. */ |
1598 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info) |
1599 | || integer_zerop (DR_STEP (dr_info->dr))) |
1600 | return false; |
1601 | |
1602 | /* Strided accesses perform only component accesses, alignment is |
1603 | irrelevant for them. */ |
1604 | if (STMT_VINFO_STRIDED_P (stmt_info) |
1605 | && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
1606 | return false; |
1607 | |
1608 | return true; |
1609 | } |
1610 | |
1611 | /* Given an memory reference EXP return whether its alignment is less |
1612 | than its size. */ |
1613 | |
1614 | static bool |
1615 | not_size_aligned (tree exp) |
1616 | { |
1617 | if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) |
1618 | return true; |
1619 | |
1620 | return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) |
1621 | > get_object_alignment (exp)); |
1622 | } |
1623 | |
1624 | /* Function vector_alignment_reachable_p |
1625 | |
1626 | Return true if vector alignment for DR_INFO is reachable by peeling |
1627 | a few loop iterations. Return false otherwise. */ |
1628 | |
1629 | static bool |
1630 | vector_alignment_reachable_p (dr_vec_info *dr_info) |
1631 | { |
1632 | stmt_vec_info stmt_info = dr_info->stmt; |
1633 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
1634 | |
1635 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
1636 | { |
1637 | /* For interleaved access we peel only if number of iterations in |
1638 | the prolog loop ({VF - misalignment}), is a multiple of the |
1639 | number of the interleaved accesses. */ |
1640 | int elem_size, mis_in_elements; |
1641 | |
1642 | /* FORNOW: handle only known alignment. */ |
1643 | if (!known_alignment_for_access_p (dr_info, vectype)) |
1644 | return false; |
1645 | |
1646 | poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (node: vectype); |
1647 | poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype)); |
1648 | elem_size = vector_element_size (vector_size, nelements); |
1649 | mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size; |
1650 | |
1651 | if (!multiple_p (a: nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info))) |
1652 | return false; |
1653 | } |
1654 | |
1655 | /* If misalignment is known at the compile time then allow peeling |
1656 | only if natural alignment is reachable through peeling. */ |
1657 | if (known_alignment_for_access_p (dr_info, vectype) |
1658 | && !aligned_access_p (dr_info, vectype)) |
1659 | { |
1660 | HOST_WIDE_INT elmsize = |
1661 | int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); |
1662 | if (dump_enabled_p ()) |
1663 | { |
1664 | dump_printf_loc (MSG_NOTE, vect_location, |
1665 | "data size = %wd. misalignment = %d.\n" , elmsize, |
1666 | dr_misalignment (dr_info, vectype)); |
1667 | } |
1668 | if (dr_misalignment (dr_info, vectype) % elmsize) |
1669 | { |
1670 | if (dump_enabled_p ()) |
1671 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1672 | "data size does not divide the misalignment.\n" ); |
1673 | return false; |
1674 | } |
1675 | } |
1676 | |
1677 | if (!known_alignment_for_access_p (dr_info, vectype)) |
1678 | { |
1679 | tree type = TREE_TYPE (DR_REF (dr_info->dr)); |
1680 | bool is_packed = not_size_aligned (DR_REF (dr_info->dr)); |
1681 | if (dump_enabled_p ()) |
1682 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1683 | "Unknown misalignment, %snaturally aligned\n" , |
1684 | is_packed ? "not " : "" ); |
1685 | return targetm.vectorize.vector_alignment_reachable (type, is_packed); |
1686 | } |
1687 | |
1688 | return true; |
1689 | } |
1690 | |
1691 | |
1692 | /* Calculate the cost of the memory access represented by DR_INFO. */ |
1693 | |
1694 | static void |
1695 | vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info, |
1696 | dr_alignment_support alignment_support_scheme, |
1697 | int misalignment, |
1698 | unsigned int *inside_cost, |
1699 | unsigned int *outside_cost, |
1700 | stmt_vector_for_cost *body_cost_vec, |
1701 | stmt_vector_for_cost *prologue_cost_vec) |
1702 | { |
1703 | stmt_vec_info stmt_info = dr_info->stmt; |
1704 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
1705 | int ncopies; |
1706 | |
1707 | if (PURE_SLP_STMT (stmt_info)) |
1708 | ncopies = 1; |
1709 | else |
1710 | ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info)); |
1711 | |
1712 | if (DR_IS_READ (dr_info->dr)) |
1713 | vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme, |
1714 | misalignment, true, inside_cost, |
1715 | outside_cost, prologue_cost_vec, body_cost_vec, false); |
1716 | else |
1717 | vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme, |
1718 | misalignment, inside_cost, body_cost_vec); |
1719 | |
1720 | if (dump_enabled_p ()) |
1721 | dump_printf_loc (MSG_NOTE, vect_location, |
1722 | "vect_get_data_access_cost: inside_cost = %d, " |
1723 | "outside_cost = %d.\n" , *inside_cost, *outside_cost); |
1724 | } |
1725 | |
1726 | |
1727 | typedef struct _vect_peel_info |
1728 | { |
1729 | dr_vec_info *dr_info; |
1730 | int npeel; |
1731 | unsigned int count; |
1732 | } *vect_peel_info; |
1733 | |
1734 | typedef struct _vect_peel_extended_info |
1735 | { |
1736 | vec_info *vinfo; |
1737 | struct _vect_peel_info peel_info; |
1738 | unsigned int inside_cost; |
1739 | unsigned int outside_cost; |
1740 | } *vect_peel_extended_info; |
1741 | |
1742 | |
1743 | /* Peeling hashtable helpers. */ |
1744 | |
1745 | struct peel_info_hasher : free_ptr_hash <_vect_peel_info> |
1746 | { |
1747 | static inline hashval_t hash (const _vect_peel_info *); |
1748 | static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); |
1749 | }; |
1750 | |
1751 | inline hashval_t |
1752 | peel_info_hasher::hash (const _vect_peel_info *peel_info) |
1753 | { |
1754 | return (hashval_t) peel_info->npeel; |
1755 | } |
1756 | |
1757 | inline bool |
1758 | peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) |
1759 | { |
1760 | return (a->npeel == b->npeel); |
1761 | } |
1762 | |
1763 | |
1764 | /* Insert DR_INFO into peeling hash table with NPEEL as key. */ |
1765 | |
1766 | static void |
1767 | vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, |
1768 | loop_vec_info loop_vinfo, dr_vec_info *dr_info, |
1769 | int npeel, bool supportable_if_not_aligned) |
1770 | { |
1771 | struct _vect_peel_info elem, *slot; |
1772 | _vect_peel_info **new_slot; |
1773 | |
1774 | elem.npeel = npeel; |
1775 | slot = peeling_htab->find (value: &elem); |
1776 | if (slot) |
1777 | slot->count++; |
1778 | else |
1779 | { |
1780 | slot = XNEW (struct _vect_peel_info); |
1781 | slot->npeel = npeel; |
1782 | slot->dr_info = dr_info; |
1783 | slot->count = 1; |
1784 | new_slot = peeling_htab->find_slot (value: slot, insert: INSERT); |
1785 | *new_slot = slot; |
1786 | } |
1787 | |
1788 | /* If this DR is not supported with unknown misalignment then bias |
1789 | this slot when the cost model is disabled. */ |
1790 | if (!supportable_if_not_aligned |
1791 | && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) |
1792 | slot->count += VECT_MAX_COST; |
1793 | } |
1794 | |
1795 | |
1796 | /* Traverse peeling hash table to find peeling option that aligns maximum |
1797 | number of data accesses. */ |
1798 | |
1799 | int |
1800 | vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, |
1801 | _vect_peel_extended_info *max) |
1802 | { |
1803 | vect_peel_info elem = *slot; |
1804 | |
1805 | if (elem->count > max->peel_info.count |
1806 | || (elem->count == max->peel_info.count |
1807 | && max->peel_info.npeel > elem->npeel)) |
1808 | { |
1809 | max->peel_info.npeel = elem->npeel; |
1810 | max->peel_info.count = elem->count; |
1811 | max->peel_info.dr_info = elem->dr_info; |
1812 | } |
1813 | |
1814 | return 1; |
1815 | } |
1816 | |
1817 | /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking |
1818 | data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true, |
1819 | npeel is computed at runtime but DR0_INFO's misalignment will be zero |
1820 | after peeling. */ |
1821 | |
1822 | static void |
1823 | vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo, |
1824 | dr_vec_info *dr0_info, |
1825 | unsigned int *inside_cost, |
1826 | unsigned int *outside_cost, |
1827 | stmt_vector_for_cost *body_cost_vec, |
1828 | stmt_vector_for_cost *prologue_cost_vec, |
1829 | unsigned int npeel) |
1830 | { |
1831 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
1832 | |
1833 | bool dr0_alignment_known_p |
1834 | = (dr0_info |
1835 | && known_alignment_for_access_p (dr_info: dr0_info, |
1836 | STMT_VINFO_VECTYPE (dr0_info->stmt))); |
1837 | |
1838 | for (data_reference *dr : datarefs) |
1839 | { |
1840 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
1841 | if (!vect_relevant_for_alignment_p (dr_info)) |
1842 | continue; |
1843 | |
1844 | tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); |
1845 | dr_alignment_support alignment_support_scheme; |
1846 | int misalignment; |
1847 | unsigned HOST_WIDE_INT alignment; |
1848 | |
1849 | bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr), |
1850 | size_zero_node) < 0; |
1851 | poly_int64 off = 0; |
1852 | if (negative) |
1853 | off = ((TYPE_VECTOR_SUBPARTS (node: vectype) - 1) |
1854 | * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))); |
1855 | |
1856 | if (npeel == 0) |
1857 | misalignment = dr_misalignment (dr_info, vectype, offset: off); |
1858 | else if (dr_info == dr0_info |
1859 | || vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info: dr0_info)) |
1860 | misalignment = 0; |
1861 | else if (!dr0_alignment_known_p |
1862 | || !known_alignment_for_access_p (dr_info, vectype) |
1863 | || !DR_TARGET_ALIGNMENT (dr_info).is_constant (const_value: &alignment)) |
1864 | misalignment = DR_MISALIGNMENT_UNKNOWN; |
1865 | else |
1866 | { |
1867 | misalignment = dr_misalignment (dr_info, vectype, offset: off); |
1868 | misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr)); |
1869 | misalignment &= alignment - 1; |
1870 | } |
1871 | alignment_support_scheme |
1872 | = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype, |
1873 | misalignment); |
1874 | |
1875 | vect_get_data_access_cost (vinfo: loop_vinfo, dr_info, |
1876 | alignment_support_scheme, misalignment, |
1877 | inside_cost, outside_cost, |
1878 | body_cost_vec, prologue_cost_vec); |
1879 | } |
1880 | } |
1881 | |
1882 | /* Traverse peeling hash table and calculate cost for each peeling option. |
1883 | Find the one with the lowest cost. */ |
1884 | |
1885 | int |
1886 | vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, |
1887 | _vect_peel_extended_info *min) |
1888 | { |
1889 | vect_peel_info elem = *slot; |
1890 | int dummy; |
1891 | unsigned int inside_cost = 0, outside_cost = 0; |
1892 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: min->vinfo); |
1893 | stmt_vector_for_cost prologue_cost_vec, body_cost_vec, |
1894 | epilogue_cost_vec; |
1895 | |
1896 | prologue_cost_vec.create (nelems: 2); |
1897 | body_cost_vec.create (nelems: 2); |
1898 | epilogue_cost_vec.create (nelems: 2); |
1899 | |
1900 | vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info: elem->dr_info, inside_cost: &inside_cost, |
1901 | outside_cost: &outside_cost, body_cost_vec: &body_cost_vec, |
1902 | prologue_cost_vec: &prologue_cost_vec, npeel: elem->npeel); |
1903 | |
1904 | body_cost_vec.release (); |
1905 | |
1906 | outside_cost += vect_get_known_peeling_cost |
1907 | (loop_vinfo, elem->npeel, &dummy, |
1908 | &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), |
1909 | &prologue_cost_vec, &epilogue_cost_vec); |
1910 | |
1911 | /* Prologue and epilogue costs are added to the target model later. |
1912 | These costs depend only on the scalar iteration cost, the |
1913 | number of peeling iterations finally chosen, and the number of |
1914 | misaligned statements. So discard the information found here. */ |
1915 | prologue_cost_vec.release (); |
1916 | epilogue_cost_vec.release (); |
1917 | |
1918 | if (inside_cost < min->inside_cost |
1919 | || (inside_cost == min->inside_cost |
1920 | && outside_cost < min->outside_cost)) |
1921 | { |
1922 | min->inside_cost = inside_cost; |
1923 | min->outside_cost = outside_cost; |
1924 | min->peel_info.dr_info = elem->dr_info; |
1925 | min->peel_info.npeel = elem->npeel; |
1926 | min->peel_info.count = elem->count; |
1927 | } |
1928 | |
1929 | return 1; |
1930 | } |
1931 | |
1932 | |
1933 | /* Choose best peeling option by traversing peeling hash table and either |
1934 | choosing an option with the lowest cost (if cost model is enabled) or the |
1935 | option that aligns as many accesses as possible. */ |
1936 | |
1937 | static struct _vect_peel_extended_info |
1938 | vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, |
1939 | loop_vec_info loop_vinfo) |
1940 | { |
1941 | struct _vect_peel_extended_info res; |
1942 | |
1943 | res.peel_info.dr_info = NULL; |
1944 | res.vinfo = loop_vinfo; |
1945 | |
1946 | if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) |
1947 | { |
1948 | res.inside_cost = INT_MAX; |
1949 | res.outside_cost = INT_MAX; |
1950 | peeling_htab->traverse <_vect_peel_extended_info *, |
1951 | vect_peeling_hash_get_lowest_cost> (argument: &res); |
1952 | } |
1953 | else |
1954 | { |
1955 | res.peel_info.count = 0; |
1956 | peeling_htab->traverse <_vect_peel_extended_info *, |
1957 | vect_peeling_hash_get_most_frequent> (argument: &res); |
1958 | res.inside_cost = 0; |
1959 | res.outside_cost = 0; |
1960 | } |
1961 | |
1962 | return res; |
1963 | } |
1964 | |
1965 | /* Return true if the new peeling NPEEL is supported. */ |
1966 | |
1967 | static bool |
1968 | vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info, |
1969 | unsigned npeel) |
1970 | { |
1971 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
1972 | enum dr_alignment_support supportable_dr_alignment; |
1973 | |
1974 | bool dr0_alignment_known_p |
1975 | = known_alignment_for_access_p (dr_info: dr0_info, |
1976 | STMT_VINFO_VECTYPE (dr0_info->stmt)); |
1977 | |
1978 | /* Ensure that all data refs can be vectorized after the peel. */ |
1979 | for (data_reference *dr : datarefs) |
1980 | { |
1981 | if (dr == dr0_info->dr) |
1982 | continue; |
1983 | |
1984 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
1985 | if (!vect_relevant_for_alignment_p (dr_info) |
1986 | || vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info: dr0_info)) |
1987 | continue; |
1988 | |
1989 | tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); |
1990 | int misalignment; |
1991 | unsigned HOST_WIDE_INT alignment; |
1992 | if (!dr0_alignment_known_p |
1993 | || !known_alignment_for_access_p (dr_info, vectype) |
1994 | || !DR_TARGET_ALIGNMENT (dr_info).is_constant (const_value: &alignment)) |
1995 | misalignment = DR_MISALIGNMENT_UNKNOWN; |
1996 | else |
1997 | { |
1998 | misalignment = dr_misalignment (dr_info, vectype); |
1999 | misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr)); |
2000 | misalignment &= alignment - 1; |
2001 | } |
2002 | supportable_dr_alignment |
2003 | = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype, |
2004 | misalignment); |
2005 | if (supportable_dr_alignment == dr_unaligned_unsupported) |
2006 | return false; |
2007 | } |
2008 | |
2009 | return true; |
2010 | } |
2011 | |
2012 | /* Compare two data-references DRA and DRB to group them into chunks |
2013 | with related alignment. */ |
2014 | |
2015 | static int |
2016 | dr_align_group_sort_cmp (const void *dra_, const void *drb_) |
2017 | { |
2018 | data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); |
2019 | data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); |
2020 | int cmp; |
2021 | |
2022 | /* Stabilize sort. */ |
2023 | if (dra == drb) |
2024 | return 0; |
2025 | |
2026 | /* Ordering of DRs according to base. */ |
2027 | cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra), |
2028 | DR_BASE_ADDRESS (drb)); |
2029 | if (cmp != 0) |
2030 | return cmp; |
2031 | |
2032 | /* And according to DR_OFFSET. */ |
2033 | cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); |
2034 | if (cmp != 0) |
2035 | return cmp; |
2036 | |
2037 | /* And after step. */ |
2038 | cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)); |
2039 | if (cmp != 0) |
2040 | return cmp; |
2041 | |
2042 | /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ |
2043 | cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)); |
2044 | if (cmp == 0) |
2045 | return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; |
2046 | return cmp; |
2047 | } |
2048 | |
2049 | /* Function vect_enhance_data_refs_alignment |
2050 | |
2051 | This pass will use loop versioning and loop peeling in order to enhance |
2052 | the alignment of data references in the loop. |
2053 | |
2054 | FOR NOW: we assume that whatever versioning/peeling takes place, only the |
2055 | original loop is to be vectorized. Any other loops that are created by |
2056 | the transformations performed in this pass - are not supposed to be |
2057 | vectorized. This restriction will be relaxed. |
2058 | |
2059 | This pass will require a cost model to guide it whether to apply peeling |
2060 | or versioning or a combination of the two. For example, the scheme that |
2061 | intel uses when given a loop with several memory accesses, is as follows: |
2062 | choose one memory access ('p') which alignment you want to force by doing |
2063 | peeling. Then, either (1) generate a loop in which 'p' is aligned and all |
2064 | other accesses are not necessarily aligned, or (2) use loop versioning to |
2065 | generate one loop in which all accesses are aligned, and another loop in |
2066 | which only 'p' is necessarily aligned. |
2067 | |
2068 | ("Automatic Intra-Register Vectorization for the Intel Architecture", |
2069 | Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International |
2070 | Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) |
2071 | |
2072 | Devising a cost model is the most critical aspect of this work. It will |
2073 | guide us on which access to peel for, whether to use loop versioning, how |
2074 | many versions to create, etc. The cost model will probably consist of |
2075 | generic considerations as well as target specific considerations (on |
2076 | powerpc for example, misaligned stores are more painful than misaligned |
2077 | loads). |
2078 | |
2079 | Here are the general steps involved in alignment enhancements: |
2080 | |
2081 | -- original loop, before alignment analysis: |
2082 | for (i=0; i<N; i++){ |
2083 | x = q[i]; # DR_MISALIGNMENT(q) = unknown |
2084 | p[i] = y; # DR_MISALIGNMENT(p) = unknown |
2085 | } |
2086 | |
2087 | -- After vect_compute_data_refs_alignment: |
2088 | for (i=0; i<N; i++){ |
2089 | x = q[i]; # DR_MISALIGNMENT(q) = 3 |
2090 | p[i] = y; # DR_MISALIGNMENT(p) = unknown |
2091 | } |
2092 | |
2093 | -- Possibility 1: we do loop versioning: |
2094 | if (p is aligned) { |
2095 | for (i=0; i<N; i++){ # loop 1A |
2096 | x = q[i]; # DR_MISALIGNMENT(q) = 3 |
2097 | p[i] = y; # DR_MISALIGNMENT(p) = 0 |
2098 | } |
2099 | } |
2100 | else { |
2101 | for (i=0; i<N; i++){ # loop 1B |
2102 | x = q[i]; # DR_MISALIGNMENT(q) = 3 |
2103 | p[i] = y; # DR_MISALIGNMENT(p) = unaligned |
2104 | } |
2105 | } |
2106 | |
2107 | -- Possibility 2: we do loop peeling: |
2108 | for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). |
2109 | x = q[i]; |
2110 | p[i] = y; |
2111 | } |
2112 | for (i = 3; i < N; i++){ # loop 2A |
2113 | x = q[i]; # DR_MISALIGNMENT(q) = 0 |
2114 | p[i] = y; # DR_MISALIGNMENT(p) = unknown |
2115 | } |
2116 | |
2117 | -- Possibility 3: combination of loop peeling and versioning: |
2118 | for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). |
2119 | x = q[i]; |
2120 | p[i] = y; |
2121 | } |
2122 | if (p is aligned) { |
2123 | for (i = 3; i<N; i++){ # loop 3A |
2124 | x = q[i]; # DR_MISALIGNMENT(q) = 0 |
2125 | p[i] = y; # DR_MISALIGNMENT(p) = 0 |
2126 | } |
2127 | } |
2128 | else { |
2129 | for (i = 3; i<N; i++){ # loop 3B |
2130 | x = q[i]; # DR_MISALIGNMENT(q) = 0 |
2131 | p[i] = y; # DR_MISALIGNMENT(p) = unaligned |
2132 | } |
2133 | } |
2134 | |
2135 | These loops are later passed to loop_transform to be vectorized. The |
2136 | vectorizer will use the alignment information to guide the transformation |
2137 | (whether to generate regular loads/stores, or with special handling for |
2138 | misalignment). */ |
2139 | |
2140 | opt_result |
2141 | vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) |
2142 | { |
2143 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
2144 | dr_vec_info *first_store = NULL; |
2145 | dr_vec_info *dr0_info = NULL; |
2146 | struct data_reference *dr; |
2147 | unsigned int i; |
2148 | bool do_peeling = false; |
2149 | bool do_versioning = false; |
2150 | unsigned int npeel = 0; |
2151 | bool one_misalignment_known = false; |
2152 | bool one_misalignment_unknown = false; |
2153 | bool one_dr_unsupportable = false; |
2154 | dr_vec_info *unsupportable_dr_info = NULL; |
2155 | unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0; |
2156 | hash_table<peel_info_hasher> peeling_htab (1); |
2157 | |
2158 | DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment" ); |
2159 | |
2160 | /* Reset data so we can safely be called multiple times. */ |
2161 | LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (size: 0); |
2162 | LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; |
2163 | |
2164 | if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ()) |
2165 | return opt_result::success (); |
2166 | |
2167 | /* Sort the vector of datarefs so DRs that have the same or dependent |
2168 | alignment are next to each other. */ |
2169 | auto_vec<data_reference_p> datarefs |
2170 | = LOOP_VINFO_DATAREFS (loop_vinfo).copy (); |
2171 | datarefs.qsort (dr_align_group_sort_cmp); |
2172 | |
2173 | /* Compute the number of DRs that become aligned when we peel |
2174 | a dataref so it becomes aligned. */ |
2175 | auto_vec<unsigned> n_same_align_refs (datarefs.length ()); |
2176 | n_same_align_refs.quick_grow_cleared (len: datarefs.length ()); |
2177 | unsigned i0; |
2178 | for (i0 = 0; i0 < datarefs.length (); ++i0) |
2179 | if (DR_BASE_ADDRESS (datarefs[i0])) |
2180 | break; |
2181 | for (i = i0 + 1; i <= datarefs.length (); ++i) |
2182 | { |
2183 | if (i == datarefs.length () |
2184 | || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]), |
2185 | DR_BASE_ADDRESS (datarefs[i]), flags: 0) |
2186 | || !operand_equal_p (DR_OFFSET (datarefs[i0]), |
2187 | DR_OFFSET (datarefs[i]), flags: 0) |
2188 | || !operand_equal_p (DR_STEP (datarefs[i0]), |
2189 | DR_STEP (datarefs[i]), flags: 0)) |
2190 | { |
2191 | /* The subgroup [i0, i-1] now only differs in DR_INIT and |
2192 | possibly DR_TARGET_ALIGNMENT. Still the whole subgroup |
2193 | will get known misalignment if we align one of the refs |
2194 | with the largest DR_TARGET_ALIGNMENT. */ |
2195 | for (unsigned j = i0; j < i; ++j) |
2196 | { |
2197 | dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]); |
2198 | for (unsigned k = i0; k < i; ++k) |
2199 | { |
2200 | if (k == j) |
2201 | continue; |
2202 | dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]); |
2203 | if (vect_dr_aligned_if_related_peeled_dr_is (dr_info: dr_infok, |
2204 | dr_peel_info: dr_infoj)) |
2205 | n_same_align_refs[j]++; |
2206 | } |
2207 | } |
2208 | i0 = i; |
2209 | } |
2210 | } |
2211 | |
2212 | /* While cost model enhancements are expected in the future, the high level |
2213 | view of the code at this time is as follows: |
2214 | |
2215 | A) If there is a misaligned access then see if peeling to align |
2216 | this access can make all data references satisfy |
2217 | vect_supportable_dr_alignment. If so, update data structures |
2218 | as needed and return true. |
2219 | |
2220 | B) If peeling wasn't possible and there is a data reference with an |
2221 | unknown misalignment that does not satisfy vect_supportable_dr_alignment |
2222 | then see if loop versioning checks can be used to make all data |
2223 | references satisfy vect_supportable_dr_alignment. If so, update |
2224 | data structures as needed and return true. |
2225 | |
2226 | C) If neither peeling nor versioning were successful then return false if |
2227 | any data reference does not satisfy vect_supportable_dr_alignment. |
2228 | |
2229 | D) Return true (all data references satisfy vect_supportable_dr_alignment). |
2230 | |
2231 | Note, Possibility 3 above (which is peeling and versioning together) is not |
2232 | being done at this time. */ |
2233 | |
2234 | /* (1) Peeling to force alignment. */ |
2235 | |
2236 | /* (1.1) Decide whether to perform peeling, and how many iterations to peel: |
2237 | Considerations: |
2238 | + How many accesses will become aligned due to the peeling |
2239 | - How many accesses will become unaligned due to the peeling, |
2240 | and the cost of misaligned accesses. |
2241 | - The cost of peeling (the extra runtime checks, the increase |
2242 | in code size). */ |
2243 | |
2244 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
2245 | { |
2246 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
2247 | if (!vect_relevant_for_alignment_p (dr_info)) |
2248 | continue; |
2249 | |
2250 | stmt_vec_info stmt_info = dr_info->stmt; |
2251 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
2252 | do_peeling = vector_alignment_reachable_p (dr_info); |
2253 | if (do_peeling) |
2254 | { |
2255 | if (known_alignment_for_access_p (dr_info, vectype)) |
2256 | { |
2257 | unsigned int npeel_tmp = 0; |
2258 | bool negative = tree_int_cst_compare (DR_STEP (dr), |
2259 | size_zero_node) < 0; |
2260 | |
2261 | /* If known_alignment_for_access_p then we have set |
2262 | DR_MISALIGNMENT which is only done if we know it at compiler |
2263 | time, so it is safe to assume target alignment is constant. |
2264 | */ |
2265 | unsigned int target_align = |
2266 | DR_TARGET_ALIGNMENT (dr_info).to_constant (); |
2267 | unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info); |
2268 | poly_int64 off = 0; |
2269 | if (negative) |
2270 | off = (TYPE_VECTOR_SUBPARTS (node: vectype) - 1) * -dr_size; |
2271 | unsigned int mis = dr_misalignment (dr_info, vectype, offset: off); |
2272 | mis = negative ? mis : -mis; |
2273 | if (mis != 0) |
2274 | npeel_tmp = (mis & (target_align - 1)) / dr_size; |
2275 | |
2276 | /* For multiple types, it is possible that the bigger type access |
2277 | will have more than one peeling option. E.g., a loop with two |
2278 | types: one of size (vector size / 4), and the other one of |
2279 | size (vector size / 8). Vectorization factor will 8. If both |
2280 | accesses are misaligned by 3, the first one needs one scalar |
2281 | iteration to be aligned, and the second one needs 5. But the |
2282 | first one will be aligned also by peeling 5 scalar |
2283 | iterations, and in that case both accesses will be aligned. |
2284 | Hence, except for the immediate peeling amount, we also want |
2285 | to try to add full vector size, while we don't exceed |
2286 | vectorization factor. |
2287 | We do this automatically for cost model, since we calculate |
2288 | cost for every peeling option. */ |
2289 | poly_uint64 nscalars = npeel_tmp; |
2290 | if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) |
2291 | { |
2292 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
2293 | nscalars = (STMT_SLP_TYPE (stmt_info) |
2294 | ? vf * DR_GROUP_SIZE (stmt_info) : vf); |
2295 | } |
2296 | |
2297 | /* Save info about DR in the hash table. Also include peeling |
2298 | amounts according to the explanation above. Indicate |
2299 | the alignment status when the ref is not aligned. |
2300 | ??? Rather than using unknown alignment here we should |
2301 | prune all entries from the peeling hashtable which cause |
2302 | DRs to be not supported. */ |
2303 | bool supportable_if_not_aligned |
2304 | = vect_supportable_dr_alignment |
2305 | (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN); |
2306 | while (known_le (npeel_tmp, nscalars)) |
2307 | { |
2308 | vect_peeling_hash_insert (peeling_htab: &peeling_htab, loop_vinfo, |
2309 | dr_info, npeel: npeel_tmp, |
2310 | supportable_if_not_aligned); |
2311 | npeel_tmp += MAX (1, target_align / dr_size); |
2312 | } |
2313 | |
2314 | one_misalignment_known = true; |
2315 | } |
2316 | else |
2317 | { |
2318 | /* If we don't know any misalignment values, we prefer |
2319 | peeling for data-ref that has the maximum number of data-refs |
2320 | with the same alignment, unless the target prefers to align |
2321 | stores over load. */ |
2322 | unsigned same_align_drs = n_same_align_refs[i]; |
2323 | if (!dr0_info |
2324 | || dr0_same_align_drs < same_align_drs) |
2325 | { |
2326 | dr0_same_align_drs = same_align_drs; |
2327 | dr0_info = dr_info; |
2328 | } |
2329 | /* For data-refs with the same number of related |
2330 | accesses prefer the one where the misalign |
2331 | computation will be invariant in the outermost loop. */ |
2332 | else if (dr0_same_align_drs == same_align_drs) |
2333 | { |
2334 | class loop *ivloop0, *ivloop; |
2335 | ivloop0 = outermost_invariant_loop_for_expr |
2336 | (loop, DR_BASE_ADDRESS (dr0_info->dr)); |
2337 | ivloop = outermost_invariant_loop_for_expr |
2338 | (loop, DR_BASE_ADDRESS (dr)); |
2339 | if ((ivloop && !ivloop0) |
2340 | || (ivloop && ivloop0 |
2341 | && flow_loop_nested_p (ivloop, ivloop0))) |
2342 | dr0_info = dr_info; |
2343 | } |
2344 | |
2345 | one_misalignment_unknown = true; |
2346 | |
2347 | /* Check for data refs with unsupportable alignment that |
2348 | can be peeled. */ |
2349 | enum dr_alignment_support supportable_dr_alignment |
2350 | = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype, |
2351 | DR_MISALIGNMENT_UNKNOWN); |
2352 | if (supportable_dr_alignment == dr_unaligned_unsupported) |
2353 | { |
2354 | one_dr_unsupportable = true; |
2355 | unsupportable_dr_info = dr_info; |
2356 | } |
2357 | |
2358 | if (!first_store && DR_IS_WRITE (dr)) |
2359 | { |
2360 | first_store = dr_info; |
2361 | first_store_same_align_drs = same_align_drs; |
2362 | } |
2363 | } |
2364 | } |
2365 | else |
2366 | { |
2367 | if (!aligned_access_p (dr_info, vectype)) |
2368 | { |
2369 | if (dump_enabled_p ()) |
2370 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2371 | "vector alignment may not be reachable\n" ); |
2372 | break; |
2373 | } |
2374 | } |
2375 | } |
2376 | |
2377 | /* Check if we can possibly peel the loop. */ |
2378 | if (!vect_can_advance_ivs_p (loop_vinfo) |
2379 | || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), |
2380 | loop_preheader_edge (loop)) |
2381 | || loop->inner |
2382 | || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)) |
2383 | do_peeling = false; |
2384 | |
2385 | struct _vect_peel_extended_info peel_for_known_alignment; |
2386 | struct _vect_peel_extended_info peel_for_unknown_alignment; |
2387 | struct _vect_peel_extended_info best_peel; |
2388 | |
2389 | peel_for_unknown_alignment.inside_cost = INT_MAX; |
2390 | peel_for_unknown_alignment.outside_cost = INT_MAX; |
2391 | peel_for_unknown_alignment.peel_info.count = 0; |
2392 | |
2393 | if (do_peeling |
2394 | && one_misalignment_unknown) |
2395 | { |
2396 | /* Check if the target requires to prefer stores over loads, i.e., if |
2397 | misaligned stores are more expensive than misaligned loads (taking |
2398 | drs with same alignment into account). */ |
2399 | unsigned int load_inside_cost = 0; |
2400 | unsigned int load_outside_cost = 0; |
2401 | unsigned int store_inside_cost = 0; |
2402 | unsigned int store_outside_cost = 0; |
2403 | unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2; |
2404 | |
2405 | stmt_vector_for_cost dummy; |
2406 | dummy.create (nelems: 2); |
2407 | vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info, |
2408 | inside_cost: &load_inside_cost, |
2409 | outside_cost: &load_outside_cost, |
2410 | body_cost_vec: &dummy, prologue_cost_vec: &dummy, npeel: estimated_npeels); |
2411 | dummy.release (); |
2412 | |
2413 | if (first_store) |
2414 | { |
2415 | dummy.create (nelems: 2); |
2416 | vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info: first_store, |
2417 | inside_cost: &store_inside_cost, |
2418 | outside_cost: &store_outside_cost, |
2419 | body_cost_vec: &dummy, prologue_cost_vec: &dummy, |
2420 | npeel: estimated_npeels); |
2421 | dummy.release (); |
2422 | } |
2423 | else |
2424 | { |
2425 | store_inside_cost = INT_MAX; |
2426 | store_outside_cost = INT_MAX; |
2427 | } |
2428 | |
2429 | if (load_inside_cost > store_inside_cost |
2430 | || (load_inside_cost == store_inside_cost |
2431 | && load_outside_cost > store_outside_cost)) |
2432 | { |
2433 | dr0_info = first_store; |
2434 | dr0_same_align_drs = first_store_same_align_drs; |
2435 | peel_for_unknown_alignment.inside_cost = store_inside_cost; |
2436 | peel_for_unknown_alignment.outside_cost = store_outside_cost; |
2437 | } |
2438 | else |
2439 | { |
2440 | peel_for_unknown_alignment.inside_cost = load_inside_cost; |
2441 | peel_for_unknown_alignment.outside_cost = load_outside_cost; |
2442 | } |
2443 | |
2444 | stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; |
2445 | prologue_cost_vec.create (nelems: 2); |
2446 | epilogue_cost_vec.create (nelems: 2); |
2447 | |
2448 | int dummy2; |
2449 | peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost |
2450 | (loop_vinfo, estimated_npeels, &dummy2, |
2451 | &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), |
2452 | &prologue_cost_vec, &epilogue_cost_vec); |
2453 | |
2454 | prologue_cost_vec.release (); |
2455 | epilogue_cost_vec.release (); |
2456 | |
2457 | peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1; |
2458 | } |
2459 | |
2460 | peel_for_unknown_alignment.peel_info.npeel = 0; |
2461 | peel_for_unknown_alignment.peel_info.dr_info = dr0_info; |
2462 | |
2463 | best_peel = peel_for_unknown_alignment; |
2464 | |
2465 | peel_for_known_alignment.inside_cost = INT_MAX; |
2466 | peel_for_known_alignment.outside_cost = INT_MAX; |
2467 | peel_for_known_alignment.peel_info.count = 0; |
2468 | peel_for_known_alignment.peel_info.dr_info = NULL; |
2469 | |
2470 | if (do_peeling && one_misalignment_known) |
2471 | { |
2472 | /* Peeling is possible, but there is no data access that is not supported |
2473 | unless aligned. So we try to choose the best possible peeling from |
2474 | the hash table. */ |
2475 | peel_for_known_alignment = vect_peeling_hash_choose_best_peeling |
2476 | (peeling_htab: &peeling_htab, loop_vinfo); |
2477 | } |
2478 | |
2479 | /* Compare costs of peeling for known and unknown alignment. */ |
2480 | if (peel_for_known_alignment.peel_info.dr_info != NULL |
2481 | && peel_for_unknown_alignment.inside_cost |
2482 | >= peel_for_known_alignment.inside_cost) |
2483 | { |
2484 | best_peel = peel_for_known_alignment; |
2485 | |
2486 | /* If the best peeling for known alignment has NPEEL == 0, perform no |
2487 | peeling at all except if there is an unsupportable dr that we can |
2488 | align. */ |
2489 | if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable) |
2490 | do_peeling = false; |
2491 | } |
2492 | |
2493 | /* If there is an unsupportable data ref, prefer this over all choices so far |
2494 | since we'd have to discard a chosen peeling except when it accidentally |
2495 | aligned the unsupportable data ref. */ |
2496 | if (one_dr_unsupportable) |
2497 | dr0_info = unsupportable_dr_info; |
2498 | else if (do_peeling) |
2499 | { |
2500 | /* Calculate the penalty for no peeling, i.e. leaving everything as-is. |
2501 | TODO: Use nopeel_outside_cost or get rid of it? */ |
2502 | unsigned nopeel_inside_cost = 0; |
2503 | unsigned nopeel_outside_cost = 0; |
2504 | |
2505 | stmt_vector_for_cost dummy; |
2506 | dummy.create (nelems: 2); |
2507 | vect_get_peeling_costs_all_drs (loop_vinfo, NULL, inside_cost: &nopeel_inside_cost, |
2508 | outside_cost: &nopeel_outside_cost, body_cost_vec: &dummy, prologue_cost_vec: &dummy, npeel: 0); |
2509 | dummy.release (); |
2510 | |
2511 | /* Add epilogue costs. As we do not peel for alignment here, no prologue |
2512 | costs will be recorded. */ |
2513 | stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; |
2514 | prologue_cost_vec.create (nelems: 2); |
2515 | epilogue_cost_vec.create (nelems: 2); |
2516 | |
2517 | int dummy2; |
2518 | nopeel_outside_cost += vect_get_known_peeling_cost |
2519 | (loop_vinfo, 0, &dummy2, |
2520 | &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), |
2521 | &prologue_cost_vec, &epilogue_cost_vec); |
2522 | |
2523 | prologue_cost_vec.release (); |
2524 | epilogue_cost_vec.release (); |
2525 | |
2526 | npeel = best_peel.peel_info.npeel; |
2527 | dr0_info = best_peel.peel_info.dr_info; |
2528 | |
2529 | /* If no peeling is not more expensive than the best peeling we |
2530 | have so far, don't perform any peeling. */ |
2531 | if (nopeel_inside_cost <= best_peel.inside_cost) |
2532 | do_peeling = false; |
2533 | } |
2534 | |
2535 | if (do_peeling) |
2536 | { |
2537 | stmt_vec_info stmt_info = dr0_info->stmt; |
2538 | if (known_alignment_for_access_p (dr_info: dr0_info, |
2539 | STMT_VINFO_VECTYPE (stmt_info))) |
2540 | { |
2541 | bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr), |
2542 | size_zero_node) < 0; |
2543 | if (!npeel) |
2544 | { |
2545 | /* Since it's known at compile time, compute the number of |
2546 | iterations in the peeled loop (the peeling factor) for use in |
2547 | updating DR_MISALIGNMENT values. The peeling factor is the |
2548 | vectorization factor minus the misalignment as an element |
2549 | count. */ |
2550 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
2551 | poly_int64 off = 0; |
2552 | if (negative) |
2553 | off = ((TYPE_VECTOR_SUBPARTS (node: vectype) - 1) |
2554 | * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))); |
2555 | unsigned int mis |
2556 | = dr_misalignment (dr_info: dr0_info, vectype, offset: off); |
2557 | mis = negative ? mis : -mis; |
2558 | /* If known_alignment_for_access_p then we have set |
2559 | DR_MISALIGNMENT which is only done if we know it at compiler |
2560 | time, so it is safe to assume target alignment is constant. |
2561 | */ |
2562 | unsigned int target_align = |
2563 | DR_TARGET_ALIGNMENT (dr0_info).to_constant (); |
2564 | npeel = ((mis & (target_align - 1)) |
2565 | / vect_get_scalar_dr_size (dr_info: dr0_info)); |
2566 | } |
2567 | |
2568 | /* For interleaved data access every iteration accesses all the |
2569 | members of the group, therefore we divide the number of iterations |
2570 | by the group size. */ |
2571 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
2572 | npeel /= DR_GROUP_SIZE (stmt_info); |
2573 | |
2574 | if (dump_enabled_p ()) |
2575 | dump_printf_loc (MSG_NOTE, vect_location, |
2576 | "Try peeling by %d\n" , npeel); |
2577 | } |
2578 | |
2579 | /* Ensure that all datarefs can be vectorized after the peel. */ |
2580 | if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel)) |
2581 | do_peeling = false; |
2582 | |
2583 | /* Check if all datarefs are supportable and log. */ |
2584 | if (do_peeling |
2585 | && npeel == 0 |
2586 | && known_alignment_for_access_p (dr_info: dr0_info, |
2587 | STMT_VINFO_VECTYPE (stmt_info))) |
2588 | return opt_result::success (); |
2589 | |
2590 | /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ |
2591 | if (do_peeling) |
2592 | { |
2593 | unsigned max_allowed_peel |
2594 | = param_vect_max_peeling_for_alignment; |
2595 | if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP) |
2596 | max_allowed_peel = 0; |
2597 | if (max_allowed_peel != (unsigned)-1) |
2598 | { |
2599 | unsigned max_peel = npeel; |
2600 | if (max_peel == 0) |
2601 | { |
2602 | poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info); |
2603 | unsigned HOST_WIDE_INT target_align_c; |
2604 | if (target_align.is_constant (const_value: &target_align_c)) |
2605 | max_peel = |
2606 | target_align_c / vect_get_scalar_dr_size (dr_info: dr0_info) - 1; |
2607 | else |
2608 | { |
2609 | do_peeling = false; |
2610 | if (dump_enabled_p ()) |
2611 | dump_printf_loc (MSG_NOTE, vect_location, |
2612 | "Disable peeling, max peels set and vector" |
2613 | " alignment unknown\n" ); |
2614 | } |
2615 | } |
2616 | if (max_peel > max_allowed_peel) |
2617 | { |
2618 | do_peeling = false; |
2619 | if (dump_enabled_p ()) |
2620 | dump_printf_loc (MSG_NOTE, vect_location, |
2621 | "Disable peeling, max peels reached: %d\n" , max_peel); |
2622 | } |
2623 | } |
2624 | } |
2625 | |
2626 | /* Cost model #2 - if peeling may result in a remaining loop not |
2627 | iterating enough to be vectorized then do not peel. Since this |
2628 | is a cost heuristic rather than a correctness decision, use the |
2629 | most likely runtime value for variable vectorization factors. */ |
2630 | if (do_peeling |
2631 | && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) |
2632 | { |
2633 | unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); |
2634 | unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel; |
2635 | if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo) |
2636 | < assumed_vf + max_peel) |
2637 | do_peeling = false; |
2638 | } |
2639 | |
2640 | if (do_peeling) |
2641 | { |
2642 | /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. |
2643 | If the misalignment of DR_i is identical to that of dr0 then set |
2644 | DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and |
2645 | dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) |
2646 | by the peeling factor times the element size of DR_i (MOD the |
2647 | vectorization factor times the size). Otherwise, the |
2648 | misalignment of DR_i must be set to unknown. */ |
2649 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
2650 | if (dr != dr0_info->dr) |
2651 | { |
2652 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
2653 | if (!vect_relevant_for_alignment_p (dr_info)) |
2654 | continue; |
2655 | |
2656 | vect_update_misalignment_for_peel (dr_info, dr_peel_info: dr0_info, npeel); |
2657 | } |
2658 | |
2659 | LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info; |
2660 | if (npeel) |
2661 | LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; |
2662 | else |
2663 | LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1; |
2664 | SET_DR_MISALIGNMENT (dr0_info, |
2665 | vect_dr_misalign_for_aligned_access (dr0_info)); |
2666 | if (dump_enabled_p ()) |
2667 | { |
2668 | dump_printf_loc (MSG_NOTE, vect_location, |
2669 | "Alignment of access forced using peeling.\n" ); |
2670 | dump_printf_loc (MSG_NOTE, vect_location, |
2671 | "Peeling for alignment will be applied.\n" ); |
2672 | } |
2673 | |
2674 | /* The inside-loop cost will be accounted for in vectorizable_load |
2675 | and vectorizable_store correctly with adjusted alignments. |
2676 | Drop the body_cst_vec on the floor here. */ |
2677 | return opt_result::success (); |
2678 | } |
2679 | } |
2680 | |
2681 | /* (2) Versioning to force alignment. */ |
2682 | |
2683 | /* Try versioning if: |
2684 | 1) optimize loop for speed and the cost-model is not cheap |
2685 | 2) there is at least one unsupported misaligned data ref with an unknown |
2686 | misalignment, and |
2687 | 3) all misaligned data refs with a known misalignment are supported, and |
2688 | 4) the number of runtime alignment checks is within reason. */ |
2689 | |
2690 | do_versioning |
2691 | = (optimize_loop_nest_for_speed_p (loop) |
2692 | && !loop->inner /* FORNOW */ |
2693 | && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP); |
2694 | |
2695 | if (do_versioning) |
2696 | { |
2697 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
2698 | { |
2699 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
2700 | if (!vect_relevant_for_alignment_p (dr_info)) |
2701 | continue; |
2702 | |
2703 | stmt_vec_info stmt_info = dr_info->stmt; |
2704 | if (STMT_VINFO_STRIDED_P (stmt_info)) |
2705 | { |
2706 | do_versioning = false; |
2707 | break; |
2708 | } |
2709 | |
2710 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
2711 | bool negative = tree_int_cst_compare (DR_STEP (dr), |
2712 | size_zero_node) < 0; |
2713 | poly_int64 off = 0; |
2714 | if (negative) |
2715 | off = ((TYPE_VECTOR_SUBPARTS (node: vectype) - 1) |
2716 | * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))); |
2717 | int misalignment; |
2718 | if ((misalignment = dr_misalignment (dr_info, vectype, offset: off)) == 0) |
2719 | continue; |
2720 | |
2721 | enum dr_alignment_support supportable_dr_alignment |
2722 | = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype, |
2723 | misalignment); |
2724 | if (supportable_dr_alignment == dr_unaligned_unsupported) |
2725 | { |
2726 | if (misalignment != DR_MISALIGNMENT_UNKNOWN |
2727 | || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () |
2728 | >= (unsigned) param_vect_max_version_for_alignment_checks)) |
2729 | { |
2730 | do_versioning = false; |
2731 | break; |
2732 | } |
2733 | |
2734 | /* At present we don't support versioning for alignment |
2735 | with variable VF, since there's no guarantee that the |
2736 | VF is a power of two. We could relax this if we added |
2737 | a way of enforcing a power-of-two size. */ |
2738 | unsigned HOST_WIDE_INT size; |
2739 | if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (const_value: &size)) |
2740 | { |
2741 | do_versioning = false; |
2742 | break; |
2743 | } |
2744 | |
2745 | /* Forcing alignment in the first iteration is no good if |
2746 | we don't keep it across iterations. For now, just disable |
2747 | versioning in this case. |
2748 | ?? We could actually unroll the loop to achieve the required |
2749 | overall step alignment, and forcing the alignment could be |
2750 | done by doing some iterations of the non-vectorized loop. */ |
2751 | if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) |
2752 | * DR_STEP_ALIGNMENT (dr), |
2753 | DR_TARGET_ALIGNMENT (dr_info))) |
2754 | { |
2755 | do_versioning = false; |
2756 | break; |
2757 | } |
2758 | |
2759 | /* The rightmost bits of an aligned address must be zeros. |
2760 | Construct the mask needed for this test. For example, |
2761 | GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the |
2762 | mask must be 15 = 0xf. */ |
2763 | int mask = size - 1; |
2764 | |
2765 | /* FORNOW: use the same mask to test all potentially unaligned |
2766 | references in the loop. */ |
2767 | if (LOOP_VINFO_PTR_MASK (loop_vinfo) |
2768 | && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask) |
2769 | { |
2770 | do_versioning = false; |
2771 | break; |
2772 | } |
2773 | |
2774 | LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; |
2775 | LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (obj: stmt_info); |
2776 | } |
2777 | } |
2778 | |
2779 | /* Versioning requires at least one misaligned data reference. */ |
2780 | if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) |
2781 | do_versioning = false; |
2782 | else if (!do_versioning) |
2783 | LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (size: 0); |
2784 | } |
2785 | |
2786 | if (do_versioning) |
2787 | { |
2788 | const vec<stmt_vec_info> &may_misalign_stmts |
2789 | = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); |
2790 | stmt_vec_info stmt_info; |
2791 | |
2792 | /* It can now be assumed that the data references in the statements |
2793 | in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version |
2794 | of the loop being vectorized. */ |
2795 | FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info) |
2796 | { |
2797 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
2798 | SET_DR_MISALIGNMENT (dr_info, |
2799 | vect_dr_misalign_for_aligned_access (dr_info)); |
2800 | if (dump_enabled_p ()) |
2801 | dump_printf_loc (MSG_NOTE, vect_location, |
2802 | "Alignment of access forced using versioning.\n" ); |
2803 | } |
2804 | |
2805 | if (dump_enabled_p ()) |
2806 | dump_printf_loc (MSG_NOTE, vect_location, |
2807 | "Versioning for alignment will be applied.\n" ); |
2808 | |
2809 | /* Peeling and versioning can't be done together at this time. */ |
2810 | gcc_assert (! (do_peeling && do_versioning)); |
2811 | |
2812 | return opt_result::success (); |
2813 | } |
2814 | |
2815 | /* This point is reached if neither peeling nor versioning is being done. */ |
2816 | gcc_assert (! (do_peeling || do_versioning)); |
2817 | |
2818 | return opt_result::success (); |
2819 | } |
2820 | |
2821 | |
2822 | /* Function vect_analyze_data_refs_alignment |
2823 | |
2824 | Analyze the alignment of the data-references in the loop. |
2825 | Return FALSE if a data reference is found that cannot be vectorized. */ |
2826 | |
2827 | opt_result |
2828 | vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo) |
2829 | { |
2830 | DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment" ); |
2831 | |
2832 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
2833 | struct data_reference *dr; |
2834 | unsigned int i; |
2835 | |
2836 | vect_record_base_alignments (vinfo: loop_vinfo); |
2837 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
2838 | { |
2839 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
2840 | if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)) |
2841 | { |
2842 | if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt) |
2843 | && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt) |
2844 | continue; |
2845 | vect_compute_data_ref_alignment (vinfo: loop_vinfo, dr_info, |
2846 | STMT_VINFO_VECTYPE (dr_info->stmt)); |
2847 | } |
2848 | } |
2849 | |
2850 | return opt_result::success (); |
2851 | } |
2852 | |
2853 | |
2854 | /* Analyze alignment of DRs of stmts in NODE. */ |
2855 | |
2856 | static bool |
2857 | vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node) |
2858 | { |
2859 | /* Alignment is maintained in the first element of the group. */ |
2860 | stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
2861 | first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info); |
2862 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info); |
2863 | tree vectype = SLP_TREE_VECTYPE (node); |
2864 | poly_uint64 vector_alignment |
2865 | = exact_div (a: targetm.vectorize.preferred_vector_alignment (vectype), |
2866 | BITS_PER_UNIT); |
2867 | if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED) |
2868 | vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node)); |
2869 | /* Re-analyze alignment when we're facing a vectorization with a bigger |
2870 | alignment requirement. */ |
2871 | else if (known_lt (dr_info->target_alignment, vector_alignment)) |
2872 | { |
2873 | poly_uint64 old_target_alignment = dr_info->target_alignment; |
2874 | int old_misalignment = dr_info->misalignment; |
2875 | vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node)); |
2876 | /* But keep knowledge about a smaller alignment. */ |
2877 | if (old_misalignment != DR_MISALIGNMENT_UNKNOWN |
2878 | && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN) |
2879 | { |
2880 | dr_info->target_alignment = old_target_alignment; |
2881 | dr_info->misalignment = old_misalignment; |
2882 | } |
2883 | } |
2884 | /* When we ever face unordered target alignments the first one wins in terms |
2885 | of analyzing and the other will become unknown in dr_misalignment. */ |
2886 | return true; |
2887 | } |
2888 | |
2889 | /* Function vect_slp_analyze_instance_alignment |
2890 | |
2891 | Analyze the alignment of the data-references in the SLP instance. |
2892 | Return FALSE if a data reference is found that cannot be vectorized. */ |
2893 | |
2894 | bool |
2895 | vect_slp_analyze_instance_alignment (vec_info *vinfo, |
2896 | slp_instance instance) |
2897 | { |
2898 | DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment" ); |
2899 | |
2900 | slp_tree node; |
2901 | unsigned i; |
2902 | FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) |
2903 | if (! vect_slp_analyze_node_alignment (vinfo, node)) |
2904 | return false; |
2905 | |
2906 | if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store |
2907 | && ! vect_slp_analyze_node_alignment |
2908 | (vinfo, SLP_INSTANCE_TREE (instance))) |
2909 | return false; |
2910 | |
2911 | return true; |
2912 | } |
2913 | |
2914 | |
2915 | /* Analyze groups of accesses: check that DR_INFO belongs to a group of |
2916 | accesses of legal size, step, etc. Detect gaps, single element |
2917 | interleaving, and other special cases. Set grouped access info. |
2918 | Collect groups of strided stores for further use in SLP analysis. |
2919 | Worker for vect_analyze_group_access. */ |
2920 | |
2921 | static bool |
2922 | vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info) |
2923 | { |
2924 | data_reference *dr = dr_info->dr; |
2925 | tree step = DR_STEP (dr); |
2926 | tree scalar_type = TREE_TYPE (DR_REF (dr)); |
2927 | HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); |
2928 | stmt_vec_info stmt_info = dr_info->stmt; |
2929 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
2930 | bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo); |
2931 | HOST_WIDE_INT dr_step = -1; |
2932 | HOST_WIDE_INT groupsize, last_accessed_element = 1; |
2933 | bool slp_impossible = false; |
2934 | |
2935 | /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the |
2936 | size of the interleaving group (including gaps). */ |
2937 | if (tree_fits_shwi_p (step)) |
2938 | { |
2939 | dr_step = tree_to_shwi (step); |
2940 | /* Check that STEP is a multiple of type size. Otherwise there is |
2941 | a non-element-sized gap at the end of the group which we |
2942 | cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE. |
2943 | ??? As we can handle non-constant step fine here we should |
2944 | simply remove uses of DR_GROUP_GAP between the last and first |
2945 | element and instead rely on DR_STEP. DR_GROUP_SIZE then would |
2946 | simply not include that gap. */ |
2947 | if ((dr_step % type_size) != 0) |
2948 | { |
2949 | if (dump_enabled_p ()) |
2950 | dump_printf_loc (MSG_NOTE, vect_location, |
2951 | "Step %T is not a multiple of the element size" |
2952 | " for %T\n" , |
2953 | step, DR_REF (dr)); |
2954 | return false; |
2955 | } |
2956 | groupsize = absu_hwi (x: dr_step) / type_size; |
2957 | } |
2958 | else |
2959 | groupsize = 0; |
2960 | |
2961 | /* Not consecutive access is possible only if it is a part of interleaving. */ |
2962 | if (!DR_GROUP_FIRST_ELEMENT (stmt_info)) |
2963 | { |
2964 | /* Check if it this DR is a part of interleaving, and is a single |
2965 | element of the group that is accessed in the loop. */ |
2966 | |
2967 | /* Gaps are supported only for loads. STEP must be a multiple of the type |
2968 | size. */ |
2969 | if (DR_IS_READ (dr) |
2970 | && (dr_step % type_size) == 0 |
2971 | && groupsize > 0 |
2972 | /* This could be UINT_MAX but as we are generating code in a very |
2973 | inefficient way we have to cap earlier. |
2974 | See PR91403 for example. */ |
2975 | && groupsize <= 4096) |
2976 | { |
2977 | DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info; |
2978 | DR_GROUP_SIZE (stmt_info) = groupsize; |
2979 | DR_GROUP_GAP (stmt_info) = groupsize - 1; |
2980 | if (dump_enabled_p ()) |
2981 | dump_printf_loc (MSG_NOTE, vect_location, |
2982 | "Detected single element interleaving %T" |
2983 | " step %T\n" , |
2984 | DR_REF (dr), step); |
2985 | |
2986 | return true; |
2987 | } |
2988 | |
2989 | if (dump_enabled_p ()) |
2990 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2991 | "not consecutive access %G" , stmt_info->stmt); |
2992 | |
2993 | if (bb_vinfo) |
2994 | { |
2995 | /* Mark the statement as unvectorizable. */ |
2996 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; |
2997 | return true; |
2998 | } |
2999 | |
3000 | if (dump_enabled_p ()) |
3001 | dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n" ); |
3002 | STMT_VINFO_STRIDED_P (stmt_info) = true; |
3003 | return true; |
3004 | } |
3005 | |
3006 | if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info) |
3007 | { |
3008 | /* First stmt in the interleaving chain. Check the chain. */ |
3009 | stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info); |
3010 | struct data_reference *data_ref = dr; |
3011 | unsigned int count = 1; |
3012 | tree prev_init = DR_INIT (data_ref); |
3013 | HOST_WIDE_INT diff, gaps = 0; |
3014 | |
3015 | /* By construction, all group members have INTEGER_CST DR_INITs. */ |
3016 | while (next) |
3017 | { |
3018 | /* We never have the same DR multiple times. */ |
3019 | gcc_assert (tree_int_cst_compare (DR_INIT (data_ref), |
3020 | DR_INIT (STMT_VINFO_DATA_REF (next))) != 0); |
3021 | |
3022 | data_ref = STMT_VINFO_DATA_REF (next); |
3023 | |
3024 | /* All group members have the same STEP by construction. */ |
3025 | gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); |
3026 | |
3027 | /* Check that the distance between two accesses is equal to the type |
3028 | size. Otherwise, we have gaps. */ |
3029 | diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) |
3030 | - TREE_INT_CST_LOW (prev_init)) / type_size; |
3031 | if (diff < 1 || diff > UINT_MAX) |
3032 | { |
3033 | /* For artificial testcases with array accesses with large |
3034 | constant indices we can run into overflow issues which |
3035 | can end up fooling the groupsize constraint below so |
3036 | check the individual gaps (which are represented as |
3037 | unsigned int) as well. */ |
3038 | if (dump_enabled_p ()) |
3039 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3040 | "interleaved access with gap larger " |
3041 | "than representable\n" ); |
3042 | return false; |
3043 | } |
3044 | if (diff != 1) |
3045 | { |
3046 | /* FORNOW: SLP of accesses with gaps is not supported. */ |
3047 | slp_impossible = true; |
3048 | if (DR_IS_WRITE (data_ref)) |
3049 | { |
3050 | if (dump_enabled_p ()) |
3051 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3052 | "interleaved store with gaps\n" ); |
3053 | return false; |
3054 | } |
3055 | |
3056 | gaps += diff - 1; |
3057 | } |
3058 | |
3059 | last_accessed_element += diff; |
3060 | |
3061 | /* Store the gap from the previous member of the group. If there is no |
3062 | gap in the access, DR_GROUP_GAP is always 1. */ |
3063 | DR_GROUP_GAP (next) = diff; |
3064 | |
3065 | prev_init = DR_INIT (data_ref); |
3066 | next = DR_GROUP_NEXT_ELEMENT (next); |
3067 | /* Count the number of data-refs in the chain. */ |
3068 | count++; |
3069 | } |
3070 | |
3071 | if (groupsize == 0) |
3072 | groupsize = count + gaps; |
3073 | |
3074 | /* This could be UINT_MAX but as we are generating code in a very |
3075 | inefficient way we have to cap earlier. See PR78699 for example. */ |
3076 | if (groupsize > 4096) |
3077 | { |
3078 | if (dump_enabled_p ()) |
3079 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3080 | "group is too large\n" ); |
3081 | return false; |
3082 | } |
3083 | |
3084 | /* Check that the size of the interleaving is equal to count for stores, |
3085 | i.e., that there are no gaps. */ |
3086 | if (groupsize != count |
3087 | && !DR_IS_READ (dr)) |
3088 | { |
3089 | groupsize = count; |
3090 | STMT_VINFO_STRIDED_P (stmt_info) = true; |
3091 | } |
3092 | |
3093 | /* If there is a gap after the last load in the group it is the |
3094 | difference between the groupsize and the last accessed |
3095 | element. |
3096 | When there is no gap, this difference should be 0. */ |
3097 | DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element; |
3098 | |
3099 | DR_GROUP_SIZE (stmt_info) = groupsize; |
3100 | if (dump_enabled_p ()) |
3101 | { |
3102 | dump_printf_loc (MSG_NOTE, vect_location, |
3103 | "Detected interleaving " ); |
3104 | if (DR_IS_READ (dr)) |
3105 | dump_printf (MSG_NOTE, "load " ); |
3106 | else if (STMT_VINFO_STRIDED_P (stmt_info)) |
3107 | dump_printf (MSG_NOTE, "strided store " ); |
3108 | else |
3109 | dump_printf (MSG_NOTE, "store " ); |
3110 | dump_printf (MSG_NOTE, "of size %u\n" , |
3111 | (unsigned)groupsize); |
3112 | dump_printf_loc (MSG_NOTE, vect_location, "\t%G" , stmt_info->stmt); |
3113 | next = DR_GROUP_NEXT_ELEMENT (stmt_info); |
3114 | while (next) |
3115 | { |
3116 | if (DR_GROUP_GAP (next) != 1) |
3117 | dump_printf_loc (MSG_NOTE, vect_location, |
3118 | "\t<gap of %d elements>\n" , |
3119 | DR_GROUP_GAP (next) - 1); |
3120 | dump_printf_loc (MSG_NOTE, vect_location, "\t%G" , next->stmt); |
3121 | next = DR_GROUP_NEXT_ELEMENT (next); |
3122 | } |
3123 | if (DR_GROUP_GAP (stmt_info) != 0) |
3124 | dump_printf_loc (MSG_NOTE, vect_location, |
3125 | "\t<gap of %d elements>\n" , |
3126 | DR_GROUP_GAP (stmt_info)); |
3127 | } |
3128 | |
3129 | /* SLP: create an SLP data structure for every interleaving group of |
3130 | stores for further analysis in vect_analyse_slp. */ |
3131 | if (DR_IS_WRITE (dr) && !slp_impossible) |
3132 | { |
3133 | if (loop_vinfo) |
3134 | LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (obj: stmt_info); |
3135 | if (bb_vinfo) |
3136 | BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (obj: stmt_info); |
3137 | } |
3138 | } |
3139 | |
3140 | return true; |
3141 | } |
3142 | |
3143 | /* Analyze groups of accesses: check that DR_INFO belongs to a group of |
3144 | accesses of legal size, step, etc. Detect gaps, single element |
3145 | interleaving, and other special cases. Set grouped access info. |
3146 | Collect groups of strided stores for further use in SLP analysis. */ |
3147 | |
3148 | static bool |
3149 | vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info) |
3150 | { |
3151 | if (!vect_analyze_group_access_1 (vinfo, dr_info)) |
3152 | { |
3153 | /* Dissolve the group if present. */ |
3154 | stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt); |
3155 | while (stmt_info) |
3156 | { |
3157 | stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info); |
3158 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
3159 | DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL; |
3160 | stmt_info = next; |
3161 | } |
3162 | return false; |
3163 | } |
3164 | return true; |
3165 | } |
3166 | |
3167 | /* Analyze the access pattern of the data-reference DR_INFO. |
3168 | In case of non-consecutive accesses call vect_analyze_group_access() to |
3169 | analyze groups of accesses. */ |
3170 | |
3171 | static bool |
3172 | vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info) |
3173 | { |
3174 | data_reference *dr = dr_info->dr; |
3175 | tree step = DR_STEP (dr); |
3176 | tree scalar_type = TREE_TYPE (DR_REF (dr)); |
3177 | stmt_vec_info stmt_info = dr_info->stmt; |
3178 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
3179 | class loop *loop = NULL; |
3180 | |
3181 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) |
3182 | return true; |
3183 | |
3184 | if (loop_vinfo) |
3185 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
3186 | |
3187 | if (loop_vinfo && !step) |
3188 | { |
3189 | if (dump_enabled_p ()) |
3190 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3191 | "bad data-ref access in loop\n" ); |
3192 | return false; |
3193 | } |
3194 | |
3195 | /* Allow loads with zero step in inner-loop vectorization. */ |
3196 | if (loop_vinfo && integer_zerop (step)) |
3197 | { |
3198 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
3199 | if (!nested_in_vect_loop_p (loop, stmt_info)) |
3200 | return DR_IS_READ (dr); |
3201 | /* Allow references with zero step for outer loops marked |
3202 | with pragma omp simd only - it guarantees absence of |
3203 | loop-carried dependencies between inner loop iterations. */ |
3204 | if (loop->safelen < 2) |
3205 | { |
3206 | if (dump_enabled_p ()) |
3207 | dump_printf_loc (MSG_NOTE, vect_location, |
3208 | "zero step in inner loop of nest\n" ); |
3209 | return false; |
3210 | } |
3211 | } |
3212 | |
3213 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
3214 | { |
3215 | /* Interleaved accesses are not yet supported within outer-loop |
3216 | vectorization for references in the inner-loop. */ |
3217 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
3218 | |
3219 | /* For the rest of the analysis we use the outer-loop step. */ |
3220 | step = STMT_VINFO_DR_STEP (stmt_info); |
3221 | if (integer_zerop (step)) |
3222 | { |
3223 | if (dump_enabled_p ()) |
3224 | dump_printf_loc (MSG_NOTE, vect_location, |
3225 | "zero step in outer loop.\n" ); |
3226 | return DR_IS_READ (dr); |
3227 | } |
3228 | } |
3229 | |
3230 | /* Consecutive? */ |
3231 | if (TREE_CODE (step) == INTEGER_CST) |
3232 | { |
3233 | HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); |
3234 | if (!tree_int_cst_compare (t1: step, TYPE_SIZE_UNIT (scalar_type)) |
3235 | || (dr_step < 0 |
3236 | && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) |
3237 | { |
3238 | /* Mark that it is not interleaving. */ |
3239 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
3240 | return true; |
3241 | } |
3242 | } |
3243 | |
3244 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
3245 | { |
3246 | if (dump_enabled_p ()) |
3247 | dump_printf_loc (MSG_NOTE, vect_location, |
3248 | "grouped access in outer loop.\n" ); |
3249 | return false; |
3250 | } |
3251 | |
3252 | |
3253 | /* Assume this is a DR handled by non-constant strided load case. */ |
3254 | if (TREE_CODE (step) != INTEGER_CST) |
3255 | return (STMT_VINFO_STRIDED_P (stmt_info) |
3256 | && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) |
3257 | || vect_analyze_group_access (vinfo, dr_info))); |
3258 | |
3259 | /* Not consecutive access - check if it's a part of interleaving group. */ |
3260 | return vect_analyze_group_access (vinfo, dr_info); |
3261 | } |
3262 | |
3263 | /* Compare two data-references DRA and DRB to group them into chunks |
3264 | suitable for grouping. */ |
3265 | |
3266 | static int |
3267 | dr_group_sort_cmp (const void *dra_, const void *drb_) |
3268 | { |
3269 | dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_); |
3270 | dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_); |
3271 | data_reference_p dra = dra_info->dr; |
3272 | data_reference_p drb = drb_info->dr; |
3273 | int cmp; |
3274 | |
3275 | /* Stabilize sort. */ |
3276 | if (dra == drb) |
3277 | return 0; |
3278 | |
3279 | /* Different group IDs lead never belong to the same group. */ |
3280 | if (dra_info->group != drb_info->group) |
3281 | return dra_info->group < drb_info->group ? -1 : 1; |
3282 | |
3283 | /* Ordering of DRs according to base. */ |
3284 | cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra), |
3285 | DR_BASE_ADDRESS (drb)); |
3286 | if (cmp != 0) |
3287 | return cmp; |
3288 | |
3289 | /* And according to DR_OFFSET. */ |
3290 | cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); |
3291 | if (cmp != 0) |
3292 | return cmp; |
3293 | |
3294 | /* Put reads before writes. */ |
3295 | if (DR_IS_READ (dra) != DR_IS_READ (drb)) |
3296 | return DR_IS_READ (dra) ? -1 : 1; |
3297 | |
3298 | /* Then sort after access size. */ |
3299 | cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), |
3300 | TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); |
3301 | if (cmp != 0) |
3302 | return cmp; |
3303 | |
3304 | /* And after step. */ |
3305 | cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)); |
3306 | if (cmp != 0) |
3307 | return cmp; |
3308 | |
3309 | /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ |
3310 | cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)); |
3311 | if (cmp == 0) |
3312 | return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; |
3313 | return cmp; |
3314 | } |
3315 | |
3316 | /* If OP is the result of a conversion, return the unconverted value, |
3317 | otherwise return null. */ |
3318 | |
3319 | static tree |
3320 | strip_conversion (tree op) |
3321 | { |
3322 | if (TREE_CODE (op) != SSA_NAME) |
3323 | return NULL_TREE; |
3324 | gimple *stmt = SSA_NAME_DEF_STMT (op); |
3325 | if (!is_gimple_assign (gs: stmt) |
3326 | || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))) |
3327 | return NULL_TREE; |
3328 | return gimple_assign_rhs1 (gs: stmt); |
3329 | } |
3330 | |
3331 | /* Return true if vectorizable_* routines can handle statements STMT1_INFO |
3332 | and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can |
3333 | be grouped in SLP mode. */ |
3334 | |
3335 | static bool |
3336 | can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info, |
3337 | bool allow_slp_p) |
3338 | { |
3339 | if (gimple_assign_single_p (gs: stmt1_info->stmt)) |
3340 | return gimple_assign_single_p (gs: stmt2_info->stmt); |
3341 | |
3342 | gcall *call1 = dyn_cast <gcall *> (p: stmt1_info->stmt); |
3343 | if (call1 && gimple_call_internal_p (gs: call1)) |
3344 | { |
3345 | /* Check for two masked loads or two masked stores. */ |
3346 | gcall *call2 = dyn_cast <gcall *> (p: stmt2_info->stmt); |
3347 | if (!call2 || !gimple_call_internal_p (gs: call2)) |
3348 | return false; |
3349 | internal_fn ifn = gimple_call_internal_fn (gs: call1); |
3350 | if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE) |
3351 | return false; |
3352 | if (ifn != gimple_call_internal_fn (gs: call2)) |
3353 | return false; |
3354 | |
3355 | /* Check that the masks are the same. Cope with casts of masks, |
3356 | like those created by build_mask_conversion. */ |
3357 | tree mask1 = gimple_call_arg (gs: call1, index: 2); |
3358 | tree mask2 = gimple_call_arg (gs: call2, index: 2); |
3359 | if (!operand_equal_p (mask1, mask2, flags: 0) && !allow_slp_p) |
3360 | { |
3361 | mask1 = strip_conversion (op: mask1); |
3362 | if (!mask1) |
3363 | return false; |
3364 | mask2 = strip_conversion (op: mask2); |
3365 | if (!mask2) |
3366 | return false; |
3367 | if (!operand_equal_p (mask1, mask2, flags: 0)) |
3368 | return false; |
3369 | } |
3370 | return true; |
3371 | } |
3372 | |
3373 | return false; |
3374 | } |
3375 | |
3376 | /* Function vect_analyze_data_ref_accesses. |
3377 | |
3378 | Analyze the access pattern of all the data references in the loop. |
3379 | |
3380 | FORNOW: the only access pattern that is considered vectorizable is a |
3381 | simple step 1 (consecutive) access. |
3382 | |
3383 | FORNOW: handle only arrays and pointer accesses. */ |
3384 | |
3385 | opt_result |
3386 | vect_analyze_data_ref_accesses (vec_info *vinfo, |
3387 | vec<int> *dataref_groups) |
3388 | { |
3389 | unsigned int i; |
3390 | vec<data_reference_p> datarefs = vinfo->shared->datarefs; |
3391 | |
3392 | DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses" ); |
3393 | |
3394 | if (datarefs.is_empty ()) |
3395 | return opt_result::success (); |
3396 | |
3397 | /* Sort the array of datarefs to make building the interleaving chains |
3398 | linear. Don't modify the original vector's order, it is needed for |
3399 | determining what dependencies are reversed. */ |
3400 | vec<dr_vec_info *> datarefs_copy; |
3401 | datarefs_copy.create (nelems: datarefs.length ()); |
3402 | for (unsigned i = 0; i < datarefs.length (); i++) |
3403 | { |
3404 | dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]); |
3405 | /* If the caller computed DR grouping use that, otherwise group by |
3406 | basic blocks. */ |
3407 | if (dataref_groups) |
3408 | dr_info->group = (*dataref_groups)[i]; |
3409 | else |
3410 | dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index; |
3411 | datarefs_copy.quick_push (obj: dr_info); |
3412 | } |
3413 | datarefs_copy.qsort (dr_group_sort_cmp); |
3414 | hash_set<stmt_vec_info> to_fixup; |
3415 | |
3416 | /* Build the interleaving chains. */ |
3417 | for (i = 0; i < datarefs_copy.length () - 1;) |
3418 | { |
3419 | dr_vec_info *dr_info_a = datarefs_copy[i]; |
3420 | data_reference_p dra = dr_info_a->dr; |
3421 | int dra_group_id = dr_info_a->group; |
3422 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
3423 | stmt_vec_info lastinfo = NULL; |
3424 | if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) |
3425 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)) |
3426 | { |
3427 | ++i; |
3428 | continue; |
3429 | } |
3430 | for (i = i + 1; i < datarefs_copy.length (); ++i) |
3431 | { |
3432 | dr_vec_info *dr_info_b = datarefs_copy[i]; |
3433 | data_reference_p drb = dr_info_b->dr; |
3434 | int drb_group_id = dr_info_b->group; |
3435 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; |
3436 | if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b) |
3437 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) |
3438 | break; |
3439 | |
3440 | /* ??? Imperfect sorting (non-compatible types, non-modulo |
3441 | accesses, same accesses) can lead to a group to be artificially |
3442 | split here as we don't just skip over those. If it really |
3443 | matters we can push those to a worklist and re-iterate |
3444 | over them. The we can just skip ahead to the next DR here. */ |
3445 | |
3446 | /* DRs in a different DR group should not be put into the same |
3447 | interleaving group. */ |
3448 | if (dra_group_id != drb_group_id) |
3449 | break; |
3450 | |
3451 | /* Check that the data-refs have same first location (except init) |
3452 | and they are both either store or load (not load and store, |
3453 | not masked loads or stores). */ |
3454 | if (DR_IS_READ (dra) != DR_IS_READ (drb) |
3455 | || data_ref_compare_tree (DR_BASE_ADDRESS (dra), |
3456 | DR_BASE_ADDRESS (drb)) != 0 |
3457 | || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 |
3458 | || !can_group_stmts_p (stmt1_info: stmtinfo_a, stmt2_info: stmtinfo_b, allow_slp_p: true)) |
3459 | break; |
3460 | |
3461 | /* Check that the data-refs have the same constant size. */ |
3462 | tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); |
3463 | tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); |
3464 | if (!tree_fits_uhwi_p (sza) |
3465 | || !tree_fits_uhwi_p (szb) |
3466 | || !tree_int_cst_equal (sza, szb)) |
3467 | break; |
3468 | |
3469 | /* Check that the data-refs have the same step. */ |
3470 | if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0) |
3471 | break; |
3472 | |
3473 | /* Check the types are compatible. |
3474 | ??? We don't distinguish this during sorting. */ |
3475 | if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), |
3476 | TREE_TYPE (DR_REF (drb)))) |
3477 | break; |
3478 | |
3479 | /* Check that the DR_INITs are compile-time constants. */ |
3480 | if (!tree_fits_shwi_p (DR_INIT (dra)) |
3481 | || !tree_fits_shwi_p (DR_INIT (drb))) |
3482 | break; |
3483 | |
3484 | /* Different .GOMP_SIMD_LANE calls still give the same lane, |
3485 | just hold extra information. */ |
3486 | if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a) |
3487 | && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b) |
3488 | && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0) |
3489 | break; |
3490 | |
3491 | /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ |
3492 | HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); |
3493 | HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); |
3494 | HOST_WIDE_INT init_prev |
3495 | = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr)); |
3496 | gcc_assert (init_a <= init_b |
3497 | && init_a <= init_prev |
3498 | && init_prev <= init_b); |
3499 | |
3500 | /* Do not place the same access in the interleaving chain twice. */ |
3501 | if (init_b == init_prev) |
3502 | { |
3503 | gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr)) |
3504 | < gimple_uid (DR_STMT (drb))); |
3505 | /* Simply link in duplicates and fix up the chain below. */ |
3506 | } |
3507 | else |
3508 | { |
3509 | /* If init_b == init_a + the size of the type * k, we have an |
3510 | interleaving, and DRA is accessed before DRB. */ |
3511 | unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); |
3512 | if (type_size_a == 0 |
3513 | || (((unsigned HOST_WIDE_INT)init_b - init_a) |
3514 | % type_size_a != 0)) |
3515 | break; |
3516 | |
3517 | /* If we have a store, the accesses are adjacent. This splits |
3518 | groups into chunks we support (we don't support vectorization |
3519 | of stores with gaps). */ |
3520 | if (!DR_IS_READ (dra) |
3521 | && (((unsigned HOST_WIDE_INT)init_b - init_prev) |
3522 | != type_size_a)) |
3523 | break; |
3524 | |
3525 | /* If the step (if not zero or non-constant) is smaller than the |
3526 | difference between data-refs' inits this splits groups into |
3527 | suitable sizes. */ |
3528 | if (tree_fits_shwi_p (DR_STEP (dra))) |
3529 | { |
3530 | unsigned HOST_WIDE_INT step |
3531 | = absu_hwi (x: tree_to_shwi (DR_STEP (dra))); |
3532 | if (step != 0 |
3533 | && step <= ((unsigned HOST_WIDE_INT)init_b - init_a)) |
3534 | break; |
3535 | } |
3536 | } |
3537 | |
3538 | if (dump_enabled_p ()) |
3539 | dump_printf_loc (MSG_NOTE, vect_location, |
3540 | DR_IS_READ (dra) |
3541 | ? "Detected interleaving load %T and %T\n" |
3542 | : "Detected interleaving store %T and %T\n" , |
3543 | DR_REF (dra), DR_REF (drb)); |
3544 | |
3545 | /* Link the found element into the group list. */ |
3546 | if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) |
3547 | { |
3548 | DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a; |
3549 | lastinfo = stmtinfo_a; |
3550 | } |
3551 | DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a; |
3552 | DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b; |
3553 | lastinfo = stmtinfo_b; |
3554 | |
3555 | STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a) |
3556 | = !can_group_stmts_p (stmt1_info: stmtinfo_a, stmt2_info: stmtinfo_b, allow_slp_p: false); |
3557 | |
3558 | if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)) |
3559 | dump_printf_loc (MSG_NOTE, vect_location, |
3560 | "Load suitable for SLP vectorization only.\n" ); |
3561 | |
3562 | if (init_b == init_prev |
3563 | && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) |
3564 | && dump_enabled_p ()) |
3565 | dump_printf_loc (MSG_NOTE, vect_location, |
3566 | "Queuing group with duplicate access for fixup\n" ); |
3567 | } |
3568 | } |
3569 | |
3570 | /* Fixup groups with duplicate entries by splitting it. */ |
3571 | while (1) |
3572 | { |
3573 | hash_set<stmt_vec_info>::iterator it = to_fixup.begin (); |
3574 | if (!(it != to_fixup.end ())) |
3575 | break; |
3576 | stmt_vec_info grp = *it; |
3577 | to_fixup.remove (k: grp); |
3578 | |
3579 | /* Find the earliest duplicate group member. */ |
3580 | unsigned first_duplicate = -1u; |
3581 | stmt_vec_info next, g = grp; |
3582 | while ((next = DR_GROUP_NEXT_ELEMENT (g))) |
3583 | { |
3584 | if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr), |
3585 | DR_INIT (STMT_VINFO_DR_INFO (g)->dr)) |
3586 | && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate) |
3587 | first_duplicate = gimple_uid (STMT_VINFO_STMT (next)); |
3588 | g = next; |
3589 | } |
3590 | if (first_duplicate == -1U) |
3591 | continue; |
3592 | |
3593 | /* Then move all stmts after the first duplicate to a new group. |
3594 | Note this is a heuristic but one with the property that *it |
3595 | is fixed up completely. */ |
3596 | g = grp; |
3597 | stmt_vec_info newgroup = NULL, ng = grp; |
3598 | while ((next = DR_GROUP_NEXT_ELEMENT (g))) |
3599 | { |
3600 | if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate) |
3601 | { |
3602 | DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next); |
3603 | if (!newgroup) |
3604 | newgroup = next; |
3605 | else |
3606 | DR_GROUP_NEXT_ELEMENT (ng) = next; |
3607 | ng = next; |
3608 | DR_GROUP_FIRST_ELEMENT (ng) = newgroup; |
3609 | } |
3610 | else |
3611 | g = DR_GROUP_NEXT_ELEMENT (g); |
3612 | } |
3613 | DR_GROUP_NEXT_ELEMENT (ng) = NULL; |
3614 | |
3615 | /* Fixup the new group which still may contain duplicates. */ |
3616 | to_fixup.add (k: newgroup); |
3617 | } |
3618 | |
3619 | dr_vec_info *dr_info; |
3620 | FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info) |
3621 | { |
3622 | if (STMT_VINFO_VECTORIZABLE (dr_info->stmt) |
3623 | && !vect_analyze_data_ref_access (vinfo, dr_info)) |
3624 | { |
3625 | if (dump_enabled_p ()) |
3626 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3627 | "not vectorized: complicated access pattern.\n" ); |
3628 | |
3629 | if (is_a <bb_vec_info> (p: vinfo)) |
3630 | { |
3631 | /* Mark the statement as not vectorizable. */ |
3632 | STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false; |
3633 | continue; |
3634 | } |
3635 | else |
3636 | { |
3637 | datarefs_copy.release (); |
3638 | return opt_result::failure_at (loc: dr_info->stmt->stmt, |
3639 | fmt: "not vectorized:" |
3640 | " complicated access pattern.\n" ); |
3641 | } |
3642 | } |
3643 | } |
3644 | |
3645 | datarefs_copy.release (); |
3646 | return opt_result::success (); |
3647 | } |
3648 | |
3649 | /* Function vect_vfa_segment_size. |
3650 | |
3651 | Input: |
3652 | DR_INFO: The data reference. |
3653 | LENGTH_FACTOR: segment length to consider. |
3654 | |
3655 | Return a value suitable for the dr_with_seg_len::seg_len field. |
3656 | This is the "distance travelled" by the pointer from the first |
3657 | iteration in the segment to the last. Note that it does not include |
3658 | the size of the access; in effect it only describes the first byte. */ |
3659 | |
3660 | static tree |
3661 | vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor) |
3662 | { |
3663 | length_factor = size_binop (MINUS_EXPR, |
3664 | fold_convert (sizetype, length_factor), |
3665 | size_one_node); |
3666 | return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)), |
3667 | length_factor); |
3668 | } |
3669 | |
3670 | /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)), |
3671 | gives the worst-case number of bytes covered by the segment. */ |
3672 | |
3673 | static unsigned HOST_WIDE_INT |
3674 | vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info) |
3675 | { |
3676 | stmt_vec_info stmt_vinfo = dr_info->stmt; |
3677 | tree ref_type = TREE_TYPE (DR_REF (dr_info->dr)); |
3678 | unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type)); |
3679 | unsigned HOST_WIDE_INT access_size = ref_size; |
3680 | if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo)) |
3681 | { |
3682 | gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo); |
3683 | access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo); |
3684 | } |
3685 | tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); |
3686 | int misalignment; |
3687 | if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists () |
3688 | && ((misalignment = dr_misalignment (dr_info, vectype)), true) |
3689 | && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment) |
3690 | == dr_explicit_realign_optimized)) |
3691 | { |
3692 | /* We might access a full vector's worth. */ |
3693 | access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size; |
3694 | } |
3695 | return access_size; |
3696 | } |
3697 | |
3698 | /* Get the minimum alignment for all the scalar accesses that DR_INFO |
3699 | describes. */ |
3700 | |
3701 | static unsigned int |
3702 | vect_vfa_align (dr_vec_info *dr_info) |
3703 | { |
3704 | return dr_alignment (dr: dr_info->dr); |
3705 | } |
3706 | |
3707 | /* Function vect_no_alias_p. |
3708 | |
3709 | Given data references A and B with equal base and offset, see whether |
3710 | the alias relation can be decided at compilation time. Return 1 if |
3711 | it can and the references alias, 0 if it can and the references do |
3712 | not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A, |
3713 | SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent |
3714 | of dr_with_seg_len::{seg_len,access_size} for A and B. */ |
3715 | |
3716 | static int |
3717 | vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b, |
3718 | tree segment_length_a, tree segment_length_b, |
3719 | unsigned HOST_WIDE_INT access_size_a, |
3720 | unsigned HOST_WIDE_INT access_size_b) |
3721 | { |
3722 | poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr)); |
3723 | poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr)); |
3724 | poly_uint64 const_length_a; |
3725 | poly_uint64 const_length_b; |
3726 | |
3727 | /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT |
3728 | bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of |
3729 | [a, a+12) */ |
3730 | if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0) |
3731 | { |
3732 | const_length_a = (-wi::to_poly_wide (t: segment_length_a)).force_uhwi (); |
3733 | offset_a -= const_length_a; |
3734 | } |
3735 | else |
3736 | const_length_a = tree_to_poly_uint64 (segment_length_a); |
3737 | if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0) |
3738 | { |
3739 | const_length_b = (-wi::to_poly_wide (t: segment_length_b)).force_uhwi (); |
3740 | offset_b -= const_length_b; |
3741 | } |
3742 | else |
3743 | const_length_b = tree_to_poly_uint64 (segment_length_b); |
3744 | |
3745 | const_length_a += access_size_a; |
3746 | const_length_b += access_size_b; |
3747 | |
3748 | if (ranges_known_overlap_p (pos1: offset_a, size1: const_length_a, |
3749 | pos2: offset_b, size2: const_length_b)) |
3750 | return 1; |
3751 | |
3752 | if (!ranges_maybe_overlap_p (pos1: offset_a, size1: const_length_a, |
3753 | pos2: offset_b, size2: const_length_b)) |
3754 | return 0; |
3755 | |
3756 | return -1; |
3757 | } |
3758 | |
3759 | /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH |
3760 | in DDR is >= VF. */ |
3761 | |
3762 | static bool |
3763 | dependence_distance_ge_vf (data_dependence_relation *ddr, |
3764 | unsigned int loop_depth, poly_uint64 vf) |
3765 | { |
3766 | if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE |
3767 | || DDR_NUM_DIST_VECTS (ddr) == 0) |
3768 | return false; |
3769 | |
3770 | /* If the dependence is exact, we should have limited the VF instead. */ |
3771 | gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr)); |
3772 | |
3773 | unsigned int i; |
3774 | lambda_vector dist_v; |
3775 | FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) |
3776 | { |
3777 | HOST_WIDE_INT dist = dist_v[loop_depth]; |
3778 | if (dist != 0 |
3779 | && !(dist > 0 && DDR_REVERSED_P (ddr)) |
3780 | && maybe_lt (a: (unsigned HOST_WIDE_INT) abs_hwi (x: dist), b: vf)) |
3781 | return false; |
3782 | } |
3783 | |
3784 | if (dump_enabled_p ()) |
3785 | dump_printf_loc (MSG_NOTE, vect_location, |
3786 | "dependence distance between %T and %T is >= VF\n" , |
3787 | DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr))); |
3788 | |
3789 | return true; |
3790 | } |
3791 | |
3792 | /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */ |
3793 | |
3794 | static void |
3795 | dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound) |
3796 | { |
3797 | dump_printf (dump_kind, "%s (%T) >= " , |
3798 | lower_bound.unsigned_p ? "unsigned" : "abs" , |
3799 | lower_bound.expr); |
3800 | dump_dec (dump_kind, lower_bound.min_value); |
3801 | } |
3802 | |
3803 | /* Record that the vectorized loop requires the vec_lower_bound described |
3804 | by EXPR, UNSIGNED_P and MIN_VALUE. */ |
3805 | |
3806 | static void |
3807 | vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p, |
3808 | poly_uint64 min_value) |
3809 | { |
3810 | vec<vec_lower_bound> &lower_bounds |
3811 | = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); |
3812 | for (unsigned int i = 0; i < lower_bounds.length (); ++i) |
3813 | if (operand_equal_p (lower_bounds[i].expr, expr, flags: 0)) |
3814 | { |
3815 | unsigned_p &= lower_bounds[i].unsigned_p; |
3816 | min_value = upper_bound (a: lower_bounds[i].min_value, b: min_value); |
3817 | if (lower_bounds[i].unsigned_p != unsigned_p |
3818 | || maybe_lt (a: lower_bounds[i].min_value, b: min_value)) |
3819 | { |
3820 | lower_bounds[i].unsigned_p = unsigned_p; |
3821 | lower_bounds[i].min_value = min_value; |
3822 | if (dump_enabled_p ()) |
3823 | { |
3824 | dump_printf_loc (MSG_NOTE, vect_location, |
3825 | "updating run-time check to " ); |
3826 | dump_lower_bound (dump_kind: MSG_NOTE, lower_bound: lower_bounds[i]); |
3827 | dump_printf (MSG_NOTE, "\n" ); |
3828 | } |
3829 | } |
3830 | return; |
3831 | } |
3832 | |
3833 | vec_lower_bound lower_bound (expr, unsigned_p, min_value); |
3834 | if (dump_enabled_p ()) |
3835 | { |
3836 | dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that " ); |
3837 | dump_lower_bound (dump_kind: MSG_NOTE, lower_bound); |
3838 | dump_printf (MSG_NOTE, "\n" ); |
3839 | } |
3840 | LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (obj: lower_bound); |
3841 | } |
3842 | |
3843 | /* Return true if it's unlikely that the step of the vectorized form of DR_INFO |
3844 | will span fewer than GAP bytes. */ |
3845 | |
3846 | static bool |
3847 | vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info, |
3848 | poly_int64 gap) |
3849 | { |
3850 | stmt_vec_info stmt_info = dr_info->stmt; |
3851 | HOST_WIDE_INT count |
3852 | = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); |
3853 | if (DR_GROUP_FIRST_ELEMENT (stmt_info)) |
3854 | count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info)); |
3855 | return (estimated_poly_value (x: gap) |
3856 | <= count * vect_get_scalar_dr_size (dr_info)); |
3857 | } |
3858 | |
3859 | /* Return true if we know that there is no alias between DR_INFO_A and |
3860 | DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N. |
3861 | When returning true, set *LOWER_BOUND_OUT to this N. */ |
3862 | |
3863 | static bool |
3864 | vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b, |
3865 | poly_uint64 *lower_bound_out) |
3866 | { |
3867 | /* Check that there is a constant gap of known sign between DR_A |
3868 | and DR_B. */ |
3869 | data_reference *dr_a = dr_info_a->dr; |
3870 | data_reference *dr_b = dr_info_b->dr; |
3871 | poly_int64 init_a, init_b; |
3872 | if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), flags: 0) |
3873 | || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), flags: 0) |
3874 | || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), flags: 0) |
3875 | || !poly_int_tree_p (DR_INIT (dr_a), value: &init_a) |
3876 | || !poly_int_tree_p (DR_INIT (dr_b), value: &init_b) |
3877 | || !ordered_p (a: init_a, b: init_b)) |
3878 | return false; |
3879 | |
3880 | /* Sort DR_A and DR_B by the address they access. */ |
3881 | if (maybe_lt (a: init_b, b: init_a)) |
3882 | { |
3883 | std::swap (a&: init_a, b&: init_b); |
3884 | std::swap (a&: dr_info_a, b&: dr_info_b); |
3885 | std::swap (a&: dr_a, b&: dr_b); |
3886 | } |
3887 | |
3888 | /* If the two accesses could be dependent within a scalar iteration, |
3889 | make sure that we'd retain their order. */ |
3890 | if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b) |
3891 | && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b)) |
3892 | return false; |
3893 | |
3894 | /* There is no alias if abs (DR_STEP) is greater than or equal to |
3895 | the bytes spanned by the combination of the two accesses. */ |
3896 | *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info: dr_info_b) - init_a; |
3897 | return true; |
3898 | } |
3899 | |
3900 | /* Function vect_prune_runtime_alias_test_list. |
3901 | |
3902 | Prune a list of ddrs to be tested at run-time by versioning for alias. |
3903 | Merge several alias checks into one if possible. |
3904 | Return FALSE if resulting list of ddrs is longer then allowed by |
3905 | PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ |
3906 | |
3907 | opt_result |
3908 | vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) |
3909 | { |
3910 | typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash; |
3911 | hash_set <tree_pair_hash> compared_objects; |
3912 | |
3913 | const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); |
3914 | vec<dr_with_seg_len_pair_t> &comp_alias_ddrs |
3915 | = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); |
3916 | const vec<vec_object_pair> &check_unequal_addrs |
3917 | = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo); |
3918 | poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
3919 | tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); |
3920 | |
3921 | ddr_p ddr; |
3922 | unsigned int i; |
3923 | tree length_factor; |
3924 | |
3925 | DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list" ); |
3926 | |
3927 | /* Step values are irrelevant for aliasing if the number of vector |
3928 | iterations is equal to the number of scalar iterations (which can |
3929 | happen for fully-SLP loops). */ |
3930 | bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U); |
3931 | |
3932 | if (!vf_one_p) |
3933 | { |
3934 | /* Convert the checks for nonzero steps into bound tests. */ |
3935 | tree value; |
3936 | FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value) |
3937 | vect_check_lower_bound (loop_vinfo, expr: value, unsigned_p: true, min_value: 1); |
3938 | } |
3939 | |
3940 | if (may_alias_ddrs.is_empty ()) |
3941 | return opt_result::success (); |
3942 | |
3943 | comp_alias_ddrs.create (nelems: may_alias_ddrs.length ()); |
3944 | |
3945 | unsigned int loop_depth |
3946 | = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num, |
3947 | LOOP_VINFO_LOOP_NEST (loop_vinfo)); |
3948 | |
3949 | /* First, we collect all data ref pairs for aliasing checks. */ |
3950 | FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) |
3951 | { |
3952 | poly_uint64 lower_bound; |
3953 | tree segment_length_a, segment_length_b; |
3954 | unsigned HOST_WIDE_INT access_size_a, access_size_b; |
3955 | unsigned int align_a, align_b; |
3956 | |
3957 | /* Ignore the alias if the VF we chose ended up being no greater |
3958 | than the dependence distance. */ |
3959 | if (dependence_distance_ge_vf (ddr, loop_depth, vf: vect_factor)) |
3960 | continue; |
3961 | |
3962 | if (DDR_OBJECT_A (ddr)) |
3963 | { |
3964 | vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr)); |
3965 | if (!compared_objects.add (k: new_pair)) |
3966 | { |
3967 | if (dump_enabled_p ()) |
3968 | dump_printf_loc (MSG_NOTE, vect_location, |
3969 | "checking that %T and %T" |
3970 | " have different addresses\n" , |
3971 | new_pair.first, new_pair.second); |
3972 | LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (obj: new_pair); |
3973 | } |
3974 | continue; |
3975 | } |
3976 | |
3977 | dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr)); |
3978 | stmt_vec_info stmt_info_a = dr_info_a->stmt; |
3979 | |
3980 | dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr)); |
3981 | stmt_vec_info stmt_info_b = dr_info_b->stmt; |
3982 | |
3983 | bool preserves_scalar_order_p |
3984 | = vect_preserves_scalar_order_p (dr_info_a, dr_info_b); |
3985 | bool ignore_step_p |
3986 | = (vf_one_p |
3987 | && (preserves_scalar_order_p |
3988 | || operand_equal_p (DR_STEP (dr_info_a->dr), |
3989 | DR_STEP (dr_info_b->dr)))); |
3990 | |
3991 | /* Skip the pair if inter-iteration dependencies are irrelevant |
3992 | and intra-iteration dependencies are guaranteed to be honored. */ |
3993 | if (ignore_step_p |
3994 | && (preserves_scalar_order_p |
3995 | || vectorizable_with_step_bound_p (dr_info_a, dr_info_b, |
3996 | lower_bound_out: &lower_bound))) |
3997 | { |
3998 | if (dump_enabled_p ()) |
3999 | dump_printf_loc (MSG_NOTE, vect_location, |
4000 | "no need for alias check between " |
4001 | "%T and %T when VF is 1\n" , |
4002 | DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr)); |
4003 | continue; |
4004 | } |
4005 | |
4006 | /* See whether we can handle the alias using a bounds check on |
4007 | the step, and whether that's likely to be the best approach. |
4008 | (It might not be, for example, if the minimum step is much larger |
4009 | than the number of bytes handled by one vector iteration.) */ |
4010 | if (!ignore_step_p |
4011 | && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST |
4012 | && vectorizable_with_step_bound_p (dr_info_a, dr_info_b, |
4013 | lower_bound_out: &lower_bound) |
4014 | && (vect_small_gap_p (loop_vinfo, dr_info: dr_info_a, gap: lower_bound) |
4015 | || vect_small_gap_p (loop_vinfo, dr_info: dr_info_b, gap: lower_bound))) |
4016 | { |
4017 | bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr); |
4018 | if (dump_enabled_p ()) |
4019 | { |
4020 | dump_printf_loc (MSG_NOTE, vect_location, "no alias between " |
4021 | "%T and %T when the step %T is outside " , |
4022 | DR_REF (dr_info_a->dr), |
4023 | DR_REF (dr_info_b->dr), |
4024 | DR_STEP (dr_info_a->dr)); |
4025 | if (unsigned_p) |
4026 | dump_printf (MSG_NOTE, "[0" ); |
4027 | else |
4028 | { |
4029 | dump_printf (MSG_NOTE, "(" ); |
4030 | dump_dec (MSG_NOTE, poly_int64 (-lower_bound)); |
4031 | } |
4032 | dump_printf (MSG_NOTE, ", " ); |
4033 | dump_dec (MSG_NOTE, lower_bound); |
4034 | dump_printf (MSG_NOTE, ")\n" ); |
4035 | } |
4036 | vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr), |
4037 | unsigned_p, min_value: lower_bound); |
4038 | continue; |
4039 | } |
4040 | |
4041 | stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a); |
4042 | if (dr_group_first_a) |
4043 | { |
4044 | stmt_info_a = dr_group_first_a; |
4045 | dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a); |
4046 | } |
4047 | |
4048 | stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b); |
4049 | if (dr_group_first_b) |
4050 | { |
4051 | stmt_info_b = dr_group_first_b; |
4052 | dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b); |
4053 | } |
4054 | |
4055 | if (ignore_step_p) |
4056 | { |
4057 | segment_length_a = size_zero_node; |
4058 | segment_length_b = size_zero_node; |
4059 | } |
4060 | else |
4061 | { |
4062 | if (!operand_equal_p (DR_STEP (dr_info_a->dr), |
4063 | DR_STEP (dr_info_b->dr), flags: 0)) |
4064 | length_factor = scalar_loop_iters; |
4065 | else |
4066 | length_factor = size_int (vect_factor); |
4067 | segment_length_a = vect_vfa_segment_size (dr_info: dr_info_a, length_factor); |
4068 | segment_length_b = vect_vfa_segment_size (dr_info: dr_info_b, length_factor); |
4069 | } |
4070 | access_size_a = vect_vfa_access_size (vinfo: loop_vinfo, dr_info: dr_info_a); |
4071 | access_size_b = vect_vfa_access_size (vinfo: loop_vinfo, dr_info: dr_info_b); |
4072 | align_a = vect_vfa_align (dr_info: dr_info_a); |
4073 | align_b = vect_vfa_align (dr_info: dr_info_b); |
4074 | |
4075 | /* See whether the alias is known at compilation time. */ |
4076 | if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr), |
4077 | DR_BASE_ADDRESS (dr_info_b->dr), flags: 0) |
4078 | && operand_equal_p (DR_OFFSET (dr_info_a->dr), |
4079 | DR_OFFSET (dr_info_b->dr), flags: 0) |
4080 | && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST |
4081 | && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST |
4082 | && poly_int_tree_p (t: segment_length_a) |
4083 | && poly_int_tree_p (t: segment_length_b)) |
4084 | { |
4085 | int res = vect_compile_time_alias (a: dr_info_a, b: dr_info_b, |
4086 | segment_length_a, |
4087 | segment_length_b, |
4088 | access_size_a, |
4089 | access_size_b); |
4090 | if (res >= 0 && dump_enabled_p ()) |
4091 | { |
4092 | dump_printf_loc (MSG_NOTE, vect_location, |
4093 | "can tell at compile time that %T and %T" , |
4094 | DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr)); |
4095 | if (res == 0) |
4096 | dump_printf (MSG_NOTE, " do not alias\n" ); |
4097 | else |
4098 | dump_printf (MSG_NOTE, " alias\n" ); |
4099 | } |
4100 | |
4101 | if (res == 0) |
4102 | continue; |
4103 | |
4104 | if (res == 1) |
4105 | return opt_result::failure_at (loc: stmt_info_b->stmt, |
4106 | fmt: "not vectorized:" |
4107 | " compilation time alias: %G%G" , |
4108 | stmt_info_a->stmt, |
4109 | stmt_info_b->stmt); |
4110 | } |
4111 | |
4112 | dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a, |
4113 | access_size_a, align_a); |
4114 | dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b, |
4115 | access_size_b, align_b); |
4116 | /* Canonicalize the order to be the one that's needed for accurate |
4117 | RAW, WAR and WAW flags, in cases where the data references are |
4118 | well-ordered. The order doesn't really matter otherwise, |
4119 | but we might as well be consistent. */ |
4120 | if (get_later_stmt (stmt1_info: stmt_info_a, stmt2_info: stmt_info_b) == stmt_info_a) |
4121 | std::swap (a&: dr_a, b&: dr_b); |
4122 | |
4123 | dr_with_seg_len_pair_t dr_with_seg_len_pair |
4124 | (dr_a, dr_b, (preserves_scalar_order_p |
4125 | ? dr_with_seg_len_pair_t::WELL_ORDERED |
4126 | : dr_with_seg_len_pair_t::REORDERED)); |
4127 | |
4128 | comp_alias_ddrs.safe_push (obj: dr_with_seg_len_pair); |
4129 | } |
4130 | |
4131 | prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor); |
4132 | |
4133 | unsigned int count = (comp_alias_ddrs.length () |
4134 | + check_unequal_addrs.length ()); |
4135 | |
4136 | if (count |
4137 | && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) |
4138 | == VECT_COST_MODEL_VERY_CHEAP)) |
4139 | return opt_result::failure_at |
4140 | (loc: vect_location, fmt: "would need a runtime alias check\n" ); |
4141 | |
4142 | if (dump_enabled_p ()) |
4143 | dump_printf_loc (MSG_NOTE, vect_location, |
4144 | "improved number of alias checks from %d to %d\n" , |
4145 | may_alias_ddrs.length (), count); |
4146 | unsigned limit = param_vect_max_version_for_alias_checks; |
4147 | if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP) |
4148 | limit = param_vect_max_version_for_alias_checks * 6 / 10; |
4149 | if (count > limit) |
4150 | return opt_result::failure_at |
4151 | (loc: vect_location, |
4152 | fmt: "number of versioning for alias run-time tests exceeds %d " |
4153 | "(--param vect-max-version-for-alias-checks)\n" , limit); |
4154 | |
4155 | return opt_result::success (); |
4156 | } |
4157 | |
4158 | /* Check whether we can use an internal function for a gather load |
4159 | or scatter store. READ_P is true for loads and false for stores. |
4160 | MASKED_P is true if the load or store is conditional. MEMORY_TYPE is |
4161 | the type of the memory elements being loaded or stored. OFFSET_TYPE |
4162 | is the type of the offset that is being applied to the invariant |
4163 | base address. SCALE is the amount by which the offset should |
4164 | be multiplied *after* it has been converted to address width. |
4165 | |
4166 | Return true if the function is supported, storing the function id in |
4167 | *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */ |
4168 | |
4169 | bool |
4170 | vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, |
4171 | tree vectype, tree memory_type, tree offset_type, |
4172 | int scale, internal_fn *ifn_out, |
4173 | tree *offset_vectype_out) |
4174 | { |
4175 | unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type)); |
4176 | unsigned int element_bits = vector_element_bits (vectype); |
4177 | if (element_bits != memory_bits) |
4178 | /* For now the vector elements must be the same width as the |
4179 | memory elements. */ |
4180 | return false; |
4181 | |
4182 | /* Work out which function we need. */ |
4183 | internal_fn ifn, alt_ifn, alt_ifn2; |
4184 | if (read_p) |
4185 | { |
4186 | ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; |
4187 | alt_ifn = IFN_MASK_GATHER_LOAD; |
4188 | /* When target supports MASK_LEN_GATHER_LOAD, we always |
4189 | use MASK_LEN_GATHER_LOAD regardless whether len and |
4190 | mask are valid or not. */ |
4191 | alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD; |
4192 | } |
4193 | else |
4194 | { |
4195 | ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; |
4196 | alt_ifn = IFN_MASK_SCATTER_STORE; |
4197 | /* When target supports MASK_LEN_SCATTER_STORE, we always |
4198 | use MASK_LEN_SCATTER_STORE regardless whether len and |
4199 | mask are valid or not. */ |
4200 | alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE; |
4201 | } |
4202 | |
4203 | for (;;) |
4204 | { |
4205 | tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type); |
4206 | if (!offset_vectype) |
4207 | return false; |
4208 | |
4209 | /* Test whether the target supports this combination. */ |
4210 | if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, |
4211 | offset_vectype, scale)) |
4212 | { |
4213 | *ifn_out = ifn; |
4214 | *offset_vectype_out = offset_vectype; |
4215 | return true; |
4216 | } |
4217 | else if (!masked_p |
4218 | && internal_gather_scatter_fn_supported_p (alt_ifn, vectype, |
4219 | memory_type, |
4220 | offset_vectype, |
4221 | scale)) |
4222 | { |
4223 | *ifn_out = alt_ifn; |
4224 | *offset_vectype_out = offset_vectype; |
4225 | return true; |
4226 | } |
4227 | else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype, |
4228 | memory_type, |
4229 | offset_vectype, scale)) |
4230 | { |
4231 | *ifn_out = alt_ifn2; |
4232 | *offset_vectype_out = offset_vectype; |
4233 | return true; |
4234 | } |
4235 | |
4236 | if (TYPE_PRECISION (offset_type) >= POINTER_SIZE |
4237 | && TYPE_PRECISION (offset_type) >= element_bits) |
4238 | return false; |
4239 | |
4240 | offset_type = build_nonstandard_integer_type |
4241 | (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type)); |
4242 | } |
4243 | } |
4244 | |
4245 | /* STMT_INFO is a call to an internal gather load or scatter store function. |
4246 | Describe the operation in INFO. */ |
4247 | |
4248 | static void |
4249 | vect_describe_gather_scatter_call (stmt_vec_info stmt_info, |
4250 | gather_scatter_info *info) |
4251 | { |
4252 | gcall *call = as_a <gcall *> (p: stmt_info->stmt); |
4253 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
4254 | data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); |
4255 | |
4256 | info->ifn = gimple_call_internal_fn (gs: call); |
4257 | info->decl = NULL_TREE; |
4258 | info->base = gimple_call_arg (gs: call, index: 0); |
4259 | info->offset = gimple_call_arg (gs: call, index: 1); |
4260 | info->offset_dt = vect_unknown_def_type; |
4261 | info->offset_vectype = NULL_TREE; |
4262 | info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); |
4263 | info->element_type = TREE_TYPE (vectype); |
4264 | info->memory_type = TREE_TYPE (DR_REF (dr)); |
4265 | } |
4266 | |
4267 | /* Return true if a non-affine read or write in STMT_INFO is suitable for a |
4268 | gather load or scatter store. Describe the operation in *INFO if so. */ |
4269 | |
4270 | bool |
4271 | vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, |
4272 | gather_scatter_info *info) |
4273 | { |
4274 | HOST_WIDE_INT scale = 1; |
4275 | poly_int64 pbitpos, pbitsize; |
4276 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
4277 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); |
4278 | tree offtype = NULL_TREE; |
4279 | tree decl = NULL_TREE, base, off; |
4280 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
4281 | tree memory_type = TREE_TYPE (DR_REF (dr)); |
4282 | machine_mode pmode; |
4283 | int punsignedp, reversep, pvolatilep = 0; |
4284 | internal_fn ifn; |
4285 | tree offset_vectype; |
4286 | bool masked_p = false; |
4287 | |
4288 | /* See whether this is already a call to a gather/scatter internal function. |
4289 | If not, see whether it's a masked load or store. */ |
4290 | gcall *call = dyn_cast <gcall *> (p: stmt_info->stmt); |
4291 | if (call && gimple_call_internal_p (gs: call)) |
4292 | { |
4293 | ifn = gimple_call_internal_fn (gs: call); |
4294 | if (internal_gather_scatter_fn_p (ifn)) |
4295 | { |
4296 | vect_describe_gather_scatter_call (stmt_info, info); |
4297 | return true; |
4298 | } |
4299 | masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE); |
4300 | } |
4301 | |
4302 | /* True if we should aim to use internal functions rather than |
4303 | built-in functions. */ |
4304 | bool use_ifn_p = (DR_IS_READ (dr) |
4305 | ? supports_vec_gather_load_p (TYPE_MODE (vectype)) |
4306 | : supports_vec_scatter_store_p (TYPE_MODE (vectype))); |
4307 | |
4308 | base = DR_REF (dr); |
4309 | /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, |
4310 | see if we can use the def stmt of the address. */ |
4311 | if (masked_p |
4312 | && TREE_CODE (base) == MEM_REF |
4313 | && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME |
4314 | && integer_zerop (TREE_OPERAND (base, 1)) |
4315 | && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) |
4316 | { |
4317 | gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); |
4318 | if (is_gimple_assign (gs: def_stmt) |
4319 | && gimple_assign_rhs_code (gs: def_stmt) == ADDR_EXPR) |
4320 | base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); |
4321 | } |
4322 | |
4323 | /* The gather and scatter builtins need address of the form |
4324 | loop_invariant + vector * {1, 2, 4, 8} |
4325 | or |
4326 | loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. |
4327 | Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture |
4328 | of loop invariants/SSA_NAMEs defined in the loop, with casts, |
4329 | multiplications and additions in it. To get a vector, we need |
4330 | a single SSA_NAME that will be defined in the loop and will |
4331 | contain everything that is not loop invariant and that can be |
4332 | vectorized. The following code attempts to find such a preexistng |
4333 | SSA_NAME OFF and put the loop invariants into a tree BASE |
4334 | that can be gimplified before the loop. */ |
4335 | base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, |
4336 | &punsignedp, &reversep, &pvolatilep); |
4337 | if (reversep) |
4338 | return false; |
4339 | |
4340 | /* PR 107346. Packed structs can have fields at offsets that are not |
4341 | multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */ |
4342 | if (!multiple_p (a: pbitpos, BITS_PER_UNIT)) |
4343 | return false; |
4344 | |
4345 | /* We need to be able to form an address to the base which for example |
4346 | isn't possible for hard registers. */ |
4347 | if (may_be_nonaddressable_p (expr: base)) |
4348 | return false; |
4349 | |
4350 | poly_int64 pbytepos = exact_div (a: pbitpos, BITS_PER_UNIT); |
4351 | |
4352 | if (TREE_CODE (base) == MEM_REF) |
4353 | { |
4354 | if (!integer_zerop (TREE_OPERAND (base, 1))) |
4355 | { |
4356 | if (off == NULL_TREE) |
4357 | off = wide_int_to_tree (sizetype, cst: mem_ref_offset (base)); |
4358 | else |
4359 | off = size_binop (PLUS_EXPR, off, |
4360 | fold_convert (sizetype, TREE_OPERAND (base, 1))); |
4361 | } |
4362 | base = TREE_OPERAND (base, 0); |
4363 | } |
4364 | else |
4365 | base = build_fold_addr_expr (base); |
4366 | |
4367 | if (off == NULL_TREE) |
4368 | off = size_zero_node; |
4369 | |
4370 | /* If base is not loop invariant, either off is 0, then we start with just |
4371 | the constant offset in the loop invariant BASE and continue with base |
4372 | as OFF, otherwise give up. |
4373 | We could handle that case by gimplifying the addition of base + off |
4374 | into some SSA_NAME and use that as off, but for now punt. */ |
4375 | if (!expr_invariant_in_loop_p (loop, base)) |
4376 | { |
4377 | if (!integer_zerop (off)) |
4378 | return false; |
4379 | off = base; |
4380 | base = size_int (pbytepos); |
4381 | } |
4382 | /* Otherwise put base + constant offset into the loop invariant BASE |
4383 | and continue with OFF. */ |
4384 | else |
4385 | { |
4386 | base = fold_convert (sizetype, base); |
4387 | base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); |
4388 | } |
4389 | |
4390 | /* OFF at this point may be either a SSA_NAME or some tree expression |
4391 | from get_inner_reference. Try to peel off loop invariants from it |
4392 | into BASE as long as possible. */ |
4393 | STRIP_NOPS (off); |
4394 | while (offtype == NULL_TREE) |
4395 | { |
4396 | enum tree_code code; |
4397 | tree op0, op1, add = NULL_TREE; |
4398 | |
4399 | if (TREE_CODE (off) == SSA_NAME) |
4400 | { |
4401 | gimple *def_stmt = SSA_NAME_DEF_STMT (off); |
4402 | |
4403 | if (expr_invariant_in_loop_p (loop, off)) |
4404 | return false; |
4405 | |
4406 | if (gimple_code (g: def_stmt) != GIMPLE_ASSIGN) |
4407 | break; |
4408 | |
4409 | op0 = gimple_assign_rhs1 (gs: def_stmt); |
4410 | code = gimple_assign_rhs_code (gs: def_stmt); |
4411 | op1 = gimple_assign_rhs2 (gs: def_stmt); |
4412 | } |
4413 | else |
4414 | { |
4415 | if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) |
4416 | return false; |
4417 | code = TREE_CODE (off); |
4418 | extract_ops_from_tree (expr: off, code: &code, op0: &op0, op1: &op1); |
4419 | } |
4420 | switch (code) |
4421 | { |
4422 | case POINTER_PLUS_EXPR: |
4423 | case PLUS_EXPR: |
4424 | if (expr_invariant_in_loop_p (loop, op0)) |
4425 | { |
4426 | add = op0; |
4427 | off = op1; |
4428 | do_add: |
4429 | add = fold_convert (sizetype, add); |
4430 | if (scale != 1) |
4431 | add = size_binop (MULT_EXPR, add, size_int (scale)); |
4432 | base = size_binop (PLUS_EXPR, base, add); |
4433 | continue; |
4434 | } |
4435 | if (expr_invariant_in_loop_p (loop, op1)) |
4436 | { |
4437 | add = op1; |
4438 | off = op0; |
4439 | goto do_add; |
4440 | } |
4441 | break; |
4442 | case MINUS_EXPR: |
4443 | if (expr_invariant_in_loop_p (loop, op1)) |
4444 | { |
4445 | add = fold_convert (sizetype, op1); |
4446 | add = size_binop (MINUS_EXPR, size_zero_node, add); |
4447 | off = op0; |
4448 | goto do_add; |
4449 | } |
4450 | break; |
4451 | case MULT_EXPR: |
4452 | if (scale == 1 && tree_fits_shwi_p (op1)) |
4453 | { |
4454 | int new_scale = tree_to_shwi (op1); |
4455 | /* Only treat this as a scaling operation if the target |
4456 | supports it for at least some offset type. */ |
4457 | if (use_ifn_p |
4458 | && !vect_gather_scatter_fn_p (vinfo: loop_vinfo, DR_IS_READ (dr), |
4459 | masked_p, vectype, memory_type, |
4460 | signed_char_type_node, |
4461 | scale: new_scale, ifn_out: &ifn, |
4462 | offset_vectype_out: &offset_vectype) |
4463 | && !vect_gather_scatter_fn_p (vinfo: loop_vinfo, DR_IS_READ (dr), |
4464 | masked_p, vectype, memory_type, |
4465 | unsigned_char_type_node, |
4466 | scale: new_scale, ifn_out: &ifn, |
4467 | offset_vectype_out: &offset_vectype)) |
4468 | break; |
4469 | scale = new_scale; |
4470 | off = op0; |
4471 | continue; |
4472 | } |
4473 | break; |
4474 | case SSA_NAME: |
4475 | off = op0; |
4476 | continue; |
4477 | CASE_CONVERT: |
4478 | if (!POINTER_TYPE_P (TREE_TYPE (op0)) |
4479 | && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) |
4480 | break; |
4481 | |
4482 | /* Don't include the conversion if the target is happy with |
4483 | the current offset type. */ |
4484 | if (use_ifn_p |
4485 | && TREE_CODE (off) == SSA_NAME |
4486 | && !POINTER_TYPE_P (TREE_TYPE (off)) |
4487 | && vect_gather_scatter_fn_p (vinfo: loop_vinfo, DR_IS_READ (dr), |
4488 | masked_p, vectype, memory_type, |
4489 | TREE_TYPE (off), scale, ifn_out: &ifn, |
4490 | offset_vectype_out: &offset_vectype)) |
4491 | break; |
4492 | |
4493 | if (TYPE_PRECISION (TREE_TYPE (op0)) |
4494 | == TYPE_PRECISION (TREE_TYPE (off))) |
4495 | { |
4496 | off = op0; |
4497 | continue; |
4498 | } |
4499 | |
4500 | /* Include the conversion if it is widening and we're using |
4501 | the IFN path or the target can handle the converted from |
4502 | offset or the current size is not already the same as the |
4503 | data vector element size. */ |
4504 | if ((TYPE_PRECISION (TREE_TYPE (op0)) |
4505 | < TYPE_PRECISION (TREE_TYPE (off))) |
4506 | && (use_ifn_p |
4507 | || (DR_IS_READ (dr) |
4508 | ? (targetm.vectorize.builtin_gather |
4509 | && targetm.vectorize.builtin_gather (vectype, |
4510 | TREE_TYPE (op0), |
4511 | scale)) |
4512 | : (targetm.vectorize.builtin_scatter |
4513 | && targetm.vectorize.builtin_scatter (vectype, |
4514 | TREE_TYPE (op0), |
4515 | scale))) |
4516 | || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)), |
4517 | TYPE_SIZE (TREE_TYPE (vectype)), flags: 0))) |
4518 | { |
4519 | off = op0; |
4520 | offtype = TREE_TYPE (off); |
4521 | STRIP_NOPS (off); |
4522 | continue; |
4523 | } |
4524 | break; |
4525 | default: |
4526 | break; |
4527 | } |
4528 | break; |
4529 | } |
4530 | |
4531 | /* If at the end OFF still isn't a SSA_NAME or isn't |
4532 | defined in the loop, punt. */ |
4533 | if (TREE_CODE (off) != SSA_NAME |
4534 | || expr_invariant_in_loop_p (loop, off)) |
4535 | return false; |
4536 | |
4537 | if (offtype == NULL_TREE) |
4538 | offtype = TREE_TYPE (off); |
4539 | |
4540 | if (use_ifn_p) |
4541 | { |
4542 | if (!vect_gather_scatter_fn_p (vinfo: loop_vinfo, DR_IS_READ (dr), masked_p, |
4543 | vectype, memory_type, offset_type: offtype, scale, |
4544 | ifn_out: &ifn, offset_vectype_out: &offset_vectype)) |
4545 | ifn = IFN_LAST; |
4546 | decl = NULL_TREE; |
4547 | } |
4548 | else |
4549 | { |
4550 | if (DR_IS_READ (dr)) |
4551 | { |
4552 | if (targetm.vectorize.builtin_gather) |
4553 | decl = targetm.vectorize.builtin_gather (vectype, offtype, scale); |
4554 | } |
4555 | else |
4556 | { |
4557 | if (targetm.vectorize.builtin_scatter) |
4558 | decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale); |
4559 | } |
4560 | ifn = IFN_LAST; |
4561 | /* The offset vector type will be read from DECL when needed. */ |
4562 | offset_vectype = NULL_TREE; |
4563 | } |
4564 | |
4565 | info->ifn = ifn; |
4566 | info->decl = decl; |
4567 | info->base = base; |
4568 | info->offset = off; |
4569 | info->offset_dt = vect_unknown_def_type; |
4570 | info->offset_vectype = offset_vectype; |
4571 | info->scale = scale; |
4572 | info->element_type = TREE_TYPE (vectype); |
4573 | info->memory_type = memory_type; |
4574 | return true; |
4575 | } |
4576 | |
4577 | /* Find the data references in STMT, analyze them with respect to LOOP and |
4578 | append them to DATAREFS. Return false if datarefs in this stmt cannot |
4579 | be handled. */ |
4580 | |
4581 | opt_result |
4582 | vect_find_stmt_data_reference (loop_p loop, gimple *stmt, |
4583 | vec<data_reference_p> *datarefs, |
4584 | vec<int> *dataref_groups, int group_id) |
4585 | { |
4586 | /* We can ignore clobbers for dataref analysis - they are removed during |
4587 | loop vectorization and BB vectorization checks dependences with a |
4588 | stmt walk. */ |
4589 | if (gimple_clobber_p (s: stmt)) |
4590 | return opt_result::success (); |
4591 | |
4592 | if (gimple_has_volatile_ops (stmt)) |
4593 | return opt_result::failure_at (loc: stmt, fmt: "not vectorized: volatile type: %G" , |
4594 | stmt); |
4595 | |
4596 | if (stmt_can_throw_internal (cfun, stmt)) |
4597 | return opt_result::failure_at (loc: stmt, |
4598 | fmt: "not vectorized:" |
4599 | " statement can throw an exception: %G" , |
4600 | stmt); |
4601 | |
4602 | auto_vec<data_reference_p, 2> refs; |
4603 | opt_result res = find_data_references_in_stmt (loop, stmt, &refs); |
4604 | if (!res) |
4605 | return res; |
4606 | |
4607 | if (refs.is_empty ()) |
4608 | return opt_result::success (); |
4609 | |
4610 | if (refs.length () > 1) |
4611 | { |
4612 | while (!refs.is_empty ()) |
4613 | free_data_ref (refs.pop ()); |
4614 | return opt_result::failure_at (loc: stmt, |
4615 | fmt: "not vectorized: more than one " |
4616 | "data ref in stmt: %G" , stmt); |
4617 | } |
4618 | |
4619 | data_reference_p dr = refs.pop (); |
4620 | if (gcall *call = dyn_cast <gcall *> (p: stmt)) |
4621 | if (!gimple_call_internal_p (gs: call) |
4622 | || (gimple_call_internal_fn (gs: call) != IFN_MASK_LOAD |
4623 | && gimple_call_internal_fn (gs: call) != IFN_MASK_STORE)) |
4624 | { |
4625 | free_data_ref (dr); |
4626 | return opt_result::failure_at (loc: stmt, |
4627 | fmt: "not vectorized: dr in a call %G" , stmt); |
4628 | } |
4629 | |
4630 | if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF |
4631 | && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) |
4632 | { |
4633 | free_data_ref (dr); |
4634 | return opt_result::failure_at (loc: stmt, |
4635 | fmt: "not vectorized:" |
4636 | " statement is an unsupported" |
4637 | " bitfield access %G" , stmt); |
4638 | } |
4639 | |
4640 | if (DR_BASE_ADDRESS (dr) |
4641 | && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) |
4642 | { |
4643 | free_data_ref (dr); |
4644 | return opt_result::failure_at (loc: stmt, |
4645 | fmt: "not vectorized:" |
4646 | " base addr of dr is a constant\n" ); |
4647 | } |
4648 | |
4649 | /* Check whether this may be a SIMD lane access and adjust the |
4650 | DR to make it easier for us to handle it. */ |
4651 | if (loop |
4652 | && loop->simduid |
4653 | && (!DR_BASE_ADDRESS (dr) |
4654 | || !DR_OFFSET (dr) |
4655 | || !DR_INIT (dr) |
4656 | || !DR_STEP (dr))) |
4657 | { |
4658 | struct data_reference *newdr |
4659 | = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt, |
4660 | DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr)); |
4661 | if (DR_BASE_ADDRESS (newdr) |
4662 | && DR_OFFSET (newdr) |
4663 | && DR_INIT (newdr) |
4664 | && DR_STEP (newdr) |
4665 | && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST |
4666 | && integer_zerop (DR_STEP (newdr))) |
4667 | { |
4668 | tree base_address = DR_BASE_ADDRESS (newdr); |
4669 | tree off = DR_OFFSET (newdr); |
4670 | tree step = ssize_int (1); |
4671 | if (integer_zerop (off) |
4672 | && TREE_CODE (base_address) == POINTER_PLUS_EXPR) |
4673 | { |
4674 | off = TREE_OPERAND (base_address, 1); |
4675 | base_address = TREE_OPERAND (base_address, 0); |
4676 | } |
4677 | STRIP_NOPS (off); |
4678 | if (TREE_CODE (off) == MULT_EXPR |
4679 | && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) |
4680 | { |
4681 | step = TREE_OPERAND (off, 1); |
4682 | off = TREE_OPERAND (off, 0); |
4683 | STRIP_NOPS (off); |
4684 | } |
4685 | if (CONVERT_EXPR_P (off) |
4686 | && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0))) |
4687 | < TYPE_PRECISION (TREE_TYPE (off)))) |
4688 | off = TREE_OPERAND (off, 0); |
4689 | if (TREE_CODE (off) == SSA_NAME) |
4690 | { |
4691 | gimple *def = SSA_NAME_DEF_STMT (off); |
4692 | /* Look through widening conversion. */ |
4693 | if (is_gimple_assign (gs: def) |
4694 | && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def))) |
4695 | { |
4696 | tree rhs1 = gimple_assign_rhs1 (gs: def); |
4697 | if (TREE_CODE (rhs1) == SSA_NAME |
4698 | && INTEGRAL_TYPE_P (TREE_TYPE (rhs1)) |
4699 | && (TYPE_PRECISION (TREE_TYPE (off)) |
4700 | > TYPE_PRECISION (TREE_TYPE (rhs1)))) |
4701 | def = SSA_NAME_DEF_STMT (rhs1); |
4702 | } |
4703 | if (is_gimple_call (gs: def) |
4704 | && gimple_call_internal_p (gs: def) |
4705 | && (gimple_call_internal_fn (gs: def) == IFN_GOMP_SIMD_LANE)) |
4706 | { |
4707 | tree arg = gimple_call_arg (gs: def, index: 0); |
4708 | tree reft = TREE_TYPE (DR_REF (newdr)); |
4709 | gcc_assert (TREE_CODE (arg) == SSA_NAME); |
4710 | arg = SSA_NAME_VAR (arg); |
4711 | if (arg == loop->simduid |
4712 | /* For now. */ |
4713 | && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step)) |
4714 | { |
4715 | DR_BASE_ADDRESS (newdr) = base_address; |
4716 | DR_OFFSET (newdr) = ssize_int (0); |
4717 | DR_STEP (newdr) = step; |
4718 | DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT; |
4719 | DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step); |
4720 | /* Mark as simd-lane access. */ |
4721 | tree arg2 = gimple_call_arg (gs: def, index: 1); |
4722 | newdr->aux = (void *) (-1 - tree_to_uhwi (arg2)); |
4723 | free_data_ref (dr); |
4724 | datarefs->safe_push (obj: newdr); |
4725 | if (dataref_groups) |
4726 | dataref_groups->safe_push (obj: group_id); |
4727 | return opt_result::success (); |
4728 | } |
4729 | } |
4730 | } |
4731 | } |
4732 | free_data_ref (newdr); |
4733 | } |
4734 | |
4735 | datarefs->safe_push (obj: dr); |
4736 | if (dataref_groups) |
4737 | dataref_groups->safe_push (obj: group_id); |
4738 | return opt_result::success (); |
4739 | } |
4740 | |
4741 | /* Function vect_analyze_data_refs. |
4742 | |
4743 | Find all the data references in the loop or basic block. |
4744 | |
4745 | The general structure of the analysis of data refs in the vectorizer is as |
4746 | follows: |
4747 | 1- vect_analyze_data_refs(loop/bb): call |
4748 | compute_data_dependences_for_loop/bb to find and analyze all data-refs |
4749 | in the loop/bb and their dependences. |
4750 | 2- vect_analyze_dependences(): apply dependence testing using ddrs. |
4751 | 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. |
4752 | 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. |
4753 | |
4754 | */ |
4755 | |
4756 | opt_result |
4757 | vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal) |
4758 | { |
4759 | class loop *loop = NULL; |
4760 | unsigned int i; |
4761 | struct data_reference *dr; |
4762 | tree scalar_type; |
4763 | |
4764 | DUMP_VECT_SCOPE ("vect_analyze_data_refs" ); |
4765 | |
4766 | if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo)) |
4767 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
4768 | |
4769 | /* Go through the data-refs, check that the analysis succeeded. Update |
4770 | pointer from stmt_vec_info struct to DR and vectype. */ |
4771 | |
4772 | vec<data_reference_p> datarefs = vinfo->shared->datarefs; |
4773 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
4774 | { |
4775 | enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; |
4776 | poly_uint64 vf; |
4777 | |
4778 | gcc_assert (DR_REF (dr)); |
4779 | stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr)); |
4780 | gcc_assert (!stmt_info->dr_aux.dr); |
4781 | stmt_info->dr_aux.dr = dr; |
4782 | stmt_info->dr_aux.stmt = stmt_info; |
4783 | |
4784 | /* Check that analysis of the data-ref succeeded. */ |
4785 | if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) |
4786 | || !DR_STEP (dr)) |
4787 | { |
4788 | bool maybe_gather |
4789 | = DR_IS_READ (dr) |
4790 | && !TREE_THIS_VOLATILE (DR_REF (dr)); |
4791 | bool maybe_scatter |
4792 | = DR_IS_WRITE (dr) |
4793 | && !TREE_THIS_VOLATILE (DR_REF (dr)); |
4794 | |
4795 | /* If target supports vector gather loads or scatter stores, |
4796 | see if they can't be used. */ |
4797 | if (is_a <loop_vec_info> (p: vinfo) |
4798 | && !nested_in_vect_loop_p (loop, stmt_info)) |
4799 | { |
4800 | if (maybe_gather || maybe_scatter) |
4801 | { |
4802 | if (maybe_gather) |
4803 | gatherscatter = GATHER; |
4804 | else |
4805 | gatherscatter = SCATTER; |
4806 | } |
4807 | } |
4808 | |
4809 | if (gatherscatter == SG_NONE) |
4810 | { |
4811 | if (dump_enabled_p ()) |
4812 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4813 | "not vectorized: data ref analysis " |
4814 | "failed %G" , stmt_info->stmt); |
4815 | if (is_a <bb_vec_info> (p: vinfo)) |
4816 | { |
4817 | /* In BB vectorization the ref can still participate |
4818 | in dependence analysis, we just can't vectorize it. */ |
4819 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; |
4820 | continue; |
4821 | } |
4822 | return opt_result::failure_at (loc: stmt_info->stmt, |
4823 | fmt: "not vectorized:" |
4824 | " data ref analysis failed: %G" , |
4825 | stmt_info->stmt); |
4826 | } |
4827 | } |
4828 | |
4829 | /* See if this was detected as SIMD lane access. */ |
4830 | if (dr->aux == (void *)-1 |
4831 | || dr->aux == (void *)-2 |
4832 | || dr->aux == (void *)-3 |
4833 | || dr->aux == (void *)-4) |
4834 | { |
4835 | if (nested_in_vect_loop_p (loop, stmt_info)) |
4836 | return opt_result::failure_at (loc: stmt_info->stmt, |
4837 | fmt: "not vectorized:" |
4838 | " data ref analysis failed: %G" , |
4839 | stmt_info->stmt); |
4840 | STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) |
4841 | = -(uintptr_t) dr->aux; |
4842 | } |
4843 | |
4844 | tree base = get_base_address (DR_REF (dr)); |
4845 | if (base && VAR_P (base) && DECL_NONALIASED (base)) |
4846 | { |
4847 | if (dump_enabled_p ()) |
4848 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4849 | "not vectorized: base object not addressable " |
4850 | "for stmt: %G" , stmt_info->stmt); |
4851 | if (is_a <bb_vec_info> (p: vinfo)) |
4852 | { |
4853 | /* In BB vectorization the ref can still participate |
4854 | in dependence analysis, we just can't vectorize it. */ |
4855 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; |
4856 | continue; |
4857 | } |
4858 | return opt_result::failure_at (loc: stmt_info->stmt, |
4859 | fmt: "not vectorized: base object not" |
4860 | " addressable for stmt: %G" , |
4861 | stmt_info->stmt); |
4862 | } |
4863 | |
4864 | if (is_a <loop_vec_info> (p: vinfo) |
4865 | && DR_STEP (dr) |
4866 | && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) |
4867 | { |
4868 | if (nested_in_vect_loop_p (loop, stmt_info)) |
4869 | return opt_result::failure_at (loc: stmt_info->stmt, |
4870 | fmt: "not vectorized: " |
4871 | "not suitable for strided load %G" , |
4872 | stmt_info->stmt); |
4873 | STMT_VINFO_STRIDED_P (stmt_info) = true; |
4874 | } |
4875 | |
4876 | /* Update DR field in stmt_vec_info struct. */ |
4877 | |
4878 | /* If the dataref is in an inner-loop of the loop that is considered for |
4879 | for vectorization, we also want to analyze the access relative to |
4880 | the outer-loop (DR contains information only relative to the |
4881 | inner-most enclosing loop). We do that by building a reference to the |
4882 | first location accessed by the inner-loop, and analyze it relative to |
4883 | the outer-loop. */ |
4884 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
4885 | { |
4886 | /* Build a reference to the first location accessed by the |
4887 | inner loop: *(BASE + INIT + OFFSET). By construction, |
4888 | this address must be invariant in the inner loop, so we |
4889 | can consider it as being used in the outer loop. */ |
4890 | tree base = unshare_expr (DR_BASE_ADDRESS (dr)); |
4891 | tree offset = unshare_expr (DR_OFFSET (dr)); |
4892 | tree init = unshare_expr (DR_INIT (dr)); |
4893 | tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), |
4894 | init, offset); |
4895 | tree init_addr = fold_build_pointer_plus (base, init_offset); |
4896 | tree init_ref = build_fold_indirect_ref (init_addr); |
4897 | |
4898 | if (dump_enabled_p ()) |
4899 | dump_printf_loc (MSG_NOTE, vect_location, |
4900 | "analyze in outer loop: %T\n" , init_ref); |
4901 | |
4902 | opt_result res |
4903 | = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info), |
4904 | init_ref, loop, stmt_info->stmt); |
4905 | if (!res) |
4906 | /* dr_analyze_innermost already explained the failure. */ |
4907 | return res; |
4908 | |
4909 | if (dump_enabled_p ()) |
4910 | dump_printf_loc (MSG_NOTE, vect_location, |
4911 | "\touter base_address: %T\n" |
4912 | "\touter offset from base address: %T\n" |
4913 | "\touter constant offset from base address: %T\n" |
4914 | "\touter step: %T\n" |
4915 | "\touter base alignment: %d\n\n" |
4916 | "\touter base misalignment: %d\n" |
4917 | "\touter offset alignment: %d\n" |
4918 | "\touter step alignment: %d\n" , |
4919 | STMT_VINFO_DR_BASE_ADDRESS (stmt_info), |
4920 | STMT_VINFO_DR_OFFSET (stmt_info), |
4921 | STMT_VINFO_DR_INIT (stmt_info), |
4922 | STMT_VINFO_DR_STEP (stmt_info), |
4923 | STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info), |
4924 | STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info), |
4925 | STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info), |
4926 | STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info)); |
4927 | } |
4928 | |
4929 | /* Set vectype for STMT. */ |
4930 | scalar_type = TREE_TYPE (DR_REF (dr)); |
4931 | tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type); |
4932 | if (!vectype) |
4933 | { |
4934 | if (dump_enabled_p ()) |
4935 | { |
4936 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4937 | "not vectorized: no vectype for stmt: %G" , |
4938 | stmt_info->stmt); |
4939 | dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: " ); |
4940 | dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, |
4941 | scalar_type); |
4942 | dump_printf (MSG_MISSED_OPTIMIZATION, "\n" ); |
4943 | } |
4944 | |
4945 | if (is_a <bb_vec_info> (p: vinfo)) |
4946 | { |
4947 | /* No vector type is fine, the ref can still participate |
4948 | in dependence analysis, we just can't vectorize it. */ |
4949 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; |
4950 | continue; |
4951 | } |
4952 | if (fatal) |
4953 | *fatal = false; |
4954 | return opt_result::failure_at (loc: stmt_info->stmt, |
4955 | fmt: "not vectorized:" |
4956 | " no vectype for stmt: %G" |
4957 | " scalar_type: %T\n" , |
4958 | stmt_info->stmt, scalar_type); |
4959 | } |
4960 | else |
4961 | { |
4962 | if (dump_enabled_p ()) |
4963 | dump_printf_loc (MSG_NOTE, vect_location, |
4964 | "got vectype for stmt: %G%T\n" , |
4965 | stmt_info->stmt, vectype); |
4966 | } |
4967 | |
4968 | /* Adjust the minimal vectorization factor according to the |
4969 | vector type. */ |
4970 | vf = TYPE_VECTOR_SUBPARTS (node: vectype); |
4971 | *min_vf = upper_bound (a: *min_vf, b: vf); |
4972 | |
4973 | /* Leave the BB vectorizer to pick the vector type later, based on |
4974 | the final dataref group size and SLP node size. */ |
4975 | if (is_a <loop_vec_info> (p: vinfo)) |
4976 | STMT_VINFO_VECTYPE (stmt_info) = vectype; |
4977 | |
4978 | if (gatherscatter != SG_NONE) |
4979 | { |
4980 | gather_scatter_info gs_info; |
4981 | if (!vect_check_gather_scatter (stmt_info, |
4982 | loop_vinfo: as_a <loop_vec_info> (p: vinfo), |
4983 | info: &gs_info) |
4984 | || !get_vectype_for_scalar_type (vinfo, |
4985 | TREE_TYPE (gs_info.offset))) |
4986 | { |
4987 | if (fatal) |
4988 | *fatal = false; |
4989 | return opt_result::failure_at |
4990 | (loc: stmt_info->stmt, |
4991 | fmt: (gatherscatter == GATHER) |
4992 | ? "not vectorized: not suitable for gather load %G" |
4993 | : "not vectorized: not suitable for scatter store %G" , |
4994 | stmt_info->stmt); |
4995 | } |
4996 | STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; |
4997 | } |
4998 | } |
4999 | |
5000 | /* We used to stop processing and prune the list here. Verify we no |
5001 | longer need to. */ |
5002 | gcc_assert (i == datarefs.length ()); |
5003 | |
5004 | return opt_result::success (); |
5005 | } |
5006 | |
5007 | |
5008 | /* Function vect_get_new_vect_var. |
5009 | |
5010 | Returns a name for a new variable. The current naming scheme appends the |
5011 | prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to |
5012 | the name of vectorizer generated variables, and appends that to NAME if |
5013 | provided. */ |
5014 | |
5015 | tree |
5016 | vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) |
5017 | { |
5018 | const char *prefix; |
5019 | tree new_vect_var; |
5020 | |
5021 | switch (var_kind) |
5022 | { |
5023 | case vect_simple_var: |
5024 | prefix = "vect" ; |
5025 | break; |
5026 | case vect_scalar_var: |
5027 | prefix = "stmp" ; |
5028 | break; |
5029 | case vect_mask_var: |
5030 | prefix = "mask" ; |
5031 | break; |
5032 | case vect_pointer_var: |
5033 | prefix = "vectp" ; |
5034 | break; |
5035 | default: |
5036 | gcc_unreachable (); |
5037 | } |
5038 | |
5039 | if (name) |
5040 | { |
5041 | char* tmp = concat (prefix, "_" , name, NULL); |
5042 | new_vect_var = create_tmp_reg (type, tmp); |
5043 | free (ptr: tmp); |
5044 | } |
5045 | else |
5046 | new_vect_var = create_tmp_reg (type, prefix); |
5047 | |
5048 | return new_vect_var; |
5049 | } |
5050 | |
5051 | /* Like vect_get_new_vect_var but return an SSA name. */ |
5052 | |
5053 | tree |
5054 | vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) |
5055 | { |
5056 | const char *prefix; |
5057 | tree new_vect_var; |
5058 | |
5059 | switch (var_kind) |
5060 | { |
5061 | case vect_simple_var: |
5062 | prefix = "vect" ; |
5063 | break; |
5064 | case vect_scalar_var: |
5065 | prefix = "stmp" ; |
5066 | break; |
5067 | case vect_pointer_var: |
5068 | prefix = "vectp" ; |
5069 | break; |
5070 | default: |
5071 | gcc_unreachable (); |
5072 | } |
5073 | |
5074 | if (name) |
5075 | { |
5076 | char* tmp = concat (prefix, "_" , name, NULL); |
5077 | new_vect_var = make_temp_ssa_name (type, NULL, name: tmp); |
5078 | free (ptr: tmp); |
5079 | } |
5080 | else |
5081 | new_vect_var = make_temp_ssa_name (type, NULL, name: prefix); |
5082 | |
5083 | return new_vect_var; |
5084 | } |
5085 | |
5086 | /* Duplicate points-to info on NAME from DR_INFO. */ |
5087 | |
5088 | static void |
5089 | vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info) |
5090 | { |
5091 | duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr)); |
5092 | /* DR_PTR_INFO is for a base SSA name, not including constant or |
5093 | variable offsets in the ref so its alignment info does not apply. */ |
5094 | mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); |
5095 | } |
5096 | |
5097 | /* Function vect_create_addr_base_for_vector_ref. |
5098 | |
5099 | Create an expression that computes the address of the first memory location |
5100 | that will be accessed for a data reference. |
5101 | |
5102 | Input: |
5103 | STMT_INFO: The statement containing the data reference. |
5104 | NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. |
5105 | OFFSET: Optional. If supplied, it is be added to the initial address. |
5106 | LOOP: Specify relative to which loop-nest should the address be computed. |
5107 | For example, when the dataref is in an inner-loop nested in an |
5108 | outer-loop that is now being vectorized, LOOP can be either the |
5109 | outer-loop, or the inner-loop. The first memory location accessed |
5110 | by the following dataref ('in' points to short): |
5111 | |
5112 | for (i=0; i<N; i++) |
5113 | for (j=0; j<M; j++) |
5114 | s += in[i+j] |
5115 | |
5116 | is as follows: |
5117 | if LOOP=i_loop: &in (relative to i_loop) |
5118 | if LOOP=j_loop: &in+i*2B (relative to j_loop) |
5119 | |
5120 | Output: |
5121 | 1. Return an SSA_NAME whose value is the address of the memory location of |
5122 | the first vector of the data reference. |
5123 | 2. If new_stmt_list is not NULL_TREE after return then the caller must insert |
5124 | these statement(s) which define the returned SSA_NAME. |
5125 | |
5126 | FORNOW: We are only handling array accesses with step 1. */ |
5127 | |
5128 | tree |
5129 | vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info, |
5130 | gimple_seq *new_stmt_list, |
5131 | tree offset) |
5132 | { |
5133 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
5134 | struct data_reference *dr = dr_info->dr; |
5135 | const char *base_name; |
5136 | tree addr_base; |
5137 | tree dest; |
5138 | gimple_seq seq = NULL; |
5139 | tree vect_ptr_type; |
5140 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
5141 | innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info); |
5142 | |
5143 | tree data_ref_base = unshare_expr (drb->base_address); |
5144 | tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, check_outer: true)); |
5145 | tree init = unshare_expr (drb->init); |
5146 | |
5147 | if (loop_vinfo) |
5148 | base_name = get_name (data_ref_base); |
5149 | else |
5150 | { |
5151 | base_offset = ssize_int (0); |
5152 | init = ssize_int (0); |
5153 | base_name = get_name (DR_REF (dr)); |
5154 | } |
5155 | |
5156 | /* Create base_offset */ |
5157 | base_offset = size_binop (PLUS_EXPR, |
5158 | fold_convert (sizetype, base_offset), |
5159 | fold_convert (sizetype, init)); |
5160 | |
5161 | if (offset) |
5162 | { |
5163 | offset = fold_convert (sizetype, offset); |
5164 | base_offset = fold_build2 (PLUS_EXPR, sizetype, |
5165 | base_offset, offset); |
5166 | } |
5167 | |
5168 | /* base + base_offset */ |
5169 | if (loop_vinfo) |
5170 | addr_base = fold_build_pointer_plus (data_ref_base, base_offset); |
5171 | else |
5172 | addr_base = build1 (ADDR_EXPR, |
5173 | build_pointer_type (TREE_TYPE (DR_REF (dr))), |
5174 | /* Strip zero offset components since we don't need |
5175 | them and they can confuse late diagnostics if |
5176 | we CSE them wrongly. See PR106904 for example. */ |
5177 | unshare_expr (strip_zero_offset_components |
5178 | (DR_REF (dr)))); |
5179 | |
5180 | vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr))); |
5181 | dest = vect_get_new_vect_var (type: vect_ptr_type, var_kind: vect_pointer_var, name: base_name); |
5182 | addr_base = force_gimple_operand (addr_base, &seq, true, dest); |
5183 | gimple_seq_add_seq (new_stmt_list, seq); |
5184 | |
5185 | if (DR_PTR_INFO (dr) |
5186 | && TREE_CODE (addr_base) == SSA_NAME |
5187 | /* We should only duplicate pointer info to newly created SSA names. */ |
5188 | && SSA_NAME_VAR (addr_base) == dest) |
5189 | { |
5190 | gcc_assert (!SSA_NAME_PTR_INFO (addr_base)); |
5191 | vect_duplicate_ssa_name_ptr_info (name: addr_base, dr_info); |
5192 | } |
5193 | |
5194 | if (dump_enabled_p ()) |
5195 | dump_printf_loc (MSG_NOTE, vect_location, "created %T\n" , addr_base); |
5196 | |
5197 | return addr_base; |
5198 | } |
5199 | |
5200 | |
5201 | /* Function vect_create_data_ref_ptr. |
5202 | |
5203 | Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first |
5204 | location accessed in the loop by STMT_INFO, along with the def-use update |
5205 | chain to appropriately advance the pointer through the loop iterations. |
5206 | Also set aliasing information for the pointer. This pointer is used by |
5207 | the callers to this function to create a memory reference expression for |
5208 | vector load/store access. |
5209 | |
5210 | Input: |
5211 | 1. STMT_INFO: a stmt that references memory. Expected to be of the form |
5212 | GIMPLE_ASSIGN <name, data-ref> or |
5213 | GIMPLE_ASSIGN <data-ref, name>. |
5214 | 2. AGGR_TYPE: the type of the reference, which should be either a vector |
5215 | or an array. |
5216 | 3. AT_LOOP: the loop where the vector memref is to be created. |
5217 | 4. OFFSET (optional): a byte offset to be added to the initial address |
5218 | accessed by the data-ref in STMT_INFO. |
5219 | 5. BSI: location where the new stmts are to be placed if there is no loop |
5220 | 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain |
5221 | pointing to the initial address. |
5222 | 8. IV_STEP (optional, defaults to NULL): the amount that should be added |
5223 | to the IV during each iteration of the loop. NULL says to move |
5224 | by one copy of AGGR_TYPE up or down, depending on the step of the |
5225 | data reference. |
5226 | |
5227 | Output: |
5228 | 1. Declare a new ptr to vector_type, and have it point to the base of the |
5229 | data reference (initial addressed accessed by the data reference). |
5230 | For example, for vector of type V8HI, the following code is generated: |
5231 | |
5232 | v8hi *ap; |
5233 | ap = (v8hi *)initial_address; |
5234 | |
5235 | if OFFSET is not supplied: |
5236 | initial_address = &a[init]; |
5237 | if OFFSET is supplied: |
5238 | initial_address = &a[init] + OFFSET; |
5239 | if BYTE_OFFSET is supplied: |
5240 | initial_address = &a[init] + BYTE_OFFSET; |
5241 | |
5242 | Return the initial_address in INITIAL_ADDRESS. |
5243 | |
5244 | 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also |
5245 | update the pointer in each iteration of the loop. |
5246 | |
5247 | Return the increment stmt that updates the pointer in PTR_INCR. |
5248 | |
5249 | 3. Return the pointer. */ |
5250 | |
5251 | tree |
5252 | vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info, |
5253 | tree aggr_type, class loop *at_loop, tree offset, |
5254 | tree *initial_address, gimple_stmt_iterator *gsi, |
5255 | gimple **ptr_incr, bool only_init, |
5256 | tree iv_step) |
5257 | { |
5258 | const char *base_name; |
5259 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
5260 | class loop *loop = NULL; |
5261 | bool nested_in_vect_loop = false; |
5262 | class loop *containing_loop = NULL; |
5263 | tree aggr_ptr_type; |
5264 | tree aggr_ptr; |
5265 | tree new_temp; |
5266 | gimple_seq new_stmt_list = NULL; |
5267 | edge pe = NULL; |
5268 | basic_block new_bb; |
5269 | tree aggr_ptr_init; |
5270 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
5271 | struct data_reference *dr = dr_info->dr; |
5272 | tree aptr; |
5273 | gimple_stmt_iterator incr_gsi; |
5274 | bool insert_after; |
5275 | tree indx_before_incr, indx_after_incr; |
5276 | gimple *incr; |
5277 | bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo); |
5278 | |
5279 | gcc_assert (iv_step != NULL_TREE |
5280 | || TREE_CODE (aggr_type) == ARRAY_TYPE |
5281 | || TREE_CODE (aggr_type) == VECTOR_TYPE); |
5282 | |
5283 | if (loop_vinfo) |
5284 | { |
5285 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
5286 | nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info); |
5287 | containing_loop = (gimple_bb (g: stmt_info->stmt))->loop_father; |
5288 | pe = loop_preheader_edge (loop); |
5289 | } |
5290 | else |
5291 | { |
5292 | gcc_assert (bb_vinfo); |
5293 | only_init = true; |
5294 | *ptr_incr = NULL; |
5295 | } |
5296 | |
5297 | /* Create an expression for the first address accessed by this load |
5298 | in LOOP. */ |
5299 | base_name = get_name (DR_BASE_ADDRESS (dr)); |
5300 | |
5301 | if (dump_enabled_p ()) |
5302 | { |
5303 | tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); |
5304 | dump_printf_loc (MSG_NOTE, vect_location, |
5305 | "create %s-pointer variable to type: %T" , |
5306 | get_tree_code_name (TREE_CODE (aggr_type)), |
5307 | aggr_type); |
5308 | if (TREE_CODE (dr_base_type) == ARRAY_TYPE) |
5309 | dump_printf (MSG_NOTE, " vectorizing an array ref: " ); |
5310 | else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) |
5311 | dump_printf (MSG_NOTE, " vectorizing a vector ref: " ); |
5312 | else if (TREE_CODE (dr_base_type) == RECORD_TYPE) |
5313 | dump_printf (MSG_NOTE, " vectorizing a record based array ref: " ); |
5314 | else |
5315 | dump_printf (MSG_NOTE, " vectorizing a pointer ref: " ); |
5316 | dump_printf (MSG_NOTE, "%T\n" , DR_BASE_OBJECT (dr)); |
5317 | } |
5318 | |
5319 | /* (1) Create the new aggregate-pointer variable. |
5320 | Vector and array types inherit the alias set of their component |
5321 | type by default so we need to use a ref-all pointer if the data |
5322 | reference does not conflict with the created aggregated data |
5323 | reference because it is not addressable. */ |
5324 | bool need_ref_all = false; |
5325 | if (!alias_sets_conflict_p (get_alias_set (aggr_type), |
5326 | get_alias_set (DR_REF (dr)))) |
5327 | need_ref_all = true; |
5328 | /* Likewise for any of the data references in the stmt group. */ |
5329 | else if (DR_GROUP_SIZE (stmt_info) > 1) |
5330 | { |
5331 | stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info); |
5332 | do |
5333 | { |
5334 | struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); |
5335 | if (!alias_sets_conflict_p (get_alias_set (aggr_type), |
5336 | get_alias_set (DR_REF (sdr)))) |
5337 | { |
5338 | need_ref_all = true; |
5339 | break; |
5340 | } |
5341 | sinfo = DR_GROUP_NEXT_ELEMENT (sinfo); |
5342 | } |
5343 | while (sinfo); |
5344 | } |
5345 | aggr_ptr_type = build_pointer_type_for_mode (aggr_type, VOIDmode, |
5346 | need_ref_all); |
5347 | aggr_ptr = vect_get_new_vect_var (type: aggr_ptr_type, var_kind: vect_pointer_var, name: base_name); |
5348 | |
5349 | |
5350 | /* Note: If the dataref is in an inner-loop nested in LOOP, and we are |
5351 | vectorizing LOOP (i.e., outer-loop vectorization), we need to create two |
5352 | def-use update cycles for the pointer: one relative to the outer-loop |
5353 | (LOOP), which is what steps (3) and (4) below do. The other is relative |
5354 | to the inner-loop (which is the inner-most loop containing the dataref), |
5355 | and this is done be step (5) below. |
5356 | |
5357 | When vectorizing inner-most loops, the vectorized loop (LOOP) is also the |
5358 | inner-most loop, and so steps (3),(4) work the same, and step (5) is |
5359 | redundant. Steps (3),(4) create the following: |
5360 | |
5361 | vp0 = &base_addr; |
5362 | LOOP: vp1 = phi(vp0,vp2) |
5363 | ... |
5364 | ... |
5365 | vp2 = vp1 + step |
5366 | goto LOOP |
5367 | |
5368 | If there is an inner-loop nested in loop, then step (5) will also be |
5369 | applied, and an additional update in the inner-loop will be created: |
5370 | |
5371 | vp0 = &base_addr; |
5372 | LOOP: vp1 = phi(vp0,vp2) |
5373 | ... |
5374 | inner: vp3 = phi(vp1,vp4) |
5375 | vp4 = vp3 + inner_step |
5376 | if () goto inner |
5377 | ... |
5378 | vp2 = vp1 + step |
5379 | if () goto LOOP */ |
5380 | |
5381 | /* (2) Calculate the initial address of the aggregate-pointer, and set |
5382 | the aggregate-pointer to point to it before the loop. */ |
5383 | |
5384 | /* Create: (&(base[init_val]+offset) in the loop preheader. */ |
5385 | |
5386 | new_temp = vect_create_addr_base_for_vector_ref (vinfo, |
5387 | stmt_info, new_stmt_list: &new_stmt_list, |
5388 | offset); |
5389 | if (new_stmt_list) |
5390 | { |
5391 | if (pe) |
5392 | { |
5393 | new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); |
5394 | gcc_assert (!new_bb); |
5395 | } |
5396 | else |
5397 | gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); |
5398 | } |
5399 | |
5400 | *initial_address = new_temp; |
5401 | aggr_ptr_init = new_temp; |
5402 | |
5403 | /* (3) Handle the updating of the aggregate-pointer inside the loop. |
5404 | This is needed when ONLY_INIT is false, and also when AT_LOOP is the |
5405 | inner-loop nested in LOOP (during outer-loop vectorization). */ |
5406 | |
5407 | /* No update in loop is required. */ |
5408 | if (only_init && (!loop_vinfo || at_loop == loop)) |
5409 | aptr = aggr_ptr_init; |
5410 | else |
5411 | { |
5412 | /* Accesses to invariant addresses should be handled specially |
5413 | by the caller. */ |
5414 | tree step = vect_dr_behavior (vinfo, dr_info)->step; |
5415 | gcc_assert (!integer_zerop (step)); |
5416 | |
5417 | if (iv_step == NULL_TREE) |
5418 | { |
5419 | /* The step of the aggregate pointer is the type size, |
5420 | negated for downward accesses. */ |
5421 | iv_step = TYPE_SIZE_UNIT (aggr_type); |
5422 | if (tree_int_cst_sgn (step) == -1) |
5423 | iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); |
5424 | } |
5425 | |
5426 | standard_iv_increment_position (loop, &incr_gsi, &insert_after); |
5427 | |
5428 | create_iv (aggr_ptr_init, PLUS_EXPR, |
5429 | fold_convert (aggr_ptr_type, iv_step), |
5430 | aggr_ptr, loop, &incr_gsi, insert_after, |
5431 | &indx_before_incr, &indx_after_incr); |
5432 | incr = gsi_stmt (i: incr_gsi); |
5433 | |
5434 | /* Copy the points-to information if it exists. */ |
5435 | if (DR_PTR_INFO (dr)) |
5436 | { |
5437 | vect_duplicate_ssa_name_ptr_info (name: indx_before_incr, dr_info); |
5438 | vect_duplicate_ssa_name_ptr_info (name: indx_after_incr, dr_info); |
5439 | } |
5440 | if (ptr_incr) |
5441 | *ptr_incr = incr; |
5442 | |
5443 | aptr = indx_before_incr; |
5444 | } |
5445 | |
5446 | if (!nested_in_vect_loop || only_init) |
5447 | return aptr; |
5448 | |
5449 | |
5450 | /* (4) Handle the updating of the aggregate-pointer inside the inner-loop |
5451 | nested in LOOP, if exists. */ |
5452 | |
5453 | gcc_assert (nested_in_vect_loop); |
5454 | if (!only_init) |
5455 | { |
5456 | standard_iv_increment_position (containing_loop, &incr_gsi, |
5457 | &insert_after); |
5458 | create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)), |
5459 | aggr_ptr, containing_loop, &incr_gsi, insert_after, |
5460 | &indx_before_incr, &indx_after_incr); |
5461 | incr = gsi_stmt (i: incr_gsi); |
5462 | |
5463 | /* Copy the points-to information if it exists. */ |
5464 | if (DR_PTR_INFO (dr)) |
5465 | { |
5466 | vect_duplicate_ssa_name_ptr_info (name: indx_before_incr, dr_info); |
5467 | vect_duplicate_ssa_name_ptr_info (name: indx_after_incr, dr_info); |
5468 | } |
5469 | if (ptr_incr) |
5470 | *ptr_incr = incr; |
5471 | |
5472 | return indx_before_incr; |
5473 | } |
5474 | else |
5475 | gcc_unreachable (); |
5476 | } |
5477 | |
5478 | |
5479 | /* Function bump_vector_ptr |
5480 | |
5481 | Increment a pointer (to a vector type) by vector-size. If requested, |
5482 | i.e. if PTR-INCR is given, then also connect the new increment stmt |
5483 | to the existing def-use update-chain of the pointer, by modifying |
5484 | the PTR_INCR as illustrated below: |
5485 | |
5486 | The pointer def-use update-chain before this function: |
5487 | DATAREF_PTR = phi (p_0, p_2) |
5488 | .... |
5489 | PTR_INCR: p_2 = DATAREF_PTR + step |
5490 | |
5491 | The pointer def-use update-chain after this function: |
5492 | DATAREF_PTR = phi (p_0, p_2) |
5493 | .... |
5494 | NEW_DATAREF_PTR = DATAREF_PTR + BUMP |
5495 | .... |
5496 | PTR_INCR: p_2 = NEW_DATAREF_PTR + step |
5497 | |
5498 | Input: |
5499 | DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated |
5500 | in the loop. |
5501 | PTR_INCR - optional. The stmt that updates the pointer in each iteration of |
5502 | the loop. The increment amount across iterations is expected |
5503 | to be vector_size. |
5504 | BSI - location where the new update stmt is to be placed. |
5505 | STMT_INFO - the original scalar memory-access stmt that is being vectorized. |
5506 | BUMP - optional. The offset by which to bump the pointer. If not given, |
5507 | the offset is assumed to be vector_size. |
5508 | |
5509 | Output: Return NEW_DATAREF_PTR as illustrated above. |
5510 | |
5511 | */ |
5512 | |
5513 | tree |
5514 | bump_vector_ptr (vec_info *vinfo, |
5515 | tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, |
5516 | stmt_vec_info stmt_info, tree bump) |
5517 | { |
5518 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); |
5519 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
5520 | tree update = TYPE_SIZE_UNIT (vectype); |
5521 | gimple *incr_stmt; |
5522 | ssa_op_iter iter; |
5523 | use_operand_p use_p; |
5524 | tree new_dataref_ptr; |
5525 | |
5526 | if (bump) |
5527 | update = bump; |
5528 | |
5529 | if (TREE_CODE (dataref_ptr) == SSA_NAME) |
5530 | new_dataref_ptr = copy_ssa_name (var: dataref_ptr); |
5531 | else if (is_gimple_min_invariant (dataref_ptr)) |
5532 | /* When possible avoid emitting a separate increment stmt that will |
5533 | force the addressed object addressable. */ |
5534 | return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr), |
5535 | fold_build2 (MEM_REF, |
5536 | TREE_TYPE (TREE_TYPE (dataref_ptr)), |
5537 | dataref_ptr, |
5538 | fold_convert (ptr_type_node, update))); |
5539 | else |
5540 | new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); |
5541 | incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, |
5542 | dataref_ptr, update); |
5543 | vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi); |
5544 | /* Fold the increment, avoiding excessive chains use-def chains of |
5545 | those, leading to compile-time issues for passes until the next |
5546 | forwprop pass which would do this as well. */ |
5547 | gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt); |
5548 | if (fold_stmt (&fold_gsi, follow_all_ssa_edges)) |
5549 | { |
5550 | incr_stmt = gsi_stmt (i: fold_gsi); |
5551 | update_stmt (s: incr_stmt); |
5552 | } |
5553 | |
5554 | /* Copy the points-to information if it exists. */ |
5555 | if (DR_PTR_INFO (dr)) |
5556 | { |
5557 | duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); |
5558 | mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); |
5559 | } |
5560 | |
5561 | if (!ptr_incr) |
5562 | return new_dataref_ptr; |
5563 | |
5564 | /* Update the vector-pointer's cross-iteration increment. */ |
5565 | FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) |
5566 | { |
5567 | tree use = USE_FROM_PTR (use_p); |
5568 | |
5569 | if (use == dataref_ptr) |
5570 | SET_USE (use_p, new_dataref_ptr); |
5571 | else |
5572 | gcc_assert (operand_equal_p (use, update, 0)); |
5573 | } |
5574 | |
5575 | return new_dataref_ptr; |
5576 | } |
5577 | |
5578 | |
5579 | /* Copy memory reference info such as base/clique from the SRC reference |
5580 | to the DEST MEM_REF. */ |
5581 | |
5582 | void |
5583 | vect_copy_ref_info (tree dest, tree src) |
5584 | { |
5585 | if (TREE_CODE (dest) != MEM_REF) |
5586 | return; |
5587 | |
5588 | tree src_base = src; |
5589 | while (handled_component_p (t: src_base)) |
5590 | src_base = TREE_OPERAND (src_base, 0); |
5591 | if (TREE_CODE (src_base) != MEM_REF |
5592 | && TREE_CODE (src_base) != TARGET_MEM_REF) |
5593 | return; |
5594 | |
5595 | MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base); |
5596 | MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base); |
5597 | } |
5598 | |
5599 | |
5600 | /* Function vect_create_destination_var. |
5601 | |
5602 | Create a new temporary of type VECTYPE. */ |
5603 | |
5604 | tree |
5605 | vect_create_destination_var (tree scalar_dest, tree vectype) |
5606 | { |
5607 | tree vec_dest; |
5608 | const char *name; |
5609 | char *new_name; |
5610 | tree type; |
5611 | enum vect_var_kind kind; |
5612 | |
5613 | kind = vectype |
5614 | ? VECTOR_BOOLEAN_TYPE_P (vectype) |
5615 | ? vect_mask_var |
5616 | : vect_simple_var |
5617 | : vect_scalar_var; |
5618 | type = vectype ? vectype : TREE_TYPE (scalar_dest); |
5619 | |
5620 | gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); |
5621 | |
5622 | name = get_name (scalar_dest); |
5623 | if (name) |
5624 | new_name = xasprintf ("%s_%u" , name, SSA_NAME_VERSION (scalar_dest)); |
5625 | else |
5626 | new_name = xasprintf ("_%u" , SSA_NAME_VERSION (scalar_dest)); |
5627 | vec_dest = vect_get_new_vect_var (type, var_kind: kind, name: new_name); |
5628 | free (ptr: new_name); |
5629 | |
5630 | return vec_dest; |
5631 | } |
5632 | |
5633 | /* Function vect_grouped_store_supported. |
5634 | |
5635 | Returns TRUE if interleave high and interleave low permutations |
5636 | are supported, and FALSE otherwise. */ |
5637 | |
5638 | bool |
5639 | vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) |
5640 | { |
5641 | machine_mode mode = TYPE_MODE (vectype); |
5642 | |
5643 | /* vect_permute_store_chain requires the group size to be equal to 3 or |
5644 | be a power of two. */ |
5645 | if (count != 3 && exact_log2 (x: count) == -1) |
5646 | { |
5647 | if (dump_enabled_p ()) |
5648 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
5649 | "the size of the group of accesses" |
5650 | " is not a power of 2 or not eqaul to 3\n" ); |
5651 | return false; |
5652 | } |
5653 | |
5654 | /* Check that the permutation is supported. */ |
5655 | if (VECTOR_MODE_P (mode)) |
5656 | { |
5657 | unsigned int i; |
5658 | if (count == 3) |
5659 | { |
5660 | unsigned int j0 = 0, j1 = 0, j2 = 0; |
5661 | unsigned int i, j; |
5662 | |
5663 | unsigned int nelt; |
5664 | if (!GET_MODE_NUNITS (mode).is_constant (const_value: &nelt)) |
5665 | { |
5666 | if (dump_enabled_p ()) |
5667 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
5668 | "cannot handle groups of 3 stores for" |
5669 | " variable-length vectors\n" ); |
5670 | return false; |
5671 | } |
5672 | |
5673 | vec_perm_builder sel (nelt, nelt, 1); |
5674 | sel.quick_grow (len: nelt); |
5675 | vec_perm_indices indices; |
5676 | for (j = 0; j < 3; j++) |
5677 | { |
5678 | int nelt0 = ((3 - j) * nelt) % 3; |
5679 | int nelt1 = ((3 - j) * nelt + 1) % 3; |
5680 | int nelt2 = ((3 - j) * nelt + 2) % 3; |
5681 | for (i = 0; i < nelt; i++) |
5682 | { |
5683 | if (3 * i + nelt0 < nelt) |
5684 | sel[3 * i + nelt0] = j0++; |
5685 | if (3 * i + nelt1 < nelt) |
5686 | sel[3 * i + nelt1] = nelt + j1++; |
5687 | if (3 * i + nelt2 < nelt) |
5688 | sel[3 * i + nelt2] = 0; |
5689 | } |
5690 | indices.new_vector (sel, 2, nelt); |
5691 | if (!can_vec_perm_const_p (mode, mode, indices)) |
5692 | { |
5693 | if (dump_enabled_p ()) |
5694 | dump_printf (MSG_MISSED_OPTIMIZATION, |
5695 | "permutation op not supported by target.\n" ); |
5696 | return false; |
5697 | } |
5698 | |
5699 | for (i = 0; i < nelt; i++) |
5700 | { |
5701 | if (3 * i + nelt0 < nelt) |
5702 | sel[3 * i + nelt0] = 3 * i + nelt0; |
5703 | if (3 * i + nelt1 < nelt) |
5704 | sel[3 * i + nelt1] = 3 * i + nelt1; |
5705 | if (3 * i + nelt2 < nelt) |
5706 | sel[3 * i + nelt2] = nelt + j2++; |
5707 | } |
5708 | indices.new_vector (sel, 2, nelt); |
5709 | if (!can_vec_perm_const_p (mode, mode, indices)) |
5710 | { |
5711 | if (dump_enabled_p ()) |
5712 | dump_printf (MSG_MISSED_OPTIMIZATION, |
5713 | "permutation op not supported by target.\n" ); |
5714 | return false; |
5715 | } |
5716 | } |
5717 | return true; |
5718 | } |
5719 | else |
5720 | { |
5721 | /* If length is not equal to 3 then only power of 2 is supported. */ |
5722 | gcc_assert (pow2p_hwi (count)); |
5723 | poly_uint64 nelt = GET_MODE_NUNITS (mode); |
5724 | |
5725 | /* The encoding has 2 interleaved stepped patterns. */ |
5726 | if(!multiple_p (a: nelt, b: 2)) |
5727 | return false; |
5728 | vec_perm_builder sel (nelt, 2, 3); |
5729 | sel.quick_grow (len: 6); |
5730 | for (i = 0; i < 3; i++) |
5731 | { |
5732 | sel[i * 2] = i; |
5733 | sel[i * 2 + 1] = i + nelt; |
5734 | } |
5735 | vec_perm_indices indices (sel, 2, nelt); |
5736 | if (can_vec_perm_const_p (mode, mode, indices)) |
5737 | { |
5738 | for (i = 0; i < 6; i++) |
5739 | sel[i] += exact_div (a: nelt, b: 2); |
5740 | indices.new_vector (sel, 2, nelt); |
5741 | if (can_vec_perm_const_p (mode, mode, indices)) |
5742 | return true; |
5743 | } |
5744 | } |
5745 | } |
5746 | |
5747 | if (dump_enabled_p ()) |
5748 | dump_printf (MSG_MISSED_OPTIMIZATION, |
5749 | "permutation op not supported by target.\n" ); |
5750 | return false; |
5751 | } |
5752 | |
5753 | /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors |
5754 | of type VECTYPE. MASKED_P says whether the masked form is needed. */ |
5755 | |
5756 | internal_fn |
5757 | vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, |
5758 | bool masked_p) |
5759 | { |
5760 | if (vect_lanes_optab_supported_p (name: "vec_mask_len_store_lanes" , |
5761 | optab: vec_mask_len_store_lanes_optab, vectype, |
5762 | count)) |
5763 | return IFN_MASK_LEN_STORE_LANES; |
5764 | else if (masked_p) |
5765 | { |
5766 | if (vect_lanes_optab_supported_p (name: "vec_mask_store_lanes" , |
5767 | optab: vec_mask_store_lanes_optab, vectype, |
5768 | count)) |
5769 | return IFN_MASK_STORE_LANES; |
5770 | } |
5771 | else |
5772 | { |
5773 | if (vect_lanes_optab_supported_p (name: "vec_store_lanes" , |
5774 | optab: vec_store_lanes_optab, vectype, count)) |
5775 | return IFN_STORE_LANES; |
5776 | } |
5777 | return IFN_LAST; |
5778 | } |
5779 | |
5780 | |
5781 | /* Function vect_permute_store_chain. |
5782 | |
5783 | Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be |
5784 | a power of 2 or equal to 3, generate interleave_high/low stmts to reorder |
5785 | the data correctly for the stores. Return the final references for stores |
5786 | in RESULT_CHAIN. |
5787 | |
5788 | E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. |
5789 | The input is 4 vectors each containing 8 elements. We assign a number to |
5790 | each element, the input sequence is: |
5791 | |
5792 | 1st vec: 0 1 2 3 4 5 6 7 |
5793 | 2nd vec: 8 9 10 11 12 13 14 15 |
5794 | 3rd vec: 16 17 18 19 20 21 22 23 |
5795 | 4th vec: 24 25 26 27 28 29 30 31 |
5796 | |
5797 | The output sequence should be: |
5798 | |
5799 | 1st vec: 0 8 16 24 1 9 17 25 |
5800 | 2nd vec: 2 10 18 26 3 11 19 27 |
5801 | 3rd vec: 4 12 20 28 5 13 21 30 |
5802 | 4th vec: 6 14 22 30 7 15 23 31 |
5803 | |
5804 | i.e., we interleave the contents of the four vectors in their order. |
5805 | |
5806 | We use interleave_high/low instructions to create such output. The input of |
5807 | each interleave_high/low operation is two vectors: |
5808 | 1st vec 2nd vec |
5809 | 0 1 2 3 4 5 6 7 |
5810 | the even elements of the result vector are obtained left-to-right from the |
5811 | high/low elements of the first vector. The odd elements of the result are |
5812 | obtained left-to-right from the high/low elements of the second vector. |
5813 | The output of interleave_high will be: 0 4 1 5 |
5814 | and of interleave_low: 2 6 3 7 |
5815 | |
5816 | |
5817 | The permutation is done in log LENGTH stages. In each stage interleave_high |
5818 | and interleave_low stmts are created for each pair of vectors in DR_CHAIN, |
5819 | where the first argument is taken from the first half of DR_CHAIN and the |
5820 | second argument from it's second half. |
5821 | In our example, |
5822 | |
5823 | I1: interleave_high (1st vec, 3rd vec) |
5824 | I2: interleave_low (1st vec, 3rd vec) |
5825 | I3: interleave_high (2nd vec, 4th vec) |
5826 | I4: interleave_low (2nd vec, 4th vec) |
5827 | |
5828 | The output for the first stage is: |
5829 | |
5830 | I1: 0 16 1 17 2 18 3 19 |
5831 | I2: 4 20 5 21 6 22 7 23 |
5832 | I3: 8 24 9 25 10 26 11 27 |
5833 | I4: 12 28 13 29 14 30 15 31 |
5834 | |
5835 | The output of the second stage, i.e. the final result is: |
5836 | |
5837 | I1: 0 8 16 24 1 9 17 25 |
5838 | I2: 2 10 18 26 3 11 19 27 |
5839 | I3: 4 12 20 28 5 13 21 30 |
5840 | I4: 6 14 22 30 7 15 23 31. */ |
5841 | |
5842 | void |
5843 | vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain, |
5844 | unsigned int length, |
5845 | stmt_vec_info stmt_info, |
5846 | gimple_stmt_iterator *gsi, |
5847 | vec<tree> *result_chain) |
5848 | { |
5849 | tree vect1, vect2, high, low; |
5850 | gimple *perm_stmt; |
5851 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
5852 | tree perm_mask_low, perm_mask_high; |
5853 | tree data_ref; |
5854 | tree perm3_mask_low, perm3_mask_high; |
5855 | unsigned int i, j, n, log_length = exact_log2 (x: length); |
5856 | |
5857 | result_chain->quick_grow (len: length); |
5858 | memcpy (dest: result_chain->address (), src: dr_chain.address (), |
5859 | n: length * sizeof (tree)); |
5860 | |
5861 | if (length == 3) |
5862 | { |
5863 | /* vect_grouped_store_supported ensures that this is constant. */ |
5864 | unsigned int nelt = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant (); |
5865 | unsigned int j0 = 0, j1 = 0, j2 = 0; |
5866 | |
5867 | vec_perm_builder sel (nelt, nelt, 1); |
5868 | sel.quick_grow (len: nelt); |
5869 | vec_perm_indices indices; |
5870 | for (j = 0; j < 3; j++) |
5871 | { |
5872 | int nelt0 = ((3 - j) * nelt) % 3; |
5873 | int nelt1 = ((3 - j) * nelt + 1) % 3; |
5874 | int nelt2 = ((3 - j) * nelt + 2) % 3; |
5875 | |
5876 | for (i = 0; i < nelt; i++) |
5877 | { |
5878 | if (3 * i + nelt0 < nelt) |
5879 | sel[3 * i + nelt0] = j0++; |
5880 | if (3 * i + nelt1 < nelt) |
5881 | sel[3 * i + nelt1] = nelt + j1++; |
5882 | if (3 * i + nelt2 < nelt) |
5883 | sel[3 * i + nelt2] = 0; |
5884 | } |
5885 | indices.new_vector (sel, 2, nelt); |
5886 | perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); |
5887 | |
5888 | for (i = 0; i < nelt; i++) |
5889 | { |
5890 | if (3 * i + nelt0 < nelt) |
5891 | sel[3 * i + nelt0] = 3 * i + nelt0; |
5892 | if (3 * i + nelt1 < nelt) |
5893 | sel[3 * i + nelt1] = 3 * i + nelt1; |
5894 | if (3 * i + nelt2 < nelt) |
5895 | sel[3 * i + nelt2] = nelt + j2++; |
5896 | } |
5897 | indices.new_vector (sel, 2, nelt); |
5898 | perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); |
5899 | |
5900 | vect1 = dr_chain[0]; |
5901 | vect2 = dr_chain[1]; |
5902 | |
5903 | /* Create interleaving stmt: |
5904 | low = VEC_PERM_EXPR <vect1, vect2, |
5905 | {j, nelt, *, j + 1, nelt + j + 1, *, |
5906 | j + 2, nelt + j + 2, *, ...}> */ |
5907 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3_low" ); |
5908 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, |
5909 | vect2, perm3_mask_low); |
5910 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
5911 | |
5912 | vect1 = data_ref; |
5913 | vect2 = dr_chain[2]; |
5914 | /* Create interleaving stmt: |
5915 | low = VEC_PERM_EXPR <vect1, vect2, |
5916 | {0, 1, nelt + j, 3, 4, nelt + j + 1, |
5917 | 6, 7, nelt + j + 2, ...}> */ |
5918 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3_high" ); |
5919 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, |
5920 | vect2, perm3_mask_high); |
5921 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
5922 | (*result_chain)[j] = data_ref; |
5923 | } |
5924 | } |
5925 | else |
5926 | { |
5927 | /* If length is not equal to 3 then only power of 2 is supported. */ |
5928 | gcc_assert (pow2p_hwi (length)); |
5929 | |
5930 | /* The encoding has 2 interleaved stepped patterns. */ |
5931 | poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (node: vectype); |
5932 | vec_perm_builder sel (nelt, 2, 3); |
5933 | sel.quick_grow (len: 6); |
5934 | for (i = 0; i < 3; i++) |
5935 | { |
5936 | sel[i * 2] = i; |
5937 | sel[i * 2 + 1] = i + nelt; |
5938 | } |
5939 | vec_perm_indices indices (sel, 2, nelt); |
5940 | perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); |
5941 | |
5942 | for (i = 0; i < 6; i++) |
5943 | sel[i] += exact_div (a: nelt, b: 2); |
5944 | indices.new_vector (sel, 2, nelt); |
5945 | perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); |
5946 | |
5947 | for (i = 0, n = log_length; i < n; i++) |
5948 | { |
5949 | for (j = 0; j < length/2; j++) |
5950 | { |
5951 | vect1 = dr_chain[j]; |
5952 | vect2 = dr_chain[j+length/2]; |
5953 | |
5954 | /* Create interleaving stmt: |
5955 | high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, |
5956 | ...}> */ |
5957 | high = make_temp_ssa_name (type: vectype, NULL, name: "vect_inter_high" ); |
5958 | perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, |
5959 | vect2, perm_mask_high); |
5960 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
5961 | (*result_chain)[2*j] = high; |
5962 | |
5963 | /* Create interleaving stmt: |
5964 | low = VEC_PERM_EXPR <vect1, vect2, |
5965 | {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, |
5966 | ...}> */ |
5967 | low = make_temp_ssa_name (type: vectype, NULL, name: "vect_inter_low" ); |
5968 | perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, |
5969 | vect2, perm_mask_low); |
5970 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
5971 | (*result_chain)[2*j+1] = low; |
5972 | } |
5973 | memcpy (dest: dr_chain.address (), src: result_chain->address (), |
5974 | n: length * sizeof (tree)); |
5975 | } |
5976 | } |
5977 | } |
5978 | |
5979 | /* Function vect_setup_realignment |
5980 | |
5981 | This function is called when vectorizing an unaligned load using |
5982 | the dr_explicit_realign[_optimized] scheme. |
5983 | This function generates the following code at the loop prolog: |
5984 | |
5985 | p = initial_addr; |
5986 | x msq_init = *(floor(p)); # prolog load |
5987 | realignment_token = call target_builtin; |
5988 | loop: |
5989 | x msq = phi (msq_init, ---) |
5990 | |
5991 | The stmts marked with x are generated only for the case of |
5992 | dr_explicit_realign_optimized. |
5993 | |
5994 | The code above sets up a new (vector) pointer, pointing to the first |
5995 | location accessed by STMT_INFO, and a "floor-aligned" load using that |
5996 | pointer. It also generates code to compute the "realignment-token" |
5997 | (if the relevant target hook was defined), and creates a phi-node at the |
5998 | loop-header bb whose arguments are the result of the prolog-load (created |
5999 | by this function) and the result of a load that takes place in the loop |
6000 | (to be created by the caller to this function). |
6001 | |
6002 | For the case of dr_explicit_realign_optimized: |
6003 | The caller to this function uses the phi-result (msq) to create the |
6004 | realignment code inside the loop, and sets up the missing phi argument, |
6005 | as follows: |
6006 | loop: |
6007 | msq = phi (msq_init, lsq) |
6008 | lsq = *(floor(p')); # load in loop |
6009 | result = realign_load (msq, lsq, realignment_token); |
6010 | |
6011 | For the case of dr_explicit_realign: |
6012 | loop: |
6013 | msq = *(floor(p)); # load in loop |
6014 | p' = p + (VS-1); |
6015 | lsq = *(floor(p')); # load in loop |
6016 | result = realign_load (msq, lsq, realignment_token); |
6017 | |
6018 | Input: |
6019 | STMT_INFO - (scalar) load stmt to be vectorized. This load accesses |
6020 | a memory location that may be unaligned. |
6021 | BSI - place where new code is to be inserted. |
6022 | ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes |
6023 | is used. |
6024 | |
6025 | Output: |
6026 | REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load |
6027 | target hook, if defined. |
6028 | Return value - the result of the loop-header phi node. */ |
6029 | |
6030 | tree |
6031 | vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info, |
6032 | gimple_stmt_iterator *gsi, tree *realignment_token, |
6033 | enum dr_alignment_support alignment_support_scheme, |
6034 | tree init_addr, |
6035 | class loop **at_loop) |
6036 | { |
6037 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
6038 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
6039 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
6040 | struct data_reference *dr = dr_info->dr; |
6041 | class loop *loop = NULL; |
6042 | edge pe = NULL; |
6043 | tree scalar_dest = gimple_assign_lhs (gs: stmt_info->stmt); |
6044 | tree vec_dest; |
6045 | gimple *inc; |
6046 | tree ptr; |
6047 | tree data_ref; |
6048 | basic_block new_bb; |
6049 | tree msq_init = NULL_TREE; |
6050 | tree new_temp; |
6051 | gphi *phi_stmt; |
6052 | tree msq = NULL_TREE; |
6053 | gimple_seq stmts = NULL; |
6054 | bool compute_in_loop = false; |
6055 | bool nested_in_vect_loop = false; |
6056 | class loop *containing_loop = (gimple_bb (g: stmt_info->stmt))->loop_father; |
6057 | class loop *loop_for_initial_load = NULL; |
6058 | |
6059 | if (loop_vinfo) |
6060 | { |
6061 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
6062 | nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info); |
6063 | } |
6064 | |
6065 | gcc_assert (alignment_support_scheme == dr_explicit_realign |
6066 | || alignment_support_scheme == dr_explicit_realign_optimized); |
6067 | |
6068 | /* We need to generate three things: |
6069 | 1. the misalignment computation |
6070 | 2. the extra vector load (for the optimized realignment scheme). |
6071 | 3. the phi node for the two vectors from which the realignment is |
6072 | done (for the optimized realignment scheme). */ |
6073 | |
6074 | /* 1. Determine where to generate the misalignment computation. |
6075 | |
6076 | If INIT_ADDR is NULL_TREE, this indicates that the misalignment |
6077 | calculation will be generated by this function, outside the loop (in the |
6078 | preheader). Otherwise, INIT_ADDR had already been computed for us by the |
6079 | caller, inside the loop. |
6080 | |
6081 | Background: If the misalignment remains fixed throughout the iterations of |
6082 | the loop, then both realignment schemes are applicable, and also the |
6083 | misalignment computation can be done outside LOOP. This is because we are |
6084 | vectorizing LOOP, and so the memory accesses in LOOP advance in steps that |
6085 | are a multiple of VS (the Vector Size), and therefore the misalignment in |
6086 | different vectorized LOOP iterations is always the same. |
6087 | The problem arises only if the memory access is in an inner-loop nested |
6088 | inside LOOP, which is now being vectorized using outer-loop vectorization. |
6089 | This is the only case when the misalignment of the memory access may not |
6090 | remain fixed throughout the iterations of the inner-loop (as explained in |
6091 | detail in vect_supportable_dr_alignment). In this case, not only is the |
6092 | optimized realignment scheme not applicable, but also the misalignment |
6093 | computation (and generation of the realignment token that is passed to |
6094 | REALIGN_LOAD) have to be done inside the loop. |
6095 | |
6096 | In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode |
6097 | or not, which in turn determines if the misalignment is computed inside |
6098 | the inner-loop, or outside LOOP. */ |
6099 | |
6100 | if (init_addr != NULL_TREE || !loop_vinfo) |
6101 | { |
6102 | compute_in_loop = true; |
6103 | gcc_assert (alignment_support_scheme == dr_explicit_realign); |
6104 | } |
6105 | |
6106 | |
6107 | /* 2. Determine where to generate the extra vector load. |
6108 | |
6109 | For the optimized realignment scheme, instead of generating two vector |
6110 | loads in each iteration, we generate a single extra vector load in the |
6111 | preheader of the loop, and in each iteration reuse the result of the |
6112 | vector load from the previous iteration. In case the memory access is in |
6113 | an inner-loop nested inside LOOP, which is now being vectorized using |
6114 | outer-loop vectorization, we need to determine whether this initial vector |
6115 | load should be generated at the preheader of the inner-loop, or can be |
6116 | generated at the preheader of LOOP. If the memory access has no evolution |
6117 | in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has |
6118 | to be generated inside LOOP (in the preheader of the inner-loop). */ |
6119 | |
6120 | if (nested_in_vect_loop) |
6121 | { |
6122 | tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); |
6123 | bool invariant_in_outerloop = |
6124 | (tree_int_cst_compare (t1: outerloop_step, size_zero_node) == 0); |
6125 | loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); |
6126 | } |
6127 | else |
6128 | loop_for_initial_load = loop; |
6129 | if (at_loop) |
6130 | *at_loop = loop_for_initial_load; |
6131 | |
6132 | tree vuse = NULL_TREE; |
6133 | if (loop_for_initial_load) |
6134 | { |
6135 | pe = loop_preheader_edge (loop_for_initial_load); |
6136 | if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header)) |
6137 | vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe); |
6138 | } |
6139 | if (!vuse) |
6140 | vuse = gimple_vuse (g: gsi_stmt (i: *gsi)); |
6141 | |
6142 | /* 3. For the case of the optimized realignment, create the first vector |
6143 | load at the loop preheader. */ |
6144 | |
6145 | if (alignment_support_scheme == dr_explicit_realign_optimized) |
6146 | { |
6147 | /* Create msq_init = *(floor(p1)) in the loop preheader */ |
6148 | gassign *new_stmt; |
6149 | |
6150 | gcc_assert (!compute_in_loop); |
6151 | vec_dest = vect_create_destination_var (scalar_dest, vectype); |
6152 | ptr = vect_create_data_ref_ptr (vinfo, stmt_info, aggr_type: vectype, |
6153 | at_loop: loop_for_initial_load, NULL_TREE, |
6154 | initial_address: &init_addr, NULL, ptr_incr: &inc, only_init: true); |
6155 | if (TREE_CODE (ptr) == SSA_NAME) |
6156 | new_temp = copy_ssa_name (var: ptr); |
6157 | else |
6158 | new_temp = make_ssa_name (TREE_TYPE (ptr)); |
6159 | poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info); |
6160 | tree type = TREE_TYPE (ptr); |
6161 | new_stmt = gimple_build_assign |
6162 | (new_temp, BIT_AND_EXPR, ptr, |
6163 | fold_build2 (MINUS_EXPR, type, |
6164 | build_int_cst (type, 0), |
6165 | build_int_cst (type, align))); |
6166 | new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); |
6167 | gcc_assert (!new_bb); |
6168 | data_ref |
6169 | = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, |
6170 | build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); |
6171 | vect_copy_ref_info (dest: data_ref, DR_REF (dr)); |
6172 | new_stmt = gimple_build_assign (vec_dest, data_ref); |
6173 | new_temp = make_ssa_name (var: vec_dest, stmt: new_stmt); |
6174 | gimple_assign_set_lhs (gs: new_stmt, lhs: new_temp); |
6175 | gimple_set_vuse (g: new_stmt, vuse); |
6176 | if (pe) |
6177 | { |
6178 | new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); |
6179 | gcc_assert (!new_bb); |
6180 | } |
6181 | else |
6182 | gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); |
6183 | |
6184 | msq_init = gimple_assign_lhs (gs: new_stmt); |
6185 | } |
6186 | |
6187 | /* 4. Create realignment token using a target builtin, if available. |
6188 | It is done either inside the containing loop, or before LOOP (as |
6189 | determined above). */ |
6190 | |
6191 | if (targetm.vectorize.builtin_mask_for_load) |
6192 | { |
6193 | gcall *new_stmt; |
6194 | tree builtin_decl; |
6195 | |
6196 | /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ |
6197 | if (!init_addr) |
6198 | { |
6199 | /* Generate the INIT_ADDR computation outside LOOP. */ |
6200 | init_addr = vect_create_addr_base_for_vector_ref (vinfo, |
6201 | stmt_info, new_stmt_list: &stmts, |
6202 | NULL_TREE); |
6203 | if (loop) |
6204 | { |
6205 | pe = loop_preheader_edge (loop); |
6206 | new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
6207 | gcc_assert (!new_bb); |
6208 | } |
6209 | else |
6210 | gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); |
6211 | } |
6212 | |
6213 | builtin_decl = targetm.vectorize.builtin_mask_for_load (); |
6214 | new_stmt = gimple_build_call (builtin_decl, 1, init_addr); |
6215 | vec_dest = |
6216 | vect_create_destination_var (scalar_dest, |
6217 | vectype: gimple_call_return_type (gs: new_stmt)); |
6218 | new_temp = make_ssa_name (var: vec_dest, stmt: new_stmt); |
6219 | gimple_call_set_lhs (gs: new_stmt, lhs: new_temp); |
6220 | |
6221 | if (compute_in_loop) |
6222 | gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); |
6223 | else |
6224 | { |
6225 | /* Generate the misalignment computation outside LOOP. */ |
6226 | pe = loop_preheader_edge (loop); |
6227 | new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); |
6228 | gcc_assert (!new_bb); |
6229 | } |
6230 | |
6231 | *realignment_token = gimple_call_lhs (gs: new_stmt); |
6232 | |
6233 | /* The result of the CALL_EXPR to this builtin is determined from |
6234 | the value of the parameter and no global variables are touched |
6235 | which makes the builtin a "const" function. Requiring the |
6236 | builtin to have the "const" attribute makes it unnecessary |
6237 | to call mark_call_clobbered. */ |
6238 | gcc_assert (TREE_READONLY (builtin_decl)); |
6239 | } |
6240 | |
6241 | if (alignment_support_scheme == dr_explicit_realign) |
6242 | return msq; |
6243 | |
6244 | gcc_assert (!compute_in_loop); |
6245 | gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); |
6246 | |
6247 | |
6248 | /* 5. Create msq = phi <msq_init, lsq> in loop */ |
6249 | |
6250 | pe = loop_preheader_edge (containing_loop); |
6251 | vec_dest = vect_create_destination_var (scalar_dest, vectype); |
6252 | msq = make_ssa_name (var: vec_dest); |
6253 | phi_stmt = create_phi_node (msq, containing_loop->header); |
6254 | add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); |
6255 | |
6256 | return msq; |
6257 | } |
6258 | |
6259 | |
6260 | /* Function vect_grouped_load_supported. |
6261 | |
6262 | COUNT is the size of the load group (the number of statements plus the |
6263 | number of gaps). SINGLE_ELEMENT_P is true if there is actually |
6264 | only one statement, with a gap of COUNT - 1. |
6265 | |
6266 | Returns true if a suitable permute exists. */ |
6267 | |
6268 | bool |
6269 | vect_grouped_load_supported (tree vectype, bool single_element_p, |
6270 | unsigned HOST_WIDE_INT count) |
6271 | { |
6272 | machine_mode mode = TYPE_MODE (vectype); |
6273 | |
6274 | /* If this is single-element interleaving with an element distance |
6275 | that leaves unused vector loads around punt - we at least create |
6276 | very sub-optimal code in that case (and blow up memory, |
6277 | see PR65518). */ |
6278 | if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype))) |
6279 | { |
6280 | if (dump_enabled_p ()) |
6281 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6282 | "single-element interleaving not supported " |
6283 | "for not adjacent vector loads\n" ); |
6284 | return false; |
6285 | } |
6286 | |
6287 | /* vect_permute_load_chain requires the group size to be equal to 3 or |
6288 | be a power of two. */ |
6289 | if (count != 3 && exact_log2 (x: count) == -1) |
6290 | { |
6291 | if (dump_enabled_p ()) |
6292 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6293 | "the size of the group of accesses" |
6294 | " is not a power of 2 or not equal to 3\n" ); |
6295 | return false; |
6296 | } |
6297 | |
6298 | /* Check that the permutation is supported. */ |
6299 | if (VECTOR_MODE_P (mode)) |
6300 | { |
6301 | unsigned int i, j; |
6302 | if (count == 3) |
6303 | { |
6304 | unsigned int nelt; |
6305 | if (!GET_MODE_NUNITS (mode).is_constant (const_value: &nelt)) |
6306 | { |
6307 | if (dump_enabled_p ()) |
6308 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6309 | "cannot handle groups of 3 loads for" |
6310 | " variable-length vectors\n" ); |
6311 | return false; |
6312 | } |
6313 | |
6314 | vec_perm_builder sel (nelt, nelt, 1); |
6315 | sel.quick_grow (len: nelt); |
6316 | vec_perm_indices indices; |
6317 | unsigned int k; |
6318 | for (k = 0; k < 3; k++) |
6319 | { |
6320 | for (i = 0; i < nelt; i++) |
6321 | if (3 * i + k < 2 * nelt) |
6322 | sel[i] = 3 * i + k; |
6323 | else |
6324 | sel[i] = 0; |
6325 | indices.new_vector (sel, 2, nelt); |
6326 | if (!can_vec_perm_const_p (mode, mode, indices)) |
6327 | { |
6328 | if (dump_enabled_p ()) |
6329 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6330 | "shuffle of 3 loads is not supported by" |
6331 | " target\n" ); |
6332 | return false; |
6333 | } |
6334 | for (i = 0, j = 0; i < nelt; i++) |
6335 | if (3 * i + k < 2 * nelt) |
6336 | sel[i] = i; |
6337 | else |
6338 | sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); |
6339 | indices.new_vector (sel, 2, nelt); |
6340 | if (!can_vec_perm_const_p (mode, mode, indices)) |
6341 | { |
6342 | if (dump_enabled_p ()) |
6343 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6344 | "shuffle of 3 loads is not supported by" |
6345 | " target\n" ); |
6346 | return false; |
6347 | } |
6348 | } |
6349 | return true; |
6350 | } |
6351 | else |
6352 | { |
6353 | /* If length is not equal to 3 then only power of 2 is supported. */ |
6354 | gcc_assert (pow2p_hwi (count)); |
6355 | poly_uint64 nelt = GET_MODE_NUNITS (mode); |
6356 | |
6357 | /* The encoding has a single stepped pattern. */ |
6358 | vec_perm_builder sel (nelt, 1, 3); |
6359 | sel.quick_grow (len: 3); |
6360 | for (i = 0; i < 3; i++) |
6361 | sel[i] = i * 2; |
6362 | vec_perm_indices indices (sel, 2, nelt); |
6363 | if (can_vec_perm_const_p (mode, mode, indices)) |
6364 | { |
6365 | for (i = 0; i < 3; i++) |
6366 | sel[i] = i * 2 + 1; |
6367 | indices.new_vector (sel, 2, nelt); |
6368 | if (can_vec_perm_const_p (mode, mode, indices)) |
6369 | return true; |
6370 | } |
6371 | } |
6372 | } |
6373 | |
6374 | if (dump_enabled_p ()) |
6375 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6376 | "extract even/odd not supported by target\n" ); |
6377 | return false; |
6378 | } |
6379 | |
6380 | /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors |
6381 | of type VECTYPE. MASKED_P says whether the masked form is needed. */ |
6382 | |
6383 | internal_fn |
6384 | vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, |
6385 | bool masked_p) |
6386 | { |
6387 | if (vect_lanes_optab_supported_p (name: "vec_mask_len_load_lanes" , |
6388 | optab: vec_mask_len_load_lanes_optab, vectype, |
6389 | count)) |
6390 | return IFN_MASK_LEN_LOAD_LANES; |
6391 | else if (masked_p) |
6392 | { |
6393 | if (vect_lanes_optab_supported_p (name: "vec_mask_load_lanes" , |
6394 | optab: vec_mask_load_lanes_optab, vectype, |
6395 | count)) |
6396 | return IFN_MASK_LOAD_LANES; |
6397 | } |
6398 | else |
6399 | { |
6400 | if (vect_lanes_optab_supported_p (name: "vec_load_lanes" , optab: vec_load_lanes_optab, |
6401 | vectype, count)) |
6402 | return IFN_LOAD_LANES; |
6403 | } |
6404 | return IFN_LAST; |
6405 | } |
6406 | |
6407 | /* Function vect_permute_load_chain. |
6408 | |
6409 | Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be |
6410 | a power of 2 or equal to 3, generate extract_even/odd stmts to reorder |
6411 | the input data correctly. Return the final references for loads in |
6412 | RESULT_CHAIN. |
6413 | |
6414 | E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. |
6415 | The input is 4 vectors each containing 8 elements. We assign a number to each |
6416 | element, the input sequence is: |
6417 | |
6418 | 1st vec: 0 1 2 3 4 5 6 7 |
6419 | 2nd vec: 8 9 10 11 12 13 14 15 |
6420 | 3rd vec: 16 17 18 19 20 21 22 23 |
6421 | 4th vec: 24 25 26 27 28 29 30 31 |
6422 | |
6423 | The output sequence should be: |
6424 | |
6425 | 1st vec: 0 4 8 12 16 20 24 28 |
6426 | 2nd vec: 1 5 9 13 17 21 25 29 |
6427 | 3rd vec: 2 6 10 14 18 22 26 30 |
6428 | 4th vec: 3 7 11 15 19 23 27 31 |
6429 | |
6430 | i.e., the first output vector should contain the first elements of each |
6431 | interleaving group, etc. |
6432 | |
6433 | We use extract_even/odd instructions to create such output. The input of |
6434 | each extract_even/odd operation is two vectors |
6435 | 1st vec 2nd vec |
6436 | 0 1 2 3 4 5 6 7 |
6437 | |
6438 | and the output is the vector of extracted even/odd elements. The output of |
6439 | extract_even will be: 0 2 4 6 |
6440 | and of extract_odd: 1 3 5 7 |
6441 | |
6442 | |
6443 | The permutation is done in log LENGTH stages. In each stage extract_even |
6444 | and extract_odd stmts are created for each pair of vectors in DR_CHAIN in |
6445 | their order. In our example, |
6446 | |
6447 | E1: extract_even (1st vec, 2nd vec) |
6448 | E2: extract_odd (1st vec, 2nd vec) |
6449 | E3: extract_even (3rd vec, 4th vec) |
6450 | E4: extract_odd (3rd vec, 4th vec) |
6451 | |
6452 | The output for the first stage will be: |
6453 | |
6454 | E1: 0 2 4 6 8 10 12 14 |
6455 | E2: 1 3 5 7 9 11 13 15 |
6456 | E3: 16 18 20 22 24 26 28 30 |
6457 | E4: 17 19 21 23 25 27 29 31 |
6458 | |
6459 | In order to proceed and create the correct sequence for the next stage (or |
6460 | for the correct output, if the second stage is the last one, as in our |
6461 | example), we first put the output of extract_even operation and then the |
6462 | output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). |
6463 | The input for the second stage is: |
6464 | |
6465 | 1st vec (E1): 0 2 4 6 8 10 12 14 |
6466 | 2nd vec (E3): 16 18 20 22 24 26 28 30 |
6467 | 3rd vec (E2): 1 3 5 7 9 11 13 15 |
6468 | 4th vec (E4): 17 19 21 23 25 27 29 31 |
6469 | |
6470 | The output of the second stage: |
6471 | |
6472 | E1: 0 4 8 12 16 20 24 28 |
6473 | E2: 2 6 10 14 18 22 26 30 |
6474 | E3: 1 5 9 13 17 21 25 29 |
6475 | E4: 3 7 11 15 19 23 27 31 |
6476 | |
6477 | And RESULT_CHAIN after reordering: |
6478 | |
6479 | 1st vec (E1): 0 4 8 12 16 20 24 28 |
6480 | 2nd vec (E3): 1 5 9 13 17 21 25 29 |
6481 | 3rd vec (E2): 2 6 10 14 18 22 26 30 |
6482 | 4th vec (E4): 3 7 11 15 19 23 27 31. */ |
6483 | |
6484 | static void |
6485 | vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, |
6486 | unsigned int length, |
6487 | stmt_vec_info stmt_info, |
6488 | gimple_stmt_iterator *gsi, |
6489 | vec<tree> *result_chain) |
6490 | { |
6491 | tree data_ref, first_vect, second_vect; |
6492 | tree perm_mask_even, perm_mask_odd; |
6493 | tree perm3_mask_low, perm3_mask_high; |
6494 | gimple *perm_stmt; |
6495 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
6496 | unsigned int i, j, log_length = exact_log2 (x: length); |
6497 | |
6498 | result_chain->quick_grow (len: length); |
6499 | memcpy (dest: result_chain->address (), src: dr_chain.address (), |
6500 | n: length * sizeof (tree)); |
6501 | |
6502 | if (length == 3) |
6503 | { |
6504 | /* vect_grouped_load_supported ensures that this is constant. */ |
6505 | unsigned nelt = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant (); |
6506 | unsigned int k; |
6507 | |
6508 | vec_perm_builder sel (nelt, nelt, 1); |
6509 | sel.quick_grow (len: nelt); |
6510 | vec_perm_indices indices; |
6511 | for (k = 0; k < 3; k++) |
6512 | { |
6513 | for (i = 0; i < nelt; i++) |
6514 | if (3 * i + k < 2 * nelt) |
6515 | sel[i] = 3 * i + k; |
6516 | else |
6517 | sel[i] = 0; |
6518 | indices.new_vector (sel, 2, nelt); |
6519 | perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); |
6520 | |
6521 | for (i = 0, j = 0; i < nelt; i++) |
6522 | if (3 * i + k < 2 * nelt) |
6523 | sel[i] = i; |
6524 | else |
6525 | sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); |
6526 | indices.new_vector (sel, 2, nelt); |
6527 | perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); |
6528 | |
6529 | first_vect = dr_chain[0]; |
6530 | second_vect = dr_chain[1]; |
6531 | |
6532 | /* Create interleaving stmt (low part of): |
6533 | low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, |
6534 | ...}> */ |
6535 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3_low" ); |
6536 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, |
6537 | second_vect, perm3_mask_low); |
6538 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6539 | |
6540 | /* Create interleaving stmt (high part of): |
6541 | high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, |
6542 | ...}> */ |
6543 | first_vect = data_ref; |
6544 | second_vect = dr_chain[2]; |
6545 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3_high" ); |
6546 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, |
6547 | second_vect, perm3_mask_high); |
6548 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6549 | (*result_chain)[k] = data_ref; |
6550 | } |
6551 | } |
6552 | else |
6553 | { |
6554 | /* If length is not equal to 3 then only power of 2 is supported. */ |
6555 | gcc_assert (pow2p_hwi (length)); |
6556 | |
6557 | /* The encoding has a single stepped pattern. */ |
6558 | poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (node: vectype); |
6559 | vec_perm_builder sel (nelt, 1, 3); |
6560 | sel.quick_grow (len: 3); |
6561 | for (i = 0; i < 3; ++i) |
6562 | sel[i] = i * 2; |
6563 | vec_perm_indices indices (sel, 2, nelt); |
6564 | perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); |
6565 | |
6566 | for (i = 0; i < 3; ++i) |
6567 | sel[i] = i * 2 + 1; |
6568 | indices.new_vector (sel, 2, nelt); |
6569 | perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices); |
6570 | |
6571 | for (i = 0; i < log_length; i++) |
6572 | { |
6573 | for (j = 0; j < length; j += 2) |
6574 | { |
6575 | first_vect = dr_chain[j]; |
6576 | second_vect = dr_chain[j+1]; |
6577 | |
6578 | /* data_ref = permute_even (first_data_ref, second_data_ref); */ |
6579 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_perm_even" ); |
6580 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6581 | first_vect, second_vect, |
6582 | perm_mask_even); |
6583 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6584 | (*result_chain)[j/2] = data_ref; |
6585 | |
6586 | /* data_ref = permute_odd (first_data_ref, second_data_ref); */ |
6587 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_perm_odd" ); |
6588 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6589 | first_vect, second_vect, |
6590 | perm_mask_odd); |
6591 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6592 | (*result_chain)[j/2+length/2] = data_ref; |
6593 | } |
6594 | memcpy (dest: dr_chain.address (), src: result_chain->address (), |
6595 | n: length * sizeof (tree)); |
6596 | } |
6597 | } |
6598 | } |
6599 | |
6600 | /* Function vect_shift_permute_load_chain. |
6601 | |
6602 | Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate |
6603 | sequence of stmts to reorder the input data accordingly. |
6604 | Return the final references for loads in RESULT_CHAIN. |
6605 | Return true if successed, false otherwise. |
6606 | |
6607 | E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. |
6608 | The input is 3 vectors each containing 8 elements. We assign a |
6609 | number to each element, the input sequence is: |
6610 | |
6611 | 1st vec: 0 1 2 3 4 5 6 7 |
6612 | 2nd vec: 8 9 10 11 12 13 14 15 |
6613 | 3rd vec: 16 17 18 19 20 21 22 23 |
6614 | |
6615 | The output sequence should be: |
6616 | |
6617 | 1st vec: 0 3 6 9 12 15 18 21 |
6618 | 2nd vec: 1 4 7 10 13 16 19 22 |
6619 | 3rd vec: 2 5 8 11 14 17 20 23 |
6620 | |
6621 | We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. |
6622 | |
6623 | First we shuffle all 3 vectors to get correct elements order: |
6624 | |
6625 | 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) |
6626 | 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) |
6627 | 3rd vec: (16 19 22) (17 20 23) (18 21) |
6628 | |
6629 | Next we unite and shift vector 3 times: |
6630 | |
6631 | 1st step: |
6632 | shift right by 6 the concatenation of: |
6633 | "1st vec" and "2nd vec" |
6634 | ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) |
6635 | "2nd vec" and "3rd vec" |
6636 | ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) |
6637 | "3rd vec" and "1st vec" |
6638 | (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) |
6639 | | New vectors | |
6640 | |
6641 | So that now new vectors are: |
6642 | |
6643 | 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) |
6644 | 2nd vec: (10 13) (16 19 22) (17 20 23) |
6645 | 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) |
6646 | |
6647 | 2nd step: |
6648 | shift right by 5 the concatenation of: |
6649 | "1st vec" and "3rd vec" |
6650 | ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) |
6651 | "2nd vec" and "1st vec" |
6652 | (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) |
6653 | "3rd vec" and "2nd vec" |
6654 | (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) |
6655 | | New vectors | |
6656 | |
6657 | So that now new vectors are: |
6658 | |
6659 | 1st vec: ( 9 12 15) (18 21) ( 0 3 6) |
6660 | 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) |
6661 | 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY |
6662 | |
6663 | 3rd step: |
6664 | shift right by 5 the concatenation of: |
6665 | "1st vec" and "1st vec" |
6666 | ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) |
6667 | shift right by 3 the concatenation of: |
6668 | "2nd vec" and "2nd vec" |
6669 | (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) |
6670 | | New vectors | |
6671 | |
6672 | So that now all vectors are READY: |
6673 | 1st vec: ( 0 3 6) ( 9 12 15) (18 21) |
6674 | 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) |
6675 | 3rd vec: ( 1 4 7) (10 13) (16 19 22) |
6676 | |
6677 | This algorithm is faster than one in vect_permute_load_chain if: |
6678 | 1. "shift of a concatination" is faster than general permutation. |
6679 | This is usually so. |
6680 | 2. The TARGET machine can't execute vector instructions in parallel. |
6681 | This is because each step of the algorithm depends on previous. |
6682 | The algorithm in vect_permute_load_chain is much more parallel. |
6683 | |
6684 | The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. |
6685 | */ |
6686 | |
6687 | static bool |
6688 | vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, |
6689 | unsigned int length, |
6690 | stmt_vec_info stmt_info, |
6691 | gimple_stmt_iterator *gsi, |
6692 | vec<tree> *result_chain) |
6693 | { |
6694 | tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; |
6695 | tree perm2_mask1, perm2_mask2, perm3_mask; |
6696 | tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; |
6697 | gimple *perm_stmt; |
6698 | |
6699 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
6700 | machine_mode vmode = TYPE_MODE (vectype); |
6701 | unsigned int i; |
6702 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
6703 | |
6704 | unsigned HOST_WIDE_INT nelt, vf; |
6705 | if (!TYPE_VECTOR_SUBPARTS (node: vectype).is_constant (const_value: &nelt) |
6706 | || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (const_value: &vf)) |
6707 | /* Not supported for variable-length vectors. */ |
6708 | return false; |
6709 | |
6710 | vec_perm_builder sel (nelt, nelt, 1); |
6711 | sel.quick_grow (len: nelt); |
6712 | |
6713 | result_chain->quick_grow (len: length); |
6714 | memcpy (dest: result_chain->address (), src: dr_chain.address (), |
6715 | n: length * sizeof (tree)); |
6716 | |
6717 | if (pow2p_hwi (x: length) && vf > 4) |
6718 | { |
6719 | unsigned int j, log_length = exact_log2 (x: length); |
6720 | for (i = 0; i < nelt / 2; ++i) |
6721 | sel[i] = i * 2; |
6722 | for (i = 0; i < nelt / 2; ++i) |
6723 | sel[nelt / 2 + i] = i * 2 + 1; |
6724 | vec_perm_indices indices (sel, 2, nelt); |
6725 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6726 | { |
6727 | if (dump_enabled_p ()) |
6728 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6729 | "shuffle of 2 fields structure is not \ |
6730 | supported by target\n" ); |
6731 | return false; |
6732 | } |
6733 | perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices); |
6734 | |
6735 | for (i = 0; i < nelt / 2; ++i) |
6736 | sel[i] = i * 2 + 1; |
6737 | for (i = 0; i < nelt / 2; ++i) |
6738 | sel[nelt / 2 + i] = i * 2; |
6739 | indices.new_vector (sel, 2, nelt); |
6740 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6741 | { |
6742 | if (dump_enabled_p ()) |
6743 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6744 | "shuffle of 2 fields structure is not \ |
6745 | supported by target\n" ); |
6746 | return false; |
6747 | } |
6748 | perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices); |
6749 | |
6750 | /* Generating permutation constant to shift all elements. |
6751 | For vector length 8 it is {4 5 6 7 8 9 10 11}. */ |
6752 | for (i = 0; i < nelt; i++) |
6753 | sel[i] = nelt / 2 + i; |
6754 | indices.new_vector (sel, 2, nelt); |
6755 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6756 | { |
6757 | if (dump_enabled_p ()) |
6758 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6759 | "shift permutation is not supported by target\n" ); |
6760 | return false; |
6761 | } |
6762 | shift1_mask = vect_gen_perm_mask_checked (vectype, indices); |
6763 | |
6764 | /* Generating permutation constant to select vector from 2. |
6765 | For vector length 8 it is {0 1 2 3 12 13 14 15}. */ |
6766 | for (i = 0; i < nelt / 2; i++) |
6767 | sel[i] = i; |
6768 | for (i = nelt / 2; i < nelt; i++) |
6769 | sel[i] = nelt + i; |
6770 | indices.new_vector (sel, 2, nelt); |
6771 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6772 | { |
6773 | if (dump_enabled_p ()) |
6774 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6775 | "select is not supported by target\n" ); |
6776 | return false; |
6777 | } |
6778 | select_mask = vect_gen_perm_mask_checked (vectype, indices); |
6779 | |
6780 | for (i = 0; i < log_length; i++) |
6781 | { |
6782 | for (j = 0; j < length; j += 2) |
6783 | { |
6784 | first_vect = dr_chain[j]; |
6785 | second_vect = dr_chain[j + 1]; |
6786 | |
6787 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle2" ); |
6788 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6789 | first_vect, first_vect, |
6790 | perm2_mask1); |
6791 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6792 | vect[0] = data_ref; |
6793 | |
6794 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle2" ); |
6795 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6796 | second_vect, second_vect, |
6797 | perm2_mask2); |
6798 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6799 | vect[1] = data_ref; |
6800 | |
6801 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift" ); |
6802 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6803 | vect[0], vect[1], shift1_mask); |
6804 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6805 | (*result_chain)[j/2 + length/2] = data_ref; |
6806 | |
6807 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_select" ); |
6808 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6809 | vect[0], vect[1], select_mask); |
6810 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6811 | (*result_chain)[j/2] = data_ref; |
6812 | } |
6813 | memcpy (dest: dr_chain.address (), src: result_chain->address (), |
6814 | n: length * sizeof (tree)); |
6815 | } |
6816 | return true; |
6817 | } |
6818 | if (length == 3 && vf > 2) |
6819 | { |
6820 | unsigned int k = 0, l = 0; |
6821 | |
6822 | /* Generating permutation constant to get all elements in rigth order. |
6823 | For vector length 8 it is {0 3 6 1 4 7 2 5}. */ |
6824 | for (i = 0; i < nelt; i++) |
6825 | { |
6826 | if (3 * k + (l % 3) >= nelt) |
6827 | { |
6828 | k = 0; |
6829 | l += (3 - (nelt % 3)); |
6830 | } |
6831 | sel[i] = 3 * k + (l % 3); |
6832 | k++; |
6833 | } |
6834 | vec_perm_indices indices (sel, 2, nelt); |
6835 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6836 | { |
6837 | if (dump_enabled_p ()) |
6838 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6839 | "shuffle of 3 fields structure is not \ |
6840 | supported by target\n" ); |
6841 | return false; |
6842 | } |
6843 | perm3_mask = vect_gen_perm_mask_checked (vectype, indices); |
6844 | |
6845 | /* Generating permutation constant to shift all elements. |
6846 | For vector length 8 it is {6 7 8 9 10 11 12 13}. */ |
6847 | for (i = 0; i < nelt; i++) |
6848 | sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; |
6849 | indices.new_vector (sel, 2, nelt); |
6850 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6851 | { |
6852 | if (dump_enabled_p ()) |
6853 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6854 | "shift permutation is not supported by target\n" ); |
6855 | return false; |
6856 | } |
6857 | shift1_mask = vect_gen_perm_mask_checked (vectype, indices); |
6858 | |
6859 | /* Generating permutation constant to shift all elements. |
6860 | For vector length 8 it is {5 6 7 8 9 10 11 12}. */ |
6861 | for (i = 0; i < nelt; i++) |
6862 | sel[i] = 2 * (nelt / 3) + 1 + i; |
6863 | indices.new_vector (sel, 2, nelt); |
6864 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6865 | { |
6866 | if (dump_enabled_p ()) |
6867 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6868 | "shift permutation is not supported by target\n" ); |
6869 | return false; |
6870 | } |
6871 | shift2_mask = vect_gen_perm_mask_checked (vectype, indices); |
6872 | |
6873 | /* Generating permutation constant to shift all elements. |
6874 | For vector length 8 it is {3 4 5 6 7 8 9 10}. */ |
6875 | for (i = 0; i < nelt; i++) |
6876 | sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; |
6877 | indices.new_vector (sel, 2, nelt); |
6878 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6879 | { |
6880 | if (dump_enabled_p ()) |
6881 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6882 | "shift permutation is not supported by target\n" ); |
6883 | return false; |
6884 | } |
6885 | shift3_mask = vect_gen_perm_mask_checked (vectype, indices); |
6886 | |
6887 | /* Generating permutation constant to shift all elements. |
6888 | For vector length 8 it is {5 6 7 8 9 10 11 12}. */ |
6889 | for (i = 0; i < nelt; i++) |
6890 | sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; |
6891 | indices.new_vector (sel, 2, nelt); |
6892 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6893 | { |
6894 | if (dump_enabled_p ()) |
6895 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6896 | "shift permutation is not supported by target\n" ); |
6897 | return false; |
6898 | } |
6899 | shift4_mask = vect_gen_perm_mask_checked (vectype, indices); |
6900 | |
6901 | for (k = 0; k < 3; k++) |
6902 | { |
6903 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3" ); |
6904 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6905 | dr_chain[k], dr_chain[k], |
6906 | perm3_mask); |
6907 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6908 | vect[k] = data_ref; |
6909 | } |
6910 | |
6911 | for (k = 0; k < 3; k++) |
6912 | { |
6913 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift1" ); |
6914 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6915 | vect[k % 3], vect[(k + 1) % 3], |
6916 | shift1_mask); |
6917 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6918 | vect_shift[k] = data_ref; |
6919 | } |
6920 | |
6921 | for (k = 0; k < 3; k++) |
6922 | { |
6923 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift2" ); |
6924 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6925 | vect_shift[(4 - k) % 3], |
6926 | vect_shift[(3 - k) % 3], |
6927 | shift2_mask); |
6928 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6929 | vect[k] = data_ref; |
6930 | } |
6931 | |
6932 | (*result_chain)[3 - (nelt % 3)] = vect[2]; |
6933 | |
6934 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift3" ); |
6935 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], |
6936 | vect[0], shift3_mask); |
6937 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6938 | (*result_chain)[nelt % 3] = data_ref; |
6939 | |
6940 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift4" ); |
6941 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], |
6942 | vect[1], shift4_mask); |
6943 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6944 | (*result_chain)[0] = data_ref; |
6945 | return true; |
6946 | } |
6947 | return false; |
6948 | } |
6949 | |
6950 | /* Function vect_transform_grouped_load. |
6951 | |
6952 | Given a chain of input interleaved data-refs (in DR_CHAIN), build statements |
6953 | to perform their permutation and ascribe the result vectorized statements to |
6954 | the scalar statements. |
6955 | */ |
6956 | |
6957 | void |
6958 | vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info, |
6959 | vec<tree> dr_chain, |
6960 | int size, gimple_stmt_iterator *gsi) |
6961 | { |
6962 | machine_mode mode; |
6963 | vec<tree> result_chain = vNULL; |
6964 | |
6965 | /* DR_CHAIN contains input data-refs that are a part of the interleaving. |
6966 | RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted |
6967 | vectors, that are ready for vector computation. */ |
6968 | result_chain.create (nelems: size); |
6969 | |
6970 | /* If reassociation width for vector type is 2 or greater target machine can |
6971 | execute 2 or more vector instructions in parallel. Otherwise try to |
6972 | get chain for loads group using vect_shift_permute_load_chain. */ |
6973 | mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); |
6974 | if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 |
6975 | || pow2p_hwi (x: size) |
6976 | || !vect_shift_permute_load_chain (vinfo, dr_chain, length: size, stmt_info, |
6977 | gsi, result_chain: &result_chain)) |
6978 | vect_permute_load_chain (vinfo, dr_chain, |
6979 | length: size, stmt_info, gsi, result_chain: &result_chain); |
6980 | vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain); |
6981 | result_chain.release (); |
6982 | } |
6983 | |
6984 | /* RESULT_CHAIN contains the output of a group of grouped loads that were |
6985 | generated as part of the vectorization of STMT_INFO. Assign the statement |
6986 | for each vector to the associated scalar statement. */ |
6987 | |
6988 | void |
6989 | vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info, |
6990 | vec<tree> result_chain) |
6991 | { |
6992 | stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); |
6993 | unsigned int i, gap_count; |
6994 | tree tmp_data_ref; |
6995 | |
6996 | /* Put a permuted data-ref in the VECTORIZED_STMT field. |
6997 | Since we scan the chain starting from it's first node, their order |
6998 | corresponds the order of data-refs in RESULT_CHAIN. */ |
6999 | stmt_vec_info next_stmt_info = first_stmt_info; |
7000 | gap_count = 1; |
7001 | FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) |
7002 | { |
7003 | if (!next_stmt_info) |
7004 | break; |
7005 | |
7006 | /* Skip the gaps. Loads created for the gaps will be removed by dead |
7007 | code elimination pass later. No need to check for the first stmt in |
7008 | the group, since it always exists. |
7009 | DR_GROUP_GAP is the number of steps in elements from the previous |
7010 | access (if there is no gap DR_GROUP_GAP is 1). We skip loads that |
7011 | correspond to the gaps. */ |
7012 | if (next_stmt_info != first_stmt_info |
7013 | && gap_count < DR_GROUP_GAP (next_stmt_info)) |
7014 | { |
7015 | gap_count++; |
7016 | continue; |
7017 | } |
7018 | |
7019 | /* ??? The following needs cleanup after the removal of |
7020 | DR_GROUP_SAME_DR_STMT. */ |
7021 | if (next_stmt_info) |
7022 | { |
7023 | gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); |
7024 | /* We assume that if VEC_STMT is not NULL, this is a case of multiple |
7025 | copies, and we put the new vector statement last. */ |
7026 | STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (obj: new_stmt); |
7027 | |
7028 | next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); |
7029 | gap_count = 1; |
7030 | } |
7031 | } |
7032 | } |
7033 | |
7034 | /* Function vect_force_dr_alignment_p. |
7035 | |
7036 | Returns whether the alignment of a DECL can be forced to be aligned |
7037 | on ALIGNMENT bit boundary. */ |
7038 | |
7039 | bool |
7040 | vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment) |
7041 | { |
7042 | if (!VAR_P (decl)) |
7043 | return false; |
7044 | |
7045 | if (decl_in_symtab_p (decl) |
7046 | && !symtab_node::get (decl)->can_increase_alignment_p ()) |
7047 | return false; |
7048 | |
7049 | if (TREE_STATIC (decl)) |
7050 | return (known_le (alignment, |
7051 | (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT)); |
7052 | else |
7053 | return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT)); |
7054 | } |
7055 | |
7056 | /* Return whether the data reference DR_INFO is supported with respect to its |
7057 | alignment. |
7058 | If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even |
7059 | it is aligned, i.e., check if it is possible to vectorize it with different |
7060 | alignment. */ |
7061 | |
7062 | enum dr_alignment_support |
7063 | vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, |
7064 | tree vectype, int misalignment) |
7065 | { |
7066 | data_reference *dr = dr_info->dr; |
7067 | stmt_vec_info stmt_info = dr_info->stmt; |
7068 | machine_mode mode = TYPE_MODE (vectype); |
7069 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
7070 | class loop *vect_loop = NULL; |
7071 | bool nested_in_vect_loop = false; |
7072 | |
7073 | if (misalignment == 0) |
7074 | return dr_aligned; |
7075 | |
7076 | /* For now assume all conditional loads/stores support unaligned |
7077 | access without any special code. */ |
7078 | if (gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt)) |
7079 | if (gimple_call_internal_p (gs: stmt) |
7080 | && (gimple_call_internal_fn (gs: stmt) == IFN_MASK_LOAD |
7081 | || gimple_call_internal_fn (gs: stmt) == IFN_MASK_STORE)) |
7082 | return dr_unaligned_supported; |
7083 | |
7084 | if (loop_vinfo) |
7085 | { |
7086 | vect_loop = LOOP_VINFO_LOOP (loop_vinfo); |
7087 | nested_in_vect_loop = nested_in_vect_loop_p (loop: vect_loop, stmt_info); |
7088 | } |
7089 | |
7090 | /* Possibly unaligned access. */ |
7091 | |
7092 | /* We can choose between using the implicit realignment scheme (generating |
7093 | a misaligned_move stmt) and the explicit realignment scheme (generating |
7094 | aligned loads with a REALIGN_LOAD). There are two variants to the |
7095 | explicit realignment scheme: optimized, and unoptimized. |
7096 | We can optimize the realignment only if the step between consecutive |
7097 | vector loads is equal to the vector size. Since the vector memory |
7098 | accesses advance in steps of VS (Vector Size) in the vectorized loop, it |
7099 | is guaranteed that the misalignment amount remains the same throughout the |
7100 | execution of the vectorized loop. Therefore, we can create the |
7101 | "realignment token" (the permutation mask that is passed to REALIGN_LOAD) |
7102 | at the loop preheader. |
7103 | |
7104 | However, in the case of outer-loop vectorization, when vectorizing a |
7105 | memory access in the inner-loop nested within the LOOP that is now being |
7106 | vectorized, while it is guaranteed that the misalignment of the |
7107 | vectorized memory access will remain the same in different outer-loop |
7108 | iterations, it is *not* guaranteed that is will remain the same throughout |
7109 | the execution of the inner-loop. This is because the inner-loop advances |
7110 | with the original scalar step (and not in steps of VS). If the inner-loop |
7111 | step happens to be a multiple of VS, then the misalignment remains fixed |
7112 | and we can use the optimized realignment scheme. For example: |
7113 | |
7114 | for (i=0; i<N; i++) |
7115 | for (j=0; j<M; j++) |
7116 | s += a[i+j]; |
7117 | |
7118 | When vectorizing the i-loop in the above example, the step between |
7119 | consecutive vector loads is 1, and so the misalignment does not remain |
7120 | fixed across the execution of the inner-loop, and the realignment cannot |
7121 | be optimized (as illustrated in the following pseudo vectorized loop): |
7122 | |
7123 | for (i=0; i<N; i+=4) |
7124 | for (j=0; j<M; j++){ |
7125 | vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} |
7126 | // when j is {0,1,2,3,4,5,6,7,...} respectively. |
7127 | // (assuming that we start from an aligned address). |
7128 | } |
7129 | |
7130 | We therefore have to use the unoptimized realignment scheme: |
7131 | |
7132 | for (i=0; i<N; i+=4) |
7133 | for (j=k; j<M; j+=4) |
7134 | vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming |
7135 | // that the misalignment of the initial address is |
7136 | // 0). |
7137 | |
7138 | The loop can then be vectorized as follows: |
7139 | |
7140 | for (k=0; k<4; k++){ |
7141 | rt = get_realignment_token (&vp[k]); |
7142 | for (i=0; i<N; i+=4){ |
7143 | v1 = vp[i+k]; |
7144 | for (j=k; j<M; j+=4){ |
7145 | v2 = vp[i+j+VS-1]; |
7146 | va = REALIGN_LOAD <v1,v2,rt>; |
7147 | vs += va; |
7148 | v1 = v2; |
7149 | } |
7150 | } |
7151 | } */ |
7152 | |
7153 | if (DR_IS_READ (dr)) |
7154 | { |
7155 | if (optab_handler (op: vec_realign_load_optab, mode) != CODE_FOR_nothing |
7156 | && (!targetm.vectorize.builtin_mask_for_load |
7157 | || targetm.vectorize.builtin_mask_for_load ())) |
7158 | { |
7159 | /* If we are doing SLP then the accesses need not have the |
7160 | same alignment, instead it depends on the SLP group size. */ |
7161 | if (loop_vinfo |
7162 | && STMT_SLP_TYPE (stmt_info) |
7163 | && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) |
7164 | || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) |
7165 | * (DR_GROUP_SIZE |
7166 | (DR_GROUP_FIRST_ELEMENT (stmt_info))), |
7167 | b: TYPE_VECTOR_SUBPARTS (node: vectype)))) |
7168 | ; |
7169 | else if (!loop_vinfo |
7170 | || (nested_in_vect_loop |
7171 | && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)), |
7172 | b: GET_MODE_SIZE (TYPE_MODE (vectype))))) |
7173 | return dr_explicit_realign; |
7174 | else |
7175 | return dr_explicit_realign_optimized; |
7176 | } |
7177 | } |
7178 | |
7179 | bool is_packed = false; |
7180 | tree type = TREE_TYPE (DR_REF (dr)); |
7181 | if (misalignment == DR_MISALIGNMENT_UNKNOWN) |
7182 | is_packed = not_size_aligned (DR_REF (dr)); |
7183 | if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment, |
7184 | is_packed)) |
7185 | return dr_unaligned_supported; |
7186 | |
7187 | /* Unsupported. */ |
7188 | return dr_unaligned_unsupported; |
7189 | } |
7190 | |