1 | /* SLP - Basic Block Vectorization |
2 | Copyright (C) 2007-2024 Free Software Foundation, Inc. |
3 | Contributed by Dorit Naishlos <dorit@il.ibm.com> |
4 | and Ira Rosen <irar@il.ibm.com> |
5 | |
6 | This file is part of GCC. |
7 | |
8 | GCC is free software; you can redistribute it and/or modify it under |
9 | the terms of the GNU General Public License as published by the Free |
10 | Software Foundation; either version 3, or (at your option) any later |
11 | version. |
12 | |
13 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or |
15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
16 | for more details. |
17 | |
18 | You should have received a copy of the GNU General Public License |
19 | along with GCC; see the file COPYING3. If not see |
20 | <http://www.gnu.org/licenses/>. */ |
21 | |
22 | #include "config.h" |
23 | #define INCLUDE_ALGORITHM |
24 | #include "system.h" |
25 | #include "coretypes.h" |
26 | #include "backend.h" |
27 | #include "target.h" |
28 | #include "rtl.h" |
29 | #include "tree.h" |
30 | #include "gimple.h" |
31 | #include "tree-pass.h" |
32 | #include "ssa.h" |
33 | #include "optabs-tree.h" |
34 | #include "insn-config.h" |
35 | #include "recog.h" /* FIXME: for insn_data */ |
36 | #include "fold-const.h" |
37 | #include "stor-layout.h" |
38 | #include "gimple-iterator.h" |
39 | #include "cfgloop.h" |
40 | #include "tree-vectorizer.h" |
41 | #include "langhooks.h" |
42 | #include "gimple-walk.h" |
43 | #include "dbgcnt.h" |
44 | #include "tree-vector-builder.h" |
45 | #include "vec-perm-indices.h" |
46 | #include "gimple-fold.h" |
47 | #include "internal-fn.h" |
48 | #include "dump-context.h" |
49 | #include "cfganal.h" |
50 | #include "tree-eh.h" |
51 | #include "tree-cfg.h" |
52 | #include "alloc-pool.h" |
53 | #include "sreal.h" |
54 | #include "predict.h" |
55 | |
56 | static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree, |
57 | load_permutation_t &, |
58 | const vec<tree> &, |
59 | gimple_stmt_iterator *, |
60 | poly_uint64, bool, bool, |
61 | unsigned *, |
62 | unsigned * = nullptr, |
63 | bool = false); |
64 | static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *, |
65 | slp_tree, lane_permutation_t &, |
66 | vec<slp_tree> &, bool); |
67 | static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *, |
68 | slp_tree, stmt_vector_for_cost *); |
69 | static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree); |
70 | |
71 | static object_allocator<_slp_tree> *slp_tree_pool; |
72 | static slp_tree slp_first_node; |
73 | |
74 | void |
75 | vect_slp_init (void) |
76 | { |
77 | slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes" ); |
78 | } |
79 | |
80 | void |
81 | vect_slp_fini (void) |
82 | { |
83 | while (slp_first_node) |
84 | delete slp_first_node; |
85 | delete slp_tree_pool; |
86 | slp_tree_pool = NULL; |
87 | } |
88 | |
89 | void * |
90 | _slp_tree::operator new (size_t n) |
91 | { |
92 | gcc_assert (n == sizeof (_slp_tree)); |
93 | return slp_tree_pool->allocate_raw (); |
94 | } |
95 | |
96 | void |
97 | _slp_tree::operator delete (void *node, size_t n) |
98 | { |
99 | gcc_assert (n == sizeof (_slp_tree)); |
100 | slp_tree_pool->remove_raw (object: node); |
101 | } |
102 | |
103 | |
104 | /* Initialize a SLP node. */ |
105 | |
106 | _slp_tree::_slp_tree () |
107 | { |
108 | this->prev_node = NULL; |
109 | if (slp_first_node) |
110 | slp_first_node->prev_node = this; |
111 | this->next_node = slp_first_node; |
112 | slp_first_node = this; |
113 | SLP_TREE_SCALAR_STMTS (this) = vNULL; |
114 | SLP_TREE_SCALAR_OPS (this) = vNULL; |
115 | SLP_TREE_VEC_DEFS (this) = vNULL; |
116 | SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0; |
117 | SLP_TREE_CHILDREN (this) = vNULL; |
118 | SLP_TREE_LOAD_PERMUTATION (this) = vNULL; |
119 | SLP_TREE_LANE_PERMUTATION (this) = vNULL; |
120 | SLP_TREE_SIMD_CLONE_INFO (this) = vNULL; |
121 | SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def; |
122 | SLP_TREE_CODE (this) = ERROR_MARK; |
123 | SLP_TREE_VECTYPE (this) = NULL_TREE; |
124 | SLP_TREE_REPRESENTATIVE (this) = NULL; |
125 | SLP_TREE_REF_COUNT (this) = 1; |
126 | this->failed = NULL; |
127 | this->max_nunits = 1; |
128 | this->lanes = 0; |
129 | } |
130 | |
131 | /* Tear down a SLP node. */ |
132 | |
133 | _slp_tree::~_slp_tree () |
134 | { |
135 | if (this->prev_node) |
136 | this->prev_node->next_node = this->next_node; |
137 | else |
138 | slp_first_node = this->next_node; |
139 | if (this->next_node) |
140 | this->next_node->prev_node = this->prev_node; |
141 | SLP_TREE_CHILDREN (this).release (); |
142 | SLP_TREE_SCALAR_STMTS (this).release (); |
143 | SLP_TREE_SCALAR_OPS (this).release (); |
144 | SLP_TREE_VEC_DEFS (this).release (); |
145 | SLP_TREE_LOAD_PERMUTATION (this).release (); |
146 | SLP_TREE_LANE_PERMUTATION (this).release (); |
147 | SLP_TREE_SIMD_CLONE_INFO (this).release (); |
148 | if (this->failed) |
149 | free (ptr: failed); |
150 | } |
151 | |
152 | /* Push the single SSA definition in DEF to the vector of vector defs. */ |
153 | |
154 | void |
155 | _slp_tree::push_vec_def (gimple *def) |
156 | { |
157 | if (gphi *phi = dyn_cast <gphi *> (p: def)) |
158 | vec_defs.quick_push (obj: gimple_phi_result (gs: phi)); |
159 | else |
160 | { |
161 | def_operand_p defop = single_ssa_def_operand (stmt: def, SSA_OP_ALL_DEFS); |
162 | vec_defs.quick_push (obj: get_def_from_ptr (def: defop)); |
163 | } |
164 | } |
165 | |
166 | /* Recursively free the memory allocated for the SLP tree rooted at NODE. */ |
167 | |
168 | void |
169 | vect_free_slp_tree (slp_tree node) |
170 | { |
171 | int i; |
172 | slp_tree child; |
173 | |
174 | if (--SLP_TREE_REF_COUNT (node) != 0) |
175 | return; |
176 | |
177 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
178 | if (child) |
179 | vect_free_slp_tree (node: child); |
180 | |
181 | /* If the node defines any SLP only patterns then those patterns are no |
182 | longer valid and should be removed. */ |
183 | stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node); |
184 | if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info)) |
185 | { |
186 | stmt_vec_info stmt_info = vect_orig_stmt (stmt_info: rep_stmt_info); |
187 | STMT_VINFO_IN_PATTERN_P (stmt_info) = false; |
188 | STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info); |
189 | } |
190 | |
191 | delete node; |
192 | } |
193 | |
194 | /* Return a location suitable for dumpings related to the SLP instance. */ |
195 | |
196 | dump_user_location_t |
197 | _slp_instance::location () const |
198 | { |
199 | if (!root_stmts.is_empty ()) |
200 | return root_stmts[0]->stmt; |
201 | else |
202 | return SLP_TREE_SCALAR_STMTS (root)[0]->stmt; |
203 | } |
204 | |
205 | |
206 | /* Free the memory allocated for the SLP instance. */ |
207 | |
208 | void |
209 | vect_free_slp_instance (slp_instance instance) |
210 | { |
211 | vect_free_slp_tree (SLP_INSTANCE_TREE (instance)); |
212 | SLP_INSTANCE_LOADS (instance).release (); |
213 | SLP_INSTANCE_ROOT_STMTS (instance).release (); |
214 | SLP_INSTANCE_REMAIN_DEFS (instance).release (); |
215 | instance->subgraph_entries.release (); |
216 | instance->cost_vec.release (); |
217 | free (ptr: instance); |
218 | } |
219 | |
220 | |
221 | /* Create an SLP node for SCALAR_STMTS. */ |
222 | |
223 | slp_tree |
224 | vect_create_new_slp_node (unsigned nops, tree_code code) |
225 | { |
226 | slp_tree node = new _slp_tree; |
227 | SLP_TREE_SCALAR_STMTS (node) = vNULL; |
228 | SLP_TREE_CHILDREN (node).create (nelems: nops); |
229 | SLP_TREE_DEF_TYPE (node) = vect_internal_def; |
230 | SLP_TREE_CODE (node) = code; |
231 | return node; |
232 | } |
233 | /* Create an SLP node for SCALAR_STMTS. */ |
234 | |
235 | static slp_tree |
236 | vect_create_new_slp_node (slp_tree node, |
237 | vec<stmt_vec_info> scalar_stmts, unsigned nops) |
238 | { |
239 | SLP_TREE_SCALAR_STMTS (node) = scalar_stmts; |
240 | SLP_TREE_CHILDREN (node).create (nelems: nops); |
241 | SLP_TREE_DEF_TYPE (node) = vect_internal_def; |
242 | SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0]; |
243 | SLP_TREE_LANES (node) = scalar_stmts.length (); |
244 | return node; |
245 | } |
246 | |
247 | /* Create an SLP node for SCALAR_STMTS. */ |
248 | |
249 | static slp_tree |
250 | vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops) |
251 | { |
252 | return vect_create_new_slp_node (node: new _slp_tree, scalar_stmts, nops); |
253 | } |
254 | |
255 | /* Create an SLP node for OPS. */ |
256 | |
257 | static slp_tree |
258 | vect_create_new_slp_node (slp_tree node, vec<tree> ops) |
259 | { |
260 | SLP_TREE_SCALAR_OPS (node) = ops; |
261 | SLP_TREE_DEF_TYPE (node) = vect_external_def; |
262 | SLP_TREE_LANES (node) = ops.length (); |
263 | return node; |
264 | } |
265 | |
266 | /* Create an SLP node for OPS. */ |
267 | |
268 | static slp_tree |
269 | vect_create_new_slp_node (vec<tree> ops) |
270 | { |
271 | return vect_create_new_slp_node (node: new _slp_tree, ops); |
272 | } |
273 | |
274 | |
275 | /* This structure is used in creation of an SLP tree. Each instance |
276 | corresponds to the same operand in a group of scalar stmts in an SLP |
277 | node. */ |
278 | typedef struct _slp_oprnd_info |
279 | { |
280 | /* Def-stmts for the operands. */ |
281 | vec<stmt_vec_info> def_stmts; |
282 | /* Operands. */ |
283 | vec<tree> ops; |
284 | /* Information about the first statement, its vector def-type, type, the |
285 | operand itself in case it's constant, and an indication if it's a pattern |
286 | stmt and gather/scatter info. */ |
287 | tree first_op_type; |
288 | enum vect_def_type first_dt; |
289 | bool any_pattern; |
290 | bool first_gs_p; |
291 | gather_scatter_info first_gs_info; |
292 | } *slp_oprnd_info; |
293 | |
294 | |
295 | /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each |
296 | operand. */ |
297 | static vec<slp_oprnd_info> |
298 | vect_create_oprnd_info (int nops, int group_size) |
299 | { |
300 | int i; |
301 | slp_oprnd_info oprnd_info; |
302 | vec<slp_oprnd_info> oprnds_info; |
303 | |
304 | oprnds_info.create (nelems: nops); |
305 | for (i = 0; i < nops; i++) |
306 | { |
307 | oprnd_info = XNEW (struct _slp_oprnd_info); |
308 | oprnd_info->def_stmts.create (nelems: group_size); |
309 | oprnd_info->ops.create (nelems: group_size); |
310 | oprnd_info->first_dt = vect_uninitialized_def; |
311 | oprnd_info->first_op_type = NULL_TREE; |
312 | oprnd_info->any_pattern = false; |
313 | oprnd_info->first_gs_p = false; |
314 | oprnds_info.quick_push (obj: oprnd_info); |
315 | } |
316 | |
317 | return oprnds_info; |
318 | } |
319 | |
320 | |
321 | /* Free operands info. */ |
322 | |
323 | static void |
324 | vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info) |
325 | { |
326 | int i; |
327 | slp_oprnd_info oprnd_info; |
328 | |
329 | FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) |
330 | { |
331 | oprnd_info->def_stmts.release (); |
332 | oprnd_info->ops.release (); |
333 | XDELETE (oprnd_info); |
334 | } |
335 | |
336 | oprnds_info.release (); |
337 | } |
338 | |
339 | /* Return the execution frequency of NODE (so that a higher value indicates |
340 | a "more important" node when optimizing for speed). */ |
341 | |
342 | static sreal |
343 | vect_slp_node_weight (slp_tree node) |
344 | { |
345 | stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node)); |
346 | basic_block bb = gimple_bb (g: stmt_info->stmt); |
347 | return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count); |
348 | } |
349 | |
350 | /* Return true if STMTS contains a pattern statement. */ |
351 | |
352 | static bool |
353 | vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts) |
354 | { |
355 | stmt_vec_info stmt_info; |
356 | unsigned int i; |
357 | FOR_EACH_VEC_ELT (stmts, i, stmt_info) |
358 | if (is_pattern_stmt_p (stmt_info)) |
359 | return true; |
360 | return false; |
361 | } |
362 | |
363 | /* Return true when all lanes in the external or constant NODE have |
364 | the same value. */ |
365 | |
366 | static bool |
367 | vect_slp_tree_uniform_p (slp_tree node) |
368 | { |
369 | gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def |
370 | || SLP_TREE_DEF_TYPE (node) == vect_external_def); |
371 | |
372 | /* Pre-exsting vectors. */ |
373 | if (SLP_TREE_SCALAR_OPS (node).is_empty ()) |
374 | return false; |
375 | |
376 | unsigned i; |
377 | tree op, first = NULL_TREE; |
378 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) |
379 | if (!first) |
380 | first = op; |
381 | else if (!operand_equal_p (first, op, flags: 0)) |
382 | return false; |
383 | |
384 | return true; |
385 | } |
386 | |
387 | /* Find the place of the data-ref in STMT_INFO in the interleaving chain |
388 | that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part |
389 | of the chain. */ |
390 | |
391 | int |
392 | vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info, |
393 | stmt_vec_info first_stmt_info) |
394 | { |
395 | stmt_vec_info next_stmt_info = first_stmt_info; |
396 | int result = 0; |
397 | |
398 | if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info)) |
399 | return -1; |
400 | |
401 | do |
402 | { |
403 | if (next_stmt_info == stmt_info) |
404 | return result; |
405 | next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); |
406 | if (next_stmt_info) |
407 | result += DR_GROUP_GAP (next_stmt_info); |
408 | } |
409 | while (next_stmt_info); |
410 | |
411 | return -1; |
412 | } |
413 | |
414 | /* Check whether it is possible to load COUNT elements of type ELT_TYPE |
415 | using the method implemented by duplicate_and_interleave. Return true |
416 | if so, returning the number of intermediate vectors in *NVECTORS_OUT |
417 | (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT |
418 | (if nonnull). */ |
419 | |
420 | bool |
421 | can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count, |
422 | tree elt_type, unsigned int *nvectors_out, |
423 | tree *vector_type_out, |
424 | tree *permutes) |
425 | { |
426 | tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count); |
427 | if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type))) |
428 | return false; |
429 | |
430 | machine_mode base_vector_mode = TYPE_MODE (base_vector_type); |
431 | poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode); |
432 | unsigned int nvectors = 1; |
433 | for (;;) |
434 | { |
435 | scalar_int_mode int_mode; |
436 | poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT; |
437 | if (int_mode_for_size (size: elt_bits, limit: 1).exists (mode: &int_mode)) |
438 | { |
439 | /* Get the natural vector type for this SLP group size. */ |
440 | tree int_type = build_nonstandard_integer_type |
441 | (GET_MODE_BITSIZE (mode: int_mode), 1); |
442 | tree vector_type |
443 | = get_vectype_for_scalar_type (vinfo, int_type, count); |
444 | poly_int64 half_nelts; |
445 | if (vector_type |
446 | && VECTOR_MODE_P (TYPE_MODE (vector_type)) |
447 | && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)), |
448 | GET_MODE_SIZE (base_vector_mode)) |
449 | && multiple_p (a: GET_MODE_NUNITS (TYPE_MODE (vector_type)), |
450 | b: 2, multiple: &half_nelts)) |
451 | { |
452 | /* Try fusing consecutive sequences of COUNT / NVECTORS elements |
453 | together into elements of type INT_TYPE and using the result |
454 | to build NVECTORS vectors. */ |
455 | poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type)); |
456 | vec_perm_builder sel1 (nelts, 2, 3); |
457 | vec_perm_builder sel2 (nelts, 2, 3); |
458 | |
459 | for (unsigned int i = 0; i < 3; ++i) |
460 | { |
461 | sel1.quick_push (obj: i); |
462 | sel1.quick_push (obj: i + nelts); |
463 | sel2.quick_push (obj: half_nelts + i); |
464 | sel2.quick_push (obj: half_nelts + i + nelts); |
465 | } |
466 | vec_perm_indices indices1 (sel1, 2, nelts); |
467 | vec_perm_indices indices2 (sel2, 2, nelts); |
468 | machine_mode vmode = TYPE_MODE (vector_type); |
469 | if (can_vec_perm_const_p (vmode, vmode, indices1) |
470 | && can_vec_perm_const_p (vmode, vmode, indices2)) |
471 | { |
472 | if (nvectors_out) |
473 | *nvectors_out = nvectors; |
474 | if (vector_type_out) |
475 | *vector_type_out = vector_type; |
476 | if (permutes) |
477 | { |
478 | permutes[0] = vect_gen_perm_mask_checked (vector_type, |
479 | indices1); |
480 | permutes[1] = vect_gen_perm_mask_checked (vector_type, |
481 | indices2); |
482 | } |
483 | return true; |
484 | } |
485 | } |
486 | } |
487 | if (!multiple_p (a: elt_bytes, b: 2, multiple: &elt_bytes)) |
488 | return false; |
489 | nvectors *= 2; |
490 | } |
491 | } |
492 | |
493 | /* Return true if DTA and DTB match. */ |
494 | |
495 | static bool |
496 | vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb) |
497 | { |
498 | return (dta == dtb |
499 | || ((dta == vect_external_def || dta == vect_constant_def) |
500 | && (dtb == vect_external_def || dtb == vect_constant_def))); |
501 | } |
502 | |
503 | static const int cond_expr_maps[3][5] = { |
504 | { 4, -1, -2, 1, 2 }, |
505 | { 4, -2, -1, 1, 2 }, |
506 | { 4, -1, -2, 2, 1 } |
507 | }; |
508 | static const int arg0_map[] = { 1, 0 }; |
509 | static const int arg1_map[] = { 1, 1 }; |
510 | static const int arg2_map[] = { 1, 2 }; |
511 | static const int arg1_arg4_map[] = { 2, 1, 4 }; |
512 | static const int arg3_arg2_map[] = { 2, 3, 2 }; |
513 | static const int op1_op0_map[] = { 2, 1, 0 }; |
514 | static const int off_map[] = { 1, -3 }; |
515 | static const int off_op0_map[] = { 2, -3, 0 }; |
516 | static const int off_arg2_map[] = { 2, -3, 2 }; |
517 | static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 }; |
518 | static const int mask_call_maps[6][7] = { |
519 | { 1, 1, }, |
520 | { 2, 1, 2, }, |
521 | { 3, 1, 2, 3, }, |
522 | { 4, 1, 2, 3, 4, }, |
523 | { 5, 1, 2, 3, 4, 5, }, |
524 | { 6, 1, 2, 3, 4, 5, 6 }, |
525 | }; |
526 | |
527 | /* For most SLP statements, there is a one-to-one mapping between |
528 | gimple arguments and child nodes. If that is not true for STMT, |
529 | return an array that contains: |
530 | |
531 | - the number of child nodes, followed by |
532 | - for each child node, the index of the argument associated with that node. |
533 | The special index -1 is the first operand of an embedded comparison and |
534 | the special index -2 is the second operand of an embedded comparison. |
535 | The special indes -3 is the offset of a gather as analyzed by |
536 | vect_check_gather_scatter. |
537 | |
538 | SWAP is as for vect_get_and_check_slp_defs. */ |
539 | |
540 | static const int * |
541 | vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false, |
542 | unsigned char swap = 0) |
543 | { |
544 | if (auto assign = dyn_cast<const gassign *> (p: stmt)) |
545 | { |
546 | if (gimple_assign_rhs_code (gs: assign) == COND_EXPR |
547 | && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign))) |
548 | return cond_expr_maps[swap]; |
549 | if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison |
550 | && swap) |
551 | return op1_op0_map; |
552 | if (gather_scatter_p) |
553 | return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME |
554 | ? off_op0_map : off_map); |
555 | } |
556 | gcc_assert (!swap); |
557 | if (auto call = dyn_cast<const gcall *> (p: stmt)) |
558 | { |
559 | if (gimple_call_internal_p (gs: call)) |
560 | switch (gimple_call_internal_fn (gs: call)) |
561 | { |
562 | case IFN_MASK_LOAD: |
563 | return gather_scatter_p ? off_arg2_map : arg2_map; |
564 | |
565 | case IFN_GATHER_LOAD: |
566 | return arg1_map; |
567 | |
568 | case IFN_MASK_GATHER_LOAD: |
569 | case IFN_MASK_LEN_GATHER_LOAD: |
570 | return arg1_arg4_map; |
571 | |
572 | case IFN_MASK_STORE: |
573 | return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map; |
574 | |
575 | case IFN_MASK_CALL: |
576 | { |
577 | unsigned nargs = gimple_call_num_args (gs: call); |
578 | if (nargs >= 2 && nargs <= 7) |
579 | return mask_call_maps[nargs-2]; |
580 | else |
581 | return nullptr; |
582 | } |
583 | |
584 | case IFN_CLZ: |
585 | case IFN_CTZ: |
586 | return arg0_map; |
587 | |
588 | default: |
589 | break; |
590 | } |
591 | } |
592 | return nullptr; |
593 | } |
594 | |
595 | /* Return the SLP node child index for operand OP of STMT. */ |
596 | |
597 | int |
598 | vect_slp_child_index_for_operand (const gimple *stmt, int op, |
599 | bool gather_scatter_p) |
600 | { |
601 | const int *opmap = vect_get_operand_map (stmt, gather_scatter_p); |
602 | if (!opmap) |
603 | return op; |
604 | for (int i = 1; i < 1 + opmap[0]; ++i) |
605 | if (opmap[i] == op) |
606 | return i - 1; |
607 | gcc_unreachable (); |
608 | } |
609 | |
610 | /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that |
611 | they are of a valid type and that they match the defs of the first stmt of |
612 | the SLP group (stored in OPRNDS_INFO). This function tries to match stmts |
613 | by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP |
614 | indicates swap is required for cond_expr stmts. Specifically, SWAP |
615 | is 1 if STMT is cond and operands of comparison need to be swapped; |
616 | SWAP is 2 if STMT is cond and code of comparison needs to be inverted. |
617 | |
618 | If there was a fatal error return -1; if the error could be corrected by |
619 | swapping operands of father node of this one, return 1; if everything is |
620 | ok return 0. */ |
621 | static int |
622 | vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap, |
623 | bool *skip_args, |
624 | vec<stmt_vec_info> stmts, unsigned stmt_num, |
625 | vec<slp_oprnd_info> *oprnds_info) |
626 | { |
627 | stmt_vec_info stmt_info = stmts[stmt_num]; |
628 | tree oprnd; |
629 | unsigned int i, number_of_oprnds; |
630 | enum vect_def_type dt = vect_uninitialized_def; |
631 | slp_oprnd_info oprnd_info; |
632 | gather_scatter_info gs_info; |
633 | unsigned int gs_op = -1u; |
634 | unsigned int commutative_op = -1U; |
635 | bool first = stmt_num == 0; |
636 | |
637 | if (!is_a<gcall *> (p: stmt_info->stmt) |
638 | && !is_a<gassign *> (p: stmt_info->stmt) |
639 | && !is_a<gphi *> (p: stmt_info->stmt)) |
640 | return -1; |
641 | |
642 | number_of_oprnds = gimple_num_args (gs: stmt_info->stmt); |
643 | const int *map |
644 | = vect_get_operand_map (stmt: stmt_info->stmt, |
645 | STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap); |
646 | if (map) |
647 | number_of_oprnds = *map++; |
648 | if (gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt)) |
649 | { |
650 | if (gimple_call_internal_p (gs: stmt)) |
651 | { |
652 | internal_fn ifn = gimple_call_internal_fn (gs: stmt); |
653 | commutative_op = first_commutative_argument (ifn); |
654 | } |
655 | } |
656 | else if (gassign *stmt = dyn_cast <gassign *> (p: stmt_info->stmt)) |
657 | { |
658 | if (commutative_tree_code (gimple_assign_rhs_code (gs: stmt))) |
659 | commutative_op = 0; |
660 | } |
661 | |
662 | bool swapped = (swap != 0); |
663 | bool backedge = false; |
664 | enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds); |
665 | for (i = 0; i < number_of_oprnds; i++) |
666 | { |
667 | oprnd_info = (*oprnds_info)[i]; |
668 | int opno = map ? map[i] : int (i); |
669 | if (opno == -3) |
670 | { |
671 | gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info)); |
672 | if (!is_a <loop_vec_info> (p: vinfo) |
673 | || !vect_check_gather_scatter (stmt_info, |
674 | as_a <loop_vec_info> (p: vinfo), |
675 | first ? &oprnd_info->first_gs_info |
676 | : &gs_info)) |
677 | return -1; |
678 | |
679 | if (first) |
680 | { |
681 | oprnd_info->first_gs_p = true; |
682 | oprnd = oprnd_info->first_gs_info.offset; |
683 | } |
684 | else |
685 | { |
686 | gs_op = i; |
687 | oprnd = gs_info.offset; |
688 | } |
689 | } |
690 | else if (opno < 0) |
691 | oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno); |
692 | else |
693 | { |
694 | oprnd = gimple_arg (gs: stmt_info->stmt, i: opno); |
695 | if (gphi *stmt = dyn_cast <gphi *> (p: stmt_info->stmt)) |
696 | { |
697 | edge e = gimple_phi_arg_edge (phi: stmt, i: opno); |
698 | backedge = (is_a <bb_vec_info> (p: vinfo) |
699 | ? e->flags & EDGE_DFS_BACK |
700 | : dominated_by_p (CDI_DOMINATORS, e->src, |
701 | gimple_bb (g: stmt_info->stmt))); |
702 | } |
703 | } |
704 | if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR) |
705 | oprnd = TREE_OPERAND (oprnd, 0); |
706 | |
707 | stmt_vec_info def_stmt_info; |
708 | if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info)) |
709 | { |
710 | if (dump_enabled_p ()) |
711 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
712 | "Build SLP failed: can't analyze def for %T\n" , |
713 | oprnd); |
714 | |
715 | return -1; |
716 | } |
717 | |
718 | if (skip_args[i]) |
719 | { |
720 | oprnd_info->def_stmts.quick_push (NULL); |
721 | oprnd_info->ops.quick_push (NULL_TREE); |
722 | oprnd_info->first_dt = vect_uninitialized_def; |
723 | continue; |
724 | } |
725 | |
726 | oprnd_info->def_stmts.quick_push (obj: def_stmt_info); |
727 | oprnd_info->ops.quick_push (obj: oprnd); |
728 | |
729 | if (def_stmt_info |
730 | && is_pattern_stmt_p (stmt_info: def_stmt_info)) |
731 | { |
732 | if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info)) |
733 | != def_stmt_info) |
734 | oprnd_info->any_pattern = true; |
735 | else |
736 | /* If we promote this to external use the original stmt def. */ |
737 | oprnd_info->ops.last () |
738 | = gimple_get_lhs (vect_orig_stmt (stmt_info: def_stmt_info)->stmt); |
739 | } |
740 | |
741 | /* If there's a extern def on a backedge make sure we can |
742 | code-generate at the region start. |
743 | ??? This is another case that could be fixed by adjusting |
744 | how we split the function but at the moment we'd have conflicting |
745 | goals there. */ |
746 | if (backedge |
747 | && dts[i] == vect_external_def |
748 | && is_a <bb_vec_info> (p: vinfo) |
749 | && TREE_CODE (oprnd) == SSA_NAME |
750 | && !SSA_NAME_IS_DEFAULT_DEF (oprnd) |
751 | && !dominated_by_p (CDI_DOMINATORS, |
752 | as_a <bb_vec_info> (p: vinfo)->bbs[0], |
753 | gimple_bb (SSA_NAME_DEF_STMT (oprnd)))) |
754 | { |
755 | if (dump_enabled_p ()) |
756 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
757 | "Build SLP failed: extern def %T only defined " |
758 | "on backedge\n" , oprnd); |
759 | return -1; |
760 | } |
761 | |
762 | if (first) |
763 | { |
764 | tree type = TREE_TYPE (oprnd); |
765 | dt = dts[i]; |
766 | |
767 | /* For the swapping logic below force vect_reduction_def |
768 | for the reduction op in a SLP reduction group. */ |
769 | if (!STMT_VINFO_DATA_REF (stmt_info) |
770 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
771 | && (int)i == STMT_VINFO_REDUC_IDX (stmt_info) |
772 | && def_stmt_info) |
773 | dts[i] = dt = vect_reduction_def; |
774 | |
775 | /* Check the types of the definition. */ |
776 | switch (dt) |
777 | { |
778 | case vect_external_def: |
779 | case vect_constant_def: |
780 | case vect_internal_def: |
781 | case vect_reduction_def: |
782 | case vect_induction_def: |
783 | case vect_nested_cycle: |
784 | case vect_first_order_recurrence: |
785 | break; |
786 | |
787 | default: |
788 | /* FORNOW: Not supported. */ |
789 | if (dump_enabled_p ()) |
790 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
791 | "Build SLP failed: illegal type of def %T\n" , |
792 | oprnd); |
793 | return -1; |
794 | } |
795 | |
796 | oprnd_info->first_dt = dt; |
797 | oprnd_info->first_op_type = type; |
798 | } |
799 | } |
800 | if (first) |
801 | return 0; |
802 | |
803 | /* Now match the operand definition types to that of the first stmt. */ |
804 | for (i = 0; i < number_of_oprnds;) |
805 | { |
806 | if (skip_args[i]) |
807 | { |
808 | ++i; |
809 | continue; |
810 | } |
811 | |
812 | oprnd_info = (*oprnds_info)[i]; |
813 | dt = dts[i]; |
814 | stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num]; |
815 | oprnd = oprnd_info->ops[stmt_num]; |
816 | tree type = TREE_TYPE (oprnd); |
817 | |
818 | if (!types_compatible_p (type1: oprnd_info->first_op_type, type2: type)) |
819 | { |
820 | if (dump_enabled_p ()) |
821 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
822 | "Build SLP failed: different operand types\n" ); |
823 | return 1; |
824 | } |
825 | |
826 | if ((gs_op == i) != oprnd_info->first_gs_p) |
827 | { |
828 | if (dump_enabled_p ()) |
829 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
830 | "Build SLP failed: mixed gather and non-gather\n" ); |
831 | return 1; |
832 | } |
833 | else if (gs_op == i) |
834 | { |
835 | if (!operand_equal_p (oprnd_info->first_gs_info.base, |
836 | gs_info.base)) |
837 | { |
838 | if (dump_enabled_p ()) |
839 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
840 | "Build SLP failed: different gather base\n" ); |
841 | return 1; |
842 | } |
843 | if (oprnd_info->first_gs_info.scale != gs_info.scale) |
844 | { |
845 | if (dump_enabled_p ()) |
846 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
847 | "Build SLP failed: different gather scale\n" ); |
848 | return 1; |
849 | } |
850 | } |
851 | |
852 | /* Not first stmt of the group, check that the def-stmt/s match |
853 | the def-stmt/s of the first stmt. Allow different definition |
854 | types for reduction chains: the first stmt must be a |
855 | vect_reduction_def (a phi node), and the rest |
856 | end in the reduction chain. */ |
857 | if ((!vect_def_types_match (dta: oprnd_info->first_dt, dtb: dt) |
858 | && !(oprnd_info->first_dt == vect_reduction_def |
859 | && !STMT_VINFO_DATA_REF (stmt_info) |
860 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
861 | && def_stmt_info |
862 | && !STMT_VINFO_DATA_REF (def_stmt_info) |
863 | && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info) |
864 | == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))) |
865 | || (!STMT_VINFO_DATA_REF (stmt_info) |
866 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
867 | && ((!def_stmt_info |
868 | || STMT_VINFO_DATA_REF (def_stmt_info) |
869 | || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info) |
870 | != REDUC_GROUP_FIRST_ELEMENT (stmt_info))) |
871 | != (oprnd_info->first_dt != vect_reduction_def)))) |
872 | { |
873 | /* Try swapping operands if we got a mismatch. For BB |
874 | vectorization only in case it will clearly improve things. */ |
875 | if (i == commutative_op && !swapped |
876 | && (!is_a <bb_vec_info> (p: vinfo) |
877 | || (!vect_def_types_match (dta: (*oprnds_info)[i+1]->first_dt, |
878 | dtb: dts[i+1]) |
879 | && (vect_def_types_match (dta: oprnd_info->first_dt, dtb: dts[i+1]) |
880 | || vect_def_types_match |
881 | (dta: (*oprnds_info)[i+1]->first_dt, dtb: dts[i]))))) |
882 | { |
883 | if (dump_enabled_p ()) |
884 | dump_printf_loc (MSG_NOTE, vect_location, |
885 | "trying swapped operands\n" ); |
886 | std::swap (a&: dts[i], b&: dts[i+1]); |
887 | std::swap (a&: (*oprnds_info)[i]->def_stmts[stmt_num], |
888 | b&: (*oprnds_info)[i+1]->def_stmts[stmt_num]); |
889 | std::swap (a&: (*oprnds_info)[i]->ops[stmt_num], |
890 | b&: (*oprnds_info)[i+1]->ops[stmt_num]); |
891 | /* After swapping some operands we lost track whether an |
892 | operand has any pattern defs so be conservative here. */ |
893 | if ((*oprnds_info)[i]->any_pattern |
894 | || (*oprnds_info)[i+1]->any_pattern) |
895 | (*oprnds_info)[i]->any_pattern |
896 | = (*oprnds_info)[i+1]->any_pattern = true; |
897 | swapped = true; |
898 | continue; |
899 | } |
900 | |
901 | if (is_a <bb_vec_info> (p: vinfo) |
902 | && !oprnd_info->any_pattern) |
903 | { |
904 | /* Now for commutative ops we should see whether we can |
905 | make the other operand matching. */ |
906 | if (dump_enabled_p ()) |
907 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
908 | "treating operand as external\n" ); |
909 | oprnd_info->first_dt = dt = vect_external_def; |
910 | } |
911 | else |
912 | { |
913 | if (dump_enabled_p ()) |
914 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
915 | "Build SLP failed: different types\n" ); |
916 | return 1; |
917 | } |
918 | } |
919 | |
920 | /* Make sure to demote the overall operand to external. */ |
921 | if (dt == vect_external_def) |
922 | oprnd_info->first_dt = vect_external_def; |
923 | /* For a SLP reduction chain we want to duplicate the reduction to |
924 | each of the chain members. That gets us a sane SLP graph (still |
925 | the stmts are not 100% correct wrt the initial values). */ |
926 | else if ((dt == vect_internal_def |
927 | || dt == vect_reduction_def) |
928 | && oprnd_info->first_dt == vect_reduction_def |
929 | && !STMT_VINFO_DATA_REF (stmt_info) |
930 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
931 | && !STMT_VINFO_DATA_REF (def_stmt_info) |
932 | && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info) |
933 | == REDUC_GROUP_FIRST_ELEMENT (stmt_info))) |
934 | { |
935 | oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0]; |
936 | oprnd_info->ops[stmt_num] = oprnd_info->ops[0]; |
937 | } |
938 | |
939 | ++i; |
940 | } |
941 | |
942 | /* Swap operands. */ |
943 | if (swapped) |
944 | { |
945 | if (dump_enabled_p ()) |
946 | dump_printf_loc (MSG_NOTE, vect_location, |
947 | "swapped operands to match def types in %G" , |
948 | stmt_info->stmt); |
949 | } |
950 | |
951 | return 0; |
952 | } |
953 | |
954 | /* Return true if call statements CALL1 and CALL2 are similar enough |
955 | to be combined into the same SLP group. */ |
956 | |
957 | bool |
958 | compatible_calls_p (gcall *call1, gcall *call2) |
959 | { |
960 | unsigned int nargs = gimple_call_num_args (gs: call1); |
961 | if (nargs != gimple_call_num_args (gs: call2)) |
962 | return false; |
963 | |
964 | if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2)) |
965 | return false; |
966 | |
967 | if (gimple_call_internal_p (gs: call1)) |
968 | { |
969 | if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)), |
970 | TREE_TYPE (gimple_call_lhs (call2)))) |
971 | return false; |
972 | for (unsigned int i = 0; i < nargs; ++i) |
973 | if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)), |
974 | TREE_TYPE (gimple_call_arg (call2, i)))) |
975 | return false; |
976 | } |
977 | else |
978 | { |
979 | if (!operand_equal_p (gimple_call_fn (gs: call1), |
980 | gimple_call_fn (gs: call2), flags: 0)) |
981 | return false; |
982 | |
983 | if (gimple_call_fntype (gs: call1) != gimple_call_fntype (gs: call2)) |
984 | return false; |
985 | } |
986 | |
987 | /* Check that any unvectorized arguments are equal. */ |
988 | if (const int *map = vect_get_operand_map (stmt: call1)) |
989 | { |
990 | unsigned int nkept = *map++; |
991 | unsigned int mapi = 0; |
992 | for (unsigned int i = 0; i < nargs; ++i) |
993 | if (mapi < nkept && map[mapi] == int (i)) |
994 | mapi += 1; |
995 | else if (!operand_equal_p (gimple_call_arg (gs: call1, index: i), |
996 | gimple_call_arg (gs: call2, index: i))) |
997 | return false; |
998 | } |
999 | |
1000 | return true; |
1001 | } |
1002 | |
1003 | /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the |
1004 | caller's attempt to find the vector type in STMT_INFO with the narrowest |
1005 | element type. Return true if VECTYPE is nonnull and if it is valid |
1006 | for STMT_INFO. When returning true, update MAX_NUNITS to reflect the |
1007 | number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for |
1008 | vect_build_slp_tree. */ |
1009 | |
1010 | static bool |
1011 | vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info, |
1012 | unsigned int group_size, |
1013 | tree vectype, poly_uint64 *max_nunits) |
1014 | { |
1015 | if (!vectype) |
1016 | { |
1017 | if (dump_enabled_p ()) |
1018 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1019 | "Build SLP failed: unsupported data-type in %G\n" , |
1020 | stmt_info->stmt); |
1021 | /* Fatal mismatch. */ |
1022 | return false; |
1023 | } |
1024 | |
1025 | /* If populating the vector type requires unrolling then fail |
1026 | before adjusting *max_nunits for basic-block vectorization. */ |
1027 | if (is_a <bb_vec_info> (p: vinfo) |
1028 | && !multiple_p (a: group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype))) |
1029 | { |
1030 | if (dump_enabled_p ()) |
1031 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1032 | "Build SLP failed: unrolling required " |
1033 | "in basic block SLP\n" ); |
1034 | /* Fatal mismatch. */ |
1035 | return false; |
1036 | } |
1037 | |
1038 | /* In case of multiple types we need to detect the smallest type. */ |
1039 | vect_update_max_nunits (max_nunits, vectype); |
1040 | return true; |
1041 | } |
1042 | |
1043 | /* Verify if the scalar stmts STMTS are isomorphic, require data |
1044 | permutation or are of unsupported types of operation. Return |
1045 | true if they are, otherwise return false and indicate in *MATCHES |
1046 | which stmts are not isomorphic to the first one. If MATCHES[0] |
1047 | is false then this indicates the comparison could not be |
1048 | carried out or the stmts will never be vectorized by SLP. |
1049 | |
1050 | Note COND_EXPR is possibly isomorphic to another one after swapping its |
1051 | operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to |
1052 | the first stmt by swapping the two operands of comparison; set SWAP[i] |
1053 | to 2 if stmt I is isormorphic to the first stmt by inverting the code |
1054 | of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped |
1055 | to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */ |
1056 | |
1057 | static bool |
1058 | vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, |
1059 | vec<stmt_vec_info> stmts, unsigned int group_size, |
1060 | poly_uint64 *max_nunits, bool *matches, |
1061 | bool *two_operators, tree *node_vectype) |
1062 | { |
1063 | unsigned int i; |
1064 | stmt_vec_info first_stmt_info = stmts[0]; |
1065 | code_helper first_stmt_code = ERROR_MARK; |
1066 | code_helper alt_stmt_code = ERROR_MARK; |
1067 | code_helper rhs_code = ERROR_MARK; |
1068 | code_helper first_cond_code = ERROR_MARK; |
1069 | tree lhs; |
1070 | bool need_same_oprnds = false; |
1071 | tree vectype = NULL_TREE, first_op1 = NULL_TREE; |
1072 | stmt_vec_info first_load = NULL, prev_first_load = NULL; |
1073 | bool first_stmt_ldst_p = false, ldst_p = false; |
1074 | bool first_stmt_phi_p = false, phi_p = false; |
1075 | bool maybe_soft_fail = false; |
1076 | tree soft_fail_nunits_vectype = NULL_TREE; |
1077 | |
1078 | /* For every stmt in NODE find its def stmt/s. */ |
1079 | stmt_vec_info stmt_info; |
1080 | FOR_EACH_VEC_ELT (stmts, i, stmt_info) |
1081 | { |
1082 | gimple *stmt = stmt_info->stmt; |
1083 | swap[i] = 0; |
1084 | matches[i] = false; |
1085 | |
1086 | if (dump_enabled_p ()) |
1087 | dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G" , stmt); |
1088 | |
1089 | /* Fail to vectorize statements marked as unvectorizable, throw |
1090 | or are volatile. */ |
1091 | if (!STMT_VINFO_VECTORIZABLE (stmt_info) |
1092 | || stmt_can_throw_internal (cfun, stmt) |
1093 | || gimple_has_volatile_ops (stmt)) |
1094 | { |
1095 | if (dump_enabled_p ()) |
1096 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1097 | "Build SLP failed: unvectorizable statement %G" , |
1098 | stmt); |
1099 | /* ??? For BB vectorization we want to commutate operands in a way |
1100 | to shuffle all unvectorizable defs into one operand and have |
1101 | the other still vectorized. The following doesn't reliably |
1102 | work for this though but it's the easiest we can do here. */ |
1103 | if (is_a <bb_vec_info> (p: vinfo) && i != 0) |
1104 | continue; |
1105 | /* Fatal mismatch. */ |
1106 | matches[0] = false; |
1107 | return false; |
1108 | } |
1109 | |
1110 | gcall *call_stmt = dyn_cast <gcall *> (p: stmt); |
1111 | lhs = gimple_get_lhs (stmt); |
1112 | if (lhs == NULL_TREE |
1113 | && (!call_stmt |
1114 | || !gimple_call_internal_p (gs: stmt) |
1115 | || !internal_store_fn_p (gimple_call_internal_fn (gs: stmt)))) |
1116 | { |
1117 | if (dump_enabled_p ()) |
1118 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1119 | "Build SLP failed: not GIMPLE_ASSIGN nor " |
1120 | "GIMPLE_CALL %G" , stmt); |
1121 | if (is_a <bb_vec_info> (p: vinfo) && i != 0) |
1122 | continue; |
1123 | /* Fatal mismatch. */ |
1124 | matches[0] = false; |
1125 | return false; |
1126 | } |
1127 | |
1128 | tree nunits_vectype; |
1129 | if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype, |
1130 | &nunits_vectype, group_size)) |
1131 | { |
1132 | if (is_a <bb_vec_info> (p: vinfo) && i != 0) |
1133 | continue; |
1134 | /* Fatal mismatch. */ |
1135 | matches[0] = false; |
1136 | return false; |
1137 | } |
1138 | /* Record nunits required but continue analysis, producing matches[] |
1139 | as if nunits was not an issue. This allows splitting of groups |
1140 | to happen. */ |
1141 | if (nunits_vectype |
1142 | && !vect_record_max_nunits (vinfo, stmt_info, group_size, |
1143 | vectype: nunits_vectype, max_nunits)) |
1144 | { |
1145 | gcc_assert (is_a <bb_vec_info> (vinfo)); |
1146 | maybe_soft_fail = true; |
1147 | soft_fail_nunits_vectype = nunits_vectype; |
1148 | } |
1149 | |
1150 | gcc_assert (vectype); |
1151 | |
1152 | if (call_stmt) |
1153 | { |
1154 | combined_fn cfn = gimple_call_combined_fn (call_stmt); |
1155 | if (cfn != CFN_LAST && cfn != CFN_MASK_CALL) |
1156 | rhs_code = cfn; |
1157 | else |
1158 | rhs_code = CALL_EXPR; |
1159 | |
1160 | if (cfn == CFN_MASK_LOAD |
1161 | || cfn == CFN_GATHER_LOAD |
1162 | || cfn == CFN_MASK_GATHER_LOAD |
1163 | || cfn == CFN_MASK_LEN_GATHER_LOAD) |
1164 | ldst_p = true; |
1165 | else if (cfn == CFN_MASK_STORE) |
1166 | { |
1167 | ldst_p = true; |
1168 | rhs_code = CFN_MASK_STORE; |
1169 | } |
1170 | else if ((cfn != CFN_LAST |
1171 | && cfn != CFN_MASK_CALL |
1172 | && internal_fn_p (code: cfn) |
1173 | && !vectorizable_internal_fn_p (fn: as_internal_fn (code: cfn))) |
1174 | || gimple_call_tail_p (s: call_stmt) |
1175 | || gimple_call_noreturn_p (s: call_stmt) |
1176 | || gimple_call_chain (gs: call_stmt)) |
1177 | { |
1178 | if (dump_enabled_p ()) |
1179 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1180 | "Build SLP failed: unsupported call type %G" , |
1181 | (gimple *) call_stmt); |
1182 | if (is_a <bb_vec_info> (p: vinfo) && i != 0) |
1183 | continue; |
1184 | /* Fatal mismatch. */ |
1185 | matches[0] = false; |
1186 | return false; |
1187 | } |
1188 | } |
1189 | else if (gimple_code (g: stmt) == GIMPLE_PHI) |
1190 | { |
1191 | rhs_code = ERROR_MARK; |
1192 | phi_p = true; |
1193 | } |
1194 | else |
1195 | { |
1196 | rhs_code = gimple_assign_rhs_code (gs: stmt); |
1197 | ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr; |
1198 | } |
1199 | |
1200 | /* Check the operation. */ |
1201 | if (i == 0) |
1202 | { |
1203 | *node_vectype = vectype; |
1204 | first_stmt_code = rhs_code; |
1205 | first_stmt_ldst_p = ldst_p; |
1206 | first_stmt_phi_p = phi_p; |
1207 | |
1208 | /* Shift arguments should be equal in all the packed stmts for a |
1209 | vector shift with scalar shift operand. */ |
1210 | if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR |
1211 | || rhs_code == LROTATE_EXPR |
1212 | || rhs_code == RROTATE_EXPR) |
1213 | { |
1214 | /* First see if we have a vector/vector shift. */ |
1215 | if (!directly_supported_p (rhs_code, vectype, optab_vector)) |
1216 | { |
1217 | /* No vector/vector shift, try for a vector/scalar shift. */ |
1218 | if (!directly_supported_p (rhs_code, vectype, optab_scalar)) |
1219 | { |
1220 | if (dump_enabled_p ()) |
1221 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1222 | "Build SLP failed: " |
1223 | "op not supported by target.\n" ); |
1224 | if (is_a <bb_vec_info> (p: vinfo) && i != 0) |
1225 | continue; |
1226 | /* Fatal mismatch. */ |
1227 | matches[0] = false; |
1228 | return false; |
1229 | } |
1230 | need_same_oprnds = true; |
1231 | first_op1 = gimple_assign_rhs2 (gs: stmt); |
1232 | } |
1233 | } |
1234 | else if (rhs_code == WIDEN_LSHIFT_EXPR) |
1235 | { |
1236 | need_same_oprnds = true; |
1237 | first_op1 = gimple_assign_rhs2 (gs: stmt); |
1238 | } |
1239 | else if (!ldst_p |
1240 | && rhs_code == BIT_FIELD_REF) |
1241 | { |
1242 | tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0); |
1243 | if (!is_a <bb_vec_info> (p: vinfo) |
1244 | || TREE_CODE (vec) != SSA_NAME |
1245 | /* When the element types are not compatible we pun the |
1246 | source to the target vectype which requires equal size. */ |
1247 | || ((!VECTOR_TYPE_P (TREE_TYPE (vec)) |
1248 | || !types_compatible_p (TREE_TYPE (vectype), |
1249 | TREE_TYPE (TREE_TYPE (vec)))) |
1250 | && !operand_equal_p (TYPE_SIZE (vectype), |
1251 | TYPE_SIZE (TREE_TYPE (vec))))) |
1252 | { |
1253 | if (dump_enabled_p ()) |
1254 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1255 | "Build SLP failed: " |
1256 | "BIT_FIELD_REF not supported\n" ); |
1257 | /* Fatal mismatch. */ |
1258 | matches[0] = false; |
1259 | return false; |
1260 | } |
1261 | } |
1262 | else if (rhs_code == CFN_DIV_POW2) |
1263 | { |
1264 | need_same_oprnds = true; |
1265 | first_op1 = gimple_call_arg (gs: call_stmt, index: 1); |
1266 | } |
1267 | } |
1268 | else |
1269 | { |
1270 | if (first_stmt_code != rhs_code |
1271 | && alt_stmt_code == ERROR_MARK) |
1272 | alt_stmt_code = rhs_code; |
1273 | if ((first_stmt_code != rhs_code |
1274 | && (first_stmt_code != IMAGPART_EXPR |
1275 | || rhs_code != REALPART_EXPR) |
1276 | && (first_stmt_code != REALPART_EXPR |
1277 | || rhs_code != IMAGPART_EXPR) |
1278 | /* Handle mismatches in plus/minus by computing both |
1279 | and merging the results. */ |
1280 | && !((first_stmt_code == PLUS_EXPR |
1281 | || first_stmt_code == MINUS_EXPR) |
1282 | && (alt_stmt_code == PLUS_EXPR |
1283 | || alt_stmt_code == MINUS_EXPR) |
1284 | && rhs_code == alt_stmt_code) |
1285 | && !(first_stmt_code.is_tree_code () |
1286 | && rhs_code.is_tree_code () |
1287 | && (TREE_CODE_CLASS (tree_code (first_stmt_code)) |
1288 | == tcc_comparison) |
1289 | && (swap_tree_comparison (tree_code (first_stmt_code)) |
1290 | == tree_code (rhs_code))) |
1291 | && !(STMT_VINFO_GROUPED_ACCESS (stmt_info) |
1292 | && (first_stmt_code == ARRAY_REF |
1293 | || first_stmt_code == BIT_FIELD_REF |
1294 | || first_stmt_code == INDIRECT_REF |
1295 | || first_stmt_code == COMPONENT_REF |
1296 | || first_stmt_code == MEM_REF) |
1297 | && (rhs_code == ARRAY_REF |
1298 | || rhs_code == BIT_FIELD_REF |
1299 | || rhs_code == INDIRECT_REF |
1300 | || rhs_code == COMPONENT_REF |
1301 | || rhs_code == MEM_REF))) |
1302 | || (ldst_p |
1303 | && (STMT_VINFO_GROUPED_ACCESS (stmt_info) |
1304 | != STMT_VINFO_GROUPED_ACCESS (first_stmt_info))) |
1305 | || (ldst_p |
1306 | && (STMT_VINFO_GATHER_SCATTER_P (stmt_info) |
1307 | != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info))) |
1308 | || first_stmt_ldst_p != ldst_p |
1309 | || first_stmt_phi_p != phi_p) |
1310 | { |
1311 | if (dump_enabled_p ()) |
1312 | { |
1313 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1314 | "Build SLP failed: different operation " |
1315 | "in stmt %G" , stmt); |
1316 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1317 | "original stmt %G" , first_stmt_info->stmt); |
1318 | } |
1319 | /* Mismatch. */ |
1320 | continue; |
1321 | } |
1322 | |
1323 | if (!ldst_p |
1324 | && first_stmt_code == BIT_FIELD_REF |
1325 | && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0) |
1326 | != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0))) |
1327 | { |
1328 | if (dump_enabled_p ()) |
1329 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1330 | "Build SLP failed: different BIT_FIELD_REF " |
1331 | "arguments in %G" , stmt); |
1332 | /* Mismatch. */ |
1333 | continue; |
1334 | } |
1335 | |
1336 | if (call_stmt |
1337 | && first_stmt_code != CFN_MASK_LOAD |
1338 | && first_stmt_code != CFN_MASK_STORE) |
1339 | { |
1340 | if (!compatible_calls_p (call1: as_a <gcall *> (p: stmts[0]->stmt), |
1341 | call2: call_stmt)) |
1342 | { |
1343 | if (dump_enabled_p ()) |
1344 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1345 | "Build SLP failed: different calls in %G" , |
1346 | stmt); |
1347 | /* Mismatch. */ |
1348 | continue; |
1349 | } |
1350 | } |
1351 | |
1352 | if ((phi_p || gimple_could_trap_p (stmt_info->stmt)) |
1353 | && (gimple_bb (g: first_stmt_info->stmt) |
1354 | != gimple_bb (g: stmt_info->stmt))) |
1355 | { |
1356 | if (dump_enabled_p ()) |
1357 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1358 | "Build SLP failed: different BB for PHI " |
1359 | "or possibly trapping operation in %G" , stmt); |
1360 | /* Mismatch. */ |
1361 | continue; |
1362 | } |
1363 | |
1364 | if (need_same_oprnds) |
1365 | { |
1366 | tree other_op1 = gimple_arg (gs: stmt, i: 1); |
1367 | if (!operand_equal_p (first_op1, other_op1, flags: 0)) |
1368 | { |
1369 | if (dump_enabled_p ()) |
1370 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1371 | "Build SLP failed: different shift " |
1372 | "arguments in %G" , stmt); |
1373 | /* Mismatch. */ |
1374 | continue; |
1375 | } |
1376 | } |
1377 | |
1378 | if (!types_compatible_p (type1: vectype, type2: *node_vectype)) |
1379 | { |
1380 | if (dump_enabled_p ()) |
1381 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1382 | "Build SLP failed: different vector type " |
1383 | "in %G" , stmt); |
1384 | /* Mismatch. */ |
1385 | continue; |
1386 | } |
1387 | } |
1388 | |
1389 | /* Grouped store or load. */ |
1390 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
1391 | { |
1392 | gcc_assert (ldst_p); |
1393 | if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))) |
1394 | { |
1395 | /* Store. */ |
1396 | gcc_assert (rhs_code == CFN_MASK_STORE |
1397 | || REFERENCE_CLASS_P (lhs) |
1398 | || DECL_P (lhs)); |
1399 | } |
1400 | else |
1401 | { |
1402 | /* Load. */ |
1403 | first_load = DR_GROUP_FIRST_ELEMENT (stmt_info); |
1404 | if (prev_first_load) |
1405 | { |
1406 | /* Check that there are no loads from different interleaving |
1407 | chains in the same node. */ |
1408 | if (prev_first_load != first_load) |
1409 | { |
1410 | if (dump_enabled_p ()) |
1411 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, |
1412 | vect_location, |
1413 | "Build SLP failed: different " |
1414 | "interleaving chains in one node %G" , |
1415 | stmt); |
1416 | /* Mismatch. */ |
1417 | continue; |
1418 | } |
1419 | } |
1420 | else |
1421 | prev_first_load = first_load; |
1422 | } |
1423 | } |
1424 | /* Non-grouped store or load. */ |
1425 | else if (ldst_p) |
1426 | { |
1427 | if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) |
1428 | && rhs_code != CFN_GATHER_LOAD |
1429 | && rhs_code != CFN_MASK_GATHER_LOAD |
1430 | && rhs_code != CFN_MASK_LEN_GATHER_LOAD |
1431 | && !STMT_VINFO_GATHER_SCATTER_P (stmt_info) |
1432 | /* Not grouped loads are handled as externals for BB |
1433 | vectorization. For loop vectorization we can handle |
1434 | splats the same we handle single element interleaving. */ |
1435 | && (is_a <bb_vec_info> (p: vinfo) |
1436 | || stmt_info != first_stmt_info)) |
1437 | { |
1438 | /* Not grouped load. */ |
1439 | if (dump_enabled_p ()) |
1440 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1441 | "Build SLP failed: not grouped load %G" , stmt); |
1442 | |
1443 | if (i != 0) |
1444 | continue; |
1445 | /* Fatal mismatch. */ |
1446 | matches[0] = false; |
1447 | return false; |
1448 | } |
1449 | } |
1450 | /* Not memory operation. */ |
1451 | else |
1452 | { |
1453 | if (!phi_p |
1454 | && rhs_code.is_tree_code () |
1455 | && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary |
1456 | && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary |
1457 | && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression |
1458 | && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison |
1459 | && rhs_code != VIEW_CONVERT_EXPR |
1460 | && rhs_code != CALL_EXPR |
1461 | && rhs_code != BIT_FIELD_REF) |
1462 | { |
1463 | if (dump_enabled_p ()) |
1464 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1465 | "Build SLP failed: operation unsupported %G" , |
1466 | stmt); |
1467 | if (is_a <bb_vec_info> (p: vinfo) && i != 0) |
1468 | continue; |
1469 | /* Fatal mismatch. */ |
1470 | matches[0] = false; |
1471 | return false; |
1472 | } |
1473 | |
1474 | if (rhs_code == COND_EXPR) |
1475 | { |
1476 | tree cond_expr = gimple_assign_rhs1 (gs: stmt); |
1477 | enum tree_code cond_code = TREE_CODE (cond_expr); |
1478 | enum tree_code swap_code = ERROR_MARK; |
1479 | enum tree_code invert_code = ERROR_MARK; |
1480 | |
1481 | if (i == 0) |
1482 | first_cond_code = TREE_CODE (cond_expr); |
1483 | else if (TREE_CODE_CLASS (cond_code) == tcc_comparison) |
1484 | { |
1485 | bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0)); |
1486 | swap_code = swap_tree_comparison (cond_code); |
1487 | invert_code = invert_tree_comparison (cond_code, honor_nans); |
1488 | } |
1489 | |
1490 | if (first_cond_code == cond_code) |
1491 | ; |
1492 | /* Isomorphic can be achieved by swapping. */ |
1493 | else if (first_cond_code == swap_code) |
1494 | swap[i] = 1; |
1495 | /* Isomorphic can be achieved by inverting. */ |
1496 | else if (first_cond_code == invert_code) |
1497 | swap[i] = 2; |
1498 | else |
1499 | { |
1500 | if (dump_enabled_p ()) |
1501 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1502 | "Build SLP failed: different" |
1503 | " operation %G" , stmt); |
1504 | /* Mismatch. */ |
1505 | continue; |
1506 | } |
1507 | } |
1508 | |
1509 | if (rhs_code.is_tree_code () |
1510 | && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison |
1511 | && (swap_tree_comparison ((tree_code)first_stmt_code) |
1512 | == (tree_code)rhs_code)) |
1513 | swap[i] = 1; |
1514 | } |
1515 | |
1516 | matches[i] = true; |
1517 | } |
1518 | |
1519 | for (i = 0; i < group_size; ++i) |
1520 | if (!matches[i]) |
1521 | return false; |
1522 | |
1523 | /* If we allowed a two-operation SLP node verify the target can cope |
1524 | with the permute we are going to use. */ |
1525 | if (alt_stmt_code != ERROR_MARK |
1526 | && (!alt_stmt_code.is_tree_code () |
1527 | || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference |
1528 | && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison))) |
1529 | { |
1530 | *two_operators = true; |
1531 | } |
1532 | |
1533 | if (maybe_soft_fail) |
1534 | { |
1535 | unsigned HOST_WIDE_INT const_nunits; |
1536 | if (!TYPE_VECTOR_SUBPARTS |
1537 | (node: soft_fail_nunits_vectype).is_constant (const_value: &const_nunits) |
1538 | || const_nunits > group_size) |
1539 | matches[0] = false; |
1540 | else |
1541 | { |
1542 | /* With constant vector elements simulate a mismatch at the |
1543 | point we need to split. */ |
1544 | unsigned tail = group_size & (const_nunits - 1); |
1545 | memset (s: &matches[group_size - tail], c: 0, n: sizeof (bool) * tail); |
1546 | } |
1547 | return false; |
1548 | } |
1549 | |
1550 | return true; |
1551 | } |
1552 | |
1553 | /* Traits for the hash_set to record failed SLP builds for a stmt set. |
1554 | Note we never remove apart from at destruction time so we do not |
1555 | need a special value for deleted that differs from empty. */ |
1556 | struct bst_traits |
1557 | { |
1558 | typedef vec <stmt_vec_info> value_type; |
1559 | typedef vec <stmt_vec_info> compare_type; |
1560 | static inline hashval_t hash (value_type); |
1561 | static inline bool equal (value_type existing, value_type candidate); |
1562 | static inline bool is_empty (value_type x) { return !x.exists (); } |
1563 | static inline bool is_deleted (value_type x) { return !x.exists (); } |
1564 | static const bool empty_zero_p = true; |
1565 | static inline void mark_empty (value_type &x) { x.release (); } |
1566 | static inline void mark_deleted (value_type &x) { x.release (); } |
1567 | static inline void remove (value_type &x) { x.release (); } |
1568 | }; |
1569 | inline hashval_t |
1570 | bst_traits::hash (value_type x) |
1571 | { |
1572 | inchash::hash h; |
1573 | for (unsigned i = 0; i < x.length (); ++i) |
1574 | h.add_int (v: gimple_uid (g: x[i]->stmt)); |
1575 | return h.end (); |
1576 | } |
1577 | inline bool |
1578 | bst_traits::equal (value_type existing, value_type candidate) |
1579 | { |
1580 | if (existing.length () != candidate.length ()) |
1581 | return false; |
1582 | for (unsigned i = 0; i < existing.length (); ++i) |
1583 | if (existing[i] != candidate[i]) |
1584 | return false; |
1585 | return true; |
1586 | } |
1587 | |
1588 | /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree> |
1589 | but then vec::insert does memmove and that's not compatible with |
1590 | std::pair. */ |
1591 | struct chain_op_t |
1592 | { |
1593 | chain_op_t (tree_code code_, vect_def_type dt_, tree op_) |
1594 | : code (code_), dt (dt_), op (op_) {} |
1595 | tree_code code; |
1596 | vect_def_type dt; |
1597 | tree op; |
1598 | }; |
1599 | |
1600 | /* Comparator for sorting associatable chains. */ |
1601 | |
1602 | static int |
1603 | dt_sort_cmp (const void *op1_, const void *op2_, void *) |
1604 | { |
1605 | auto *op1 = (const chain_op_t *) op1_; |
1606 | auto *op2 = (const chain_op_t *) op2_; |
1607 | if (op1->dt != op2->dt) |
1608 | return (int)op1->dt - (int)op2->dt; |
1609 | return (int)op1->code - (int)op2->code; |
1610 | } |
1611 | |
1612 | /* Linearize the associatable expression chain at START with the |
1613 | associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR), |
1614 | filling CHAIN with the result and using WORKLIST as intermediate storage. |
1615 | CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE |
1616 | or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation |
1617 | stmts, starting with START. */ |
1618 | |
1619 | static void |
1620 | vect_slp_linearize_chain (vec_info *vinfo, |
1621 | vec<std::pair<tree_code, gimple *> > &worklist, |
1622 | vec<chain_op_t> &chain, |
1623 | enum tree_code code, gimple *start, |
1624 | gimple *&code_stmt, gimple *&alt_code_stmt, |
1625 | vec<gimple *> *chain_stmts) |
1626 | { |
1627 | /* For each lane linearize the addition/subtraction (or other |
1628 | uniform associatable operation) expression tree. */ |
1629 | worklist.safe_push (obj: std::make_pair (x&: code, y&: start)); |
1630 | while (!worklist.is_empty ()) |
1631 | { |
1632 | auto entry = worklist.pop (); |
1633 | gassign *stmt = as_a <gassign *> (p: entry.second); |
1634 | enum tree_code in_code = entry.first; |
1635 | enum tree_code this_code = gimple_assign_rhs_code (gs: stmt); |
1636 | /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */ |
1637 | if (!code_stmt |
1638 | && gimple_assign_rhs_code (gs: stmt) == code) |
1639 | code_stmt = stmt; |
1640 | else if (!alt_code_stmt |
1641 | && gimple_assign_rhs_code (gs: stmt) == MINUS_EXPR) |
1642 | alt_code_stmt = stmt; |
1643 | if (chain_stmts) |
1644 | chain_stmts->safe_push (obj: stmt); |
1645 | for (unsigned opnum = 1; opnum <= 2; ++opnum) |
1646 | { |
1647 | tree op = gimple_op (gs: stmt, i: opnum); |
1648 | vect_def_type dt; |
1649 | stmt_vec_info def_stmt_info; |
1650 | bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info); |
1651 | gcc_assert (res); |
1652 | if (dt == vect_internal_def |
1653 | && is_pattern_stmt_p (stmt_info: def_stmt_info)) |
1654 | op = gimple_get_lhs (def_stmt_info->stmt); |
1655 | gimple *use_stmt; |
1656 | use_operand_p use_p; |
1657 | if (dt == vect_internal_def |
1658 | && single_imm_use (var: op, use_p: &use_p, stmt: &use_stmt) |
1659 | && is_gimple_assign (gs: def_stmt_info->stmt) |
1660 | && (gimple_assign_rhs_code (gs: def_stmt_info->stmt) == code |
1661 | || (code == PLUS_EXPR |
1662 | && (gimple_assign_rhs_code (gs: def_stmt_info->stmt) |
1663 | == MINUS_EXPR)))) |
1664 | { |
1665 | tree_code op_def_code = this_code; |
1666 | if (op_def_code == MINUS_EXPR && opnum == 1) |
1667 | op_def_code = PLUS_EXPR; |
1668 | if (in_code == MINUS_EXPR) |
1669 | op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR; |
1670 | worklist.safe_push (obj: std::make_pair (x&: op_def_code, |
1671 | y&: def_stmt_info->stmt)); |
1672 | } |
1673 | else |
1674 | { |
1675 | tree_code op_def_code = this_code; |
1676 | if (op_def_code == MINUS_EXPR && opnum == 1) |
1677 | op_def_code = PLUS_EXPR; |
1678 | if (in_code == MINUS_EXPR) |
1679 | op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR; |
1680 | chain.safe_push (obj: chain_op_t (op_def_code, dt, op)); |
1681 | } |
1682 | } |
1683 | } |
1684 | } |
1685 | |
1686 | typedef hash_map <vec <stmt_vec_info>, slp_tree, |
1687 | simple_hashmap_traits <bst_traits, slp_tree> > |
1688 | scalar_stmts_to_slp_tree_map_t; |
1689 | |
1690 | static slp_tree |
1691 | vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, |
1692 | vec<stmt_vec_info> stmts, unsigned int group_size, |
1693 | poly_uint64 *max_nunits, |
1694 | bool *matches, unsigned *limit, unsigned *tree_size, |
1695 | scalar_stmts_to_slp_tree_map_t *bst_map); |
1696 | |
1697 | static slp_tree |
1698 | vect_build_slp_tree (vec_info *vinfo, |
1699 | vec<stmt_vec_info> stmts, unsigned int group_size, |
1700 | poly_uint64 *max_nunits, |
1701 | bool *matches, unsigned *limit, unsigned *tree_size, |
1702 | scalar_stmts_to_slp_tree_map_t *bst_map) |
1703 | { |
1704 | if (slp_tree *leader = bst_map->get (k: stmts)) |
1705 | { |
1706 | if (dump_enabled_p ()) |
1707 | dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n" , |
1708 | !(*leader)->failed ? "" : "failed " , |
1709 | (void *) *leader); |
1710 | if (!(*leader)->failed) |
1711 | { |
1712 | SLP_TREE_REF_COUNT (*leader)++; |
1713 | vect_update_max_nunits (max_nunits, nunits: (*leader)->max_nunits); |
1714 | stmts.release (); |
1715 | return *leader; |
1716 | } |
1717 | memcpy (dest: matches, src: (*leader)->failed, n: sizeof (bool) * group_size); |
1718 | return NULL; |
1719 | } |
1720 | |
1721 | /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2 |
1722 | so we can pick up backedge destinations during discovery. */ |
1723 | slp_tree res = new _slp_tree; |
1724 | SLP_TREE_DEF_TYPE (res) = vect_internal_def; |
1725 | SLP_TREE_SCALAR_STMTS (res) = stmts; |
1726 | bst_map->put (k: stmts.copy (), v: res); |
1727 | |
1728 | if (*limit == 0) |
1729 | { |
1730 | if (dump_enabled_p ()) |
1731 | dump_printf_loc (MSG_NOTE, vect_location, |
1732 | "SLP discovery limit exceeded\n" ); |
1733 | /* Mark the node invalid so we can detect those when still in use |
1734 | as backedge destinations. */ |
1735 | SLP_TREE_SCALAR_STMTS (res) = vNULL; |
1736 | SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def; |
1737 | res->failed = XNEWVEC (bool, group_size); |
1738 | memset (s: res->failed, c: 0, n: sizeof (bool) * group_size); |
1739 | memset (s: matches, c: 0, n: sizeof (bool) * group_size); |
1740 | return NULL; |
1741 | } |
1742 | --*limit; |
1743 | |
1744 | if (dump_enabled_p ()) |
1745 | dump_printf_loc (MSG_NOTE, vect_location, |
1746 | "starting SLP discovery for node %p\n" , (void *) res); |
1747 | |
1748 | poly_uint64 this_max_nunits = 1; |
1749 | slp_tree res_ = vect_build_slp_tree_2 (vinfo, node: res, stmts, group_size, |
1750 | max_nunits: &this_max_nunits, |
1751 | matches, limit, tree_size, bst_map); |
1752 | if (!res_) |
1753 | { |
1754 | if (dump_enabled_p ()) |
1755 | dump_printf_loc (MSG_NOTE, vect_location, |
1756 | "SLP discovery for node %p failed\n" , (void *) res); |
1757 | /* Mark the node invalid so we can detect those when still in use |
1758 | as backedge destinations. */ |
1759 | SLP_TREE_SCALAR_STMTS (res) = vNULL; |
1760 | SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def; |
1761 | res->failed = XNEWVEC (bool, group_size); |
1762 | if (flag_checking) |
1763 | { |
1764 | unsigned i; |
1765 | for (i = 0; i < group_size; ++i) |
1766 | if (!matches[i]) |
1767 | break; |
1768 | gcc_assert (i < group_size); |
1769 | } |
1770 | memcpy (dest: res->failed, src: matches, n: sizeof (bool) * group_size); |
1771 | } |
1772 | else |
1773 | { |
1774 | if (dump_enabled_p ()) |
1775 | dump_printf_loc (MSG_NOTE, vect_location, |
1776 | "SLP discovery for node %p succeeded\n" , |
1777 | (void *) res); |
1778 | gcc_assert (res_ == res); |
1779 | res->max_nunits = this_max_nunits; |
1780 | vect_update_max_nunits (max_nunits, nunits: this_max_nunits); |
1781 | /* Keep a reference for the bst_map use. */ |
1782 | SLP_TREE_REF_COUNT (res)++; |
1783 | } |
1784 | return res_; |
1785 | } |
1786 | |
1787 | /* Helper for building an associated SLP node chain. */ |
1788 | |
1789 | static void |
1790 | vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype, |
1791 | slp_tree op0, slp_tree op1, |
1792 | stmt_vec_info oper1, stmt_vec_info oper2, |
1793 | vec<std::pair<unsigned, unsigned> > lperm) |
1794 | { |
1795 | unsigned group_size = SLP_TREE_LANES (op1); |
1796 | |
1797 | slp_tree child1 = new _slp_tree; |
1798 | SLP_TREE_DEF_TYPE (child1) = vect_internal_def; |
1799 | SLP_TREE_VECTYPE (child1) = vectype; |
1800 | SLP_TREE_LANES (child1) = group_size; |
1801 | SLP_TREE_CHILDREN (child1).create (nelems: 2); |
1802 | SLP_TREE_CHILDREN (child1).quick_push (obj: op0); |
1803 | SLP_TREE_CHILDREN (child1).quick_push (obj: op1); |
1804 | SLP_TREE_REPRESENTATIVE (child1) = oper1; |
1805 | |
1806 | slp_tree child2 = new _slp_tree; |
1807 | SLP_TREE_DEF_TYPE (child2) = vect_internal_def; |
1808 | SLP_TREE_VECTYPE (child2) = vectype; |
1809 | SLP_TREE_LANES (child2) = group_size; |
1810 | SLP_TREE_CHILDREN (child2).create (nelems: 2); |
1811 | SLP_TREE_CHILDREN (child2).quick_push (obj: op0); |
1812 | SLP_TREE_REF_COUNT (op0)++; |
1813 | SLP_TREE_CHILDREN (child2).quick_push (obj: op1); |
1814 | SLP_TREE_REF_COUNT (op1)++; |
1815 | SLP_TREE_REPRESENTATIVE (child2) = oper2; |
1816 | |
1817 | SLP_TREE_DEF_TYPE (perm) = vect_internal_def; |
1818 | SLP_TREE_CODE (perm) = VEC_PERM_EXPR; |
1819 | SLP_TREE_VECTYPE (perm) = vectype; |
1820 | SLP_TREE_LANES (perm) = group_size; |
1821 | /* ??? We should set this NULL but that's not expected. */ |
1822 | SLP_TREE_REPRESENTATIVE (perm) = oper1; |
1823 | SLP_TREE_LANE_PERMUTATION (perm) = lperm; |
1824 | SLP_TREE_CHILDREN (perm).quick_push (obj: child1); |
1825 | SLP_TREE_CHILDREN (perm).quick_push (obj: child2); |
1826 | } |
1827 | |
1828 | /* Recursively build an SLP tree starting from NODE. |
1829 | Fail (and return a value not equal to zero) if def-stmts are not |
1830 | isomorphic, require data permutation or are of unsupported types of |
1831 | operation. Otherwise, return 0. |
1832 | The value returned is the depth in the SLP tree where a mismatch |
1833 | was found. */ |
1834 | |
1835 | static slp_tree |
1836 | vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, |
1837 | vec<stmt_vec_info> stmts, unsigned int group_size, |
1838 | poly_uint64 *max_nunits, |
1839 | bool *matches, unsigned *limit, unsigned *tree_size, |
1840 | scalar_stmts_to_slp_tree_map_t *bst_map) |
1841 | { |
1842 | unsigned nops, i, this_tree_size = 0; |
1843 | poly_uint64 this_max_nunits = *max_nunits; |
1844 | |
1845 | matches[0] = false; |
1846 | |
1847 | stmt_vec_info stmt_info = stmts[0]; |
1848 | if (!is_a<gcall *> (p: stmt_info->stmt) |
1849 | && !is_a<gassign *> (p: stmt_info->stmt) |
1850 | && !is_a<gphi *> (p: stmt_info->stmt)) |
1851 | return NULL; |
1852 | |
1853 | nops = gimple_num_args (gs: stmt_info->stmt); |
1854 | if (const int *map = vect_get_operand_map (stmt: stmt_info->stmt, |
1855 | STMT_VINFO_GATHER_SCATTER_P |
1856 | (stmt_info))) |
1857 | nops = map[0]; |
1858 | |
1859 | /* If the SLP node is a PHI (induction or reduction), terminate |
1860 | the recursion. */ |
1861 | bool *skip_args = XALLOCAVEC (bool, nops); |
1862 | memset (s: skip_args, c: 0, n: sizeof (bool) * nops); |
1863 | if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo)) |
1864 | if (gphi *stmt = dyn_cast <gphi *> (p: stmt_info->stmt)) |
1865 | { |
1866 | tree scalar_type = TREE_TYPE (PHI_RESULT (stmt)); |
1867 | tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, |
1868 | group_size); |
1869 | if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype, |
1870 | max_nunits)) |
1871 | return NULL; |
1872 | |
1873 | vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info); |
1874 | if (def_type == vect_induction_def) |
1875 | { |
1876 | /* Induction PHIs are not cycles but walk the initial |
1877 | value. Only for inner loops through, for outer loops |
1878 | we need to pick up the value from the actual PHIs |
1879 | to more easily support peeling and epilogue vectorization. */ |
1880 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1881 | if (!nested_in_vect_loop_p (loop, stmt_info)) |
1882 | skip_args[loop_preheader_edge (loop)->dest_idx] = true; |
1883 | else |
1884 | loop = loop->inner; |
1885 | skip_args[loop_latch_edge (loop)->dest_idx] = true; |
1886 | } |
1887 | else if (def_type == vect_reduction_def |
1888 | || def_type == vect_double_reduction_def |
1889 | || def_type == vect_nested_cycle |
1890 | || def_type == vect_first_order_recurrence) |
1891 | { |
1892 | /* Else def types have to match. */ |
1893 | stmt_vec_info other_info; |
1894 | bool all_same = true; |
1895 | FOR_EACH_VEC_ELT (stmts, i, other_info) |
1896 | { |
1897 | if (STMT_VINFO_DEF_TYPE (other_info) != def_type) |
1898 | return NULL; |
1899 | if (other_info != stmt_info) |
1900 | all_same = false; |
1901 | } |
1902 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1903 | /* Reduction initial values are not explicitely represented. */ |
1904 | if (def_type != vect_first_order_recurrence |
1905 | && !nested_in_vect_loop_p (loop, stmt_info)) |
1906 | skip_args[loop_preheader_edge (loop)->dest_idx] = true; |
1907 | /* Reduction chain backedge defs are filled manually. |
1908 | ??? Need a better way to identify a SLP reduction chain PHI. |
1909 | Or a better overall way to SLP match those. */ |
1910 | if (all_same && def_type == vect_reduction_def) |
1911 | skip_args[loop_latch_edge (loop)->dest_idx] = true; |
1912 | } |
1913 | else if (def_type != vect_internal_def) |
1914 | return NULL; |
1915 | } |
1916 | |
1917 | |
1918 | bool two_operators = false; |
1919 | unsigned char *swap = XALLOCAVEC (unsigned char, group_size); |
1920 | tree vectype = NULL_TREE; |
1921 | if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size, |
1922 | max_nunits: &this_max_nunits, matches, two_operators: &two_operators, |
1923 | node_vectype: &vectype)) |
1924 | return NULL; |
1925 | |
1926 | /* If the SLP node is a load, terminate the recursion unless masked. */ |
1927 | if (STMT_VINFO_DATA_REF (stmt_info) |
1928 | && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) |
1929 | { |
1930 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) |
1931 | gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))); |
1932 | else |
1933 | { |
1934 | *max_nunits = this_max_nunits; |
1935 | (*tree_size)++; |
1936 | node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 0); |
1937 | SLP_TREE_VECTYPE (node) = vectype; |
1938 | /* And compute the load permutation. Whether it is actually |
1939 | a permutation depends on the unrolling factor which is |
1940 | decided later. */ |
1941 | vec<unsigned> load_permutation; |
1942 | int j; |
1943 | stmt_vec_info load_info; |
1944 | load_permutation.create (nelems: group_size); |
1945 | stmt_vec_info first_stmt_info |
1946 | = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); |
1947 | bool any_permute = false; |
1948 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) |
1949 | { |
1950 | int load_place; |
1951 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
1952 | load_place = vect_get_place_in_interleaving_chain |
1953 | (stmt_info: load_info, first_stmt_info); |
1954 | else |
1955 | load_place = 0; |
1956 | gcc_assert (load_place != -1); |
1957 | any_permute |= load_place != j; |
1958 | load_permutation.quick_push (obj: load_place); |
1959 | } |
1960 | |
1961 | if (gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt)) |
1962 | { |
1963 | gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) |
1964 | || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) |
1965 | || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) |
1966 | || gimple_call_internal_p (stmt, |
1967 | IFN_MASK_LEN_GATHER_LOAD)); |
1968 | load_permutation.release (); |
1969 | /* We cannot handle permuted masked loads, see PR114375. */ |
1970 | if (any_permute |
1971 | || (STMT_VINFO_GROUPED_ACCESS (stmt_info) |
1972 | && DR_GROUP_SIZE (first_stmt_info) != group_size) |
1973 | || STMT_VINFO_STRIDED_P (stmt_info)) |
1974 | { |
1975 | matches[0] = false; |
1976 | return NULL; |
1977 | } |
1978 | } |
1979 | else |
1980 | { |
1981 | SLP_TREE_LOAD_PERMUTATION (node) = load_permutation; |
1982 | return node; |
1983 | } |
1984 | } |
1985 | } |
1986 | else if (gimple_assign_single_p (gs: stmt_info->stmt) |
1987 | && !gimple_vuse (g: stmt_info->stmt) |
1988 | && gimple_assign_rhs_code (gs: stmt_info->stmt) == BIT_FIELD_REF) |
1989 | { |
1990 | /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference |
1991 | the same SSA name vector of a compatible type to vectype. */ |
1992 | vec<std::pair<unsigned, unsigned> > lperm = vNULL; |
1993 | tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0); |
1994 | stmt_vec_info estmt_info; |
1995 | FOR_EACH_VEC_ELT (stmts, i, estmt_info) |
1996 | { |
1997 | gassign *estmt = as_a <gassign *> (p: estmt_info->stmt); |
1998 | tree bfref = gimple_assign_rhs1 (gs: estmt); |
1999 | HOST_WIDE_INT lane; |
2000 | if (!known_eq (bit_field_size (bfref), |
2001 | tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype)))) |
2002 | || !constant_multiple_p (a: bit_field_offset (t: bfref), |
2003 | b: bit_field_size (t: bfref), multiple: &lane)) |
2004 | { |
2005 | lperm.release (); |
2006 | matches[0] = false; |
2007 | return NULL; |
2008 | } |
2009 | lperm.safe_push (obj: std::make_pair (x: 0, y: (unsigned)lane)); |
2010 | } |
2011 | slp_tree vnode = vect_create_new_slp_node (ops: vNULL); |
2012 | if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec)))) |
2013 | /* ??? We record vectype here but we hide eventually necessary |
2014 | punning and instead rely on code generation to materialize |
2015 | VIEW_CONVERT_EXPRs as necessary. We instead should make |
2016 | this explicit somehow. */ |
2017 | SLP_TREE_VECTYPE (vnode) = vectype; |
2018 | else |
2019 | { |
2020 | /* For different size but compatible elements we can still |
2021 | use VEC_PERM_EXPR without punning. */ |
2022 | gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec)) |
2023 | && types_compatible_p (TREE_TYPE (vectype), |
2024 | TREE_TYPE (TREE_TYPE (vec)))); |
2025 | SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec); |
2026 | } |
2027 | auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode)); |
2028 | unsigned HOST_WIDE_INT const_nunits; |
2029 | if (nunits.is_constant (const_value: &const_nunits)) |
2030 | SLP_TREE_LANES (vnode) = const_nunits; |
2031 | SLP_TREE_VEC_DEFS (vnode).safe_push (obj: vec); |
2032 | /* We are always building a permutation node even if it is an identity |
2033 | permute to shield the rest of the vectorizer from the odd node |
2034 | representing an actual vector without any scalar ops. |
2035 | ??? We could hide it completely with making the permute node |
2036 | external? */ |
2037 | node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 1); |
2038 | SLP_TREE_CODE (node) = VEC_PERM_EXPR; |
2039 | SLP_TREE_LANE_PERMUTATION (node) = lperm; |
2040 | SLP_TREE_VECTYPE (node) = vectype; |
2041 | SLP_TREE_CHILDREN (node).quick_push (obj: vnode); |
2042 | return node; |
2043 | } |
2044 | /* When discovery reaches an associatable operation see whether we can |
2045 | improve that to match up lanes in a way superior to the operand |
2046 | swapping code which at most looks at two defs. |
2047 | ??? For BB vectorization we cannot do the brute-force search |
2048 | for matching as we can succeed by means of builds from scalars |
2049 | and have no good way to "cost" one build against another. */ |
2050 | else if (is_a <loop_vec_info> (p: vinfo) |
2051 | /* ??? We don't handle !vect_internal_def defs below. */ |
2052 | && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def |
2053 | && is_gimple_assign (gs: stmt_info->stmt) |
2054 | && (associative_tree_code (gimple_assign_rhs_code (gs: stmt_info->stmt)) |
2055 | || gimple_assign_rhs_code (gs: stmt_info->stmt) == MINUS_EXPR) |
2056 | && ((FLOAT_TYPE_P (vectype) && flag_associative_math) |
2057 | || (INTEGRAL_TYPE_P (TREE_TYPE (vectype)) |
2058 | && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype))))) |
2059 | { |
2060 | /* See if we have a chain of (mixed) adds or subtracts or other |
2061 | associatable ops. */ |
2062 | enum tree_code code = gimple_assign_rhs_code (gs: stmt_info->stmt); |
2063 | if (code == MINUS_EXPR) |
2064 | code = PLUS_EXPR; |
2065 | stmt_vec_info other_op_stmt_info = NULL; |
2066 | stmt_vec_info op_stmt_info = NULL; |
2067 | unsigned chain_len = 0; |
2068 | auto_vec<chain_op_t> chain; |
2069 | auto_vec<std::pair<tree_code, gimple *> > worklist; |
2070 | auto_vec<vec<chain_op_t> > chains (group_size); |
2071 | auto_vec<slp_tree, 4> children; |
2072 | bool hard_fail = true; |
2073 | for (unsigned lane = 0; lane < group_size; ++lane) |
2074 | { |
2075 | /* For each lane linearize the addition/subtraction (or other |
2076 | uniform associatable operation) expression tree. */ |
2077 | gimple *op_stmt = NULL, *other_op_stmt = NULL; |
2078 | vect_slp_linearize_chain (vinfo, worklist, chain, code, |
2079 | start: stmts[lane]->stmt, code_stmt&: op_stmt, alt_code_stmt&: other_op_stmt, |
2080 | NULL); |
2081 | if (!op_stmt_info && op_stmt) |
2082 | op_stmt_info = vinfo->lookup_stmt (op_stmt); |
2083 | if (!other_op_stmt_info && other_op_stmt) |
2084 | other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt); |
2085 | if (chain.length () == 2) |
2086 | { |
2087 | /* In a chain of just two elements resort to the regular |
2088 | operand swapping scheme. If we run into a length |
2089 | mismatch still hard-FAIL. */ |
2090 | if (chain_len == 0) |
2091 | hard_fail = false; |
2092 | else |
2093 | { |
2094 | matches[lane] = false; |
2095 | /* ??? We might want to process the other lanes, but |
2096 | make sure to not give false matching hints to the |
2097 | caller for lanes we did not process. */ |
2098 | if (lane != group_size - 1) |
2099 | matches[0] = false; |
2100 | } |
2101 | break; |
2102 | } |
2103 | else if (chain_len == 0) |
2104 | chain_len = chain.length (); |
2105 | else if (chain.length () != chain_len) |
2106 | { |
2107 | /* ??? Here we could slip in magic to compensate with |
2108 | neutral operands. */ |
2109 | matches[lane] = false; |
2110 | if (lane != group_size - 1) |
2111 | matches[0] = false; |
2112 | break; |
2113 | } |
2114 | chains.quick_push (obj: chain.copy ()); |
2115 | chain.truncate (size: 0); |
2116 | } |
2117 | if (chains.length () == group_size) |
2118 | { |
2119 | /* We cannot yet use SLP_TREE_CODE to communicate the operation. */ |
2120 | if (!op_stmt_info) |
2121 | { |
2122 | hard_fail = false; |
2123 | goto out; |
2124 | } |
2125 | /* Now we have a set of chains with the same length. */ |
2126 | /* 1. pre-sort according to def_type and operation. */ |
2127 | for (unsigned lane = 0; lane < group_size; ++lane) |
2128 | chains[lane].stablesort (cmp: dt_sort_cmp, data: vinfo); |
2129 | if (dump_enabled_p ()) |
2130 | { |
2131 | dump_printf_loc (MSG_NOTE, vect_location, |
2132 | "pre-sorted chains of %s\n" , |
2133 | get_tree_code_name (code)); |
2134 | for (unsigned lane = 0; lane < group_size; ++lane) |
2135 | { |
2136 | for (unsigned opnum = 0; opnum < chain_len; ++opnum) |
2137 | dump_printf (MSG_NOTE, "%s %T " , |
2138 | get_tree_code_name (chains[lane][opnum].code), |
2139 | chains[lane][opnum].op); |
2140 | dump_printf (MSG_NOTE, "\n" ); |
2141 | } |
2142 | } |
2143 | /* 2. try to build children nodes, associating as necessary. */ |
2144 | for (unsigned n = 0; n < chain_len; ++n) |
2145 | { |
2146 | vect_def_type dt = chains[0][n].dt; |
2147 | unsigned lane; |
2148 | for (lane = 0; lane < group_size; ++lane) |
2149 | if (chains[lane][n].dt != dt) |
2150 | { |
2151 | if (dt == vect_constant_def |
2152 | && chains[lane][n].dt == vect_external_def) |
2153 | dt = vect_external_def; |
2154 | else if (dt == vect_external_def |
2155 | && chains[lane][n].dt == vect_constant_def) |
2156 | ; |
2157 | else |
2158 | break; |
2159 | } |
2160 | if (lane != group_size) |
2161 | { |
2162 | if (dump_enabled_p ()) |
2163 | dump_printf_loc (MSG_NOTE, vect_location, |
2164 | "giving up on chain due to mismatched " |
2165 | "def types\n" ); |
2166 | matches[lane] = false; |
2167 | if (lane != group_size - 1) |
2168 | matches[0] = false; |
2169 | goto out; |
2170 | } |
2171 | if (dt == vect_constant_def |
2172 | || dt == vect_external_def) |
2173 | { |
2174 | /* Check whether we can build the invariant. If we can't |
2175 | we never will be able to. */ |
2176 | tree type = TREE_TYPE (chains[0][n].op); |
2177 | if (!GET_MODE_SIZE (mode: vinfo->vector_mode).is_constant () |
2178 | && (TREE_CODE (type) == BOOLEAN_TYPE |
2179 | || !can_duplicate_and_interleave_p (vinfo, count: group_size, |
2180 | elt_type: type))) |
2181 | { |
2182 | matches[0] = false; |
2183 | goto out; |
2184 | } |
2185 | vec<tree> ops; |
2186 | ops.create (nelems: group_size); |
2187 | for (lane = 0; lane < group_size; ++lane) |
2188 | ops.quick_push (obj: chains[lane][n].op); |
2189 | slp_tree child = vect_create_new_slp_node (ops); |
2190 | SLP_TREE_DEF_TYPE (child) = dt; |
2191 | children.safe_push (obj: child); |
2192 | } |
2193 | else if (dt != vect_internal_def) |
2194 | { |
2195 | /* Not sure, we might need sth special. |
2196 | gcc.dg/vect/pr96854.c, |
2197 | gfortran.dg/vect/fast-math-pr37021.f90 |
2198 | and gfortran.dg/vect/pr61171.f trigger. */ |
2199 | /* Soft-fail for now. */ |
2200 | hard_fail = false; |
2201 | goto out; |
2202 | } |
2203 | else |
2204 | { |
2205 | vec<stmt_vec_info> op_stmts; |
2206 | op_stmts.create (nelems: group_size); |
2207 | slp_tree child = NULL; |
2208 | /* Brute-force our way. We have to consider a lane |
2209 | failing after fixing an earlier fail up in the |
2210 | SLP discovery recursion. So track the current |
2211 | permute per lane. */ |
2212 | unsigned *perms = XALLOCAVEC (unsigned, group_size); |
2213 | memset (s: perms, c: 0, n: sizeof (unsigned) * group_size); |
2214 | do |
2215 | { |
2216 | op_stmts.truncate (size: 0); |
2217 | for (lane = 0; lane < group_size; ++lane) |
2218 | op_stmts.quick_push |
2219 | (obj: vinfo->lookup_def (chains[lane][n].op)); |
2220 | child = vect_build_slp_tree (vinfo, stmts: op_stmts, |
2221 | group_size, max_nunits: &this_max_nunits, |
2222 | matches, limit, |
2223 | tree_size: &this_tree_size, bst_map); |
2224 | /* ??? We're likely getting too many fatal mismatches |
2225 | here so maybe we want to ignore them (but then we |
2226 | have no idea which lanes fatally mismatched). */ |
2227 | if (child || !matches[0]) |
2228 | break; |
2229 | /* Swap another lane we have not yet matched up into |
2230 | lanes that did not match. If we run out of |
2231 | permute possibilities for a lane terminate the |
2232 | search. */ |
2233 | bool term = false; |
2234 | for (lane = 1; lane < group_size; ++lane) |
2235 | if (!matches[lane]) |
2236 | { |
2237 | if (n + perms[lane] + 1 == chain_len) |
2238 | { |
2239 | term = true; |
2240 | break; |
2241 | } |
2242 | std::swap (a&: chains[lane][n], |
2243 | b&: chains[lane][n + perms[lane] + 1]); |
2244 | perms[lane]++; |
2245 | } |
2246 | if (term) |
2247 | break; |
2248 | } |
2249 | while (1); |
2250 | if (!child) |
2251 | { |
2252 | if (dump_enabled_p ()) |
2253 | dump_printf_loc (MSG_NOTE, vect_location, |
2254 | "failed to match up op %d\n" , n); |
2255 | op_stmts.release (); |
2256 | if (lane != group_size - 1) |
2257 | matches[0] = false; |
2258 | else |
2259 | matches[lane] = false; |
2260 | goto out; |
2261 | } |
2262 | if (dump_enabled_p ()) |
2263 | { |
2264 | dump_printf_loc (MSG_NOTE, vect_location, |
2265 | "matched up op %d to\n" , n); |
2266 | vect_print_slp_tree (MSG_NOTE, vect_location, child); |
2267 | } |
2268 | children.safe_push (obj: child); |
2269 | } |
2270 | } |
2271 | /* 3. build SLP nodes to combine the chain. */ |
2272 | for (unsigned lane = 0; lane < group_size; ++lane) |
2273 | if (chains[lane][0].code != code) |
2274 | { |
2275 | /* See if there's any alternate all-PLUS entry. */ |
2276 | unsigned n; |
2277 | for (n = 1; n < chain_len; ++n) |
2278 | { |
2279 | for (lane = 0; lane < group_size; ++lane) |
2280 | if (chains[lane][n].code != code) |
2281 | break; |
2282 | if (lane == group_size) |
2283 | break; |
2284 | } |
2285 | if (n != chain_len) |
2286 | { |
2287 | /* Swap that in at first position. */ |
2288 | std::swap (a&: children[0], b&: children[n]); |
2289 | for (lane = 0; lane < group_size; ++lane) |
2290 | std::swap (a&: chains[lane][0], b&: chains[lane][n]); |
2291 | } |
2292 | else |
2293 | { |
2294 | /* ??? When this triggers and we end up with two |
2295 | vect_constant/external_def up-front things break (ICE) |
2296 | spectacularly finding an insertion place for the |
2297 | all-constant op. We should have a fully |
2298 | vect_internal_def operand though(?) so we can swap |
2299 | that into first place and then prepend the all-zero |
2300 | constant. */ |
2301 | if (dump_enabled_p ()) |
2302 | dump_printf_loc (MSG_NOTE, vect_location, |
2303 | "inserting constant zero to compensate " |
2304 | "for (partially) negated first " |
2305 | "operand\n" ); |
2306 | chain_len++; |
2307 | for (lane = 0; lane < group_size; ++lane) |
2308 | chains[lane].safe_insert |
2309 | (ix: 0, obj: chain_op_t (code, vect_constant_def, NULL_TREE)); |
2310 | vec<tree> zero_ops; |
2311 | zero_ops.create (nelems: group_size); |
2312 | zero_ops.quick_push (obj: build_zero_cst (TREE_TYPE (vectype))); |
2313 | for (lane = 1; lane < group_size; ++lane) |
2314 | zero_ops.quick_push (obj: zero_ops[0]); |
2315 | slp_tree zero = vect_create_new_slp_node (ops: zero_ops); |
2316 | SLP_TREE_DEF_TYPE (zero) = vect_constant_def; |
2317 | children.safe_insert (ix: 0, obj: zero); |
2318 | } |
2319 | break; |
2320 | } |
2321 | for (unsigned i = 1; i < children.length (); ++i) |
2322 | { |
2323 | slp_tree op0 = children[i - 1]; |
2324 | slp_tree op1 = children[i]; |
2325 | bool this_two_op = false; |
2326 | for (unsigned lane = 0; lane < group_size; ++lane) |
2327 | if (chains[lane][i].code != chains[0][i].code) |
2328 | { |
2329 | this_two_op = true; |
2330 | break; |
2331 | } |
2332 | slp_tree child; |
2333 | if (i == children.length () - 1) |
2334 | child = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 2); |
2335 | else |
2336 | child = vect_create_new_slp_node (nops: 2, code: ERROR_MARK); |
2337 | if (this_two_op) |
2338 | { |
2339 | vec<std::pair<unsigned, unsigned> > lperm; |
2340 | lperm.create (nelems: group_size); |
2341 | for (unsigned lane = 0; lane < group_size; ++lane) |
2342 | lperm.quick_push (obj: std::make_pair |
2343 | (x: chains[lane][i].code != chains[0][i].code, y&: lane)); |
2344 | vect_slp_build_two_operator_nodes (perm: child, vectype, op0, op1, |
2345 | oper1: (chains[0][i].code == code |
2346 | ? op_stmt_info |
2347 | : other_op_stmt_info), |
2348 | oper2: (chains[0][i].code == code |
2349 | ? other_op_stmt_info |
2350 | : op_stmt_info), |
2351 | lperm); |
2352 | } |
2353 | else |
2354 | { |
2355 | SLP_TREE_DEF_TYPE (child) = vect_internal_def; |
2356 | SLP_TREE_VECTYPE (child) = vectype; |
2357 | SLP_TREE_LANES (child) = group_size; |
2358 | SLP_TREE_CHILDREN (child).quick_push (obj: op0); |
2359 | SLP_TREE_CHILDREN (child).quick_push (obj: op1); |
2360 | SLP_TREE_REPRESENTATIVE (child) |
2361 | = (chains[0][i].code == code |
2362 | ? op_stmt_info : other_op_stmt_info); |
2363 | } |
2364 | children[i] = child; |
2365 | } |
2366 | *tree_size += this_tree_size + 1; |
2367 | *max_nunits = this_max_nunits; |
2368 | while (!chains.is_empty ()) |
2369 | chains.pop ().release (); |
2370 | return node; |
2371 | } |
2372 | out: |
2373 | while (!children.is_empty ()) |
2374 | vect_free_slp_tree (node: children.pop ()); |
2375 | while (!chains.is_empty ()) |
2376 | chains.pop ().release (); |
2377 | /* Hard-fail, otherwise we might run into quadratic processing of the |
2378 | chains starting one stmt into the chain again. */ |
2379 | if (hard_fail) |
2380 | return NULL; |
2381 | /* Fall thru to normal processing. */ |
2382 | } |
2383 | |
2384 | /* Get at the operands, verifying they are compatible. */ |
2385 | vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size); |
2386 | slp_oprnd_info oprnd_info; |
2387 | FOR_EACH_VEC_ELT (stmts, i, stmt_info) |
2388 | { |
2389 | int res = vect_get_and_check_slp_defs (vinfo, swap: swap[i], skip_args, |
2390 | stmts, stmt_num: i, oprnds_info: &oprnds_info); |
2391 | if (res != 0) |
2392 | matches[(res == -1) ? 0 : i] = false; |
2393 | if (!matches[0]) |
2394 | break; |
2395 | } |
2396 | for (i = 0; i < group_size; ++i) |
2397 | if (!matches[i]) |
2398 | { |
2399 | vect_free_oprnd_info (oprnds_info); |
2400 | return NULL; |
2401 | } |
2402 | swap = NULL; |
2403 | |
2404 | auto_vec<slp_tree, 4> children; |
2405 | |
2406 | stmt_info = stmts[0]; |
2407 | |
2408 | /* Create SLP_TREE nodes for the definition node/s. */ |
2409 | FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) |
2410 | { |
2411 | slp_tree child = nullptr; |
2412 | unsigned int j; |
2413 | |
2414 | /* We're skipping certain operands from processing, for example |
2415 | outer loop reduction initial defs. */ |
2416 | if (skip_args[i]) |
2417 | { |
2418 | children.safe_push (NULL); |
2419 | continue; |
2420 | } |
2421 | |
2422 | if (oprnd_info->first_dt == vect_uninitialized_def) |
2423 | { |
2424 | /* COND_EXPR have one too many eventually if the condition |
2425 | is a SSA name. */ |
2426 | gcc_assert (i == 3 && nops == 4); |
2427 | continue; |
2428 | } |
2429 | |
2430 | if (is_a <bb_vec_info> (p: vinfo) |
2431 | && oprnd_info->first_dt == vect_internal_def |
2432 | && !oprnd_info->any_pattern) |
2433 | { |
2434 | /* For BB vectorization, if all defs are the same do not |
2435 | bother to continue the build along the single-lane |
2436 | graph but use a splat of the scalar value. */ |
2437 | stmt_vec_info first_def = oprnd_info->def_stmts[0]; |
2438 | for (j = 1; j < group_size; ++j) |
2439 | if (oprnd_info->def_stmts[j] != first_def) |
2440 | break; |
2441 | if (j == group_size |
2442 | /* But avoid doing this for loads where we may be |
2443 | able to CSE things, unless the stmt is not |
2444 | vectorizable. */ |
2445 | && (!STMT_VINFO_VECTORIZABLE (first_def) |
2446 | || !gimple_vuse (g: first_def->stmt))) |
2447 | { |
2448 | if (dump_enabled_p ()) |
2449 | dump_printf_loc (MSG_NOTE, vect_location, |
2450 | "Using a splat of the uniform operand %G" , |
2451 | first_def->stmt); |
2452 | oprnd_info->first_dt = vect_external_def; |
2453 | } |
2454 | } |
2455 | |
2456 | if (oprnd_info->first_dt == vect_external_def |
2457 | || oprnd_info->first_dt == vect_constant_def) |
2458 | { |
2459 | if (!GET_MODE_SIZE (mode: vinfo->vector_mode).is_constant ()) |
2460 | { |
2461 | tree op0; |
2462 | tree uniform_val = op0 = oprnd_info->ops[0]; |
2463 | for (j = 1; j < oprnd_info->ops.length (); ++j) |
2464 | if (!operand_equal_p (uniform_val, oprnd_info->ops[j])) |
2465 | { |
2466 | uniform_val = NULL_TREE; |
2467 | break; |
2468 | } |
2469 | if (!uniform_val |
2470 | && !can_duplicate_and_interleave_p (vinfo, |
2471 | count: oprnd_info->ops.length (), |
2472 | TREE_TYPE (op0))) |
2473 | { |
2474 | matches[j] = false; |
2475 | if (dump_enabled_p ()) |
2476 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2477 | "Build SLP failed: invalid type of def " |
2478 | "for variable-length SLP %T\n" , op0); |
2479 | goto fail; |
2480 | } |
2481 | } |
2482 | slp_tree invnode = vect_create_new_slp_node (ops: oprnd_info->ops); |
2483 | SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt; |
2484 | oprnd_info->ops = vNULL; |
2485 | children.safe_push (obj: invnode); |
2486 | continue; |
2487 | } |
2488 | |
2489 | if ((child = vect_build_slp_tree (vinfo, stmts: oprnd_info->def_stmts, |
2490 | group_size, max_nunits: &this_max_nunits, |
2491 | matches, limit, |
2492 | tree_size: &this_tree_size, bst_map)) != NULL) |
2493 | { |
2494 | oprnd_info->def_stmts = vNULL; |
2495 | children.safe_push (obj: child); |
2496 | continue; |
2497 | } |
2498 | |
2499 | /* If the SLP build for operand zero failed and operand zero |
2500 | and one can be commutated try that for the scalar stmts |
2501 | that failed the match. */ |
2502 | if (i == 0 |
2503 | /* A first scalar stmt mismatch signals a fatal mismatch. */ |
2504 | && matches[0] |
2505 | /* ??? For COND_EXPRs we can swap the comparison operands |
2506 | as well as the arms under some constraints. */ |
2507 | && nops == 2 |
2508 | && oprnds_info[1]->first_dt == vect_internal_def |
2509 | && is_gimple_assign (gs: stmt_info->stmt) |
2510 | /* Swapping operands for reductions breaks assumptions later on. */ |
2511 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def |
2512 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) |
2513 | { |
2514 | /* See whether we can swap the matching or the non-matching |
2515 | stmt operands. */ |
2516 | bool swap_not_matching = true; |
2517 | do |
2518 | { |
2519 | for (j = 0; j < group_size; ++j) |
2520 | { |
2521 | if (matches[j] != !swap_not_matching) |
2522 | continue; |
2523 | stmt_vec_info stmt_info = stmts[j]; |
2524 | /* Verify if we can swap operands of this stmt. */ |
2525 | gassign *stmt = dyn_cast <gassign *> (p: stmt_info->stmt); |
2526 | if (!stmt |
2527 | || !commutative_tree_code (gimple_assign_rhs_code (gs: stmt))) |
2528 | { |
2529 | if (!swap_not_matching) |
2530 | goto fail; |
2531 | swap_not_matching = false; |
2532 | break; |
2533 | } |
2534 | } |
2535 | } |
2536 | while (j != group_size); |
2537 | |
2538 | /* Swap mismatched definition stmts. */ |
2539 | if (dump_enabled_p ()) |
2540 | dump_printf_loc (MSG_NOTE, vect_location, |
2541 | "Re-trying with swapped operands of stmts " ); |
2542 | for (j = 0; j < group_size; ++j) |
2543 | if (matches[j] == !swap_not_matching) |
2544 | { |
2545 | std::swap (a&: oprnds_info[0]->def_stmts[j], |
2546 | b&: oprnds_info[1]->def_stmts[j]); |
2547 | std::swap (a&: oprnds_info[0]->ops[j], |
2548 | b&: oprnds_info[1]->ops[j]); |
2549 | if (dump_enabled_p ()) |
2550 | dump_printf (MSG_NOTE, "%d " , j); |
2551 | } |
2552 | if (dump_enabled_p ()) |
2553 | dump_printf (MSG_NOTE, "\n" ); |
2554 | /* After swapping some operands we lost track whether an |
2555 | operand has any pattern defs so be conservative here. */ |
2556 | if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern) |
2557 | oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true; |
2558 | /* And try again with scratch 'matches' ... */ |
2559 | bool *tem = XALLOCAVEC (bool, group_size); |
2560 | if ((child = vect_build_slp_tree (vinfo, stmts: oprnd_info->def_stmts, |
2561 | group_size, max_nunits: &this_max_nunits, |
2562 | matches: tem, limit, |
2563 | tree_size: &this_tree_size, bst_map)) != NULL) |
2564 | { |
2565 | oprnd_info->def_stmts = vNULL; |
2566 | children.safe_push (obj: child); |
2567 | continue; |
2568 | } |
2569 | } |
2570 | fail: |
2571 | |
2572 | /* If the SLP build failed and we analyze a basic-block |
2573 | simply treat nodes we fail to build as externally defined |
2574 | (and thus build vectors from the scalar defs). |
2575 | The cost model will reject outright expensive cases. |
2576 | ??? This doesn't treat cases where permutation ultimatively |
2577 | fails (or we don't try permutation below). Ideally we'd |
2578 | even compute a permutation that will end up with the maximum |
2579 | SLP tree size... */ |
2580 | if (is_a <bb_vec_info> (p: vinfo) |
2581 | /* ??? Rejecting patterns this way doesn't work. We'd have to |
2582 | do extra work to cancel the pattern so the uses see the |
2583 | scalar version. */ |
2584 | && !is_pattern_stmt_p (stmt_info) |
2585 | && !oprnd_info->any_pattern) |
2586 | { |
2587 | /* But if there's a leading vector sized set of matching stmts |
2588 | fail here so we can split the group. This matches the condition |
2589 | vect_analyze_slp_instance uses. */ |
2590 | /* ??? We might want to split here and combine the results to support |
2591 | multiple vector sizes better. */ |
2592 | for (j = 0; j < group_size; ++j) |
2593 | if (!matches[j]) |
2594 | break; |
2595 | if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))) |
2596 | { |
2597 | if (dump_enabled_p ()) |
2598 | dump_printf_loc (MSG_NOTE, vect_location, |
2599 | "Building vector operands from scalars\n" ); |
2600 | this_tree_size++; |
2601 | child = vect_create_new_slp_node (ops: oprnd_info->ops); |
2602 | children.safe_push (obj: child); |
2603 | oprnd_info->ops = vNULL; |
2604 | continue; |
2605 | } |
2606 | } |
2607 | |
2608 | gcc_assert (child == NULL); |
2609 | FOR_EACH_VEC_ELT (children, j, child) |
2610 | if (child) |
2611 | vect_free_slp_tree (node: child); |
2612 | vect_free_oprnd_info (oprnds_info); |
2613 | return NULL; |
2614 | } |
2615 | |
2616 | vect_free_oprnd_info (oprnds_info); |
2617 | |
2618 | /* If we have all children of a child built up from uniform scalars |
2619 | or does more than one possibly expensive vector construction then |
2620 | just throw that away, causing it built up from scalars. |
2621 | The exception is the SLP node for the vector store. */ |
2622 | if (is_a <bb_vec_info> (p: vinfo) |
2623 | && !STMT_VINFO_GROUPED_ACCESS (stmt_info) |
2624 | /* ??? Rejecting patterns this way doesn't work. We'd have to |
2625 | do extra work to cancel the pattern so the uses see the |
2626 | scalar version. */ |
2627 | && !is_pattern_stmt_p (stmt_info)) |
2628 | { |
2629 | slp_tree child; |
2630 | unsigned j; |
2631 | bool all_uniform_p = true; |
2632 | unsigned n_vector_builds = 0; |
2633 | FOR_EACH_VEC_ELT (children, j, child) |
2634 | { |
2635 | if (!child) |
2636 | ; |
2637 | else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) |
2638 | all_uniform_p = false; |
2639 | else if (!vect_slp_tree_uniform_p (node: child)) |
2640 | { |
2641 | all_uniform_p = false; |
2642 | if (SLP_TREE_DEF_TYPE (child) == vect_external_def) |
2643 | n_vector_builds++; |
2644 | } |
2645 | } |
2646 | if (all_uniform_p |
2647 | || n_vector_builds > 1 |
2648 | || (n_vector_builds == children.length () |
2649 | && is_a <gphi *> (p: stmt_info->stmt))) |
2650 | { |
2651 | /* Roll back. */ |
2652 | matches[0] = false; |
2653 | FOR_EACH_VEC_ELT (children, j, child) |
2654 | if (child) |
2655 | vect_free_slp_tree (node: child); |
2656 | |
2657 | if (dump_enabled_p ()) |
2658 | dump_printf_loc (MSG_NOTE, vect_location, |
2659 | "Building parent vector operands from " |
2660 | "scalars instead\n" ); |
2661 | return NULL; |
2662 | } |
2663 | } |
2664 | |
2665 | *tree_size += this_tree_size + 1; |
2666 | *max_nunits = this_max_nunits; |
2667 | |
2668 | if (two_operators) |
2669 | { |
2670 | /* ??? We'd likely want to either cache in bst_map sth like |
2671 | { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or |
2672 | the true { a+b, a+b, a+b, a+b } ... but there we don't have |
2673 | explicit stmts to put in so the keying on 'stmts' doesn't |
2674 | work (but we have the same issue with nodes that use 'ops'). */ |
2675 | slp_tree one = new _slp_tree; |
2676 | slp_tree two = new _slp_tree; |
2677 | SLP_TREE_DEF_TYPE (one) = vect_internal_def; |
2678 | SLP_TREE_DEF_TYPE (two) = vect_internal_def; |
2679 | SLP_TREE_VECTYPE (one) = vectype; |
2680 | SLP_TREE_VECTYPE (two) = vectype; |
2681 | SLP_TREE_CHILDREN (one).safe_splice (src: children); |
2682 | SLP_TREE_CHILDREN (two).safe_splice (src: children); |
2683 | slp_tree child; |
2684 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child) |
2685 | SLP_TREE_REF_COUNT (child)++; |
2686 | |
2687 | /* Here we record the original defs since this |
2688 | node represents the final lane configuration. */ |
2689 | node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 2); |
2690 | SLP_TREE_VECTYPE (node) = vectype; |
2691 | SLP_TREE_CODE (node) = VEC_PERM_EXPR; |
2692 | SLP_TREE_CHILDREN (node).quick_push (obj: one); |
2693 | SLP_TREE_CHILDREN (node).quick_push (obj: two); |
2694 | gassign *stmt = as_a <gassign *> (p: stmts[0]->stmt); |
2695 | enum tree_code code0 = gimple_assign_rhs_code (gs: stmt); |
2696 | enum tree_code ocode = ERROR_MARK; |
2697 | stmt_vec_info ostmt_info; |
2698 | unsigned j = 0; |
2699 | FOR_EACH_VEC_ELT (stmts, i, ostmt_info) |
2700 | { |
2701 | gassign *ostmt = as_a <gassign *> (p: ostmt_info->stmt); |
2702 | if (gimple_assign_rhs_code (gs: ostmt) != code0) |
2703 | { |
2704 | SLP_TREE_LANE_PERMUTATION (node).safe_push (obj: std::make_pair (x: 1, y&: i)); |
2705 | ocode = gimple_assign_rhs_code (gs: ostmt); |
2706 | j = i; |
2707 | } |
2708 | else |
2709 | SLP_TREE_LANE_PERMUTATION (node).safe_push (obj: std::make_pair (x: 0, y&: i)); |
2710 | } |
2711 | SLP_TREE_CODE (one) = code0; |
2712 | SLP_TREE_CODE (two) = ocode; |
2713 | SLP_TREE_LANES (one) = stmts.length (); |
2714 | SLP_TREE_LANES (two) = stmts.length (); |
2715 | SLP_TREE_REPRESENTATIVE (one) = stmts[0]; |
2716 | SLP_TREE_REPRESENTATIVE (two) = stmts[j]; |
2717 | return node; |
2718 | } |
2719 | |
2720 | node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops); |
2721 | SLP_TREE_VECTYPE (node) = vectype; |
2722 | SLP_TREE_CHILDREN (node).splice (src: children); |
2723 | return node; |
2724 | } |
2725 | |
2726 | /* Dump a single SLP tree NODE. */ |
2727 | |
2728 | static void |
2729 | vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, |
2730 | slp_tree node) |
2731 | { |
2732 | unsigned i, j; |
2733 | slp_tree child; |
2734 | stmt_vec_info stmt_info; |
2735 | tree op; |
2736 | |
2737 | dump_metadata_t metadata (dump_kind, loc.get_impl_location ()); |
2738 | dump_user_location_t user_loc = loc.get_user_location (); |
2739 | dump_printf_loc (metadata, user_loc, |
2740 | "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED |
2741 | ", refcnt=%u)" , |
2742 | SLP_TREE_DEF_TYPE (node) == vect_external_def |
2743 | ? " (external)" |
2744 | : (SLP_TREE_DEF_TYPE (node) == vect_constant_def |
2745 | ? " (constant)" |
2746 | : "" ), (void *) node, |
2747 | estimated_poly_value (x: node->max_nunits), |
2748 | SLP_TREE_REF_COUNT (node)); |
2749 | if (SLP_TREE_VECTYPE (node)) |
2750 | dump_printf (metadata, " %T" , SLP_TREE_VECTYPE (node)); |
2751 | dump_printf (metadata, "\n" ); |
2752 | if (SLP_TREE_DEF_TYPE (node) == vect_internal_def) |
2753 | { |
2754 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
2755 | dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n" ); |
2756 | else |
2757 | dump_printf_loc (metadata, user_loc, "op template: %G" , |
2758 | SLP_TREE_REPRESENTATIVE (node)->stmt); |
2759 | } |
2760 | if (SLP_TREE_SCALAR_STMTS (node).exists ()) |
2761 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
2762 | dump_printf_loc (metadata, user_loc, "\tstmt %u %G" , i, stmt_info->stmt); |
2763 | else |
2764 | { |
2765 | dump_printf_loc (metadata, user_loc, "\t{ " ); |
2766 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) |
2767 | dump_printf (metadata, "%T%s " , op, |
2768 | i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "" ); |
2769 | dump_printf (metadata, "}\n" ); |
2770 | } |
2771 | if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) |
2772 | { |
2773 | dump_printf_loc (metadata, user_loc, "\tload permutation {" ); |
2774 | FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j) |
2775 | dump_printf (dump_kind, " %u" , j); |
2776 | dump_printf (dump_kind, " }\n" ); |
2777 | } |
2778 | if (SLP_TREE_LANE_PERMUTATION (node).exists ()) |
2779 | { |
2780 | dump_printf_loc (metadata, user_loc, "\tlane permutation {" ); |
2781 | for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i) |
2782 | dump_printf (dump_kind, " %u[%u]" , |
2783 | SLP_TREE_LANE_PERMUTATION (node)[i].first, |
2784 | SLP_TREE_LANE_PERMUTATION (node)[i].second); |
2785 | dump_printf (dump_kind, " }\n" ); |
2786 | } |
2787 | if (SLP_TREE_CHILDREN (node).is_empty ()) |
2788 | return; |
2789 | dump_printf_loc (metadata, user_loc, "\tchildren" ); |
2790 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
2791 | dump_printf (dump_kind, " %p" , (void *)child); |
2792 | dump_printf (dump_kind, "\n" ); |
2793 | } |
2794 | |
2795 | DEBUG_FUNCTION void |
2796 | debug (slp_tree node) |
2797 | { |
2798 | debug_dump_context ctx; |
2799 | vect_print_slp_tree (dump_kind: MSG_NOTE, |
2800 | loc: dump_location_t::from_location_t (UNKNOWN_LOCATION), |
2801 | node); |
2802 | } |
2803 | |
2804 | /* Recursive helper for the dot producer below. */ |
2805 | |
2806 | static void |
2807 | dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited) |
2808 | { |
2809 | if (visited.add (k: node)) |
2810 | return; |
2811 | |
2812 | fprintf (stream: f, format: "\"%p\" [label=\"" , (void *)node); |
2813 | vect_print_slp_tree (dump_kind: MSG_NOTE, |
2814 | loc: dump_location_t::from_location_t (UNKNOWN_LOCATION), |
2815 | node); |
2816 | fprintf (stream: f, format: "\"];\n" ); |
2817 | |
2818 | |
2819 | for (slp_tree child : SLP_TREE_CHILDREN (node)) |
2820 | fprintf (stream: f, format: "\"%p\" -> \"%p\";" , (void *)node, (void *)child); |
2821 | |
2822 | for (slp_tree child : SLP_TREE_CHILDREN (node)) |
2823 | if (child) |
2824 | dot_slp_tree (f, node: child, visited); |
2825 | } |
2826 | |
2827 | DEBUG_FUNCTION void |
2828 | dot_slp_tree (const char *fname, slp_tree node) |
2829 | { |
2830 | FILE *f = fopen (filename: fname, modes: "w" ); |
2831 | fprintf (stream: f, format: "digraph {\n" ); |
2832 | fflush (stream: f); |
2833 | { |
2834 | debug_dump_context ctx (f); |
2835 | hash_set<slp_tree> visited; |
2836 | dot_slp_tree (f, node, visited); |
2837 | } |
2838 | fflush (stream: f); |
2839 | fprintf (stream: f, format: "}\n" ); |
2840 | fclose (stream: f); |
2841 | } |
2842 | |
2843 | /* Dump a slp tree NODE using flags specified in DUMP_KIND. */ |
2844 | |
2845 | static void |
2846 | vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc, |
2847 | slp_tree node, hash_set<slp_tree> &visited) |
2848 | { |
2849 | unsigned i; |
2850 | slp_tree child; |
2851 | |
2852 | if (visited.add (k: node)) |
2853 | return; |
2854 | |
2855 | vect_print_slp_tree (dump_kind, loc, node); |
2856 | |
2857 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
2858 | if (child) |
2859 | vect_print_slp_graph (dump_kind, loc, node: child, visited); |
2860 | } |
2861 | |
2862 | static void |
2863 | vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc, |
2864 | slp_tree entry) |
2865 | { |
2866 | hash_set<slp_tree> visited; |
2867 | vect_print_slp_graph (dump_kind, loc, node: entry, visited); |
2868 | } |
2869 | |
2870 | /* Mark the tree rooted at NODE with PURE_SLP. */ |
2871 | |
2872 | static void |
2873 | vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited) |
2874 | { |
2875 | int i; |
2876 | stmt_vec_info stmt_info; |
2877 | slp_tree child; |
2878 | |
2879 | if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) |
2880 | return; |
2881 | |
2882 | if (visited.add (k: node)) |
2883 | return; |
2884 | |
2885 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
2886 | STMT_SLP_TYPE (stmt_info) = pure_slp; |
2887 | |
2888 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
2889 | if (child) |
2890 | vect_mark_slp_stmts (node: child, visited); |
2891 | } |
2892 | |
2893 | static void |
2894 | vect_mark_slp_stmts (slp_tree node) |
2895 | { |
2896 | hash_set<slp_tree> visited; |
2897 | vect_mark_slp_stmts (node, visited); |
2898 | } |
2899 | |
2900 | /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */ |
2901 | |
2902 | static void |
2903 | vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited) |
2904 | { |
2905 | int i; |
2906 | stmt_vec_info stmt_info; |
2907 | slp_tree child; |
2908 | |
2909 | if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) |
2910 | return; |
2911 | |
2912 | if (visited.add (k: node)) |
2913 | return; |
2914 | |
2915 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
2916 | { |
2917 | gcc_assert (!STMT_VINFO_RELEVANT (stmt_info) |
2918 | || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope); |
2919 | STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope; |
2920 | } |
2921 | |
2922 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
2923 | if (child) |
2924 | vect_mark_slp_stmts_relevant (node: child, visited); |
2925 | } |
2926 | |
2927 | static void |
2928 | vect_mark_slp_stmts_relevant (slp_tree node) |
2929 | { |
2930 | hash_set<slp_tree> visited; |
2931 | vect_mark_slp_stmts_relevant (node, visited); |
2932 | } |
2933 | |
2934 | |
2935 | /* Gather loads in the SLP graph NODE and populate the INST loads array. */ |
2936 | |
2937 | static void |
2938 | vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node, |
2939 | hash_set<slp_tree> &visited) |
2940 | { |
2941 | if (!node || visited.add (k: node)) |
2942 | return; |
2943 | |
2944 | if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) |
2945 | return; |
2946 | |
2947 | if (SLP_TREE_CODE (node) != VEC_PERM_EXPR) |
2948 | { |
2949 | stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node); |
2950 | if (STMT_VINFO_DATA_REF (stmt_info) |
2951 | && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) |
2952 | loads.safe_push (obj: node); |
2953 | } |
2954 | |
2955 | unsigned i; |
2956 | slp_tree child; |
2957 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
2958 | vect_gather_slp_loads (loads, node: child, visited); |
2959 | } |
2960 | |
2961 | |
2962 | /* Find the last store in SLP INSTANCE. */ |
2963 | |
2964 | stmt_vec_info |
2965 | vect_find_last_scalar_stmt_in_slp (slp_tree node) |
2966 | { |
2967 | stmt_vec_info last = NULL; |
2968 | stmt_vec_info stmt_vinfo; |
2969 | |
2970 | for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (ix: i, ptr: &stmt_vinfo); i++) |
2971 | { |
2972 | stmt_vinfo = vect_orig_stmt (stmt_info: stmt_vinfo); |
2973 | last = last ? get_later_stmt (stmt1_info: stmt_vinfo, stmt2_info: last) : stmt_vinfo; |
2974 | } |
2975 | |
2976 | return last; |
2977 | } |
2978 | |
2979 | /* Find the first stmt in NODE. */ |
2980 | |
2981 | stmt_vec_info |
2982 | vect_find_first_scalar_stmt_in_slp (slp_tree node) |
2983 | { |
2984 | stmt_vec_info first = NULL; |
2985 | stmt_vec_info stmt_vinfo; |
2986 | |
2987 | for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (ix: i, ptr: &stmt_vinfo); i++) |
2988 | { |
2989 | stmt_vinfo = vect_orig_stmt (stmt_info: stmt_vinfo); |
2990 | if (!first |
2991 | || get_later_stmt (stmt1_info: stmt_vinfo, stmt2_info: first) == first) |
2992 | first = stmt_vinfo; |
2993 | } |
2994 | |
2995 | return first; |
2996 | } |
2997 | |
2998 | /* Splits a group of stores, currently beginning at FIRST_VINFO, into |
2999 | two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE |
3000 | (also containing the first GROUP1_SIZE stmts, since stores are |
3001 | consecutive), the second containing the remainder. |
3002 | Return the first stmt in the second group. */ |
3003 | |
3004 | static stmt_vec_info |
3005 | vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size) |
3006 | { |
3007 | gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo); |
3008 | gcc_assert (group1_size > 0); |
3009 | int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size; |
3010 | gcc_assert (group2_size > 0); |
3011 | DR_GROUP_SIZE (first_vinfo) = group1_size; |
3012 | |
3013 | stmt_vec_info stmt_info = first_vinfo; |
3014 | for (unsigned i = group1_size; i > 1; i--) |
3015 | { |
3016 | stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info); |
3017 | gcc_assert (DR_GROUP_GAP (stmt_info) == 1); |
3018 | } |
3019 | /* STMT is now the last element of the first group. */ |
3020 | stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info); |
3021 | DR_GROUP_NEXT_ELEMENT (stmt_info) = 0; |
3022 | |
3023 | DR_GROUP_SIZE (group2) = group2_size; |
3024 | for (stmt_info = group2; stmt_info; |
3025 | stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) |
3026 | { |
3027 | DR_GROUP_FIRST_ELEMENT (stmt_info) = group2; |
3028 | gcc_assert (DR_GROUP_GAP (stmt_info) == 1); |
3029 | } |
3030 | |
3031 | /* For the second group, the DR_GROUP_GAP is that before the original group, |
3032 | plus skipping over the first vector. */ |
3033 | DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size; |
3034 | |
3035 | /* DR_GROUP_GAP of the first group now has to skip over the second group too. */ |
3036 | DR_GROUP_GAP (first_vinfo) += group2_size; |
3037 | |
3038 | if (dump_enabled_p ()) |
3039 | dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n" , |
3040 | group1_size, group2_size); |
3041 | |
3042 | return group2; |
3043 | } |
3044 | |
3045 | /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE |
3046 | statements and a vector of NUNITS elements. */ |
3047 | |
3048 | static poly_uint64 |
3049 | calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size) |
3050 | { |
3051 | return exact_div (a: common_multiple (a: nunits, b: group_size), b: group_size); |
3052 | } |
3053 | |
3054 | /* Helper that checks to see if a node is a load node. */ |
3055 | |
3056 | static inline bool |
3057 | vect_is_slp_load_node (slp_tree root) |
3058 | { |
3059 | return SLP_TREE_DEF_TYPE (root) == vect_internal_def |
3060 | && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root)) |
3061 | && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))); |
3062 | } |
3063 | |
3064 | |
3065 | /* Helper function of optimize_load_redistribution that performs the operation |
3066 | recursively. */ |
3067 | |
3068 | static slp_tree |
3069 | optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map, |
3070 | vec_info *vinfo, unsigned int group_size, |
3071 | hash_map<slp_tree, slp_tree> *load_map, |
3072 | slp_tree root) |
3073 | { |
3074 | if (slp_tree *leader = load_map->get (k: root)) |
3075 | return *leader; |
3076 | |
3077 | slp_tree node; |
3078 | unsigned i; |
3079 | |
3080 | /* For now, we don't know anything about externals so do not do anything. */ |
3081 | if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def) |
3082 | return NULL; |
3083 | else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR) |
3084 | { |
3085 | /* First convert this node into a load node and add it to the leaves |
3086 | list and flatten the permute from a lane to a load one. If it's |
3087 | unneeded it will be elided later. */ |
3088 | vec<stmt_vec_info> stmts; |
3089 | stmts.create (SLP_TREE_LANES (root)); |
3090 | lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root); |
3091 | for (unsigned j = 0; j < lane_perm.length (); j++) |
3092 | { |
3093 | std::pair<unsigned, unsigned> perm = lane_perm[j]; |
3094 | node = SLP_TREE_CHILDREN (root)[perm.first]; |
3095 | |
3096 | if (!vect_is_slp_load_node (root: node) |
3097 | || SLP_TREE_CHILDREN (node).exists ()) |
3098 | { |
3099 | stmts.release (); |
3100 | goto next; |
3101 | } |
3102 | |
3103 | stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]); |
3104 | } |
3105 | |
3106 | if (dump_enabled_p ()) |
3107 | dump_printf_loc (MSG_NOTE, vect_location, |
3108 | "converting stmts on permute node %p\n" , |
3109 | (void *) root); |
3110 | |
3111 | bool *matches = XALLOCAVEC (bool, group_size); |
3112 | poly_uint64 max_nunits = 1; |
3113 | unsigned tree_size = 0, limit = 1; |
3114 | node = vect_build_slp_tree (vinfo, stmts, group_size, max_nunits: &max_nunits, |
3115 | matches, limit: &limit, tree_size: &tree_size, bst_map); |
3116 | if (!node) |
3117 | stmts.release (); |
3118 | |
3119 | load_map->put (k: root, v: node); |
3120 | return node; |
3121 | } |
3122 | |
3123 | next: |
3124 | load_map->put (k: root, NULL); |
3125 | |
3126 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node) |
3127 | { |
3128 | slp_tree value |
3129 | = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map, |
3130 | root: node); |
3131 | if (value) |
3132 | { |
3133 | SLP_TREE_REF_COUNT (value)++; |
3134 | SLP_TREE_CHILDREN (root)[i] = value; |
3135 | /* ??? We know the original leafs of the replaced nodes will |
3136 | be referenced by bst_map, only the permutes created by |
3137 | pattern matching are not. */ |
3138 | if (SLP_TREE_REF_COUNT (node) == 1) |
3139 | load_map->remove (k: node); |
3140 | vect_free_slp_tree (node); |
3141 | } |
3142 | } |
3143 | |
3144 | return NULL; |
3145 | } |
3146 | |
3147 | /* Temporary workaround for loads not being CSEd during SLP build. This |
3148 | function will traverse the SLP tree rooted in ROOT for INSTANCE and find |
3149 | VEC_PERM nodes that blend vectors from multiple nodes that all read from the |
3150 | same DR such that the final operation is equal to a permuted load. Such |
3151 | NODES are then directly converted into LOADS themselves. The nodes are |
3152 | CSEd using BST_MAP. */ |
3153 | |
3154 | static void |
3155 | optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map, |
3156 | vec_info *vinfo, unsigned int group_size, |
3157 | hash_map<slp_tree, slp_tree> *load_map, |
3158 | slp_tree root) |
3159 | { |
3160 | slp_tree node; |
3161 | unsigned i; |
3162 | |
3163 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node) |
3164 | { |
3165 | slp_tree value |
3166 | = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map, |
3167 | root: node); |
3168 | if (value) |
3169 | { |
3170 | SLP_TREE_REF_COUNT (value)++; |
3171 | SLP_TREE_CHILDREN (root)[i] = value; |
3172 | /* ??? We know the original leafs of the replaced nodes will |
3173 | be referenced by bst_map, only the permutes created by |
3174 | pattern matching are not. */ |
3175 | if (SLP_TREE_REF_COUNT (node) == 1) |
3176 | load_map->remove (k: node); |
3177 | vect_free_slp_tree (node); |
3178 | } |
3179 | } |
3180 | } |
3181 | |
3182 | /* Helper function of vect_match_slp_patterns. |
3183 | |
3184 | Attempts to match patterns against the slp tree rooted in REF_NODE using |
3185 | VINFO. Patterns are matched in post-order traversal. |
3186 | |
3187 | If matching is successful the value in REF_NODE is updated and returned, if |
3188 | not then it is returned unchanged. */ |
3189 | |
3190 | static bool |
3191 | vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo, |
3192 | slp_tree_to_load_perm_map_t *perm_cache, |
3193 | slp_compat_nodes_map_t *compat_cache, |
3194 | hash_set<slp_tree> *visited) |
3195 | { |
3196 | unsigned i; |
3197 | slp_tree node = *ref_node; |
3198 | bool found_p = false; |
3199 | if (!node || visited->add (k: node)) |
3200 | return false; |
3201 | |
3202 | slp_tree child; |
3203 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
3204 | found_p |= vect_match_slp_patterns_2 (ref_node: &SLP_TREE_CHILDREN (node)[i], |
3205 | vinfo, perm_cache, compat_cache, |
3206 | visited); |
3207 | |
3208 | for (unsigned x = 0; x < num__slp_patterns; x++) |
3209 | { |
3210 | vect_pattern *pattern |
3211 | = slp_patterns[x] (perm_cache, compat_cache, ref_node); |
3212 | if (pattern) |
3213 | { |
3214 | pattern->build (vinfo); |
3215 | delete pattern; |
3216 | found_p = true; |
3217 | } |
3218 | } |
3219 | |
3220 | return found_p; |
3221 | } |
3222 | |
3223 | /* Applies pattern matching to the given SLP tree rooted in REF_NODE using |
3224 | vec_info VINFO. |
3225 | |
3226 | The modified tree is returned. Patterns are tried in order and multiple |
3227 | patterns may match. */ |
3228 | |
3229 | static bool |
3230 | vect_match_slp_patterns (slp_instance instance, vec_info *vinfo, |
3231 | hash_set<slp_tree> *visited, |
3232 | slp_tree_to_load_perm_map_t *perm_cache, |
3233 | slp_compat_nodes_map_t *compat_cache) |
3234 | { |
3235 | DUMP_VECT_SCOPE ("vect_match_slp_patterns" ); |
3236 | slp_tree *ref_node = &SLP_INSTANCE_TREE (instance); |
3237 | |
3238 | if (dump_enabled_p ()) |
3239 | dump_printf_loc (MSG_NOTE, vect_location, |
3240 | "Analyzing SLP tree %p for patterns\n" , |
3241 | (void *) SLP_INSTANCE_TREE (instance)); |
3242 | |
3243 | return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache, |
3244 | visited); |
3245 | } |
3246 | |
3247 | /* STMT_INFO is a store group of size GROUP_SIZE that we are considering |
3248 | splitting into two, with the first split group having size NEW_GROUP_SIZE. |
3249 | Return true if we could use IFN_STORE_LANES instead and if that appears |
3250 | to be the better approach. */ |
3251 | |
3252 | static bool |
3253 | vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info, |
3254 | unsigned int group_size, |
3255 | unsigned int new_group_size) |
3256 | { |
3257 | tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info))); |
3258 | tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type); |
3259 | if (!vectype) |
3260 | return false; |
3261 | /* Allow the split if one of the two new groups would operate on full |
3262 | vectors *within* rather than across one scalar loop iteration. |
3263 | This is purely a heuristic, but it should work well for group |
3264 | sizes of 3 and 4, where the possible splits are: |
3265 | |
3266 | 3->2+1: OK if the vector has exactly two elements |
3267 | 4->2+2: Likewise |
3268 | 4->3+1: Less clear-cut. */ |
3269 | if (multiple_p (a: group_size - new_group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype)) |
3270 | || multiple_p (a: new_group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype))) |
3271 | return false; |
3272 | return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST; |
3273 | } |
3274 | |
3275 | /* Analyze an SLP instance starting from a group of grouped stores. Call |
3276 | vect_build_slp_tree to build a tree of packed stmts if possible. |
3277 | Return FALSE if it's impossible to SLP any stmt in the loop. */ |
3278 | |
3279 | static bool |
3280 | vect_analyze_slp_instance (vec_info *vinfo, |
3281 | scalar_stmts_to_slp_tree_map_t *bst_map, |
3282 | stmt_vec_info stmt_info, slp_instance_kind kind, |
3283 | unsigned max_tree_size, unsigned *limit); |
3284 | |
3285 | /* Analyze an SLP instance starting from SCALAR_STMTS which are a group |
3286 | of KIND. Return true if successful. */ |
3287 | |
3288 | static bool |
3289 | vect_build_slp_instance (vec_info *vinfo, |
3290 | slp_instance_kind kind, |
3291 | vec<stmt_vec_info> &scalar_stmts, |
3292 | vec<stmt_vec_info> &root_stmt_infos, |
3293 | vec<tree> &remain, |
3294 | unsigned max_tree_size, unsigned *limit, |
3295 | scalar_stmts_to_slp_tree_map_t *bst_map, |
3296 | /* ??? We need stmt_info for group splitting. */ |
3297 | stmt_vec_info stmt_info_) |
3298 | { |
3299 | if (kind == slp_inst_kind_ctor) |
3300 | { |
3301 | if (dump_enabled_p ()) |
3302 | dump_printf_loc (MSG_NOTE, vect_location, |
3303 | "Analyzing vectorizable constructor: %G\n" , |
3304 | root_stmt_infos[0]->stmt); |
3305 | } |
3306 | |
3307 | if (dump_enabled_p ()) |
3308 | { |
3309 | dump_printf_loc (MSG_NOTE, vect_location, |
3310 | "Starting SLP discovery for\n" ); |
3311 | for (unsigned i = 0; i < scalar_stmts.length (); ++i) |
3312 | dump_printf_loc (MSG_NOTE, vect_location, |
3313 | " %G" , scalar_stmts[i]->stmt); |
3314 | } |
3315 | |
3316 | /* Build the tree for the SLP instance. */ |
3317 | unsigned int group_size = scalar_stmts.length (); |
3318 | bool *matches = XALLOCAVEC (bool, group_size); |
3319 | poly_uint64 max_nunits = 1; |
3320 | unsigned tree_size = 0; |
3321 | unsigned i; |
3322 | slp_tree node = vect_build_slp_tree (vinfo, stmts: scalar_stmts, group_size, |
3323 | max_nunits: &max_nunits, matches, limit, |
3324 | tree_size: &tree_size, bst_map); |
3325 | if (node != NULL) |
3326 | { |
3327 | /* Calculate the unrolling factor based on the smallest type. */ |
3328 | poly_uint64 unrolling_factor |
3329 | = calculate_unrolling_factor (nunits: max_nunits, group_size); |
3330 | |
3331 | if (maybe_ne (a: unrolling_factor, b: 1U) |
3332 | && is_a <bb_vec_info> (p: vinfo)) |
3333 | { |
3334 | unsigned HOST_WIDE_INT const_max_nunits; |
3335 | if (!max_nunits.is_constant (const_value: &const_max_nunits) |
3336 | || const_max_nunits > group_size) |
3337 | { |
3338 | if (dump_enabled_p ()) |
3339 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3340 | "Build SLP failed: store group " |
3341 | "size not a multiple of the vector size " |
3342 | "in basic block SLP\n" ); |
3343 | vect_free_slp_tree (node); |
3344 | return false; |
3345 | } |
3346 | /* Fatal mismatch. */ |
3347 | if (dump_enabled_p ()) |
3348 | dump_printf_loc (MSG_NOTE, vect_location, |
3349 | "SLP discovery succeeded but node needs " |
3350 | "splitting\n" ); |
3351 | memset (s: matches, c: true, n: group_size); |
3352 | matches[group_size / const_max_nunits * const_max_nunits] = false; |
3353 | vect_free_slp_tree (node); |
3354 | } |
3355 | else |
3356 | { |
3357 | /* Create a new SLP instance. */ |
3358 | slp_instance new_instance = XNEW (class _slp_instance); |
3359 | SLP_INSTANCE_TREE (new_instance) = node; |
3360 | SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; |
3361 | SLP_INSTANCE_LOADS (new_instance) = vNULL; |
3362 | SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos; |
3363 | SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain; |
3364 | SLP_INSTANCE_KIND (new_instance) = kind; |
3365 | new_instance->reduc_phis = NULL; |
3366 | new_instance->cost_vec = vNULL; |
3367 | new_instance->subgraph_entries = vNULL; |
3368 | |
3369 | if (dump_enabled_p ()) |
3370 | dump_printf_loc (MSG_NOTE, vect_location, |
3371 | "SLP size %u vs. limit %u.\n" , |
3372 | tree_size, max_tree_size); |
3373 | |
3374 | /* Fixup SLP reduction chains. */ |
3375 | if (kind == slp_inst_kind_reduc_chain) |
3376 | { |
3377 | /* If this is a reduction chain with a conversion in front |
3378 | amend the SLP tree with a node for that. */ |
3379 | gimple *scalar_def |
3380 | = vect_orig_stmt (stmt_info: scalar_stmts[group_size - 1])->stmt; |
3381 | if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def) |
3382 | { |
3383 | /* Get at the conversion stmt - we know it's the single use |
3384 | of the last stmt of the reduction chain. */ |
3385 | use_operand_p use_p; |
3386 | bool r = single_imm_use (var: gimple_assign_lhs (gs: scalar_def), |
3387 | use_p: &use_p, stmt: &scalar_def); |
3388 | gcc_assert (r); |
3389 | stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def); |
3390 | next_info = vect_stmt_to_vectorize (stmt_info: next_info); |
3391 | scalar_stmts = vNULL; |
3392 | scalar_stmts.create (nelems: group_size); |
3393 | for (unsigned i = 0; i < group_size; ++i) |
3394 | scalar_stmts.quick_push (obj: next_info); |
3395 | slp_tree conv = vect_create_new_slp_node (scalar_stmts, nops: 1); |
3396 | SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info); |
3397 | SLP_TREE_CHILDREN (conv).quick_push (obj: node); |
3398 | SLP_INSTANCE_TREE (new_instance) = conv; |
3399 | /* We also have to fake this conversion stmt as SLP reduction |
3400 | group so we don't have to mess with too much code |
3401 | elsewhere. */ |
3402 | REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info; |
3403 | REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL; |
3404 | } |
3405 | /* Fill the backedge child of the PHI SLP node. The |
3406 | general matching code cannot find it because the |
3407 | scalar code does not reflect how we vectorize the |
3408 | reduction. */ |
3409 | use_operand_p use_p; |
3410 | imm_use_iterator imm_iter; |
3411 | class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo)); |
3412 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, |
3413 | gimple_get_lhs (scalar_def)) |
3414 | /* There are exactly two non-debug uses, the reduction |
3415 | PHI and the loop-closed PHI node. */ |
3416 | if (!is_gimple_debug (USE_STMT (use_p)) |
3417 | && gimple_bb (USE_STMT (use_p)) == loop->header) |
3418 | { |
3419 | auto_vec<stmt_vec_info, 64> phis (group_size); |
3420 | stmt_vec_info phi_info |
3421 | = vinfo->lookup_stmt (USE_STMT (use_p)); |
3422 | for (unsigned i = 0; i < group_size; ++i) |
3423 | phis.quick_push (obj: phi_info); |
3424 | slp_tree *phi_node = bst_map->get (k: phis); |
3425 | unsigned dest_idx = loop_latch_edge (loop)->dest_idx; |
3426 | SLP_TREE_CHILDREN (*phi_node)[dest_idx] |
3427 | = SLP_INSTANCE_TREE (new_instance); |
3428 | SLP_INSTANCE_TREE (new_instance)->refcnt++; |
3429 | } |
3430 | } |
3431 | |
3432 | vinfo->slp_instances.safe_push (obj: new_instance); |
3433 | |
3434 | /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with |
3435 | the number of scalar stmts in the root in a few places. |
3436 | Verify that assumption holds. */ |
3437 | gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance)) |
3438 | .length () == group_size); |
3439 | |
3440 | if (dump_enabled_p ()) |
3441 | { |
3442 | dump_printf_loc (MSG_NOTE, vect_location, |
3443 | "Final SLP tree for instance %p:\n" , |
3444 | (void *) new_instance); |
3445 | vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location, |
3446 | SLP_INSTANCE_TREE (new_instance)); |
3447 | } |
3448 | |
3449 | return true; |
3450 | } |
3451 | } |
3452 | else |
3453 | { |
3454 | /* Failed to SLP. */ |
3455 | /* Free the allocated memory. */ |
3456 | scalar_stmts.release (); |
3457 | } |
3458 | |
3459 | stmt_vec_info stmt_info = stmt_info_; |
3460 | /* Try to break the group up into pieces. */ |
3461 | if (kind == slp_inst_kind_store) |
3462 | { |
3463 | /* ??? We could delay all the actual splitting of store-groups |
3464 | until after SLP discovery of the original group completed. |
3465 | Then we can recurse to vect_build_slp_instance directly. */ |
3466 | for (i = 0; i < group_size; i++) |
3467 | if (!matches[i]) |
3468 | break; |
3469 | |
3470 | /* For basic block SLP, try to break the group up into multiples of |
3471 | a vector size. */ |
3472 | if (is_a <bb_vec_info> (p: vinfo) |
3473 | && (i > 1 && i < group_size)) |
3474 | { |
3475 | tree scalar_type |
3476 | = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info))); |
3477 | tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, |
3478 | 1 << floor_log2 (x: i)); |
3479 | unsigned HOST_WIDE_INT const_nunits; |
3480 | if (vectype |
3481 | && TYPE_VECTOR_SUBPARTS (node: vectype).is_constant (const_value: &const_nunits)) |
3482 | { |
3483 | /* Split into two groups at the first vector boundary. */ |
3484 | gcc_assert ((const_nunits & (const_nunits - 1)) == 0); |
3485 | unsigned group1_size = i & ~(const_nunits - 1); |
3486 | |
3487 | if (dump_enabled_p ()) |
3488 | dump_printf_loc (MSG_NOTE, vect_location, |
3489 | "Splitting SLP group at stmt %u\n" , i); |
3490 | stmt_vec_info rest = vect_split_slp_store_group (first_vinfo: stmt_info, |
3491 | group1_size); |
3492 | bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info, |
3493 | kind, max_tree_size, |
3494 | limit); |
3495 | /* Split the rest at the failure point and possibly |
3496 | re-analyze the remaining matching part if it has |
3497 | at least two lanes. */ |
3498 | if (group1_size < i |
3499 | && (i + 1 < group_size |
3500 | || i - group1_size > 1)) |
3501 | { |
3502 | stmt_vec_info rest2 = rest; |
3503 | rest = vect_split_slp_store_group (first_vinfo: rest, group1_size: i - group1_size); |
3504 | if (i - group1_size > 1) |
3505 | res |= vect_analyze_slp_instance (vinfo, bst_map, stmt_info: rest2, |
3506 | kind, max_tree_size, |
3507 | limit); |
3508 | } |
3509 | /* Re-analyze the non-matching tail if it has at least |
3510 | two lanes. */ |
3511 | if (i + 1 < group_size) |
3512 | res |= vect_analyze_slp_instance (vinfo, bst_map, |
3513 | stmt_info: rest, kind, max_tree_size, |
3514 | limit); |
3515 | return res; |
3516 | } |
3517 | } |
3518 | |
3519 | /* For loop vectorization split into arbitrary pieces of size > 1. */ |
3520 | if (is_a <loop_vec_info> (p: vinfo) |
3521 | && (i > 1 && i < group_size) |
3522 | && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, new_group_size: i)) |
3523 | { |
3524 | unsigned group1_size = i; |
3525 | |
3526 | if (dump_enabled_p ()) |
3527 | dump_printf_loc (MSG_NOTE, vect_location, |
3528 | "Splitting SLP group at stmt %u\n" , i); |
3529 | |
3530 | stmt_vec_info rest = vect_split_slp_store_group (first_vinfo: stmt_info, |
3531 | group1_size); |
3532 | /* Loop vectorization cannot handle gaps in stores, make sure |
3533 | the split group appears as strided. */ |
3534 | STMT_VINFO_STRIDED_P (rest) = 1; |
3535 | DR_GROUP_GAP (rest) = 0; |
3536 | STMT_VINFO_STRIDED_P (stmt_info) = 1; |
3537 | DR_GROUP_GAP (stmt_info) = 0; |
3538 | |
3539 | bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info, |
3540 | kind, max_tree_size, limit); |
3541 | if (i + 1 < group_size) |
3542 | res |= vect_analyze_slp_instance (vinfo, bst_map, |
3543 | stmt_info: rest, kind, max_tree_size, limit); |
3544 | |
3545 | return res; |
3546 | } |
3547 | |
3548 | /* Even though the first vector did not all match, we might be able to SLP |
3549 | (some) of the remainder. FORNOW ignore this possibility. */ |
3550 | } |
3551 | |
3552 | /* Failed to SLP. */ |
3553 | if (dump_enabled_p ()) |
3554 | dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n" ); |
3555 | return false; |
3556 | } |
3557 | |
3558 | |
3559 | /* Analyze an SLP instance starting from a group of grouped stores. Call |
3560 | vect_build_slp_tree to build a tree of packed stmts if possible. |
3561 | Return FALSE if it's impossible to SLP any stmt in the loop. */ |
3562 | |
3563 | static bool |
3564 | vect_analyze_slp_instance (vec_info *vinfo, |
3565 | scalar_stmts_to_slp_tree_map_t *bst_map, |
3566 | stmt_vec_info stmt_info, |
3567 | slp_instance_kind kind, |
3568 | unsigned max_tree_size, unsigned *limit) |
3569 | { |
3570 | unsigned int i; |
3571 | vec<stmt_vec_info> scalar_stmts; |
3572 | |
3573 | if (is_a <bb_vec_info> (p: vinfo)) |
3574 | vect_location = stmt_info->stmt; |
3575 | |
3576 | stmt_vec_info next_info = stmt_info; |
3577 | if (kind == slp_inst_kind_store) |
3578 | { |
3579 | /* Collect the stores and store them in scalar_stmts. */ |
3580 | scalar_stmts.create (DR_GROUP_SIZE (stmt_info)); |
3581 | while (next_info) |
3582 | { |
3583 | scalar_stmts.quick_push (obj: vect_stmt_to_vectorize (stmt_info: next_info)); |
3584 | next_info = DR_GROUP_NEXT_ELEMENT (next_info); |
3585 | } |
3586 | } |
3587 | else if (kind == slp_inst_kind_reduc_chain) |
3588 | { |
3589 | /* Collect the reduction stmts and store them in scalar_stmts. */ |
3590 | scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info)); |
3591 | while (next_info) |
3592 | { |
3593 | scalar_stmts.quick_push (obj: vect_stmt_to_vectorize (stmt_info: next_info)); |
3594 | next_info = REDUC_GROUP_NEXT_ELEMENT (next_info); |
3595 | } |
3596 | /* Mark the first element of the reduction chain as reduction to properly |
3597 | transform the node. In the reduction analysis phase only the last |
3598 | element of the chain is marked as reduction. */ |
3599 | STMT_VINFO_DEF_TYPE (stmt_info) |
3600 | = STMT_VINFO_DEF_TYPE (scalar_stmts.last ()); |
3601 | STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) |
3602 | = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ())); |
3603 | } |
3604 | else if (kind == slp_inst_kind_reduc_group) |
3605 | { |
3606 | /* Collect reduction statements. */ |
3607 | const vec<stmt_vec_info> &reductions |
3608 | = as_a <loop_vec_info> (p: vinfo)->reductions; |
3609 | scalar_stmts.create (nelems: reductions.length ()); |
3610 | for (i = 0; reductions.iterate (ix: i, ptr: &next_info); i++) |
3611 | if ((STMT_VINFO_RELEVANT_P (next_info) |
3612 | || STMT_VINFO_LIVE_P (next_info)) |
3613 | /* ??? Make sure we didn't skip a conversion around a reduction |
3614 | path. In that case we'd have to reverse engineer that conversion |
3615 | stmt following the chain using reduc_idx and from the PHI |
3616 | using reduc_def. */ |
3617 | && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def) |
3618 | scalar_stmts.quick_push (obj: next_info); |
3619 | /* If less than two were relevant/live there's nothing to SLP. */ |
3620 | if (scalar_stmts.length () < 2) |
3621 | return false; |
3622 | } |
3623 | else |
3624 | gcc_unreachable (); |
3625 | |
3626 | vec<stmt_vec_info> roots = vNULL; |
3627 | vec<tree> remain = vNULL; |
3628 | /* Build the tree for the SLP instance. */ |
3629 | bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts, |
3630 | root_stmt_infos&: roots, remain, |
3631 | max_tree_size, limit, bst_map, |
3632 | stmt_info_: kind == slp_inst_kind_store |
3633 | ? stmt_info : NULL); |
3634 | |
3635 | /* ??? If this is slp_inst_kind_store and the above succeeded here's |
3636 | where we should do store group splitting. */ |
3637 | |
3638 | return res; |
3639 | } |
3640 | |
3641 | /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP |
3642 | trees of packed scalar stmts if SLP is possible. */ |
3643 | |
3644 | opt_result |
3645 | vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) |
3646 | { |
3647 | unsigned int i; |
3648 | stmt_vec_info first_element; |
3649 | slp_instance instance; |
3650 | |
3651 | DUMP_VECT_SCOPE ("vect_analyze_slp" ); |
3652 | |
3653 | unsigned limit = max_tree_size; |
3654 | |
3655 | scalar_stmts_to_slp_tree_map_t *bst_map |
3656 | = new scalar_stmts_to_slp_tree_map_t (); |
3657 | |
3658 | /* Find SLP sequences starting from groups of grouped stores. */ |
3659 | FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element) |
3660 | vect_analyze_slp_instance (vinfo, bst_map, stmt_info: first_element, |
3661 | kind: slp_inst_kind_store, max_tree_size, limit: &limit); |
3662 | |
3663 | if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo)) |
3664 | { |
3665 | for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i) |
3666 | { |
3667 | vect_location = bb_vinfo->roots[i].roots[0]->stmt; |
3668 | /* Apply patterns. */ |
3669 | for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j) |
3670 | bb_vinfo->roots[i].stmts[j] |
3671 | = vect_stmt_to_vectorize (stmt_info: bb_vinfo->roots[i].stmts[j]); |
3672 | if (vect_build_slp_instance (vinfo: bb_vinfo, kind: bb_vinfo->roots[i].kind, |
3673 | scalar_stmts&: bb_vinfo->roots[i].stmts, |
3674 | root_stmt_infos&: bb_vinfo->roots[i].roots, |
3675 | remain&: bb_vinfo->roots[i].remain, |
3676 | max_tree_size, limit: &limit, bst_map, NULL)) |
3677 | { |
3678 | bb_vinfo->roots[i].stmts = vNULL; |
3679 | bb_vinfo->roots[i].roots = vNULL; |
3680 | bb_vinfo->roots[i].remain = vNULL; |
3681 | } |
3682 | } |
3683 | } |
3684 | |
3685 | if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo)) |
3686 | { |
3687 | /* Find SLP sequences starting from reduction chains. */ |
3688 | FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element) |
3689 | if (! STMT_VINFO_RELEVANT_P (first_element) |
3690 | && ! STMT_VINFO_LIVE_P (first_element)) |
3691 | ; |
3692 | else if (! vect_analyze_slp_instance (vinfo, bst_map, stmt_info: first_element, |
3693 | kind: slp_inst_kind_reduc_chain, |
3694 | max_tree_size, limit: &limit)) |
3695 | { |
3696 | /* Dissolve reduction chain group. */ |
3697 | stmt_vec_info vinfo = first_element; |
3698 | stmt_vec_info last = NULL; |
3699 | while (vinfo) |
3700 | { |
3701 | stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo); |
3702 | REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL; |
3703 | REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL; |
3704 | last = vinfo; |
3705 | vinfo = next; |
3706 | } |
3707 | STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def; |
3708 | /* It can be still vectorized as part of an SLP reduction. */ |
3709 | loop_vinfo->reductions.safe_push (obj: last); |
3710 | } |
3711 | |
3712 | /* Find SLP sequences starting from groups of reductions. */ |
3713 | if (loop_vinfo->reductions.length () > 1) |
3714 | vect_analyze_slp_instance (vinfo, bst_map, stmt_info: loop_vinfo->reductions[0], |
3715 | kind: slp_inst_kind_reduc_group, max_tree_size, |
3716 | limit: &limit); |
3717 | } |
3718 | |
3719 | hash_set<slp_tree> visited_patterns; |
3720 | slp_tree_to_load_perm_map_t perm_cache; |
3721 | slp_compat_nodes_map_t compat_cache; |
3722 | |
3723 | /* See if any patterns can be found in the SLP tree. */ |
3724 | bool pattern_found = false; |
3725 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance) |
3726 | pattern_found |= vect_match_slp_patterns (instance, vinfo, |
3727 | visited: &visited_patterns, perm_cache: &perm_cache, |
3728 | compat_cache: &compat_cache); |
3729 | |
3730 | /* If any were found optimize permutations of loads. */ |
3731 | if (pattern_found) |
3732 | { |
3733 | hash_map<slp_tree, slp_tree> load_map; |
3734 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance) |
3735 | { |
3736 | slp_tree root = SLP_INSTANCE_TREE (instance); |
3737 | optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root), |
3738 | load_map: &load_map, root); |
3739 | } |
3740 | } |
3741 | |
3742 | |
3743 | |
3744 | /* The map keeps a reference on SLP nodes built, release that. */ |
3745 | for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); |
3746 | it != bst_map->end (); ++it) |
3747 | if ((*it).second) |
3748 | vect_free_slp_tree (node: (*it).second); |
3749 | delete bst_map; |
3750 | |
3751 | if (pattern_found && dump_enabled_p ()) |
3752 | { |
3753 | dump_printf_loc (MSG_NOTE, vect_location, |
3754 | "Pattern matched SLP tree\n" ); |
3755 | hash_set<slp_tree> visited; |
3756 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance) |
3757 | vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location, |
3758 | SLP_INSTANCE_TREE (instance), visited); |
3759 | } |
3760 | |
3761 | return opt_result::success (); |
3762 | } |
3763 | |
3764 | /* Estimates the cost of inserting layout changes into the SLP graph. |
3765 | It can also say that the insertion is impossible. */ |
3766 | |
3767 | struct slpg_layout_cost |
3768 | { |
3769 | slpg_layout_cost () = default; |
3770 | slpg_layout_cost (sreal, bool); |
3771 | |
3772 | static slpg_layout_cost impossible () { return { sreal::max (), 0 }; } |
3773 | bool is_possible () const { return depth != sreal::max (); } |
3774 | |
3775 | bool operator== (const slpg_layout_cost &) const; |
3776 | bool operator!= (const slpg_layout_cost &) const; |
3777 | |
3778 | bool is_better_than (const slpg_layout_cost &, bool) const; |
3779 | |
3780 | void add_parallel_cost (const slpg_layout_cost &); |
3781 | void add_serial_cost (const slpg_layout_cost &); |
3782 | void split (unsigned int); |
3783 | |
3784 | /* The longest sequence of layout changes needed during any traversal |
3785 | of the partition dag, weighted by execution frequency. |
3786 | |
3787 | This is the most important metric when optimizing for speed, since |
3788 | it helps to ensure that we keep the number of operations on |
3789 | critical paths to a minimum. */ |
3790 | sreal depth = 0; |
3791 | |
3792 | /* An estimate of the total number of operations needed. It is weighted by |
3793 | execution frequency when optimizing for speed but not when optimizing for |
3794 | size. In order to avoid double-counting, a node with a fanout of N will |
3795 | distribute 1/N of its total cost to each successor. |
3796 | |
3797 | This is the most important metric when optimizing for size, since |
3798 | it helps to keep the total number of operations to a minimum, */ |
3799 | sreal total = 0; |
3800 | }; |
3801 | |
3802 | /* Construct costs for a node with weight WEIGHT. A higher weight |
3803 | indicates more frequent execution. IS_FOR_SIZE is true if we are |
3804 | optimizing for size rather than speed. */ |
3805 | |
3806 | slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size) |
3807 | : depth (weight), total (is_for_size && weight > 0 ? 1 : weight) |
3808 | { |
3809 | } |
3810 | |
3811 | bool |
3812 | slpg_layout_cost::operator== (const slpg_layout_cost &other) const |
3813 | { |
3814 | return depth == other.depth && total == other.total; |
3815 | } |
3816 | |
3817 | bool |
3818 | slpg_layout_cost::operator!= (const slpg_layout_cost &other) const |
3819 | { |
3820 | return !operator== (other); |
3821 | } |
3822 | |
3823 | /* Return true if these costs are better than OTHER. IS_FOR_SIZE is |
3824 | true if we are optimizing for size rather than speed. */ |
3825 | |
3826 | bool |
3827 | slpg_layout_cost::is_better_than (const slpg_layout_cost &other, |
3828 | bool is_for_size) const |
3829 | { |
3830 | if (is_for_size) |
3831 | { |
3832 | if (total != other.total) |
3833 | return total < other.total; |
3834 | return depth < other.depth; |
3835 | } |
3836 | else |
3837 | { |
3838 | if (depth != other.depth) |
3839 | return depth < other.depth; |
3840 | return total < other.total; |
3841 | } |
3842 | } |
3843 | |
3844 | /* Increase the costs to account for something with cost INPUT_COST |
3845 | happening in parallel with the current costs. */ |
3846 | |
3847 | void |
3848 | slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost) |
3849 | { |
3850 | depth = std::max (a: depth, b: input_cost.depth); |
3851 | total += input_cost.total; |
3852 | } |
3853 | |
3854 | /* Increase the costs to account for something with cost INPUT_COST |
3855 | happening in series with the current costs. */ |
3856 | |
3857 | void |
3858 | slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other) |
3859 | { |
3860 | depth += other.depth; |
3861 | total += other.total; |
3862 | } |
3863 | |
3864 | /* Split the total cost among TIMES successors or predecessors. */ |
3865 | |
3866 | void |
3867 | slpg_layout_cost::split (unsigned int times) |
3868 | { |
3869 | if (times > 1) |
3870 | total /= times; |
3871 | } |
3872 | |
3873 | /* Information about one node in the SLP graph, for use during |
3874 | vect_optimize_slp_pass. */ |
3875 | |
3876 | struct slpg_vertex |
3877 | { |
3878 | slpg_vertex (slp_tree node_) : node (node_) {} |
3879 | |
3880 | /* The node itself. */ |
3881 | slp_tree node; |
3882 | |
3883 | /* Which partition the node belongs to, or -1 if none. Nodes outside of |
3884 | partitions are flexible; they can have whichever layout consumers |
3885 | want them to have. */ |
3886 | int partition = -1; |
3887 | |
3888 | /* The number of nodes that directly use the result of this one |
3889 | (i.e. the number of nodes that count this one as a child). */ |
3890 | unsigned int out_degree = 0; |
3891 | |
3892 | /* The execution frequency of the node. */ |
3893 | sreal weight = 0; |
3894 | |
3895 | /* The total execution frequency of all nodes that directly use the |
3896 | result of this one. */ |
3897 | sreal out_weight = 0; |
3898 | }; |
3899 | |
3900 | /* Information about one partition of the SLP graph, for use during |
3901 | vect_optimize_slp_pass. */ |
3902 | |
3903 | struct slpg_partition_info |
3904 | { |
3905 | /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END) |
3906 | of m_partitioned_nodes. */ |
3907 | unsigned int node_begin = 0; |
3908 | unsigned int node_end = 0; |
3909 | |
3910 | /* Which layout we've chosen to use for this partition, or -1 if |
3911 | we haven't picked one yet. */ |
3912 | int layout = -1; |
3913 | |
3914 | /* The number of predecessors and successors in the partition dag. |
3915 | The predecessors always have lower partition numbers and the |
3916 | successors always have higher partition numbers. |
3917 | |
3918 | Note that the directions of these edges are not necessarily the |
3919 | same as in the data flow graph. For example, if an SCC has separate |
3920 | partitions for an inner loop and an outer loop, the inner loop's |
3921 | partition will have at least two incoming edges from the outer loop's |
3922 | partition: one for a live-in value and one for a live-out value. |
3923 | In data flow terms, one of these edges would also be from the outer loop |
3924 | to the inner loop, but the other would be in the opposite direction. */ |
3925 | unsigned int in_degree = 0; |
3926 | unsigned int out_degree = 0; |
3927 | }; |
3928 | |
3929 | /* Information about the costs of using a particular layout for a |
3930 | particular partition. It can also say that the combination is |
3931 | impossible. */ |
3932 | |
3933 | struct slpg_partition_layout_costs |
3934 | { |
3935 | bool is_possible () const { return internal_cost.is_possible (); } |
3936 | void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); } |
3937 | |
3938 | /* The costs inherited from predecessor partitions. */ |
3939 | slpg_layout_cost in_cost; |
3940 | |
3941 | /* The inherent cost of the layout within the node itself. For example, |
3942 | this is nonzero for a load if choosing a particular layout would require |
3943 | the load to permute the loaded elements. It is nonzero for a |
3944 | VEC_PERM_EXPR if the permutation cannot be eliminated or converted |
3945 | to full-vector moves. */ |
3946 | slpg_layout_cost internal_cost; |
3947 | |
3948 | /* The costs inherited from successor partitions. */ |
3949 | slpg_layout_cost out_cost; |
3950 | }; |
3951 | |
3952 | /* This class tries to optimize the layout of vectors in order to avoid |
3953 | unnecessary shuffling. At the moment, the set of possible layouts are |
3954 | restricted to bijective permutations. |
3955 | |
3956 | The goal of the pass depends on whether we're optimizing for size or |
3957 | for speed. When optimizing for size, the goal is to reduce the overall |
3958 | number of layout changes (including layout changes implied by things |
3959 | like load permutations). When optimizing for speed, the goal is to |
3960 | reduce the maximum latency attributable to layout changes on any |
3961 | non-cyclical path through the data flow graph. |
3962 | |
3963 | For example, when optimizing a loop nest for speed, we will prefer |
3964 | to make layout changes outside of a loop rather than inside of a loop, |
3965 | and will prefer to make layout changes in parallel rather than serially, |
3966 | even if that increases the overall number of layout changes. |
3967 | |
3968 | The high-level procedure is: |
3969 | |
3970 | (1) Build a graph in which edges go from uses (parents) to definitions |
3971 | (children). |
3972 | |
3973 | (2) Divide the graph into a dag of strongly-connected components (SCCs). |
3974 | |
3975 | (3) When optimizing for speed, partition the nodes in each SCC based |
3976 | on their containing cfg loop. When optimizing for size, treat |
3977 | each SCC as a single partition. |
3978 | |
3979 | This gives us a dag of partitions. The goal is now to assign a |
3980 | layout to each partition. |
3981 | |
3982 | (4) Construct a set of vector layouts that are worth considering. |
3983 | Record which nodes must keep their current layout. |
3984 | |
3985 | (5) Perform a forward walk over the partition dag (from loads to stores) |
3986 | accumulating the "forward" cost of using each layout. When visiting |
3987 | each partition, assign a tentative choice of layout to the partition |
3988 | and use that choice when calculating the cost of using a different |
3989 | layout in successor partitions. |
3990 | |
3991 | (6) Perform a backward walk over the partition dag (from stores to loads), |
3992 | accumulating the "backward" cost of using each layout. When visiting |
3993 | each partition, make a final choice of layout for that partition based |
3994 | on the accumulated forward costs (from (5)) and backward costs |
3995 | (from (6)). |
3996 | |
3997 | (7) Apply the chosen layouts to the SLP graph. |
3998 | |
3999 | For example, consider the SLP statements: |
4000 | |
4001 | S1: a_1 = load |
4002 | loop: |
4003 | S2: a_2 = PHI<a_1, a_3> |
4004 | S3: b_1 = load |
4005 | S4: a_3 = a_2 + b_1 |
4006 | exit: |
4007 | S5: a_4 = PHI<a_3> |
4008 | S6: store a_4 |
4009 | |
4010 | S2 and S4 form an SCC and are part of the same loop. Every other |
4011 | statement is in a singleton SCC. In this example there is a one-to-one |
4012 | mapping between SCCs and partitions and the partition dag looks like this; |
4013 | |
4014 | S1 S3 |
4015 | \ / |
4016 | S2+S4 |
4017 | | |
4018 | S5 |
4019 | | |
4020 | S6 |
4021 | |
4022 | S2, S3 and S4 will have a higher execution frequency than the other |
4023 | statements, so when optimizing for speed, the goal is to avoid any |
4024 | layout changes: |
4025 | |
4026 | - within S3 |
4027 | - within S2+S4 |
4028 | - on the S3->S2+S4 edge |
4029 | |
4030 | For example, if S3 was originally a reversing load, the goal of the |
4031 | pass is to make it an unreversed load and change the layout on the |
4032 | S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout |
4033 | on S1->S2+S4 and S5->S6 would also be acceptable.) |
4034 | |
4035 | The difference between SCCs and partitions becomes important if we |
4036 | add an outer loop: |
4037 | |
4038 | S1: a_1 = ... |
4039 | loop1: |
4040 | S2: a_2 = PHI<a_1, a_6> |
4041 | S3: b_1 = load |
4042 | S4: a_3 = a_2 + b_1 |
4043 | loop2: |
4044 | S5: a_4 = PHI<a_3, a_5> |
4045 | S6: c_1 = load |
4046 | S7: a_5 = a_4 + c_1 |
4047 | exit2: |
4048 | S8: a_6 = PHI<a_5> |
4049 | S9: store a_6 |
4050 | exit1: |
4051 | |
4052 | Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing |
4053 | for speed, we usually do not want restrictions in the outer loop to "infect" |
4054 | the decision for the inner loop. For example, if an outer-loop node |
4055 | in the SCC contains a statement with a fixed layout, that should not |
4056 | prevent the inner loop from using a different layout. Conversely, |
4057 | the inner loop should not dictate a layout to the outer loop: if the |
4058 | outer loop does a lot of computation, then it may not be efficient to |
4059 | do all of that computation in the inner loop's preferred layout. |
4060 | |
4061 | So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer) |
4062 | and S5+S7 (inner). We also try to arrange partitions so that: |
4063 | |
4064 | - the partition for an outer loop comes before the partition for |
4065 | an inner loop |
4066 | |
4067 | - if a sibling loop A dominates a sibling loop B, A's partition |
4068 | comes before B's |
4069 | |
4070 | This gives the following partition dag for the example above: |
4071 | |
4072 | S1 S3 |
4073 | \ / |
4074 | S2+S4+S8 S6 |
4075 | | \\ / |
4076 | | S5+S7 |
4077 | | |
4078 | S9 |
4079 | |
4080 | There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and |
4081 | one for a reversal of the edge S7->S8. |
4082 | |
4083 | The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice |
4084 | for S2+S4+S8 therefore has to balance the cost of using the outer loop's |
4085 | preferred layout against the cost of changing the layout on entry to the |
4086 | inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed). |
4087 | |
4088 | Although this works well when optimizing for speed, it has the downside |
4089 | when optimizing for size that the choice of layout for S5+S7 is completely |
4090 | independent of S9, which lessens the chance of reducing the overall number |
4091 | of permutations. We therefore do not partition SCCs when optimizing |
4092 | for size. |
4093 | |
4094 | To give a concrete example of the difference between optimizing |
4095 | for size and speed, consider: |
4096 | |
4097 | a[0] = (b[1] << c[3]) - d[1]; |
4098 | a[1] = (b[0] << c[2]) - d[0]; |
4099 | a[2] = (b[3] << c[1]) - d[3]; |
4100 | a[3] = (b[2] << c[0]) - d[2]; |
4101 | |
4102 | There are three different layouts here: one for a, one for b and d, |
4103 | and one for c. When optimizing for speed it is better to permute each |
4104 | of b, c and d into the order required by a, since those permutations |
4105 | happen in parallel. But when optimizing for size, it is better to: |
4106 | |
4107 | - permute c into the same order as b |
4108 | - do the arithmetic |
4109 | - permute the result into the order required by a |
4110 | |
4111 | This gives 2 permutations rather than 3. */ |
4112 | |
4113 | class vect_optimize_slp_pass |
4114 | { |
4115 | public: |
4116 | vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {} |
4117 | void run (); |
4118 | |
4119 | private: |
4120 | /* Graph building. */ |
4121 | struct loop *containing_loop (slp_tree); |
4122 | bool is_cfg_latch_edge (graph_edge *); |
4123 | void build_vertices (hash_set<slp_tree> &, slp_tree); |
4124 | void build_vertices (); |
4125 | void build_graph (); |
4126 | |
4127 | /* Partitioning. */ |
4128 | void create_partitions (); |
4129 | template<typename T> void for_each_partition_edge (unsigned int, T); |
4130 | |
4131 | /* Layout selection. */ |
4132 | bool is_compatible_layout (slp_tree, unsigned int); |
4133 | int change_layout_cost (slp_tree, unsigned int, unsigned int); |
4134 | slpg_partition_layout_costs &partition_layout_costs (unsigned int, |
4135 | unsigned int); |
4136 | void change_vec_perm_layout (slp_tree, lane_permutation_t &, |
4137 | int, unsigned int); |
4138 | int internal_node_cost (slp_tree, int, unsigned int); |
4139 | void start_choosing_layouts (); |
4140 | |
4141 | /* Cost propagation. */ |
4142 | slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int, |
4143 | unsigned int, unsigned int); |
4144 | slpg_layout_cost total_in_cost (unsigned int); |
4145 | slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int); |
4146 | slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int); |
4147 | void forward_pass (); |
4148 | void backward_pass (); |
4149 | |
4150 | /* Rematerialization. */ |
4151 | slp_tree get_result_with_layout (slp_tree, unsigned int); |
4152 | void materialize (); |
4153 | |
4154 | /* Clean-up. */ |
4155 | void remove_redundant_permutations (); |
4156 | |
4157 | void dump (); |
4158 | |
4159 | vec_info *m_vinfo; |
4160 | |
4161 | /* True if we should optimize the graph for size, false if we should |
4162 | optimize it for speed. (It wouldn't be easy to make this decision |
4163 | more locally.) */ |
4164 | bool m_optimize_size; |
4165 | |
4166 | /* A graph of all SLP nodes, with edges leading from uses to definitions. |
4167 | In other words, a node's predecessors are its slp_tree parents and |
4168 | a node's successors are its slp_tree children. */ |
4169 | graph *m_slpg = nullptr; |
4170 | |
4171 | /* The vertices of M_SLPG, indexed by slp_tree::vertex. */ |
4172 | auto_vec<slpg_vertex> m_vertices; |
4173 | |
4174 | /* The list of all leaves of M_SLPG. such as external definitions, constants, |
4175 | and loads. */ |
4176 | auto_vec<int> m_leafs; |
4177 | |
4178 | /* This array has one entry for every vector layout that we're considering. |
4179 | Element 0 is null and indicates "no change". Other entries describe |
4180 | permutations that are inherent in the current graph and that we would |
4181 | like to reverse if possible. |
4182 | |
4183 | For example, a permutation { 1, 2, 3, 0 } means that something has |
4184 | effectively been permuted in that way, such as a load group |
4185 | { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]). |
4186 | We'd then like to apply the reverse permutation { 3, 0, 1, 2 } |
4187 | in order to put things "back" in order. */ |
4188 | auto_vec<vec<unsigned> > m_perms; |
4189 | |
4190 | /* A partitioning of the nodes for which a layout must be chosen. |
4191 | Each partition represents an <SCC, cfg loop> pair; that is, |
4192 | nodes in different SCCs belong to different partitions, and nodes |
4193 | within an SCC can be further partitioned according to a containing |
4194 | cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if: |
4195 | |
4196 | - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk |
4197 | from leaves (such as loads) to roots (such as stores). |
4198 | |
4199 | - SCC1 == SCC2 and L1's header strictly dominates L2's header. */ |
4200 | auto_vec<slpg_partition_info> m_partitions; |
4201 | |
4202 | /* The list of all nodes for which a layout must be chosen. Nodes for |
4203 | partition P come before the nodes for partition P+1. Nodes within a |
4204 | partition are in reverse postorder. */ |
4205 | auto_vec<unsigned int> m_partitioned_nodes; |
4206 | |
4207 | /* Index P * num-layouts + L contains the cost of using layout L |
4208 | for partition P. */ |
4209 | auto_vec<slpg_partition_layout_costs> m_partition_layout_costs; |
4210 | |
4211 | /* Index N * num-layouts + L, if nonnull, is a node that provides the |
4212 | original output of node N adjusted to have layout L. */ |
4213 | auto_vec<slp_tree> m_node_layouts; |
4214 | }; |
4215 | |
4216 | /* Fill the vertices and leafs vector with all nodes in the SLP graph. |
4217 | Also record whether we should optimize anything for speed rather |
4218 | than size. */ |
4219 | |
4220 | void |
4221 | vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited, |
4222 | slp_tree node) |
4223 | { |
4224 | unsigned i; |
4225 | slp_tree child; |
4226 | |
4227 | if (visited.add (k: node)) |
4228 | return; |
4229 | |
4230 | if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node)) |
4231 | { |
4232 | basic_block bb = gimple_bb (g: vect_orig_stmt (stmt_info: rep)->stmt); |
4233 | if (optimize_bb_for_speed_p (bb)) |
4234 | m_optimize_size = false; |
4235 | } |
4236 | |
4237 | node->vertex = m_vertices.length (); |
4238 | m_vertices.safe_push (obj: slpg_vertex (node)); |
4239 | |
4240 | bool leaf = true; |
4241 | bool force_leaf = false; |
4242 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
4243 | if (child) |
4244 | { |
4245 | leaf = false; |
4246 | build_vertices (visited, node: child); |
4247 | } |
4248 | else |
4249 | force_leaf = true; |
4250 | /* Since SLP discovery works along use-def edges all cycles have an |
4251 | entry - but there's the exception of cycles where we do not handle |
4252 | the entry explicitely (but with a NULL SLP node), like some reductions |
4253 | and inductions. Force those SLP PHIs to act as leafs to make them |
4254 | backwards reachable. */ |
4255 | if (leaf || force_leaf) |
4256 | m_leafs.safe_push (obj: node->vertex); |
4257 | } |
4258 | |
4259 | /* Fill the vertices and leafs vector with all nodes in the SLP graph. */ |
4260 | |
4261 | void |
4262 | vect_optimize_slp_pass::build_vertices () |
4263 | { |
4264 | hash_set<slp_tree> visited; |
4265 | unsigned i; |
4266 | slp_instance instance; |
4267 | FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance) |
4268 | build_vertices (visited, SLP_INSTANCE_TREE (instance)); |
4269 | } |
4270 | |
4271 | /* Apply (reverse) bijectite PERM to VEC. */ |
4272 | |
4273 | template <class T> |
4274 | static void |
4275 | vect_slp_permute (vec<unsigned> perm, |
4276 | vec<T> &vec, bool reverse) |
4277 | { |
4278 | auto_vec<T, 64> saved; |
4279 | saved.create (vec.length ()); |
4280 | for (unsigned i = 0; i < vec.length (); ++i) |
4281 | saved.quick_push (vec[i]); |
4282 | |
4283 | if (reverse) |
4284 | { |
4285 | for (unsigned i = 0; i < vec.length (); ++i) |
4286 | vec[perm[i]] = saved[i]; |
4287 | for (unsigned i = 0; i < vec.length (); ++i) |
4288 | gcc_assert (vec[perm[i]] == saved[i]); |
4289 | } |
4290 | else |
4291 | { |
4292 | for (unsigned i = 0; i < vec.length (); ++i) |
4293 | vec[i] = saved[perm[i]]; |
4294 | for (unsigned i = 0; i < vec.length (); ++i) |
4295 | gcc_assert (vec[i] == saved[perm[i]]); |
4296 | } |
4297 | } |
4298 | |
4299 | /* Return the cfg loop that contains NODE. */ |
4300 | |
4301 | struct loop * |
4302 | vect_optimize_slp_pass::containing_loop (slp_tree node) |
4303 | { |
4304 | stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node); |
4305 | if (!rep) |
4306 | return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father; |
4307 | return gimple_bb (g: vect_orig_stmt (stmt_info: rep)->stmt)->loop_father; |
4308 | } |
4309 | |
4310 | /* Return true if UD (an edge from a use to a definition) is associated |
4311 | with a loop latch edge in the cfg. */ |
4312 | |
4313 | bool |
4314 | vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud) |
4315 | { |
4316 | slp_tree use = m_vertices[ud->src].node; |
4317 | slp_tree def = m_vertices[ud->dest].node; |
4318 | if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def |
4319 | || SLP_TREE_CODE (use) == VEC_PERM_EXPR) |
4320 | || SLP_TREE_DEF_TYPE (def) != vect_internal_def) |
4321 | return false; |
4322 | |
4323 | stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use)); |
4324 | return (is_a<gphi *> (p: use_rep->stmt) |
4325 | && bb_loop_header_p (gimple_bb (g: use_rep->stmt)) |
4326 | && containing_loop (node: def) == containing_loop (node: use)); |
4327 | } |
4328 | |
4329 | /* Build the graph. Mark edges that correspond to cfg loop latch edges with |
4330 | a nonnull data field. */ |
4331 | |
4332 | void |
4333 | vect_optimize_slp_pass::build_graph () |
4334 | { |
4335 | m_optimize_size = true; |
4336 | build_vertices (); |
4337 | |
4338 | m_slpg = new_graph (m_vertices.length ()); |
4339 | for (slpg_vertex &v : m_vertices) |
4340 | for (slp_tree child : SLP_TREE_CHILDREN (v.node)) |
4341 | if (child) |
4342 | { |
4343 | graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex); |
4344 | if (is_cfg_latch_edge (ud)) |
4345 | ud->data = this; |
4346 | } |
4347 | } |
4348 | |
4349 | /* Return true if E corresponds to a loop latch edge in the cfg. */ |
4350 | |
4351 | static bool |
4352 | skip_cfg_latch_edges (graph_edge *e) |
4353 | { |
4354 | return e->data; |
4355 | } |
4356 | |
4357 | /* Create the node partitions. */ |
4358 | |
4359 | void |
4360 | vect_optimize_slp_pass::create_partitions () |
4361 | { |
4362 | /* Calculate a postorder of the graph, ignoring edges that correspond |
4363 | to natural latch edges in the cfg. Reading the vector from the end |
4364 | to the beginning gives the reverse postorder. */ |
4365 | auto_vec<int> initial_rpo; |
4366 | graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo, |
4367 | false, NULL, skip_cfg_latch_edges); |
4368 | gcc_assert (initial_rpo.length () == m_vertices.length ()); |
4369 | |
4370 | /* Calculate the strongly connected components of the graph. */ |
4371 | auto_vec<int> scc_grouping; |
4372 | unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping); |
4373 | |
4374 | /* Create a new index order in which all nodes from the same SCC are |
4375 | consecutive. Use scc_pos to record the index of the first node in |
4376 | each SCC. */ |
4377 | auto_vec<unsigned int> scc_pos (num_sccs); |
4378 | int last_component = -1; |
4379 | unsigned int node_count = 0; |
4380 | for (unsigned int node_i : scc_grouping) |
4381 | { |
4382 | if (last_component != m_slpg->vertices[node_i].component) |
4383 | { |
4384 | last_component = m_slpg->vertices[node_i].component; |
4385 | gcc_assert (last_component == int (scc_pos.length ())); |
4386 | scc_pos.quick_push (obj: node_count); |
4387 | } |
4388 | node_count += 1; |
4389 | } |
4390 | gcc_assert (node_count == initial_rpo.length () |
4391 | && last_component + 1 == int (num_sccs)); |
4392 | |
4393 | /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes |
4394 | inside each SCC following the RPO we calculated above. The fact that |
4395 | we ignored natural latch edges when calculating the RPO should ensure |
4396 | that, for natural loop nests: |
4397 | |
4398 | - the first node that we encounter in a cfg loop is the loop header phi |
4399 | - the loop header phis are in dominance order |
4400 | |
4401 | Arranging for this is an optimization (see below) rather than a |
4402 | correctness issue. Unnatural loops with a tangled mess of backedges |
4403 | will still work correctly, but might give poorer results. |
4404 | |
4405 | Also update scc_pos so that it gives 1 + the index of the last node |
4406 | in the SCC. */ |
4407 | m_partitioned_nodes.safe_grow (len: node_count); |
4408 | for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;) |
4409 | { |
4410 | unsigned int node_i = initial_rpo[old_i]; |
4411 | unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++; |
4412 | m_partitioned_nodes[new_i] = node_i; |
4413 | } |
4414 | |
4415 | /* When optimizing for speed, partition each SCC based on the containing |
4416 | cfg loop. The order we constructed above should ensure that, for natural |
4417 | cfg loops, we'll create sub-SCC partitions for outer loops before |
4418 | the corresponding sub-SCC partitions for inner loops. Similarly, |
4419 | when one sibling loop A dominates another sibling loop B, we should |
4420 | create a sub-SCC partition for A before a sub-SCC partition for B. |
4421 | |
4422 | As above, nothing depends for correctness on whether this achieves |
4423 | a natural nesting, but we should get better results when it does. */ |
4424 | m_partitions.reserve (nelems: m_vertices.length ()); |
4425 | unsigned int next_partition_i = 0; |
4426 | hash_map<struct loop *, int> loop_partitions; |
4427 | unsigned int rpo_begin = 0; |
4428 | unsigned int num_partitioned_nodes = 0; |
4429 | for (unsigned int rpo_end : scc_pos) |
4430 | { |
4431 | loop_partitions.empty (); |
4432 | unsigned int partition_i = next_partition_i; |
4433 | for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i) |
4434 | { |
4435 | /* Handle externals and constants optimistically throughout. |
4436 | But treat existing vectors as fixed since we do not handle |
4437 | permuting them. */ |
4438 | unsigned int node_i = m_partitioned_nodes[rpo_i]; |
4439 | auto &vertex = m_vertices[node_i]; |
4440 | if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def |
4441 | && !SLP_TREE_VEC_DEFS (vertex.node).exists ()) |
4442 | || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def) |
4443 | vertex.partition = -1; |
4444 | else |
4445 | { |
4446 | bool existed; |
4447 | if (m_optimize_size) |
4448 | existed = next_partition_i > partition_i; |
4449 | else |
4450 | { |
4451 | struct loop *loop = containing_loop (node: vertex.node); |
4452 | auto &entry = loop_partitions.get_or_insert (k: loop, existed: &existed); |
4453 | if (!existed) |
4454 | entry = next_partition_i; |
4455 | partition_i = entry; |
4456 | } |
4457 | if (!existed) |
4458 | { |
4459 | m_partitions.quick_push (obj: slpg_partition_info ()); |
4460 | next_partition_i += 1; |
4461 | } |
4462 | vertex.partition = partition_i; |
4463 | num_partitioned_nodes += 1; |
4464 | m_partitions[partition_i].node_end += 1; |
4465 | } |
4466 | } |
4467 | rpo_begin = rpo_end; |
4468 | } |
4469 | |
4470 | /* Assign ranges of consecutive node indices to each partition, |
4471 | in partition order. Start with node_end being the same as |
4472 | node_begin so that the next loop can use it as a counter. */ |
4473 | unsigned int node_begin = 0; |
4474 | for (auto &partition : m_partitions) |
4475 | { |
4476 | partition.node_begin = node_begin; |
4477 | node_begin += partition.node_end; |
4478 | partition.node_end = partition.node_begin; |
4479 | } |
4480 | gcc_assert (node_begin == num_partitioned_nodes); |
4481 | |
4482 | /* Finally build the list of nodes in partition order. */ |
4483 | m_partitioned_nodes.truncate (size: num_partitioned_nodes); |
4484 | for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i) |
4485 | { |
4486 | int partition_i = m_vertices[node_i].partition; |
4487 | if (partition_i >= 0) |
4488 | { |
4489 | unsigned int order_i = m_partitions[partition_i].node_end++; |
4490 | m_partitioned_nodes[order_i] = node_i; |
4491 | } |
4492 | } |
4493 | } |
4494 | |
4495 | /* Look for edges from earlier partitions into node NODE_I and edges from |
4496 | node NODE_I into later partitions. Call: |
4497 | |
4498 | FN (ud, other_node_i) |
4499 | |
4500 | for each such use-to-def edge ud, where other_node_i is the node at the |
4501 | other end of the edge. */ |
4502 | |
4503 | template<typename T> |
4504 | void |
4505 | vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn) |
4506 | { |
4507 | int partition_i = m_vertices[node_i].partition; |
4508 | for (graph_edge *pred = m_slpg->vertices[node_i].pred; |
4509 | pred; pred = pred->pred_next) |
4510 | { |
4511 | int src_partition_i = m_vertices[pred->src].partition; |
4512 | if (src_partition_i >= 0 && src_partition_i != partition_i) |
4513 | fn (pred, pred->src); |
4514 | } |
4515 | for (graph_edge *succ = m_slpg->vertices[node_i].succ; |
4516 | succ; succ = succ->succ_next) |
4517 | { |
4518 | int dest_partition_i = m_vertices[succ->dest].partition; |
4519 | if (dest_partition_i >= 0 && dest_partition_i != partition_i) |
4520 | fn (succ, succ->dest); |
4521 | } |
4522 | } |
4523 | |
4524 | /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes |
4525 | that NODE would operate on. This test is independent of NODE's actual |
4526 | operation. */ |
4527 | |
4528 | bool |
4529 | vect_optimize_slp_pass::is_compatible_layout (slp_tree node, |
4530 | unsigned int layout_i) |
4531 | { |
4532 | if (layout_i == 0) |
4533 | return true; |
4534 | |
4535 | if (SLP_TREE_LANES (node) != m_perms[layout_i].length ()) |
4536 | return false; |
4537 | |
4538 | return true; |
4539 | } |
4540 | |
4541 | /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I |
4542 | to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the |
4543 | layouts is incompatible with NODE or if the change is not possible for |
4544 | some other reason. |
4545 | |
4546 | The properties taken from NODE include the number of lanes and the |
4547 | vector type. The actual operation doesn't matter. */ |
4548 | |
4549 | int |
4550 | vect_optimize_slp_pass::change_layout_cost (slp_tree node, |
4551 | unsigned int from_layout_i, |
4552 | unsigned int to_layout_i) |
4553 | { |
4554 | if (!is_compatible_layout (node, layout_i: from_layout_i) |
4555 | || !is_compatible_layout (node, layout_i: to_layout_i)) |
4556 | return -1; |
4557 | |
4558 | if (from_layout_i == to_layout_i) |
4559 | return 0; |
4560 | |
4561 | auto_vec<slp_tree, 1> children (1); |
4562 | children.quick_push (obj: node); |
4563 | auto_lane_permutation_t perm (SLP_TREE_LANES (node)); |
4564 | if (from_layout_i > 0) |
4565 | for (unsigned int i : m_perms[from_layout_i]) |
4566 | perm.quick_push (obj: { 0, i }); |
4567 | else |
4568 | for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i) |
4569 | perm.quick_push (obj: { 0, i }); |
4570 | if (to_layout_i > 0) |
4571 | vect_slp_permute (perm: m_perms[to_layout_i], vec&: perm, reverse: true); |
4572 | auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm, |
4573 | children, false); |
4574 | if (count >= 0) |
4575 | return MAX (count, 1); |
4576 | |
4577 | /* ??? In principle we could try changing via layout 0, giving two |
4578 | layout changes rather than 1. Doing that would require |
4579 | corresponding support in get_result_with_layout. */ |
4580 | return -1; |
4581 | } |
4582 | |
4583 | /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */ |
4584 | |
4585 | inline slpg_partition_layout_costs & |
4586 | vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i, |
4587 | unsigned int layout_i) |
4588 | { |
4589 | return m_partition_layout_costs[partition_i * m_perms.length () + layout_i]; |
4590 | } |
4591 | |
4592 | /* Change PERM in one of two ways: |
4593 | |
4594 | - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been |
4595 | chosen for child I of NODE. |
4596 | |
4597 | - if IN_LAYOUT >= 0, accept all inputs operands with that layout. |
4598 | |
4599 | In both cases, arrange for the output to have layout OUT_LAYOUT_I */ |
4600 | |
4601 | void |
4602 | vect_optimize_slp_pass:: |
4603 | change_vec_perm_layout (slp_tree node, lane_permutation_t &perm, |
4604 | int in_layout_i, unsigned int out_layout_i) |
4605 | { |
4606 | for (auto &entry : perm) |
4607 | { |
4608 | int this_in_layout_i = in_layout_i; |
4609 | if (this_in_layout_i < 0) |
4610 | { |
4611 | slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first]; |
4612 | unsigned int in_partition_i = m_vertices[in_node->vertex].partition; |
4613 | this_in_layout_i = m_partitions[in_partition_i].layout; |
4614 | } |
4615 | if (this_in_layout_i > 0) |
4616 | entry.second = m_perms[this_in_layout_i][entry.second]; |
4617 | } |
4618 | if (out_layout_i > 0) |
4619 | vect_slp_permute (perm: m_perms[out_layout_i], vec&: perm, reverse: true); |
4620 | } |
4621 | |
4622 | /* Check whether the target allows NODE to be rearranged so that the node's |
4623 | output has layout OUT_LAYOUT_I. Return the cost of the change if so, |
4624 | in the same arbitrary units as for change_layout_cost. Return -1 otherwise. |
4625 | |
4626 | If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether |
4627 | NODE can adapt to the layout changes that have (perhaps provisionally) |
4628 | been chosen for NODE's children, so that no extra permutations are |
4629 | needed on either the input or the output of NODE. |
4630 | |
4631 | If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume |
4632 | that all inputs will be forced into layout IN_LAYOUT_I beforehand. |
4633 | |
4634 | IN_LAYOUT_I has no meaning for other types of node. |
4635 | |
4636 | Keeping the node as-is is always valid. If the target doesn't appear |
4637 | to support the node as-is, but might realistically support other layouts, |
4638 | then layout 0 instead has the cost of a worst-case permutation. On the |
4639 | one hand, this ensures that every node has at least one valid layout, |
4640 | avoiding what would otherwise be an awkward special case. On the other, |
4641 | it still encourages the pass to change an invalid pre-existing layout |
4642 | choice into a valid one. */ |
4643 | |
4644 | int |
4645 | vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i, |
4646 | unsigned int out_layout_i) |
4647 | { |
4648 | const int fallback_cost = 1; |
4649 | |
4650 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
4651 | { |
4652 | auto_lane_permutation_t tmp_perm; |
4653 | tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node)); |
4654 | |
4655 | /* Check that the child nodes support the chosen layout. Checking |
4656 | the first child is enough, since any second child would have the |
4657 | same shape. */ |
4658 | auto first_child = SLP_TREE_CHILDREN (node)[0]; |
4659 | if (in_layout_i > 0 |
4660 | && !is_compatible_layout (node: first_child, layout_i: in_layout_i)) |
4661 | return -1; |
4662 | |
4663 | change_vec_perm_layout (node, perm&: tmp_perm, in_layout_i, out_layout_i); |
4664 | int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, |
4665 | node, tmp_perm, |
4666 | SLP_TREE_CHILDREN (node), |
4667 | false); |
4668 | if (count < 0) |
4669 | { |
4670 | if (in_layout_i == 0 && out_layout_i == 0) |
4671 | { |
4672 | /* Use the fallback cost if the node could in principle support |
4673 | some nonzero layout for both the inputs and the outputs. |
4674 | Otherwise assume that the node will be rejected later |
4675 | and rebuilt from scalars. */ |
4676 | if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child)) |
4677 | return fallback_cost; |
4678 | return 0; |
4679 | } |
4680 | return -1; |
4681 | } |
4682 | |
4683 | /* We currently have no way of telling whether the new layout is cheaper |
4684 | or more expensive than the old one. But at least in principle, |
4685 | it should be worth making zero permutations (whole-vector shuffles) |
4686 | cheaper than real permutations, in case the pass is able to remove |
4687 | the latter. */ |
4688 | return count == 0 ? 0 : 1; |
4689 | } |
4690 | |
4691 | stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node); |
4692 | if (rep |
4693 | && STMT_VINFO_DATA_REF (rep) |
4694 | && DR_IS_READ (STMT_VINFO_DATA_REF (rep)) |
4695 | && SLP_TREE_LOAD_PERMUTATION (node).exists ()) |
4696 | { |
4697 | auto_load_permutation_t tmp_perm; |
4698 | tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node)); |
4699 | if (out_layout_i > 0) |
4700 | vect_slp_permute (perm: m_perms[out_layout_i], vec&: tmp_perm, reverse: true); |
4701 | |
4702 | poly_uint64 vf = 1; |
4703 | if (auto loop_vinfo = dyn_cast<loop_vec_info> (p: m_vinfo)) |
4704 | vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
4705 | unsigned int n_perms; |
4706 | if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL, |
4707 | nullptr, vf, true, false, &n_perms)) |
4708 | { |
4709 | auto rep = SLP_TREE_REPRESENTATIVE (node); |
4710 | if (out_layout_i == 0) |
4711 | { |
4712 | /* Use the fallback cost if the load is an N-to-N permutation. |
4713 | Otherwise assume that the node will be rejected later |
4714 | and rebuilt from scalars. */ |
4715 | if (STMT_VINFO_GROUPED_ACCESS (rep) |
4716 | && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep)) |
4717 | == SLP_TREE_LANES (node))) |
4718 | return fallback_cost; |
4719 | return 0; |
4720 | } |
4721 | return -1; |
4722 | } |
4723 | |
4724 | /* See the comment above the corresponding VEC_PERM_EXPR handling. */ |
4725 | return n_perms == 0 ? 0 : 1; |
4726 | } |
4727 | |
4728 | return 0; |
4729 | } |
4730 | |
4731 | /* Decide which element layouts we should consider using. Calculate the |
4732 | weights associated with inserting layout changes on partition edges. |
4733 | Also mark partitions that cannot change layout, by setting their |
4734 | layout to zero. */ |
4735 | |
4736 | void |
4737 | vect_optimize_slp_pass::start_choosing_layouts () |
4738 | { |
4739 | /* Used to assign unique permutation indices. */ |
4740 | using perm_hash = unbounded_hashmap_traits< |
4741 | vec_free_hash_base<int_hash_base<unsigned>>, |
4742 | int_hash<int, -1, -2> |
4743 | >; |
4744 | hash_map<vec<unsigned>, int, perm_hash> layout_ids; |
4745 | |
4746 | /* Layout 0 is "no change". */ |
4747 | m_perms.safe_push (obj: vNULL); |
4748 | |
4749 | /* Create layouts from existing permutations. */ |
4750 | auto_load_permutation_t tmp_perm; |
4751 | for (unsigned int node_i : m_partitioned_nodes) |
4752 | { |
4753 | /* Leafs also double as entries to the reverse graph. Allow the |
4754 | layout of those to be changed. */ |
4755 | auto &vertex = m_vertices[node_i]; |
4756 | auto &partition = m_partitions[vertex.partition]; |
4757 | if (!m_slpg->vertices[node_i].succ) |
4758 | partition.layout = 0; |
4759 | |
4760 | /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */ |
4761 | slp_tree node = vertex.node; |
4762 | stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node); |
4763 | slp_tree child; |
4764 | unsigned HOST_WIDE_INT imin, imax = 0; |
4765 | bool any_permute = false; |
4766 | tmp_perm.truncate (size: 0); |
4767 | if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) |
4768 | { |
4769 | /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node |
4770 | unpermuted, record a layout that reverses this permutation. |
4771 | |
4772 | We would need more work to cope with loads that are internally |
4773 | permuted and also have inputs (such as masks for |
4774 | IFN_MASK_LOADs). */ |
4775 | gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ); |
4776 | if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt)) |
4777 | { |
4778 | partition.layout = -1; |
4779 | continue; |
4780 | } |
4781 | dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt); |
4782 | imin = DR_GROUP_SIZE (dr_stmt) + 1; |
4783 | tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node)); |
4784 | } |
4785 | else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR |
4786 | && SLP_TREE_CHILDREN (node).length () == 1 |
4787 | && (child = SLP_TREE_CHILDREN (node)[0]) |
4788 | && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child)) |
4789 | .is_constant (const_value: &imin))) |
4790 | { |
4791 | /* If the child has the same vector size as this node, |
4792 | reversing the permutation can make the permutation a no-op. |
4793 | In other cases it can change a true permutation into a |
4794 | full-vector extract. */ |
4795 | tmp_perm.reserve (SLP_TREE_LANES (node)); |
4796 | for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j) |
4797 | tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second); |
4798 | } |
4799 | else |
4800 | continue; |
4801 | |
4802 | for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j) |
4803 | { |
4804 | unsigned idx = tmp_perm[j]; |
4805 | imin = MIN (imin, idx); |
4806 | imax = MAX (imax, idx); |
4807 | if (idx - tmp_perm[0] != j) |
4808 | any_permute = true; |
4809 | } |
4810 | /* If the span doesn't match we'd disrupt VF computation, avoid |
4811 | that for now. */ |
4812 | if (imax - imin + 1 != SLP_TREE_LANES (node)) |
4813 | continue; |
4814 | /* If there's no permute no need to split one out. In this case |
4815 | we can consider turning a load into a permuted load, if that |
4816 | turns out to be cheaper than alternatives. */ |
4817 | if (!any_permute) |
4818 | { |
4819 | partition.layout = -1; |
4820 | continue; |
4821 | } |
4822 | |
4823 | /* For now only handle true permutes, like |
4824 | vect_attempt_slp_rearrange_stmts did. This allows us to be lazy |
4825 | when permuting constants and invariants keeping the permute |
4826 | bijective. */ |
4827 | auto_sbitmap load_index (SLP_TREE_LANES (node)); |
4828 | bitmap_clear (load_index); |
4829 | for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j) |
4830 | bitmap_set_bit (map: load_index, bitno: tmp_perm[j] - imin); |
4831 | unsigned j; |
4832 | for (j = 0; j < SLP_TREE_LANES (node); ++j) |
4833 | if (!bitmap_bit_p (map: load_index, bitno: j)) |
4834 | break; |
4835 | if (j != SLP_TREE_LANES (node)) |
4836 | continue; |
4837 | |
4838 | vec<unsigned> perm = vNULL; |
4839 | perm.safe_grow (SLP_TREE_LANES (node), exact: true); |
4840 | for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j) |
4841 | perm[j] = tmp_perm[j] - imin; |
4842 | |
4843 | if (int (m_perms.length ()) >= param_vect_max_layout_candidates) |
4844 | { |
4845 | /* Continue to use existing layouts, but don't add any more. */ |
4846 | int *entry = layout_ids.get (k: perm); |
4847 | partition.layout = entry ? *entry : 0; |
4848 | perm.release (); |
4849 | } |
4850 | else |
4851 | { |
4852 | bool existed; |
4853 | int &layout_i = layout_ids.get_or_insert (k: perm, existed: &existed); |
4854 | if (existed) |
4855 | perm.release (); |
4856 | else |
4857 | { |
4858 | layout_i = m_perms.length (); |
4859 | m_perms.safe_push (obj: perm); |
4860 | } |
4861 | partition.layout = layout_i; |
4862 | } |
4863 | } |
4864 | |
4865 | /* Initially assume that every layout is possible and has zero cost |
4866 | in every partition. */ |
4867 | m_partition_layout_costs.safe_grow_cleared (len: m_partitions.length () |
4868 | * m_perms.length ()); |
4869 | |
4870 | /* We have to mark outgoing permutations facing non-associating-reduction |
4871 | graph entries that are not represented as to be materialized. |
4872 | slp_inst_kind_bb_reduc currently only covers associatable reductions. */ |
4873 | for (slp_instance instance : m_vinfo->slp_instances) |
4874 | if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor) |
4875 | { |
4876 | unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex; |
4877 | m_partitions[m_vertices[node_i].partition].layout = 0; |
4878 | } |
4879 | else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain) |
4880 | { |
4881 | stmt_vec_info stmt_info |
4882 | = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance)); |
4883 | stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info); |
4884 | if (needs_fold_left_reduction_p (TREE_TYPE |
4885 | (gimple_get_lhs (stmt_info->stmt)), |
4886 | STMT_VINFO_REDUC_CODE (reduc_info))) |
4887 | { |
4888 | unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex; |
4889 | m_partitions[m_vertices[node_i].partition].layout = 0; |
4890 | } |
4891 | } |
4892 | |
4893 | /* Check which layouts each node and partition can handle. Calculate the |
4894 | weights associated with inserting layout changes on edges. */ |
4895 | for (unsigned int node_i : m_partitioned_nodes) |
4896 | { |
4897 | auto &vertex = m_vertices[node_i]; |
4898 | auto &partition = m_partitions[vertex.partition]; |
4899 | slp_tree node = vertex.node; |
4900 | |
4901 | if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node)) |
4902 | { |
4903 | vertex.weight = vect_slp_node_weight (node); |
4904 | |
4905 | /* We do not handle stores with a permutation, so all |
4906 | incoming permutations must have been materialized. |
4907 | |
4908 | We also don't handle masked grouped loads, which lack a |
4909 | permutation vector. In this case the memory locations |
4910 | form an implicit second input to the loads, on top of the |
4911 | explicit mask input, and the memory input's layout cannot |
4912 | be changed. |
4913 | |
4914 | On the other hand, we do support permuting gather loads and |
4915 | masked gather loads, where each scalar load is independent |
4916 | of the others. This can be useful if the address/index input |
4917 | benefits from permutation. */ |
4918 | if (STMT_VINFO_DATA_REF (rep) |
4919 | && STMT_VINFO_GROUPED_ACCESS (rep) |
4920 | && !SLP_TREE_LOAD_PERMUTATION (node).exists ()) |
4921 | partition.layout = 0; |
4922 | |
4923 | /* We cannot change the layout of an operation that is |
4924 | not independent on lanes. Note this is an explicit |
4925 | negative list since that's much shorter than the respective |
4926 | positive one but it's critical to keep maintaining it. */ |
4927 | if (is_gimple_call (STMT_VINFO_STMT (rep))) |
4928 | switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep))) |
4929 | { |
4930 | case CFN_COMPLEX_ADD_ROT90: |
4931 | case CFN_COMPLEX_ADD_ROT270: |
4932 | case CFN_COMPLEX_MUL: |
4933 | case CFN_COMPLEX_MUL_CONJ: |
4934 | case CFN_VEC_ADDSUB: |
4935 | case CFN_VEC_FMADDSUB: |
4936 | case CFN_VEC_FMSUBADD: |
4937 | partition.layout = 0; |
4938 | default:; |
4939 | } |
4940 | } |
4941 | |
4942 | auto process_edge = [&](graph_edge *ud, unsigned int other_node_i) |
4943 | { |
4944 | auto &other_vertex = m_vertices[other_node_i]; |
4945 | |
4946 | /* Count the number of edges from earlier partitions and the number |
4947 | of edges to later partitions. */ |
4948 | if (other_vertex.partition < vertex.partition) |
4949 | partition.in_degree += 1; |
4950 | else |
4951 | partition.out_degree += 1; |
4952 | |
4953 | /* If the current node uses the result of OTHER_NODE_I, accumulate |
4954 | the effects of that. */ |
4955 | if (ud->src == int (node_i)) |
4956 | { |
4957 | other_vertex.out_weight += vertex.weight; |
4958 | other_vertex.out_degree += 1; |
4959 | } |
4960 | }; |
4961 | for_each_partition_edge (node_i, fn: process_edge); |
4962 | } |
4963 | } |
4964 | |
4965 | /* Return the incoming costs for node NODE_I, assuming that each input keeps |
4966 | its current (provisional) choice of layout. The inputs do not necessarily |
4967 | have the same layout as each other. */ |
4968 | |
4969 | slpg_layout_cost |
4970 | vect_optimize_slp_pass::total_in_cost (unsigned int node_i) |
4971 | { |
4972 | auto &vertex = m_vertices[node_i]; |
4973 | slpg_layout_cost cost; |
4974 | auto add_cost = [&](graph_edge *, unsigned int other_node_i) |
4975 | { |
4976 | auto &other_vertex = m_vertices[other_node_i]; |
4977 | if (other_vertex.partition < vertex.partition) |
4978 | { |
4979 | auto &other_partition = m_partitions[other_vertex.partition]; |
4980 | auto &other_costs = partition_layout_costs (partition_i: other_vertex.partition, |
4981 | layout_i: other_partition.layout); |
4982 | slpg_layout_cost this_cost = other_costs.in_cost; |
4983 | this_cost.add_serial_cost (other: other_costs.internal_cost); |
4984 | this_cost.split (times: other_partition.out_degree); |
4985 | cost.add_parallel_cost (input_cost: this_cost); |
4986 | } |
4987 | }; |
4988 | for_each_partition_edge (node_i, fn: add_cost); |
4989 | return cost; |
4990 | } |
4991 | |
4992 | /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I) |
4993 | and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return |
4994 | slpg_layout_cost::impossible () if the change isn't possible. */ |
4995 | |
4996 | slpg_layout_cost |
4997 | vect_optimize_slp_pass:: |
4998 | edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i, |
4999 | unsigned int layout2_i) |
5000 | { |
5001 | auto &def_vertex = m_vertices[ud->dest]; |
5002 | auto &use_vertex = m_vertices[ud->src]; |
5003 | auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i; |
5004 | auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i; |
5005 | auto factor = change_layout_cost (node: def_vertex.node, from_layout_i: def_layout_i, |
5006 | to_layout_i: use_layout_i); |
5007 | if (factor < 0) |
5008 | return slpg_layout_cost::impossible (); |
5009 | |
5010 | /* We have a choice of putting the layout change at the site of the |
5011 | definition or at the site of the use. Prefer the former when |
5012 | optimizing for size or when the execution frequency of the |
5013 | definition is no greater than the combined execution frequencies of |
5014 | the uses. When putting the layout change at the site of the definition, |
5015 | divvy up the cost among all consumers. */ |
5016 | if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight) |
5017 | { |
5018 | slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size }; |
5019 | cost.split (times: def_vertex.out_degree); |
5020 | return cost; |
5021 | } |
5022 | return { use_vertex.weight * factor, m_optimize_size }; |
5023 | } |
5024 | |
5025 | /* UD represents a use-def link between FROM_NODE_I and a node in a later |
5026 | partition; FROM_NODE_I could be the definition node or the use node. |
5027 | The node at the other end of the link wants to use layout TO_LAYOUT_I. |
5028 | Return the cost of any necessary fix-ups on edge UD, or return |
5029 | slpg_layout_cost::impossible () if the change isn't possible. |
5030 | |
5031 | At this point, FROM_NODE_I's partition has chosen the cheapest |
5032 | layout based on the information available so far, but this choice |
5033 | is only provisional. */ |
5034 | |
5035 | slpg_layout_cost |
5036 | vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i, |
5037 | unsigned int to_layout_i) |
5038 | { |
5039 | auto &from_vertex = m_vertices[from_node_i]; |
5040 | unsigned int from_partition_i = from_vertex.partition; |
5041 | slpg_partition_info &from_partition = m_partitions[from_partition_i]; |
5042 | gcc_assert (from_partition.layout >= 0); |
5043 | |
5044 | /* First calculate the cost on the assumption that FROM_PARTITION sticks |
5045 | with its current layout preference. */ |
5046 | slpg_layout_cost cost = slpg_layout_cost::impossible (); |
5047 | auto edge_cost = edge_layout_cost (ud, node1_i: from_node_i, |
5048 | layout1_i: from_partition.layout, layout2_i: to_layout_i); |
5049 | if (edge_cost.is_possible ()) |
5050 | { |
5051 | auto &from_costs = partition_layout_costs (partition_i: from_partition_i, |
5052 | layout_i: from_partition.layout); |
5053 | cost = from_costs.in_cost; |
5054 | cost.add_serial_cost (other: from_costs.internal_cost); |
5055 | cost.split (times: from_partition.out_degree); |
5056 | cost.add_serial_cost (other: edge_cost); |
5057 | } |
5058 | else if (from_partition.layout == 0) |
5059 | /* We must allow the source partition to have layout 0 as a fallback, |
5060 | in case all other options turn out to be impossible. */ |
5061 | return cost; |
5062 | |
5063 | /* Take the minimum of that cost and the cost that applies if |
5064 | FROM_PARTITION instead switches to TO_LAYOUT_I. */ |
5065 | auto &direct_layout_costs = partition_layout_costs (partition_i: from_partition_i, |
5066 | layout_i: to_layout_i); |
5067 | if (direct_layout_costs.is_possible ()) |
5068 | { |
5069 | slpg_layout_cost direct_cost = direct_layout_costs.in_cost; |
5070 | direct_cost.add_serial_cost (other: direct_layout_costs.internal_cost); |
5071 | direct_cost.split (times: from_partition.out_degree); |
5072 | if (!cost.is_possible () |
5073 | || direct_cost.is_better_than (other: cost, is_for_size: m_optimize_size)) |
5074 | cost = direct_cost; |
5075 | } |
5076 | |
5077 | return cost; |
5078 | } |
5079 | |
5080 | /* UD represents a use-def link between TO_NODE_I and a node in an earlier |
5081 | partition; TO_NODE_I could be the definition node or the use node. |
5082 | The node at the other end of the link wants to use layout FROM_LAYOUT_I; |
5083 | return the cost of any necessary fix-ups on edge UD, or |
5084 | slpg_layout_cost::impossible () if the choice cannot be made. |
5085 | |
5086 | At this point, TO_NODE_I's partition has a fixed choice of layout. */ |
5087 | |
5088 | slpg_layout_cost |
5089 | vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i, |
5090 | unsigned int from_layout_i) |
5091 | { |
5092 | auto &to_vertex = m_vertices[to_node_i]; |
5093 | unsigned int to_partition_i = to_vertex.partition; |
5094 | slpg_partition_info &to_partition = m_partitions[to_partition_i]; |
5095 | gcc_assert (to_partition.layout >= 0); |
5096 | |
5097 | /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be |
5098 | adjusted for this input having layout FROM_LAYOUT_I. Assume that |
5099 | any other inputs keep their current choice of layout. */ |
5100 | auto &to_costs = partition_layout_costs (partition_i: to_partition_i, |
5101 | layout_i: to_partition.layout); |
5102 | if (ud->src == int (to_node_i) |
5103 | && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR) |
5104 | { |
5105 | auto &from_partition = m_partitions[m_vertices[ud->dest].partition]; |
5106 | auto old_layout = from_partition.layout; |
5107 | from_partition.layout = from_layout_i; |
5108 | int factor = internal_node_cost (node: to_vertex.node, in_layout_i: -1, |
5109 | out_layout_i: to_partition.layout); |
5110 | from_partition.layout = old_layout; |
5111 | if (factor >= 0) |
5112 | { |
5113 | slpg_layout_cost cost = to_costs.out_cost; |
5114 | cost.add_serial_cost (other: { to_vertex.weight * factor, |
5115 | m_optimize_size }); |
5116 | cost.split (times: to_partition.in_degree); |
5117 | return cost; |
5118 | } |
5119 | } |
5120 | |
5121 | /* Compute the cost if we insert any necessary layout change on edge UD. */ |
5122 | auto edge_cost = edge_layout_cost (ud, node1_i: to_node_i, |
5123 | layout1_i: to_partition.layout, layout2_i: from_layout_i); |
5124 | if (edge_cost.is_possible ()) |
5125 | { |
5126 | slpg_layout_cost cost = to_costs.out_cost; |
5127 | cost.add_serial_cost (other: to_costs.internal_cost); |
5128 | cost.split (times: to_partition.in_degree); |
5129 | cost.add_serial_cost (other: edge_cost); |
5130 | return cost; |
5131 | } |
5132 | |
5133 | return slpg_layout_cost::impossible (); |
5134 | } |
5135 | |
5136 | /* Make a forward pass through the partitions, accumulating input costs. |
5137 | Make a tentative (provisional) choice of layout for each partition, |
5138 | ensuring that this choice still allows later partitions to keep |
5139 | their original layout. */ |
5140 | |
5141 | void |
5142 | vect_optimize_slp_pass::forward_pass () |
5143 | { |
5144 | for (unsigned int partition_i = 0; partition_i < m_partitions.length (); |
5145 | ++partition_i) |
5146 | { |
5147 | auto &partition = m_partitions[partition_i]; |
5148 | |
5149 | /* If the partition consists of a single VEC_PERM_EXPR, precompute |
5150 | the incoming cost that would apply if every predecessor partition |
5151 | keeps its current layout. This is used within the loop below. */ |
5152 | slpg_layout_cost in_cost; |
5153 | slp_tree single_node = nullptr; |
5154 | if (partition.node_end == partition.node_begin + 1) |
5155 | { |
5156 | unsigned int node_i = m_partitioned_nodes[partition.node_begin]; |
5157 | single_node = m_vertices[node_i].node; |
5158 | if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR) |
5159 | in_cost = total_in_cost (node_i); |
5160 | } |
5161 | |
5162 | /* Go through the possible layouts. Decide which ones are valid |
5163 | for this partition and record which of the valid layouts has |
5164 | the lowest cost. */ |
5165 | unsigned int min_layout_i = 0; |
5166 | slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible (); |
5167 | for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i) |
5168 | { |
5169 | auto &layout_costs = partition_layout_costs (partition_i, layout_i); |
5170 | if (!layout_costs.is_possible ()) |
5171 | continue; |
5172 | |
5173 | /* If the recorded layout is already 0 then the layout cannot |
5174 | change. */ |
5175 | if (partition.layout == 0 && layout_i != 0) |
5176 | { |
5177 | layout_costs.mark_impossible (); |
5178 | continue; |
5179 | } |
5180 | |
5181 | bool is_possible = true; |
5182 | for (unsigned int order_i = partition.node_begin; |
5183 | order_i < partition.node_end; ++order_i) |
5184 | { |
5185 | unsigned int node_i = m_partitioned_nodes[order_i]; |
5186 | auto &vertex = m_vertices[node_i]; |
5187 | |
5188 | /* Reject the layout if it is individually incompatible |
5189 | with any node in the partition. */ |
5190 | if (!is_compatible_layout (node: vertex.node, layout_i)) |
5191 | { |
5192 | is_possible = false; |
5193 | break; |
5194 | } |
5195 | |
5196 | auto add_cost = [&](graph_edge *ud, unsigned int other_node_i) |
5197 | { |
5198 | auto &other_vertex = m_vertices[other_node_i]; |
5199 | if (other_vertex.partition < vertex.partition) |
5200 | { |
5201 | /* Accumulate the incoming costs from earlier |
5202 | partitions, plus the cost of any layout changes |
5203 | on UD itself. */ |
5204 | auto cost = forward_cost (ud, from_node_i: other_node_i, to_layout_i: layout_i); |
5205 | if (!cost.is_possible ()) |
5206 | is_possible = false; |
5207 | else |
5208 | layout_costs.in_cost.add_parallel_cost (input_cost: cost); |
5209 | } |
5210 | else |
5211 | /* Reject the layout if it would make layout 0 impossible |
5212 | for later partitions. This amounts to testing that the |
5213 | target supports reversing the layout change on edges |
5214 | to later partitions. |
5215 | |
5216 | In principle, it might be possible to push a layout |
5217 | change all the way down a graph, so that it never |
5218 | needs to be reversed and so that the target doesn't |
5219 | need to support the reverse operation. But it would |
5220 | be awkward to bail out if we hit a partition that |
5221 | does not support the new layout, especially since |
5222 | we are not dealing with a lattice. */ |
5223 | is_possible &= edge_layout_cost (ud, node1_i: other_node_i, layout1_i: 0, |
5224 | layout2_i: layout_i).is_possible (); |
5225 | }; |
5226 | for_each_partition_edge (node_i, fn: add_cost); |
5227 | |
5228 | /* Accumulate the cost of using LAYOUT_I within NODE, |
5229 | both for the inputs and the outputs. */ |
5230 | int factor = internal_node_cost (node: vertex.node, in_layout_i: layout_i, |
5231 | out_layout_i: layout_i); |
5232 | if (factor < 0) |
5233 | { |
5234 | is_possible = false; |
5235 | break; |
5236 | } |
5237 | else if (factor) |
5238 | layout_costs.internal_cost.add_serial_cost |
5239 | (other: { vertex.weight * factor, m_optimize_size }); |
5240 | } |
5241 | if (!is_possible) |
5242 | { |
5243 | layout_costs.mark_impossible (); |
5244 | continue; |
5245 | } |
5246 | |
5247 | /* Combine the incoming and partition-internal costs. */ |
5248 | slpg_layout_cost combined_cost = layout_costs.in_cost; |
5249 | combined_cost.add_serial_cost (other: layout_costs.internal_cost); |
5250 | |
5251 | /* If this partition consists of a single VEC_PERM_EXPR, see |
5252 | if the VEC_PERM_EXPR can be changed to support output layout |
5253 | LAYOUT_I while keeping all the provisional choices of input |
5254 | layout. */ |
5255 | if (single_node |
5256 | && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR) |
5257 | { |
5258 | int factor = internal_node_cost (node: single_node, in_layout_i: -1, out_layout_i: layout_i); |
5259 | if (factor >= 0) |
5260 | { |
5261 | auto weight = m_vertices[single_node->vertex].weight; |
5262 | slpg_layout_cost internal_cost |
5263 | = { weight * factor, m_optimize_size }; |
5264 | |
5265 | slpg_layout_cost alt_cost = in_cost; |
5266 | alt_cost.add_serial_cost (other: internal_cost); |
5267 | if (alt_cost.is_better_than (other: combined_cost, is_for_size: m_optimize_size)) |
5268 | { |
5269 | combined_cost = alt_cost; |
5270 | layout_costs.in_cost = in_cost; |
5271 | layout_costs.internal_cost = internal_cost; |
5272 | } |
5273 | } |
5274 | } |
5275 | |
5276 | /* Record the layout with the lowest cost. Prefer layout 0 in |
5277 | the event of a tie between it and another layout. */ |
5278 | if (!min_layout_cost.is_possible () |
5279 | || combined_cost.is_better_than (other: min_layout_cost, |
5280 | is_for_size: m_optimize_size)) |
5281 | { |
5282 | min_layout_i = layout_i; |
5283 | min_layout_cost = combined_cost; |
5284 | } |
5285 | } |
5286 | |
5287 | /* This loop's handling of earlier partitions should ensure that |
5288 | choosing the original layout for the current partition is no |
5289 | less valid than it was in the original graph, even with the |
5290 | provisional layout choices for those earlier partitions. */ |
5291 | gcc_assert (min_layout_cost.is_possible ()); |
5292 | partition.layout = min_layout_i; |
5293 | } |
5294 | } |
5295 | |
5296 | /* Make a backward pass through the partitions, accumulating output costs. |
5297 | Make a final choice of layout for each partition. */ |
5298 | |
5299 | void |
5300 | vect_optimize_slp_pass::backward_pass () |
5301 | { |
5302 | for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;) |
5303 | { |
5304 | auto &partition = m_partitions[partition_i]; |
5305 | |
5306 | unsigned int min_layout_i = 0; |
5307 | slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible (); |
5308 | for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i) |
5309 | { |
5310 | auto &layout_costs = partition_layout_costs (partition_i, layout_i); |
5311 | if (!layout_costs.is_possible ()) |
5312 | continue; |
5313 | |
5314 | /* Accumulate the costs from successor partitions. */ |
5315 | bool is_possible = true; |
5316 | for (unsigned int order_i = partition.node_begin; |
5317 | order_i < partition.node_end; ++order_i) |
5318 | { |
5319 | unsigned int node_i = m_partitioned_nodes[order_i]; |
5320 | auto &vertex = m_vertices[node_i]; |
5321 | auto add_cost = [&](graph_edge *ud, unsigned int other_node_i) |
5322 | { |
5323 | auto &other_vertex = m_vertices[other_node_i]; |
5324 | auto &other_partition = m_partitions[other_vertex.partition]; |
5325 | if (other_vertex.partition > vertex.partition) |
5326 | { |
5327 | /* Accumulate the incoming costs from later |
5328 | partitions, plus the cost of any layout changes |
5329 | on UD itself. */ |
5330 | auto cost = backward_cost (ud, to_node_i: other_node_i, from_layout_i: layout_i); |
5331 | if (!cost.is_possible ()) |
5332 | is_possible = false; |
5333 | else |
5334 | layout_costs.out_cost.add_parallel_cost (input_cost: cost); |
5335 | } |
5336 | else |
5337 | /* Make sure that earlier partitions can (if necessary |
5338 | or beneficial) keep the layout that they chose in |
5339 | the forward pass. This ensures that there is at |
5340 | least one valid choice of layout. */ |
5341 | is_possible &= edge_layout_cost (ud, node1_i: other_node_i, |
5342 | layout1_i: other_partition.layout, |
5343 | layout2_i: layout_i).is_possible (); |
5344 | }; |
5345 | for_each_partition_edge (node_i, fn: add_cost); |
5346 | } |
5347 | if (!is_possible) |
5348 | { |
5349 | layout_costs.mark_impossible (); |
5350 | continue; |
5351 | } |
5352 | |
5353 | /* Locally combine the costs from the forward and backward passes. |
5354 | (This combined cost is not passed on, since that would lead |
5355 | to double counting.) */ |
5356 | slpg_layout_cost combined_cost = layout_costs.in_cost; |
5357 | combined_cost.add_serial_cost (other: layout_costs.internal_cost); |
5358 | combined_cost.add_serial_cost (other: layout_costs.out_cost); |
5359 | |
5360 | /* Record the layout with the lowest cost. Prefer layout 0 in |
5361 | the event of a tie between it and another layout. */ |
5362 | if (!min_layout_cost.is_possible () |
5363 | || combined_cost.is_better_than (other: min_layout_cost, |
5364 | is_for_size: m_optimize_size)) |
5365 | { |
5366 | min_layout_i = layout_i; |
5367 | min_layout_cost = combined_cost; |
5368 | } |
5369 | } |
5370 | |
5371 | gcc_assert (min_layout_cost.is_possible ()); |
5372 | partition.layout = min_layout_i; |
5373 | } |
5374 | } |
5375 | |
5376 | /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE. |
5377 | NODE already has the layout that was selected for its partition. */ |
5378 | |
5379 | slp_tree |
5380 | vect_optimize_slp_pass::get_result_with_layout (slp_tree node, |
5381 | unsigned int to_layout_i) |
5382 | { |
5383 | unsigned int result_i = node->vertex * m_perms.length () + to_layout_i; |
5384 | slp_tree result = m_node_layouts[result_i]; |
5385 | if (result) |
5386 | return result; |
5387 | |
5388 | if (SLP_TREE_DEF_TYPE (node) == vect_constant_def |
5389 | || (SLP_TREE_DEF_TYPE (node) == vect_external_def |
5390 | /* We can't permute vector defs in place. */ |
5391 | && SLP_TREE_VEC_DEFS (node).is_empty ())) |
5392 | { |
5393 | /* If the vector is uniform or unchanged, there's nothing to do. */ |
5394 | if (to_layout_i == 0 || vect_slp_tree_uniform_p (node)) |
5395 | result = node; |
5396 | else |
5397 | { |
5398 | auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy (); |
5399 | result = vect_create_new_slp_node (ops: scalar_ops); |
5400 | vect_slp_permute (perm: m_perms[to_layout_i], vec&: scalar_ops, reverse: true); |
5401 | } |
5402 | } |
5403 | else |
5404 | { |
5405 | unsigned int partition_i = m_vertices[node->vertex].partition; |
5406 | unsigned int from_layout_i = m_partitions[partition_i].layout; |
5407 | if (from_layout_i == to_layout_i) |
5408 | return node; |
5409 | |
5410 | /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel |
5411 | permutation instead of a serial one. Leave the new permutation |
5412 | in TMP_PERM on success. */ |
5413 | auto_lane_permutation_t tmp_perm; |
5414 | unsigned int num_inputs = 1; |
5415 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
5416 | { |
5417 | tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node)); |
5418 | if (from_layout_i != 0) |
5419 | vect_slp_permute (perm: m_perms[from_layout_i], vec&: tmp_perm, reverse: false); |
5420 | if (to_layout_i != 0) |
5421 | vect_slp_permute (perm: m_perms[to_layout_i], vec&: tmp_perm, reverse: true); |
5422 | if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, |
5423 | tmp_perm, |
5424 | SLP_TREE_CHILDREN (node), |
5425 | false) >= 0) |
5426 | num_inputs = SLP_TREE_CHILDREN (node).length (); |
5427 | else |
5428 | tmp_perm.truncate (size: 0); |
5429 | } |
5430 | |
5431 | if (dump_enabled_p ()) |
5432 | { |
5433 | if (tmp_perm.length () > 0) |
5434 | dump_printf_loc (MSG_NOTE, vect_location, |
5435 | "duplicating permutation node %p with" |
5436 | " layout %d\n" , |
5437 | (void *) node, to_layout_i); |
5438 | else |
5439 | dump_printf_loc (MSG_NOTE, vect_location, |
5440 | "inserting permutation node in place of %p\n" , |
5441 | (void *) node); |
5442 | } |
5443 | |
5444 | unsigned int num_lanes = SLP_TREE_LANES (node); |
5445 | result = vect_create_new_slp_node (nops: num_inputs, code: VEC_PERM_EXPR); |
5446 | if (SLP_TREE_SCALAR_STMTS (node).length ()) |
5447 | { |
5448 | auto &stmts = SLP_TREE_SCALAR_STMTS (result); |
5449 | stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node)); |
5450 | if (from_layout_i != 0) |
5451 | vect_slp_permute (perm: m_perms[from_layout_i], vec&: stmts, reverse: false); |
5452 | if (to_layout_i != 0) |
5453 | vect_slp_permute (perm: m_perms[to_layout_i], vec&: stmts, reverse: true); |
5454 | } |
5455 | SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node); |
5456 | SLP_TREE_LANES (result) = num_lanes; |
5457 | SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node); |
5458 | result->vertex = -1; |
5459 | |
5460 | auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result); |
5461 | if (tmp_perm.length ()) |
5462 | { |
5463 | lane_perm.safe_splice (src: tmp_perm); |
5464 | SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node)); |
5465 | } |
5466 | else |
5467 | { |
5468 | lane_perm.create (nelems: num_lanes); |
5469 | for (unsigned j = 0; j < num_lanes; ++j) |
5470 | lane_perm.quick_push (obj: { 0, j }); |
5471 | if (from_layout_i != 0) |
5472 | vect_slp_permute (perm: m_perms[from_layout_i], vec&: lane_perm, reverse: false); |
5473 | if (to_layout_i != 0) |
5474 | vect_slp_permute (perm: m_perms[to_layout_i], vec&: lane_perm, reverse: true); |
5475 | SLP_TREE_CHILDREN (result).safe_push (obj: node); |
5476 | } |
5477 | for (slp_tree child : SLP_TREE_CHILDREN (result)) |
5478 | child->refcnt++; |
5479 | } |
5480 | m_node_layouts[result_i] = result; |
5481 | return result; |
5482 | } |
5483 | |
5484 | /* Apply the chosen vector layouts to the SLP graph. */ |
5485 | |
5486 | void |
5487 | vect_optimize_slp_pass::materialize () |
5488 | { |
5489 | /* We no longer need the costs, so avoid having two O(N * P) arrays |
5490 | live at the same time. */ |
5491 | m_partition_layout_costs.release (); |
5492 | m_node_layouts.safe_grow_cleared (len: m_vertices.length () * m_perms.length ()); |
5493 | |
5494 | auto_sbitmap fully_folded (m_vertices.length ()); |
5495 | bitmap_clear (fully_folded); |
5496 | for (unsigned int node_i : m_partitioned_nodes) |
5497 | { |
5498 | auto &vertex = m_vertices[node_i]; |
5499 | slp_tree node = vertex.node; |
5500 | int layout_i = m_partitions[vertex.partition].layout; |
5501 | gcc_assert (layout_i >= 0); |
5502 | |
5503 | /* Rearrange the scalar statements to match the chosen layout. */ |
5504 | if (layout_i > 0) |
5505 | vect_slp_permute (perm: m_perms[layout_i], |
5506 | SLP_TREE_SCALAR_STMTS (node), reverse: true); |
5507 | |
5508 | /* Update load and lane permutations. */ |
5509 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
5510 | { |
5511 | /* First try to absorb the input vector layouts. If that fails, |
5512 | force the inputs to have layout LAYOUT_I too. We checked that |
5513 | that was possible before deciding to use nonzero output layouts. |
5514 | (Note that at this stage we don't really have any guarantee that |
5515 | the target supports the original VEC_PERM_EXPR.) */ |
5516 | auto &perm = SLP_TREE_LANE_PERMUTATION (node); |
5517 | auto_lane_permutation_t tmp_perm; |
5518 | tmp_perm.safe_splice (src: perm); |
5519 | change_vec_perm_layout (node, perm&: tmp_perm, in_layout_i: -1, out_layout_i: layout_i); |
5520 | if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, |
5521 | tmp_perm, |
5522 | SLP_TREE_CHILDREN (node), |
5523 | false) >= 0) |
5524 | { |
5525 | if (dump_enabled_p () |
5526 | && !std::equal (first1: tmp_perm.begin (), last1: tmp_perm.end (), |
5527 | first2: perm.begin ())) |
5528 | dump_printf_loc (MSG_NOTE, vect_location, |
5529 | "absorbing input layouts into %p\n" , |
5530 | (void *) node); |
5531 | std::copy (first: tmp_perm.begin (), last: tmp_perm.end (), result: perm.begin ()); |
5532 | bitmap_set_bit (map: fully_folded, bitno: node_i); |
5533 | } |
5534 | else |
5535 | { |
5536 | /* Not MSG_MISSED because it would make no sense to users. */ |
5537 | if (dump_enabled_p ()) |
5538 | dump_printf_loc (MSG_NOTE, vect_location, |
5539 | "failed to absorb input layouts into %p\n" , |
5540 | (void *) node); |
5541 | change_vec_perm_layout (node: nullptr, perm, in_layout_i: layout_i, out_layout_i: layout_i); |
5542 | } |
5543 | } |
5544 | else |
5545 | { |
5546 | gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ()); |
5547 | auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node); |
5548 | if (layout_i > 0) |
5549 | /* ??? When we handle non-bijective permutes the idea |
5550 | is that we can force the load-permutation to be |
5551 | { min, min + 1, min + 2, ... max }. But then the |
5552 | scalar defs might no longer match the lane content |
5553 | which means wrong-code with live lane vectorization. |
5554 | So we possibly have to have NULL entries for those. */ |
5555 | vect_slp_permute (perm: m_perms[layout_i], vec&: load_perm, reverse: true); |
5556 | } |
5557 | } |
5558 | |
5559 | /* Do this before any nodes disappear, since it involves a walk |
5560 | over the leaves. */ |
5561 | remove_redundant_permutations (); |
5562 | |
5563 | /* Replace each child with a correctly laid-out version. */ |
5564 | for (unsigned int node_i : m_partitioned_nodes) |
5565 | { |
5566 | /* Skip nodes that have already been handled above. */ |
5567 | if (bitmap_bit_p (map: fully_folded, bitno: node_i)) |
5568 | continue; |
5569 | |
5570 | auto &vertex = m_vertices[node_i]; |
5571 | int in_layout_i = m_partitions[vertex.partition].layout; |
5572 | gcc_assert (in_layout_i >= 0); |
5573 | |
5574 | unsigned j; |
5575 | slp_tree child; |
5576 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child) |
5577 | { |
5578 | if (!child) |
5579 | continue; |
5580 | |
5581 | slp_tree new_child = get_result_with_layout (node: child, to_layout_i: in_layout_i); |
5582 | if (new_child != child) |
5583 | { |
5584 | vect_free_slp_tree (node: child); |
5585 | SLP_TREE_CHILDREN (vertex.node)[j] = new_child; |
5586 | new_child->refcnt += 1; |
5587 | } |
5588 | } |
5589 | } |
5590 | } |
5591 | |
5592 | /* Elide load permutations that are not necessary. Such permutations might |
5593 | be pre-existing, rather than created by the layout optimizations. */ |
5594 | |
5595 | void |
5596 | vect_optimize_slp_pass::remove_redundant_permutations () |
5597 | { |
5598 | for (unsigned int node_i : m_leafs) |
5599 | { |
5600 | slp_tree node = m_vertices[node_i].node; |
5601 | if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) |
5602 | continue; |
5603 | |
5604 | /* In basic block vectorization we allow any subchain of an interleaving |
5605 | chain. |
5606 | FORNOW: not in loop SLP because of realignment complications. */ |
5607 | if (is_a <bb_vec_info> (p: m_vinfo)) |
5608 | { |
5609 | bool subchain_p = true; |
5610 | stmt_vec_info next_load_info = NULL; |
5611 | stmt_vec_info load_info; |
5612 | unsigned j; |
5613 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) |
5614 | { |
5615 | if (j != 0 |
5616 | && (next_load_info != load_info |
5617 | || DR_GROUP_GAP (load_info) != 1)) |
5618 | { |
5619 | subchain_p = false; |
5620 | break; |
5621 | } |
5622 | next_load_info = DR_GROUP_NEXT_ELEMENT (load_info); |
5623 | } |
5624 | if (subchain_p) |
5625 | { |
5626 | SLP_TREE_LOAD_PERMUTATION (node).release (); |
5627 | continue; |
5628 | } |
5629 | } |
5630 | else |
5631 | { |
5632 | loop_vec_info loop_vinfo = as_a<loop_vec_info> (p: m_vinfo); |
5633 | stmt_vec_info load_info; |
5634 | bool this_load_permuted = false; |
5635 | unsigned j; |
5636 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) |
5637 | if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j) |
5638 | { |
5639 | this_load_permuted = true; |
5640 | break; |
5641 | } |
5642 | /* When this isn't a grouped access we know it's single element |
5643 | and contiguous. */ |
5644 | if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0])) |
5645 | { |
5646 | if (!this_load_permuted |
5647 | && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U) |
5648 | || SLP_TREE_LANES (node) == 1)) |
5649 | SLP_TREE_LOAD_PERMUTATION (node).release (); |
5650 | continue; |
5651 | } |
5652 | stmt_vec_info first_stmt_info |
5653 | = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); |
5654 | if (!this_load_permuted |
5655 | /* The load requires permutation when unrolling exposes |
5656 | a gap either because the group is larger than the SLP |
5657 | group-size or because there is a gap between the groups. */ |
5658 | && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U) |
5659 | || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info)) |
5660 | && DR_GROUP_GAP (first_stmt_info) == 0))) |
5661 | { |
5662 | SLP_TREE_LOAD_PERMUTATION (node).release (); |
5663 | continue; |
5664 | } |
5665 | } |
5666 | } |
5667 | } |
5668 | |
5669 | /* Print the partition graph and layout information to the dump file. */ |
5670 | |
5671 | void |
5672 | vect_optimize_slp_pass::dump () |
5673 | { |
5674 | dump_printf_loc (MSG_NOTE, vect_location, |
5675 | "SLP optimize permutations:\n" ); |
5676 | for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i) |
5677 | { |
5678 | dump_printf_loc (MSG_NOTE, vect_location, " %d: { " , layout_i); |
5679 | const char *sep = "" ; |
5680 | for (unsigned int idx : m_perms[layout_i]) |
5681 | { |
5682 | dump_printf (MSG_NOTE, "%s%d" , sep, idx); |
5683 | sep = ", " ; |
5684 | } |
5685 | dump_printf (MSG_NOTE, " }\n" ); |
5686 | } |
5687 | dump_printf_loc (MSG_NOTE, vect_location, |
5688 | "SLP optimize partitions:\n" ); |
5689 | for (unsigned int partition_i = 0; partition_i < m_partitions.length (); |
5690 | ++partition_i) |
5691 | { |
5692 | auto &partition = m_partitions[partition_i]; |
5693 | dump_printf_loc (MSG_NOTE, vect_location, " -------------\n" ); |
5694 | dump_printf_loc (MSG_NOTE, vect_location, |
5695 | " partition %d (layout %d):\n" , |
5696 | partition_i, partition.layout); |
5697 | dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n" ); |
5698 | for (unsigned int order_i = partition.node_begin; |
5699 | order_i < partition.node_end; ++order_i) |
5700 | { |
5701 | auto &vertex = m_vertices[m_partitioned_nodes[order_i]]; |
5702 | dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n" , |
5703 | (void *) vertex.node); |
5704 | dump_printf_loc (MSG_NOTE, vect_location, |
5705 | " weight: %f\n" , |
5706 | vertex.weight.to_double ()); |
5707 | if (vertex.out_degree) |
5708 | dump_printf_loc (MSG_NOTE, vect_location, |
5709 | " out weight: %f (degree %d)\n" , |
5710 | vertex.out_weight.to_double (), |
5711 | vertex.out_degree); |
5712 | if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR) |
5713 | dump_printf_loc (MSG_NOTE, vect_location, |
5714 | " op: VEC_PERM_EXPR\n" ); |
5715 | else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node)) |
5716 | dump_printf_loc (MSG_NOTE, vect_location, |
5717 | " op template: %G" , rep->stmt); |
5718 | } |
5719 | dump_printf_loc (MSG_NOTE, vect_location, " edges:\n" ); |
5720 | for (unsigned int order_i = partition.node_begin; |
5721 | order_i < partition.node_end; ++order_i) |
5722 | { |
5723 | unsigned int node_i = m_partitioned_nodes[order_i]; |
5724 | auto &vertex = m_vertices[node_i]; |
5725 | auto print_edge = [&](graph_edge *, unsigned int other_node_i) |
5726 | { |
5727 | auto &other_vertex = m_vertices[other_node_i]; |
5728 | if (other_vertex.partition < vertex.partition) |
5729 | dump_printf_loc (MSG_NOTE, vect_location, |
5730 | " - %p [%d] --> %p\n" , |
5731 | (void *) other_vertex.node, |
5732 | other_vertex.partition, |
5733 | (void *) vertex.node); |
5734 | else |
5735 | dump_printf_loc (MSG_NOTE, vect_location, |
5736 | " - %p --> [%d] %p\n" , |
5737 | (void *) vertex.node, |
5738 | other_vertex.partition, |
5739 | (void *) other_vertex.node); |
5740 | }; |
5741 | for_each_partition_edge (node_i, fn: print_edge); |
5742 | } |
5743 | |
5744 | for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i) |
5745 | { |
5746 | auto &layout_costs = partition_layout_costs (partition_i, layout_i); |
5747 | if (layout_costs.is_possible ()) |
5748 | { |
5749 | dump_printf_loc (MSG_NOTE, vect_location, |
5750 | " layout %d:%s\n" , layout_i, |
5751 | partition.layout == int (layout_i) |
5752 | ? " (*)" : "" ); |
5753 | slpg_layout_cost combined_cost = layout_costs.in_cost; |
5754 | combined_cost.add_serial_cost (other: layout_costs.internal_cost); |
5755 | combined_cost.add_serial_cost (other: layout_costs.out_cost); |
5756 | #define TEMPLATE "{depth: %f, total: %f}" |
5757 | dump_printf_loc (MSG_NOTE, vect_location, |
5758 | " " TEMPLATE "\n" , |
5759 | layout_costs.in_cost.depth.to_double (), |
5760 | layout_costs.in_cost.total.to_double ()); |
5761 | dump_printf_loc (MSG_NOTE, vect_location, |
5762 | " + " TEMPLATE "\n" , |
5763 | layout_costs.internal_cost.depth.to_double (), |
5764 | layout_costs.internal_cost.total.to_double ()); |
5765 | dump_printf_loc (MSG_NOTE, vect_location, |
5766 | " + " TEMPLATE "\n" , |
5767 | layout_costs.out_cost.depth.to_double (), |
5768 | layout_costs.out_cost.total.to_double ()); |
5769 | dump_printf_loc (MSG_NOTE, vect_location, |
5770 | " = " TEMPLATE "\n" , |
5771 | combined_cost.depth.to_double (), |
5772 | combined_cost.total.to_double ()); |
5773 | #undef TEMPLATE |
5774 | } |
5775 | else |
5776 | dump_printf_loc (MSG_NOTE, vect_location, |
5777 | " layout %d: rejected\n" , layout_i); |
5778 | } |
5779 | } |
5780 | } |
5781 | |
5782 | /* Main entry point for the SLP graph optimization pass. */ |
5783 | |
5784 | void |
5785 | vect_optimize_slp_pass::run () |
5786 | { |
5787 | build_graph (); |
5788 | create_partitions (); |
5789 | start_choosing_layouts (); |
5790 | if (m_perms.length () > 1) |
5791 | { |
5792 | forward_pass (); |
5793 | backward_pass (); |
5794 | if (dump_enabled_p ()) |
5795 | dump (); |
5796 | materialize (); |
5797 | while (!m_perms.is_empty ()) |
5798 | m_perms.pop ().release (); |
5799 | } |
5800 | else |
5801 | remove_redundant_permutations (); |
5802 | free_graph (g: m_slpg); |
5803 | } |
5804 | |
5805 | /* Optimize the SLP graph of VINFO. */ |
5806 | |
5807 | void |
5808 | vect_optimize_slp (vec_info *vinfo) |
5809 | { |
5810 | if (vinfo->slp_instances.is_empty ()) |
5811 | return; |
5812 | vect_optimize_slp_pass (vinfo).run (); |
5813 | } |
5814 | |
5815 | /* Gather loads reachable from the individual SLP graph entries. */ |
5816 | |
5817 | void |
5818 | vect_gather_slp_loads (vec_info *vinfo) |
5819 | { |
5820 | unsigned i; |
5821 | slp_instance instance; |
5822 | FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance) |
5823 | { |
5824 | hash_set<slp_tree> visited; |
5825 | vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance), |
5826 | SLP_INSTANCE_TREE (instance), visited); |
5827 | } |
5828 | } |
5829 | |
5830 | |
5831 | /* For each possible SLP instance decide whether to SLP it and calculate overall |
5832 | unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at |
5833 | least one instance. */ |
5834 | |
5835 | bool |
5836 | vect_make_slp_decision (loop_vec_info loop_vinfo) |
5837 | { |
5838 | unsigned int i; |
5839 | poly_uint64 unrolling_factor = 1; |
5840 | const vec<slp_instance> &slp_instances |
5841 | = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); |
5842 | slp_instance instance; |
5843 | int decided_to_slp = 0; |
5844 | |
5845 | DUMP_VECT_SCOPE ("vect_make_slp_decision" ); |
5846 | |
5847 | FOR_EACH_VEC_ELT (slp_instances, i, instance) |
5848 | { |
5849 | /* FORNOW: SLP if you can. */ |
5850 | /* All unroll factors have the form: |
5851 | |
5852 | GET_MODE_SIZE (vinfo->vector_mode) * X |
5853 | |
5854 | for some rational X, so they must have a common multiple. */ |
5855 | unrolling_factor |
5856 | = force_common_multiple (a: unrolling_factor, |
5857 | SLP_INSTANCE_UNROLLING_FACTOR (instance)); |
5858 | |
5859 | /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we |
5860 | call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and |
5861 | loop-based vectorization. Such stmts will be marked as HYBRID. */ |
5862 | vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance)); |
5863 | decided_to_slp++; |
5864 | } |
5865 | |
5866 | LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor; |
5867 | |
5868 | if (decided_to_slp && dump_enabled_p ()) |
5869 | { |
5870 | dump_printf_loc (MSG_NOTE, vect_location, |
5871 | "Decided to SLP %d instances. Unrolling factor " , |
5872 | decided_to_slp); |
5873 | dump_dec (MSG_NOTE, unrolling_factor); |
5874 | dump_printf (MSG_NOTE, "\n" ); |
5875 | } |
5876 | |
5877 | return (decided_to_slp > 0); |
5878 | } |
5879 | |
5880 | /* Private data for vect_detect_hybrid_slp. */ |
5881 | struct vdhs_data |
5882 | { |
5883 | loop_vec_info loop_vinfo; |
5884 | vec<stmt_vec_info> *worklist; |
5885 | }; |
5886 | |
5887 | /* Walker for walk_gimple_op. */ |
5888 | |
5889 | static tree |
5890 | vect_detect_hybrid_slp (tree *tp, int *, void *data) |
5891 | { |
5892 | walk_stmt_info *wi = (walk_stmt_info *)data; |
5893 | vdhs_data *dat = (vdhs_data *)wi->info; |
5894 | |
5895 | if (wi->is_lhs) |
5896 | return NULL_TREE; |
5897 | |
5898 | stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp); |
5899 | if (!def_stmt_info) |
5900 | return NULL_TREE; |
5901 | def_stmt_info = vect_stmt_to_vectorize (stmt_info: def_stmt_info); |
5902 | if (PURE_SLP_STMT (def_stmt_info)) |
5903 | { |
5904 | if (dump_enabled_p ()) |
5905 | dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G" , |
5906 | def_stmt_info->stmt); |
5907 | STMT_SLP_TYPE (def_stmt_info) = hybrid; |
5908 | dat->worklist->safe_push (obj: def_stmt_info); |
5909 | } |
5910 | |
5911 | return NULL_TREE; |
5912 | } |
5913 | |
5914 | /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp |
5915 | if so, otherwise pushing it to WORKLIST. */ |
5916 | |
5917 | static void |
5918 | maybe_push_to_hybrid_worklist (vec_info *vinfo, |
5919 | vec<stmt_vec_info> &worklist, |
5920 | stmt_vec_info stmt_info) |
5921 | { |
5922 | if (dump_enabled_p ()) |
5923 | dump_printf_loc (MSG_NOTE, vect_location, |
5924 | "Processing hybrid candidate : %G" , stmt_info->stmt); |
5925 | stmt_vec_info orig_info = vect_orig_stmt (stmt_info); |
5926 | imm_use_iterator iter2; |
5927 | ssa_op_iter iter1; |
5928 | use_operand_p use_p; |
5929 | def_operand_p def_p; |
5930 | bool any_def = false; |
5931 | FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF) |
5932 | { |
5933 | any_def = true; |
5934 | FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p)) |
5935 | { |
5936 | if (is_gimple_debug (USE_STMT (use_p))) |
5937 | continue; |
5938 | stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p)); |
5939 | /* An out-of loop use means this is a loop_vect sink. */ |
5940 | if (!use_info) |
5941 | { |
5942 | if (dump_enabled_p ()) |
5943 | dump_printf_loc (MSG_NOTE, vect_location, |
5944 | "Found loop_vect sink: %G" , stmt_info->stmt); |
5945 | worklist.safe_push (obj: stmt_info); |
5946 | return; |
5947 | } |
5948 | else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info))) |
5949 | { |
5950 | if (dump_enabled_p ()) |
5951 | dump_printf_loc (MSG_NOTE, vect_location, |
5952 | "Found loop_vect use: %G" , use_info->stmt); |
5953 | worklist.safe_push (obj: stmt_info); |
5954 | return; |
5955 | } |
5956 | } |
5957 | } |
5958 | /* No def means this is a loo_vect sink. */ |
5959 | if (!any_def) |
5960 | { |
5961 | if (dump_enabled_p ()) |
5962 | dump_printf_loc (MSG_NOTE, vect_location, |
5963 | "Found loop_vect sink: %G" , stmt_info->stmt); |
5964 | worklist.safe_push (obj: stmt_info); |
5965 | return; |
5966 | } |
5967 | if (dump_enabled_p ()) |
5968 | dump_printf_loc (MSG_NOTE, vect_location, |
5969 | "Marked SLP consumed stmt pure: %G" , stmt_info->stmt); |
5970 | STMT_SLP_TYPE (stmt_info) = pure_slp; |
5971 | } |
5972 | |
5973 | /* Find stmts that must be both vectorized and SLPed. */ |
5974 | |
5975 | void |
5976 | vect_detect_hybrid_slp (loop_vec_info loop_vinfo) |
5977 | { |
5978 | DUMP_VECT_SCOPE ("vect_detect_hybrid_slp" ); |
5979 | |
5980 | /* All stmts participating in SLP are marked pure_slp, all other |
5981 | stmts are loop_vect. |
5982 | First collect all loop_vect stmts into a worklist. |
5983 | SLP patterns cause not all original scalar stmts to appear in |
5984 | SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp. |
5985 | Rectify this here and do a backward walk over the IL only considering |
5986 | stmts as loop_vect when they are used by a loop_vect stmt and otherwise |
5987 | mark them as pure_slp. */ |
5988 | auto_vec<stmt_vec_info> worklist; |
5989 | for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i) |
5990 | { |
5991 | basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; |
5992 | for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); |
5993 | gsi_next (i: &gsi)) |
5994 | { |
5995 | gphi *phi = gsi.phi (); |
5996 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi); |
5997 | if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info)) |
5998 | maybe_push_to_hybrid_worklist (vinfo: loop_vinfo, |
5999 | worklist, stmt_info); |
6000 | } |
6001 | for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (i: gsi); |
6002 | gsi_prev (i: &gsi)) |
6003 | { |
6004 | gimple *stmt = gsi_stmt (i: gsi); |
6005 | if (is_gimple_debug (gs: stmt)) |
6006 | continue; |
6007 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt); |
6008 | if (STMT_VINFO_IN_PATTERN_P (stmt_info)) |
6009 | { |
6010 | for (gimple_stmt_iterator gsi2 |
6011 | = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); |
6012 | !gsi_end_p (i: gsi2); gsi_next (i: &gsi2)) |
6013 | { |
6014 | stmt_vec_info patt_info |
6015 | = loop_vinfo->lookup_stmt (gsi_stmt (i: gsi2)); |
6016 | if (!STMT_SLP_TYPE (patt_info) |
6017 | && STMT_VINFO_RELEVANT (patt_info)) |
6018 | maybe_push_to_hybrid_worklist (vinfo: loop_vinfo, |
6019 | worklist, stmt_info: patt_info); |
6020 | } |
6021 | stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); |
6022 | } |
6023 | if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info)) |
6024 | maybe_push_to_hybrid_worklist (vinfo: loop_vinfo, |
6025 | worklist, stmt_info); |
6026 | } |
6027 | } |
6028 | |
6029 | /* Now we have a worklist of non-SLP stmts, follow use->def chains and |
6030 | mark any SLP vectorized stmt as hybrid. |
6031 | ??? We're visiting def stmts N times (once for each non-SLP and |
6032 | once for each hybrid-SLP use). */ |
6033 | walk_stmt_info wi; |
6034 | vdhs_data dat; |
6035 | dat.worklist = &worklist; |
6036 | dat.loop_vinfo = loop_vinfo; |
6037 | memset (s: &wi, c: 0, n: sizeof (wi)); |
6038 | wi.info = (void *)&dat; |
6039 | while (!worklist.is_empty ()) |
6040 | { |
6041 | stmt_vec_info stmt_info = worklist.pop (); |
6042 | /* Since SSA operands are not set up for pattern stmts we need |
6043 | to use walk_gimple_op. */ |
6044 | wi.is_lhs = 0; |
6045 | walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi); |
6046 | /* For gather/scatter make sure to walk the offset operand, that |
6047 | can be a scaling and conversion away. */ |
6048 | gather_scatter_info gs_info; |
6049 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info) |
6050 | && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info)) |
6051 | { |
6052 | int dummy; |
6053 | vect_detect_hybrid_slp (tp: &gs_info.offset, &dummy, data: &wi); |
6054 | } |
6055 | } |
6056 | } |
6057 | |
6058 | |
6059 | /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */ |
6060 | |
6061 | _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared) |
6062 | : vec_info (vec_info::bb, shared), |
6063 | bbs (_bbs), |
6064 | roots (vNULL) |
6065 | { |
6066 | for (unsigned i = 0; i < bbs.length (); ++i) |
6067 | { |
6068 | if (i != 0) |
6069 | for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (i: si); |
6070 | gsi_next (i: &si)) |
6071 | { |
6072 | gphi *phi = si.phi (); |
6073 | gimple_set_uid (g: phi, uid: 0); |
6074 | add_stmt (phi); |
6075 | } |
6076 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs[i]); |
6077 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
6078 | { |
6079 | gimple *stmt = gsi_stmt (i: gsi); |
6080 | gimple_set_uid (g: stmt, uid: 0); |
6081 | if (is_gimple_debug (gs: stmt)) |
6082 | continue; |
6083 | add_stmt (stmt); |
6084 | } |
6085 | } |
6086 | } |
6087 | |
6088 | |
6089 | /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the |
6090 | stmts in the basic block. */ |
6091 | |
6092 | _bb_vec_info::~_bb_vec_info () |
6093 | { |
6094 | /* Reset region marker. */ |
6095 | for (unsigned i = 0; i < bbs.length (); ++i) |
6096 | { |
6097 | if (i != 0) |
6098 | for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (i: si); |
6099 | gsi_next (i: &si)) |
6100 | { |
6101 | gphi *phi = si.phi (); |
6102 | gimple_set_uid (g: phi, uid: -1); |
6103 | } |
6104 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs[i]); |
6105 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
6106 | { |
6107 | gimple *stmt = gsi_stmt (i: gsi); |
6108 | gimple_set_uid (g: stmt, uid: -1); |
6109 | } |
6110 | } |
6111 | |
6112 | for (unsigned i = 0; i < roots.length (); ++i) |
6113 | { |
6114 | roots[i].stmts.release (); |
6115 | roots[i].roots.release (); |
6116 | roots[i].remain.release (); |
6117 | } |
6118 | roots.release (); |
6119 | } |
6120 | |
6121 | /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE, |
6122 | given then that child nodes have already been processed, and that |
6123 | their def types currently match their SLP node's def type. */ |
6124 | |
6125 | static bool |
6126 | vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node, |
6127 | slp_instance node_instance, |
6128 | stmt_vector_for_cost *cost_vec) |
6129 | { |
6130 | stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node); |
6131 | |
6132 | /* Calculate the number of vector statements to be created for the |
6133 | scalar stmts in this node. For SLP reductions it is equal to the |
6134 | number of vector statements in the children (which has already been |
6135 | calculated by the recursive call). Otherwise it is the number of |
6136 | scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by |
6137 | VF divided by the number of elements in a vector. */ |
6138 | if (SLP_TREE_CODE (node) != VEC_PERM_EXPR |
6139 | && !STMT_VINFO_DATA_REF (stmt_info) |
6140 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
6141 | { |
6142 | for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i) |
6143 | if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def) |
6144 | { |
6145 | SLP_TREE_NUMBER_OF_VEC_STMTS (node) |
6146 | = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]); |
6147 | break; |
6148 | } |
6149 | } |
6150 | else |
6151 | { |
6152 | poly_uint64 vf; |
6153 | if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo)) |
6154 | vf = loop_vinfo->vectorization_factor; |
6155 | else |
6156 | vf = 1; |
6157 | unsigned int group_size = SLP_TREE_LANES (node); |
6158 | tree vectype = SLP_TREE_VECTYPE (node); |
6159 | SLP_TREE_NUMBER_OF_VEC_STMTS (node) |
6160 | = vect_get_num_vectors (nunits: vf * group_size, vectype); |
6161 | } |
6162 | |
6163 | /* Handle purely internal nodes. */ |
6164 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
6165 | { |
6166 | if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec)) |
6167 | return false; |
6168 | |
6169 | stmt_vec_info slp_stmt_info; |
6170 | unsigned int i; |
6171 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info) |
6172 | { |
6173 | if (STMT_VINFO_LIVE_P (slp_stmt_info) |
6174 | && !vectorizable_live_operation (vinfo, slp_stmt_info, node, |
6175 | node_instance, i, |
6176 | false, cost_vec)) |
6177 | return false; |
6178 | } |
6179 | return true; |
6180 | } |
6181 | |
6182 | bool dummy; |
6183 | return vect_analyze_stmt (vinfo, stmt_info, &dummy, |
6184 | node, node_instance, cost_vec); |
6185 | } |
6186 | |
6187 | /* Try to build NODE from scalars, returning true on success. |
6188 | NODE_INSTANCE is the SLP instance that contains NODE. */ |
6189 | |
6190 | static bool |
6191 | vect_slp_convert_to_external (vec_info *vinfo, slp_tree node, |
6192 | slp_instance node_instance) |
6193 | { |
6194 | stmt_vec_info stmt_info; |
6195 | unsigned int i; |
6196 | |
6197 | if (!is_a <bb_vec_info> (p: vinfo) |
6198 | || node == SLP_INSTANCE_TREE (node_instance) |
6199 | || !SLP_TREE_SCALAR_STMTS (node).exists () |
6200 | || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)) |
6201 | /* Force the mask use to be built from scalars instead. */ |
6202 | || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))) |
6203 | return false; |
6204 | |
6205 | if (dump_enabled_p ()) |
6206 | dump_printf_loc (MSG_NOTE, vect_location, |
6207 | "Building vector operands of %p from scalars instead\n" , |
6208 | (void *) node); |
6209 | |
6210 | /* Don't remove and free the child nodes here, since they could be |
6211 | referenced by other structures. The analysis and scheduling phases |
6212 | (need to) ignore child nodes of anything that isn't vect_internal_def. */ |
6213 | unsigned int group_size = SLP_TREE_LANES (node); |
6214 | SLP_TREE_DEF_TYPE (node) = vect_external_def; |
6215 | /* Invariants get their vector type from the uses. */ |
6216 | SLP_TREE_VECTYPE (node) = NULL_TREE; |
6217 | SLP_TREE_SCALAR_OPS (node).safe_grow (len: group_size, exact: true); |
6218 | SLP_TREE_LOAD_PERMUTATION (node).release (); |
6219 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
6220 | { |
6221 | tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt); |
6222 | SLP_TREE_SCALAR_OPS (node)[i] = lhs; |
6223 | } |
6224 | return true; |
6225 | } |
6226 | |
6227 | /* Return true if all elements of the slice are the same. */ |
6228 | bool |
6229 | vect_scalar_ops_slice::all_same_p () const |
6230 | { |
6231 | for (unsigned int i = 1; i < length; ++i) |
6232 | if (!operand_equal_p (op (i: 0), op (i))) |
6233 | return false; |
6234 | return true; |
6235 | } |
6236 | |
6237 | hashval_t |
6238 | vect_scalar_ops_slice_hash::hash (const value_type &s) |
6239 | { |
6240 | hashval_t hash = 0; |
6241 | for (unsigned i = 0; i < s.length; ++i) |
6242 | hash = iterative_hash_expr (tree: s.op (i), seed: hash); |
6243 | return hash; |
6244 | } |
6245 | |
6246 | bool |
6247 | vect_scalar_ops_slice_hash::equal (const value_type &s1, |
6248 | const compare_type &s2) |
6249 | { |
6250 | if (s1.length != s2.length) |
6251 | return false; |
6252 | for (unsigned i = 0; i < s1.length; ++i) |
6253 | if (!operand_equal_p (s1.op (i), s2.op (i))) |
6254 | return false; |
6255 | return true; |
6256 | } |
6257 | |
6258 | /* Compute the prologue cost for invariant or constant operands represented |
6259 | by NODE. */ |
6260 | |
6261 | static void |
6262 | vect_prologue_cost_for_slp (slp_tree node, |
6263 | stmt_vector_for_cost *cost_vec) |
6264 | { |
6265 | /* There's a special case of an existing vector, that costs nothing. */ |
6266 | if (SLP_TREE_SCALAR_OPS (node).length () == 0 |
6267 | && !SLP_TREE_VEC_DEFS (node).is_empty ()) |
6268 | return; |
6269 | /* Without looking at the actual initializer a vector of |
6270 | constants can be implemented as load from the constant pool. |
6271 | When all elements are the same we can use a splat. */ |
6272 | tree vectype = SLP_TREE_VECTYPE (node); |
6273 | unsigned group_size = SLP_TREE_SCALAR_OPS (node).length (); |
6274 | unsigned HOST_WIDE_INT const_nunits; |
6275 | unsigned nelt_limit; |
6276 | auto ops = &SLP_TREE_SCALAR_OPS (node); |
6277 | auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); |
6278 | if (TYPE_VECTOR_SUBPARTS (node: vectype).is_constant (const_value: &const_nunits) |
6279 | && ! multiple_p (a: const_nunits, b: group_size)) |
6280 | { |
6281 | nelt_limit = const_nunits; |
6282 | hash_set<vect_scalar_ops_slice_hash> vector_ops; |
6283 | for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i) |
6284 | if (!vector_ops.add (k: { .ops: ops, .start: i * const_nunits, .length: const_nunits })) |
6285 | starts.quick_push (obj: i * const_nunits); |
6286 | } |
6287 | else |
6288 | { |
6289 | /* If either the vector has variable length or the vectors |
6290 | are composed of repeated whole groups we only need to |
6291 | cost construction once. All vectors will be the same. */ |
6292 | nelt_limit = group_size; |
6293 | starts.quick_push (obj: 0); |
6294 | } |
6295 | /* ??? We're just tracking whether vectors in a single node are the same. |
6296 | Ideally we'd do something more global. */ |
6297 | bool passed = false; |
6298 | for (unsigned int start : starts) |
6299 | { |
6300 | vect_cost_for_stmt kind; |
6301 | if (SLP_TREE_DEF_TYPE (node) == vect_constant_def) |
6302 | kind = vector_load; |
6303 | else if (vect_scalar_ops_slice { .ops: ops, .start: start, .length: nelt_limit }.all_same_p ()) |
6304 | kind = scalar_to_vec; |
6305 | else |
6306 | kind = vec_construct; |
6307 | /* The target cost hook has no idea which part of the SLP node |
6308 | we are costing so avoid passing it down more than once. Pass |
6309 | it to the first vec_construct or scalar_to_vec part since for those |
6310 | the x86 backend tries to account for GPR to XMM register moves. */ |
6311 | record_stmt_cost (cost_vec, 1, kind, |
6312 | (kind != vector_load && !passed) ? node : nullptr, |
6313 | vectype, 0, vect_prologue); |
6314 | if (kind != vector_load) |
6315 | passed = true; |
6316 | } |
6317 | } |
6318 | |
6319 | /* Analyze statements contained in SLP tree NODE after recursively analyzing |
6320 | the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE. |
6321 | |
6322 | Return true if the operations are supported. */ |
6323 | |
6324 | static bool |
6325 | vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, |
6326 | slp_instance node_instance, |
6327 | hash_set<slp_tree> &visited_set, |
6328 | vec<slp_tree> &visited_vec, |
6329 | stmt_vector_for_cost *cost_vec) |
6330 | { |
6331 | int i, j; |
6332 | slp_tree child; |
6333 | |
6334 | /* Assume we can code-generate all invariants. */ |
6335 | if (!node |
6336 | || SLP_TREE_DEF_TYPE (node) == vect_constant_def |
6337 | || SLP_TREE_DEF_TYPE (node) == vect_external_def) |
6338 | return true; |
6339 | |
6340 | if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def) |
6341 | { |
6342 | if (dump_enabled_p ()) |
6343 | dump_printf_loc (MSG_NOTE, vect_location, |
6344 | "Failed cyclic SLP reference in %p\n" , (void *) node); |
6345 | return false; |
6346 | } |
6347 | gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def); |
6348 | |
6349 | /* If we already analyzed the exact same set of scalar stmts we're done. |
6350 | We share the generated vector stmts for those. */ |
6351 | if (visited_set.add (k: node)) |
6352 | return true; |
6353 | visited_vec.safe_push (obj: node); |
6354 | |
6355 | bool res = true; |
6356 | unsigned visited_rec_start = visited_vec.length (); |
6357 | unsigned cost_vec_rec_start = cost_vec->length (); |
6358 | bool seen_non_constant_child = false; |
6359 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
6360 | { |
6361 | res = vect_slp_analyze_node_operations (vinfo, node: child, node_instance, |
6362 | visited_set, visited_vec, |
6363 | cost_vec); |
6364 | if (!res) |
6365 | break; |
6366 | if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def) |
6367 | seen_non_constant_child = true; |
6368 | } |
6369 | /* We're having difficulties scheduling nodes with just constant |
6370 | operands and no scalar stmts since we then cannot compute a stmt |
6371 | insertion place. */ |
6372 | if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ()) |
6373 | { |
6374 | if (dump_enabled_p ()) |
6375 | dump_printf_loc (MSG_NOTE, vect_location, |
6376 | "Cannot vectorize all-constant op node %p\n" , |
6377 | (void *) node); |
6378 | res = false; |
6379 | } |
6380 | |
6381 | if (res) |
6382 | res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance, |
6383 | cost_vec); |
6384 | /* If analysis failed we have to pop all recursive visited nodes |
6385 | plus ourselves. */ |
6386 | if (!res) |
6387 | { |
6388 | while (visited_vec.length () >= visited_rec_start) |
6389 | visited_set.remove (k: visited_vec.pop ()); |
6390 | cost_vec->truncate (size: cost_vec_rec_start); |
6391 | } |
6392 | |
6393 | /* When the node can be vectorized cost invariant nodes it references. |
6394 | This is not done in DFS order to allow the refering node |
6395 | vectorizable_* calls to nail down the invariant nodes vector type |
6396 | and possibly unshare it if it needs a different vector type than |
6397 | other referrers. */ |
6398 | if (res) |
6399 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) |
6400 | if (child |
6401 | && (SLP_TREE_DEF_TYPE (child) == vect_constant_def |
6402 | || SLP_TREE_DEF_TYPE (child) == vect_external_def) |
6403 | /* Perform usual caching, note code-generation still |
6404 | code-gens these nodes multiple times but we expect |
6405 | to CSE them later. */ |
6406 | && !visited_set.add (k: child)) |
6407 | { |
6408 | visited_vec.safe_push (obj: child); |
6409 | /* ??? After auditing more code paths make a "default" |
6410 | and push the vector type from NODE to all children |
6411 | if it is not already set. */ |
6412 | /* Compute the number of vectors to be generated. */ |
6413 | tree vector_type = SLP_TREE_VECTYPE (child); |
6414 | if (!vector_type) |
6415 | { |
6416 | /* For shifts with a scalar argument we don't need |
6417 | to cost or code-generate anything. |
6418 | ??? Represent this more explicitely. */ |
6419 | gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node)) |
6420 | == shift_vec_info_type) |
6421 | && j == 1); |
6422 | continue; |
6423 | } |
6424 | unsigned group_size = SLP_TREE_LANES (child); |
6425 | poly_uint64 vf = 1; |
6426 | if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo)) |
6427 | vf = loop_vinfo->vectorization_factor; |
6428 | SLP_TREE_NUMBER_OF_VEC_STMTS (child) |
6429 | = vect_get_num_vectors (nunits: vf * group_size, vectype: vector_type); |
6430 | /* And cost them. */ |
6431 | vect_prologue_cost_for_slp (node: child, cost_vec); |
6432 | } |
6433 | |
6434 | /* If this node or any of its children can't be vectorized, try pruning |
6435 | the tree here rather than felling the whole thing. */ |
6436 | if (!res && vect_slp_convert_to_external (vinfo, node, node_instance)) |
6437 | { |
6438 | /* We'll need to revisit this for invariant costing and number |
6439 | of vectorized stmt setting. */ |
6440 | res = true; |
6441 | } |
6442 | |
6443 | return res; |
6444 | } |
6445 | |
6446 | /* Given a definition DEF, analyze if it will have any live scalar use after |
6447 | performing SLP vectorization whose information is represented by BB_VINFO, |
6448 | and record result into hash map SCALAR_USE_MAP as cache for later fast |
6449 | check. If recursion DEPTH exceeds a limit, stop analysis and make a |
6450 | conservative assumption. Return 0 if no scalar use, 1 if there is, -1 |
6451 | means recursion is limited. */ |
6452 | |
6453 | static int |
6454 | vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def, |
6455 | hash_map<tree, int> &scalar_use_map, |
6456 | int depth = 0) |
6457 | { |
6458 | const int depth_limit = 2; |
6459 | imm_use_iterator use_iter; |
6460 | gimple *use_stmt; |
6461 | |
6462 | if (int *res = scalar_use_map.get (k: def)) |
6463 | return *res; |
6464 | |
6465 | int scalar_use = 1; |
6466 | |
6467 | FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def) |
6468 | { |
6469 | if (is_gimple_debug (gs: use_stmt)) |
6470 | continue; |
6471 | |
6472 | stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt); |
6473 | |
6474 | if (!use_stmt_info) |
6475 | break; |
6476 | |
6477 | if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))) |
6478 | continue; |
6479 | |
6480 | /* Do not step forward when encounter PHI statement, since it may |
6481 | involve cyclic reference and cause infinite recursive invocation. */ |
6482 | if (gimple_code (g: use_stmt) == GIMPLE_PHI) |
6483 | break; |
6484 | |
6485 | /* When pattern recognition is involved, a statement whose definition is |
6486 | consumed in some pattern, may not be included in the final replacement |
6487 | pattern statements, so would be skipped when building SLP graph. |
6488 | |
6489 | * Original |
6490 | char a_c = *(char *) a; |
6491 | char b_c = *(char *) b; |
6492 | unsigned short a_s = (unsigned short) a_c; |
6493 | int a_i = (int) a_s; |
6494 | int b_i = (int) b_c; |
6495 | int r_i = a_i - b_i; |
6496 | |
6497 | * After pattern replacement |
6498 | a_s = (unsigned short) a_c; |
6499 | a_i = (int) a_s; |
6500 | |
6501 | patt_b_s = (unsigned short) b_c; // b_i = (int) b_c |
6502 | patt_b_i = (int) patt_b_s; // b_i = (int) b_c |
6503 | |
6504 | patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i |
6505 | patt_r_i = (int) patt_r_s; // r_i = a_i - b_i |
6506 | |
6507 | The definitions of a_i(original statement) and b_i(pattern statement) |
6508 | are related to, but actually not part of widen_minus pattern. |
6509 | Vectorizing the pattern does not cause these definition statements to |
6510 | be marked as PURE_SLP. For this case, we need to recursively check |
6511 | whether their uses are all absorbed into vectorized code. But there |
6512 | is an exception that some use may participate in an vectorized |
6513 | operation via an external SLP node containing that use as an element. |
6514 | The parameter "scalar_use_map" tags such kind of SSA as having scalar |
6515 | use in advance. */ |
6516 | tree lhs = gimple_get_lhs (use_stmt); |
6517 | |
6518 | if (!lhs || TREE_CODE (lhs) != SSA_NAME) |
6519 | break; |
6520 | |
6521 | if (depth_limit && depth >= depth_limit) |
6522 | return -1; |
6523 | |
6524 | if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, def: lhs, scalar_use_map, |
6525 | depth: depth + 1))) |
6526 | break; |
6527 | } |
6528 | |
6529 | if (end_imm_use_stmt_p (imm: &use_iter)) |
6530 | scalar_use = 0; |
6531 | |
6532 | /* If recursion is limited, do not cache result for non-root defs. */ |
6533 | if (!depth || scalar_use >= 0) |
6534 | { |
6535 | bool added = scalar_use_map.put (k: def, v: scalar_use); |
6536 | gcc_assert (!added); |
6537 | } |
6538 | |
6539 | return scalar_use; |
6540 | } |
6541 | |
6542 | /* Mark lanes of NODE that are live outside of the basic-block vectorized |
6543 | region and that can be vectorized using vectorizable_live_operation |
6544 | with STMT_VINFO_LIVE_P. Not handled live operations will cause the |
6545 | scalar code computing it to be retained. */ |
6546 | |
6547 | static void |
6548 | vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node, |
6549 | slp_instance instance, |
6550 | stmt_vector_for_cost *cost_vec, |
6551 | hash_map<tree, int> &scalar_use_map, |
6552 | hash_set<stmt_vec_info> &svisited, |
6553 | hash_set<slp_tree> &visited) |
6554 | { |
6555 | if (visited.add (k: node)) |
6556 | return; |
6557 | |
6558 | unsigned i; |
6559 | stmt_vec_info stmt_info; |
6560 | stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node); |
6561 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
6562 | { |
6563 | if (svisited.contains (k: stmt_info)) |
6564 | continue; |
6565 | stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); |
6566 | if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info) |
6567 | && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info) |
6568 | /* Only the pattern root stmt computes the original scalar value. */ |
6569 | continue; |
6570 | bool mark_visited = true; |
6571 | gimple *orig_stmt = orig_stmt_info->stmt; |
6572 | ssa_op_iter op_iter; |
6573 | def_operand_p def_p; |
6574 | FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF) |
6575 | { |
6576 | if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p), |
6577 | scalar_use_map)) |
6578 | { |
6579 | STMT_VINFO_LIVE_P (stmt_info) = true; |
6580 | if (vectorizable_live_operation (bb_vinfo, stmt_info, node, |
6581 | instance, i, false, cost_vec)) |
6582 | /* ??? So we know we can vectorize the live stmt from one SLP |
6583 | node. If we cannot do so from all or none consistently |
6584 | we'd have to record which SLP node (and lane) we want to |
6585 | use for the live operation. So make sure we can |
6586 | code-generate from all nodes. */ |
6587 | mark_visited = false; |
6588 | else |
6589 | STMT_VINFO_LIVE_P (stmt_info) = false; |
6590 | } |
6591 | |
6592 | /* We have to verify whether we can insert the lane extract |
6593 | before all uses. The following is a conservative approximation. |
6594 | We cannot put this into vectorizable_live_operation because |
6595 | iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT |
6596 | doesn't work. |
6597 | Note that while the fact that we emit code for loads at the |
6598 | first load should make this a non-problem leafs we construct |
6599 | from scalars are vectorized after the last scalar def. |
6600 | ??? If we'd actually compute the insert location during |
6601 | analysis we could use sth less conservative than the last |
6602 | scalar stmt in the node for the dominance check. */ |
6603 | /* ??? What remains is "live" uses in vector CTORs in the same |
6604 | SLP graph which is where those uses can end up code-generated |
6605 | right after their definition instead of close to their original |
6606 | use. But that would restrict us to code-generate lane-extracts |
6607 | from the latest stmt in a node. So we compensate for this |
6608 | during code-generation, simply not replacing uses for those |
6609 | hopefully rare cases. */ |
6610 | imm_use_iterator use_iter; |
6611 | gimple *use_stmt; |
6612 | stmt_vec_info use_stmt_info; |
6613 | |
6614 | if (STMT_VINFO_LIVE_P (stmt_info)) |
6615 | FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) |
6616 | if (!is_gimple_debug (gs: use_stmt) |
6617 | && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt)) |
6618 | || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))) |
6619 | && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt)) |
6620 | { |
6621 | if (dump_enabled_p ()) |
6622 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6623 | "Cannot determine insertion place for " |
6624 | "lane extract\n" ); |
6625 | STMT_VINFO_LIVE_P (stmt_info) = false; |
6626 | mark_visited = true; |
6627 | } |
6628 | } |
6629 | if (mark_visited) |
6630 | svisited.add (k: stmt_info); |
6631 | } |
6632 | |
6633 | slp_tree child; |
6634 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
6635 | if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def) |
6636 | vect_bb_slp_mark_live_stmts (bb_vinfo, node: child, instance, cost_vec, |
6637 | scalar_use_map, svisited, visited); |
6638 | } |
6639 | |
6640 | /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that |
6641 | are live outside of the basic-block vectorized region and that can be |
6642 | vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */ |
6643 | |
6644 | static void |
6645 | vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo) |
6646 | { |
6647 | if (bb_vinfo->slp_instances.is_empty ()) |
6648 | return; |
6649 | |
6650 | hash_set<stmt_vec_info> svisited; |
6651 | hash_set<slp_tree> visited; |
6652 | hash_map<tree, int> scalar_use_map; |
6653 | auto_vec<slp_tree> worklist; |
6654 | |
6655 | for (slp_instance instance : bb_vinfo->slp_instances) |
6656 | { |
6657 | if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc) |
6658 | for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance)) |
6659 | if (TREE_CODE (op) == SSA_NAME) |
6660 | scalar_use_map.put (k: op, v: 1); |
6661 | if (!visited.add (SLP_INSTANCE_TREE (instance))) |
6662 | worklist.safe_push (SLP_INSTANCE_TREE (instance)); |
6663 | } |
6664 | |
6665 | do |
6666 | { |
6667 | slp_tree node = worklist.pop (); |
6668 | |
6669 | if (SLP_TREE_DEF_TYPE (node) == vect_external_def) |
6670 | { |
6671 | for (tree op : SLP_TREE_SCALAR_OPS (node)) |
6672 | if (TREE_CODE (op) == SSA_NAME) |
6673 | scalar_use_map.put (k: op, v: 1); |
6674 | } |
6675 | else |
6676 | { |
6677 | for (slp_tree child : SLP_TREE_CHILDREN (node)) |
6678 | if (child && !visited.add (k: child)) |
6679 | worklist.safe_push (obj: child); |
6680 | } |
6681 | } |
6682 | while (!worklist.is_empty ()); |
6683 | |
6684 | visited.empty (); |
6685 | |
6686 | for (slp_instance instance : bb_vinfo->slp_instances) |
6687 | { |
6688 | vect_location = instance->location (); |
6689 | vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance), |
6690 | instance, cost_vec: &instance->cost_vec, |
6691 | scalar_use_map, svisited, visited); |
6692 | } |
6693 | } |
6694 | |
6695 | /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */ |
6696 | |
6697 | static bool |
6698 | vectorizable_bb_reduc_epilogue (slp_instance instance, |
6699 | stmt_vector_for_cost *cost_vec) |
6700 | { |
6701 | gassign *stmt = as_a <gassign *> (p: instance->root_stmts[0]->stmt); |
6702 | enum tree_code reduc_code = gimple_assign_rhs_code (gs: stmt); |
6703 | if (reduc_code == MINUS_EXPR) |
6704 | reduc_code = PLUS_EXPR; |
6705 | internal_fn reduc_fn; |
6706 | tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance)); |
6707 | if (!vectype |
6708 | || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn) |
6709 | || reduc_fn == IFN_LAST |
6710 | || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH) |
6711 | || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)), |
6712 | TREE_TYPE (vectype))) |
6713 | { |
6714 | if (dump_enabled_p ()) |
6715 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6716 | "not vectorized: basic block reduction epilogue " |
6717 | "operation unsupported.\n" ); |
6718 | return false; |
6719 | } |
6720 | |
6721 | /* There's no way to cost a horizontal vector reduction via REDUC_FN so |
6722 | cost log2 vector operations plus shuffles and one extraction. */ |
6723 | unsigned steps = floor_log2 (x: vect_nunits_for_cost (vec_type: vectype)); |
6724 | record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0], |
6725 | vectype, 0, vect_body); |
6726 | record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0], |
6727 | vectype, 0, vect_body); |
6728 | record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0], |
6729 | vectype, 0, vect_body); |
6730 | |
6731 | /* Since we replace all stmts of a possibly longer scalar reduction |
6732 | chain account for the extra scalar stmts for that. */ |
6733 | record_stmt_cost (body_cost_vec: cost_vec, count: instance->remain_defs.length (), kind: scalar_stmt, |
6734 | stmt_info: instance->root_stmts[0], misalign: 0, where: vect_body); |
6735 | return true; |
6736 | } |
6737 | |
6738 | /* Prune from ROOTS all stmts that are computed as part of lanes of NODE |
6739 | and recurse to children. */ |
6740 | |
6741 | static void |
6742 | vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots, |
6743 | hash_set<slp_tree> &visited) |
6744 | { |
6745 | if (SLP_TREE_DEF_TYPE (node) != vect_internal_def |
6746 | || visited.add (k: node)) |
6747 | return; |
6748 | |
6749 | stmt_vec_info stmt; |
6750 | unsigned i; |
6751 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) |
6752 | roots.remove (k: vect_orig_stmt (stmt_info: stmt)); |
6753 | |
6754 | slp_tree child; |
6755 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
6756 | if (child) |
6757 | vect_slp_prune_covered_roots (node: child, roots, visited); |
6758 | } |
6759 | |
6760 | /* Analyze statements in SLP instances of VINFO. Return true if the |
6761 | operations are supported. */ |
6762 | |
6763 | bool |
6764 | vect_slp_analyze_operations (vec_info *vinfo) |
6765 | { |
6766 | slp_instance instance; |
6767 | int i; |
6768 | |
6769 | DUMP_VECT_SCOPE ("vect_slp_analyze_operations" ); |
6770 | |
6771 | hash_set<slp_tree> visited; |
6772 | for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ) |
6773 | { |
6774 | auto_vec<slp_tree> visited_vec; |
6775 | stmt_vector_for_cost cost_vec; |
6776 | cost_vec.create (nelems: 2); |
6777 | if (is_a <bb_vec_info> (p: vinfo)) |
6778 | vect_location = instance->location (); |
6779 | if (!vect_slp_analyze_node_operations (vinfo, |
6780 | SLP_INSTANCE_TREE (instance), |
6781 | node_instance: instance, visited_set&: visited, visited_vec, |
6782 | cost_vec: &cost_vec) |
6783 | /* CTOR instances require vectorized defs for the SLP tree root. */ |
6784 | || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor |
6785 | && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) |
6786 | != vect_internal_def |
6787 | /* Make sure we vectorized with the expected type. */ |
6788 | || !useless_type_conversion_p |
6789 | (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1 |
6790 | (instance->root_stmts[0]->stmt))), |
6791 | TREE_TYPE (SLP_TREE_VECTYPE |
6792 | (SLP_INSTANCE_TREE (instance)))))) |
6793 | /* Check we can vectorize the reduction. */ |
6794 | || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc |
6795 | && !vectorizable_bb_reduc_epilogue (instance, cost_vec: &cost_vec))) |
6796 | { |
6797 | slp_tree node = SLP_INSTANCE_TREE (instance); |
6798 | stmt_vec_info stmt_info; |
6799 | if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()) |
6800 | stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0]; |
6801 | else |
6802 | stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
6803 | if (dump_enabled_p ()) |
6804 | dump_printf_loc (MSG_NOTE, vect_location, |
6805 | "removing SLP instance operations starting from: %G" , |
6806 | stmt_info->stmt); |
6807 | vect_free_slp_instance (instance); |
6808 | vinfo->slp_instances.ordered_remove (ix: i); |
6809 | cost_vec.release (); |
6810 | while (!visited_vec.is_empty ()) |
6811 | visited.remove (k: visited_vec.pop ()); |
6812 | } |
6813 | else |
6814 | { |
6815 | i++; |
6816 | if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (p: vinfo)) |
6817 | { |
6818 | add_stmt_costs (costs: loop_vinfo->vector_costs, cost_vec: &cost_vec); |
6819 | cost_vec.release (); |
6820 | } |
6821 | else |
6822 | /* For BB vectorization remember the SLP graph entry |
6823 | cost for later. */ |
6824 | instance->cost_vec = cost_vec; |
6825 | } |
6826 | } |
6827 | |
6828 | /* Now look for SLP instances with a root that are covered by other |
6829 | instances and remove them. */ |
6830 | hash_set<stmt_vec_info> roots; |
6831 | for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i) |
6832 | if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()) |
6833 | roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]); |
6834 | if (!roots.is_empty ()) |
6835 | { |
6836 | visited.empty (); |
6837 | for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i) |
6838 | vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots, |
6839 | visited); |
6840 | for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ) |
6841 | if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty () |
6842 | && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0])) |
6843 | { |
6844 | stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0]; |
6845 | if (dump_enabled_p ()) |
6846 | dump_printf_loc (MSG_NOTE, vect_location, |
6847 | "removing SLP instance operations starting " |
6848 | "from: %G" , root->stmt); |
6849 | vect_free_slp_instance (instance); |
6850 | vinfo->slp_instances.ordered_remove (ix: i); |
6851 | } |
6852 | else |
6853 | ++i; |
6854 | } |
6855 | |
6856 | /* Compute vectorizable live stmts. */ |
6857 | if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo)) |
6858 | vect_bb_slp_mark_live_stmts (bb_vinfo); |
6859 | |
6860 | return !vinfo->slp_instances.is_empty (); |
6861 | } |
6862 | |
6863 | /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively |
6864 | closing the eventual chain. */ |
6865 | |
6866 | static slp_instance |
6867 | get_ultimate_leader (slp_instance instance, |
6868 | hash_map<slp_instance, slp_instance> &instance_leader) |
6869 | { |
6870 | auto_vec<slp_instance *, 8> chain; |
6871 | slp_instance *tem; |
6872 | while (*(tem = instance_leader.get (k: instance)) != instance) |
6873 | { |
6874 | chain.safe_push (obj: tem); |
6875 | instance = *tem; |
6876 | } |
6877 | while (!chain.is_empty ()) |
6878 | *chain.pop () = instance; |
6879 | return instance; |
6880 | } |
6881 | |
6882 | namespace { |
6883 | /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in |
6884 | KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping |
6885 | for KEY. Return true if KEY was already in KEY_TO_INSTANCE. |
6886 | |
6887 | INSTANCE_LEADER is as for get_ultimate_leader. */ |
6888 | |
6889 | template<typename T> |
6890 | bool |
6891 | vect_map_to_instance (slp_instance instance, T key, |
6892 | hash_map<T, slp_instance> &key_to_instance, |
6893 | hash_map<slp_instance, slp_instance> &instance_leader) |
6894 | { |
6895 | bool existed_p; |
6896 | slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p); |
6897 | if (!existed_p) |
6898 | ; |
6899 | else if (key_instance != instance) |
6900 | { |
6901 | /* If we're running into a previously marked key make us the |
6902 | leader of the current ultimate leader. This keeps the |
6903 | leader chain acyclic and works even when the current instance |
6904 | connects two previously independent graph parts. */ |
6905 | slp_instance key_leader |
6906 | = get_ultimate_leader (instance: key_instance, instance_leader); |
6907 | if (key_leader != instance) |
6908 | instance_leader.put (k: key_leader, v: instance); |
6909 | } |
6910 | key_instance = instance; |
6911 | return existed_p; |
6912 | } |
6913 | } |
6914 | |
6915 | /* Worker of vect_bb_partition_graph, recurse on NODE. */ |
6916 | |
6917 | static void |
6918 | vect_bb_partition_graph_r (bb_vec_info bb_vinfo, |
6919 | slp_instance instance, slp_tree node, |
6920 | hash_map<stmt_vec_info, slp_instance> &stmt_to_instance, |
6921 | hash_map<slp_tree, slp_instance> &node_to_instance, |
6922 | hash_map<slp_instance, slp_instance> &instance_leader) |
6923 | { |
6924 | stmt_vec_info stmt_info; |
6925 | unsigned i; |
6926 | |
6927 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
6928 | vect_map_to_instance (instance, key: stmt_info, key_to_instance&: stmt_to_instance, |
6929 | instance_leader); |
6930 | |
6931 | if (vect_map_to_instance (instance, key: node, key_to_instance&: node_to_instance, |
6932 | instance_leader)) |
6933 | return; |
6934 | |
6935 | slp_tree child; |
6936 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
6937 | if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def) |
6938 | vect_bb_partition_graph_r (bb_vinfo, instance, node: child, stmt_to_instance, |
6939 | node_to_instance, instance_leader); |
6940 | } |
6941 | |
6942 | /* Partition the SLP graph into pieces that can be costed independently. */ |
6943 | |
6944 | static void |
6945 | vect_bb_partition_graph (bb_vec_info bb_vinfo) |
6946 | { |
6947 | DUMP_VECT_SCOPE ("vect_bb_partition_graph" ); |
6948 | |
6949 | /* First walk the SLP graph assigning each involved scalar stmt a |
6950 | corresponding SLP graph entry and upon visiting a previously |
6951 | marked stmt, make the stmts leader the current SLP graph entry. */ |
6952 | hash_map<stmt_vec_info, slp_instance> stmt_to_instance; |
6953 | hash_map<slp_tree, slp_instance> node_to_instance; |
6954 | hash_map<slp_instance, slp_instance> instance_leader; |
6955 | slp_instance instance; |
6956 | for (unsigned i = 0; bb_vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i) |
6957 | { |
6958 | instance_leader.put (k: instance, v: instance); |
6959 | vect_bb_partition_graph_r (bb_vinfo, |
6960 | instance, SLP_INSTANCE_TREE (instance), |
6961 | stmt_to_instance, node_to_instance, |
6962 | instance_leader); |
6963 | } |
6964 | |
6965 | /* Then collect entries to each independent subgraph. */ |
6966 | for (unsigned i = 0; bb_vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i) |
6967 | { |
6968 | slp_instance leader = get_ultimate_leader (instance, instance_leader); |
6969 | leader->subgraph_entries.safe_push (obj: instance); |
6970 | if (dump_enabled_p () |
6971 | && leader != instance) |
6972 | dump_printf_loc (MSG_NOTE, vect_location, |
6973 | "instance %p is leader of %p\n" , |
6974 | (void *) leader, (void *) instance); |
6975 | } |
6976 | } |
6977 | |
6978 | /* Compute the set of scalar stmts participating in internal and external |
6979 | nodes. */ |
6980 | |
6981 | static void |
6982 | vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node, |
6983 | hash_set<slp_tree> &visited, |
6984 | hash_set<stmt_vec_info> &vstmts, |
6985 | hash_set<stmt_vec_info> &estmts) |
6986 | { |
6987 | int i; |
6988 | stmt_vec_info stmt_info; |
6989 | slp_tree child; |
6990 | |
6991 | if (visited.add (k: node)) |
6992 | return; |
6993 | |
6994 | if (SLP_TREE_DEF_TYPE (node) == vect_internal_def) |
6995 | { |
6996 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
6997 | vstmts.add (k: stmt_info); |
6998 | |
6999 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
7000 | if (child) |
7001 | vect_slp_gather_vectorized_scalar_stmts (vinfo, node: child, visited, |
7002 | vstmts, estmts); |
7003 | } |
7004 | else |
7005 | for (tree def : SLP_TREE_SCALAR_OPS (node)) |
7006 | { |
7007 | stmt_vec_info def_stmt = vinfo->lookup_def (def); |
7008 | if (def_stmt) |
7009 | estmts.add (k: def_stmt); |
7010 | } |
7011 | } |
7012 | |
7013 | |
7014 | /* Compute the scalar cost of the SLP node NODE and its children |
7015 | and return it. Do not account defs that are marked in LIFE and |
7016 | update LIFE according to uses of NODE. */ |
7017 | |
7018 | static void |
7019 | vect_bb_slp_scalar_cost (vec_info *vinfo, |
7020 | slp_tree node, vec<bool, va_heap> *life, |
7021 | stmt_vector_for_cost *cost_vec, |
7022 | hash_set<stmt_vec_info> &vectorized_scalar_stmts, |
7023 | hash_set<slp_tree> &visited) |
7024 | { |
7025 | unsigned i; |
7026 | stmt_vec_info stmt_info; |
7027 | slp_tree child; |
7028 | |
7029 | if (visited.add (k: node)) |
7030 | return; |
7031 | |
7032 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
7033 | { |
7034 | ssa_op_iter op_iter; |
7035 | def_operand_p def_p; |
7036 | |
7037 | if ((*life)[i]) |
7038 | continue; |
7039 | |
7040 | stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); |
7041 | gimple *orig_stmt = orig_stmt_info->stmt; |
7042 | |
7043 | /* If there is a non-vectorized use of the defs then the scalar |
7044 | stmt is kept live in which case we do not account it or any |
7045 | required defs in the SLP children in the scalar cost. This |
7046 | way we make the vectorization more costly when compared to |
7047 | the scalar cost. */ |
7048 | if (!STMT_VINFO_LIVE_P (stmt_info)) |
7049 | { |
7050 | auto_vec<gimple *, 8> worklist; |
7051 | hash_set<gimple *> *worklist_visited = NULL; |
7052 | worklist.quick_push (obj: orig_stmt); |
7053 | do |
7054 | { |
7055 | gimple *work_stmt = worklist.pop (); |
7056 | FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF) |
7057 | { |
7058 | imm_use_iterator use_iter; |
7059 | gimple *use_stmt; |
7060 | FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, |
7061 | DEF_FROM_PTR (def_p)) |
7062 | if (!is_gimple_debug (gs: use_stmt)) |
7063 | { |
7064 | stmt_vec_info use_stmt_info |
7065 | = vinfo->lookup_stmt (use_stmt); |
7066 | if (!use_stmt_info |
7067 | || !vectorized_scalar_stmts.contains (k: use_stmt_info)) |
7068 | { |
7069 | if (use_stmt_info |
7070 | && STMT_VINFO_IN_PATTERN_P (use_stmt_info)) |
7071 | { |
7072 | /* For stmts participating in patterns we have |
7073 | to check its uses recursively. */ |
7074 | if (!worklist_visited) |
7075 | worklist_visited = new hash_set<gimple *> (); |
7076 | if (!worklist_visited->add (k: use_stmt)) |
7077 | worklist.safe_push (obj: use_stmt); |
7078 | continue; |
7079 | } |
7080 | (*life)[i] = true; |
7081 | goto next_lane; |
7082 | } |
7083 | } |
7084 | } |
7085 | } |
7086 | while (!worklist.is_empty ()); |
7087 | next_lane: |
7088 | if (worklist_visited) |
7089 | delete worklist_visited; |
7090 | if ((*life)[i]) |
7091 | continue; |
7092 | } |
7093 | |
7094 | /* Count scalar stmts only once. */ |
7095 | if (gimple_visited_p (stmt: orig_stmt)) |
7096 | continue; |
7097 | gimple_set_visited (stmt: orig_stmt, visited_p: true); |
7098 | |
7099 | vect_cost_for_stmt kind; |
7100 | if (STMT_VINFO_DATA_REF (orig_stmt_info)) |
7101 | { |
7102 | if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info))) |
7103 | kind = scalar_load; |
7104 | else |
7105 | kind = scalar_store; |
7106 | } |
7107 | else if (vect_nop_conversion_p (orig_stmt_info)) |
7108 | continue; |
7109 | /* For single-argument PHIs assume coalescing which means zero cost |
7110 | for the scalar and the vector PHIs. This avoids artificially |
7111 | favoring the vector path (but may pessimize it in some cases). */ |
7112 | else if (is_a <gphi *> (p: orig_stmt_info->stmt) |
7113 | && gimple_phi_num_args |
7114 | (gs: as_a <gphi *> (p: orig_stmt_info->stmt)) == 1) |
7115 | continue; |
7116 | else |
7117 | kind = scalar_stmt; |
7118 | record_stmt_cost (cost_vec, 1, kind, orig_stmt_info, |
7119 | SLP_TREE_VECTYPE (node), 0, vect_body); |
7120 | } |
7121 | |
7122 | auto_vec<bool, 20> subtree_life; |
7123 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
7124 | { |
7125 | if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def) |
7126 | { |
7127 | /* Do not directly pass LIFE to the recursive call, copy it to |
7128 | confine changes in the callee to the current child/subtree. */ |
7129 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
7130 | { |
7131 | subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), exact: true); |
7132 | for (unsigned j = 0; |
7133 | j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j) |
7134 | { |
7135 | auto perm = SLP_TREE_LANE_PERMUTATION (node)[j]; |
7136 | if (perm.first == i) |
7137 | subtree_life[perm.second] = (*life)[j]; |
7138 | } |
7139 | } |
7140 | else |
7141 | { |
7142 | gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child)); |
7143 | subtree_life.safe_splice (src: *life); |
7144 | } |
7145 | vect_bb_slp_scalar_cost (vinfo, node: child, life: &subtree_life, cost_vec, |
7146 | vectorized_scalar_stmts, visited); |
7147 | subtree_life.truncate (size: 0); |
7148 | } |
7149 | } |
7150 | } |
7151 | |
7152 | /* Comparator for the loop-index sorted cost vectors. */ |
7153 | |
7154 | static int |
7155 | li_cost_vec_cmp (const void *a_, const void *b_) |
7156 | { |
7157 | auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_; |
7158 | auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_; |
7159 | if (a->first < b->first) |
7160 | return -1; |
7161 | else if (a->first == b->first) |
7162 | return 0; |
7163 | return 1; |
7164 | } |
7165 | |
7166 | /* Check if vectorization of the basic block is profitable for the |
7167 | subgraph denoted by SLP_INSTANCES. */ |
7168 | |
7169 | static bool |
7170 | vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, |
7171 | vec<slp_instance> slp_instances, |
7172 | loop_p orig_loop) |
7173 | { |
7174 | slp_instance instance; |
7175 | int i; |
7176 | unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0; |
7177 | unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0; |
7178 | |
7179 | if (dump_enabled_p ()) |
7180 | { |
7181 | dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n" ); |
7182 | hash_set<slp_tree> visited; |
7183 | FOR_EACH_VEC_ELT (slp_instances, i, instance) |
7184 | vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location, |
7185 | SLP_INSTANCE_TREE (instance), visited); |
7186 | } |
7187 | |
7188 | /* Compute the set of scalar stmts we know will go away 'locally' when |
7189 | vectorizing. This used to be tracked with just PURE_SLP_STMT but that's |
7190 | not accurate for nodes promoted extern late or for scalar stmts that |
7191 | are used both in extern defs and in vectorized defs. */ |
7192 | hash_set<stmt_vec_info> vectorized_scalar_stmts; |
7193 | hash_set<stmt_vec_info> scalar_stmts_in_externs; |
7194 | hash_set<slp_tree> visited; |
7195 | FOR_EACH_VEC_ELT (slp_instances, i, instance) |
7196 | { |
7197 | vect_slp_gather_vectorized_scalar_stmts (vinfo: bb_vinfo, |
7198 | SLP_INSTANCE_TREE (instance), |
7199 | visited, |
7200 | vstmts&: vectorized_scalar_stmts, |
7201 | estmts&: scalar_stmts_in_externs); |
7202 | for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance)) |
7203 | vectorized_scalar_stmts.add (k: rstmt); |
7204 | } |
7205 | /* Scalar stmts used as defs in external nodes need to be preseved, so |
7206 | remove them from vectorized_scalar_stmts. */ |
7207 | for (stmt_vec_info stmt : scalar_stmts_in_externs) |
7208 | vectorized_scalar_stmts.remove (k: stmt); |
7209 | |
7210 | /* Calculate scalar cost and sum the cost for the vector stmts |
7211 | previously collected. */ |
7212 | stmt_vector_for_cost scalar_costs = vNULL; |
7213 | stmt_vector_for_cost vector_costs = vNULL; |
7214 | visited.empty (); |
7215 | FOR_EACH_VEC_ELT (slp_instances, i, instance) |
7216 | { |
7217 | auto_vec<bool, 20> life; |
7218 | life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)), |
7219 | exact: true); |
7220 | if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()) |
7221 | record_stmt_cost (body_cost_vec: &scalar_costs, |
7222 | SLP_INSTANCE_ROOT_STMTS (instance).length (), |
7223 | kind: scalar_stmt, |
7224 | SLP_INSTANCE_ROOT_STMTS (instance)[0], misalign: 0, where: vect_body); |
7225 | vect_bb_slp_scalar_cost (vinfo: bb_vinfo, |
7226 | SLP_INSTANCE_TREE (instance), |
7227 | life: &life, cost_vec: &scalar_costs, vectorized_scalar_stmts, |
7228 | visited); |
7229 | vector_costs.safe_splice (src: instance->cost_vec); |
7230 | instance->cost_vec.release (); |
7231 | } |
7232 | |
7233 | if (dump_enabled_p ()) |
7234 | dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n" ); |
7235 | |
7236 | /* When costing non-loop vectorization we need to consider each covered |
7237 | loop independently and make sure vectorization is profitable. For |
7238 | now we assume a loop may be not entered or executed an arbitrary |
7239 | number of iterations (??? static information can provide more |
7240 | precise info here) which means we can simply cost each containing |
7241 | loops stmts separately. */ |
7242 | |
7243 | /* First produce cost vectors sorted by loop index. */ |
7244 | auto_vec<std::pair<unsigned, stmt_info_for_cost *> > |
7245 | li_scalar_costs (scalar_costs.length ()); |
7246 | auto_vec<std::pair<unsigned, stmt_info_for_cost *> > |
7247 | li_vector_costs (vector_costs.length ()); |
7248 | stmt_info_for_cost *cost; |
7249 | FOR_EACH_VEC_ELT (scalar_costs, i, cost) |
7250 | { |
7251 | unsigned l = gimple_bb (g: cost->stmt_info->stmt)->loop_father->num; |
7252 | li_scalar_costs.quick_push (obj: std::make_pair (x&: l, y&: cost)); |
7253 | } |
7254 | /* Use a random used loop as fallback in case the first vector_costs |
7255 | entry does not have a stmt_info associated with it. */ |
7256 | unsigned l = li_scalar_costs[0].first; |
7257 | FOR_EACH_VEC_ELT (vector_costs, i, cost) |
7258 | { |
7259 | /* We inherit from the previous COST, invariants, externals and |
7260 | extracts immediately follow the cost for the related stmt. */ |
7261 | if (cost->stmt_info) |
7262 | l = gimple_bb (g: cost->stmt_info->stmt)->loop_father->num; |
7263 | li_vector_costs.quick_push (obj: std::make_pair (x&: l, y&: cost)); |
7264 | } |
7265 | li_scalar_costs.qsort (li_cost_vec_cmp); |
7266 | li_vector_costs.qsort (li_cost_vec_cmp); |
7267 | |
7268 | /* Now cost the portions individually. */ |
7269 | unsigned vi = 0; |
7270 | unsigned si = 0; |
7271 | bool profitable = true; |
7272 | while (si < li_scalar_costs.length () |
7273 | && vi < li_vector_costs.length ()) |
7274 | { |
7275 | unsigned sl = li_scalar_costs[si].first; |
7276 | unsigned vl = li_vector_costs[vi].first; |
7277 | if (sl != vl) |
7278 | { |
7279 | if (dump_enabled_p ()) |
7280 | dump_printf_loc (MSG_NOTE, vect_location, |
7281 | "Scalar %d and vector %d loop part do not " |
7282 | "match up, skipping scalar part\n" , sl, vl); |
7283 | /* Skip the scalar part, assuming zero cost on the vector side. */ |
7284 | do |
7285 | { |
7286 | si++; |
7287 | } |
7288 | while (si < li_scalar_costs.length () |
7289 | && li_scalar_costs[si].first == sl); |
7290 | continue; |
7291 | } |
7292 | |
7293 | class vector_costs *scalar_target_cost_data = init_cost (vinfo: bb_vinfo, costing_for_scalar: true); |
7294 | do |
7295 | { |
7296 | add_stmt_cost (costs: scalar_target_cost_data, i: li_scalar_costs[si].second); |
7297 | si++; |
7298 | } |
7299 | while (si < li_scalar_costs.length () |
7300 | && li_scalar_costs[si].first == sl); |
7301 | unsigned dummy; |
7302 | finish_cost (costs: scalar_target_cost_data, scalar_costs: nullptr, |
7303 | prologue_cost: &dummy, body_cost: &scalar_cost, epilogue_cost: &dummy); |
7304 | |
7305 | /* Complete the target-specific vector cost calculation. */ |
7306 | class vector_costs *vect_target_cost_data = init_cost (vinfo: bb_vinfo, costing_for_scalar: false); |
7307 | do |
7308 | { |
7309 | add_stmt_cost (costs: vect_target_cost_data, i: li_vector_costs[vi].second); |
7310 | vi++; |
7311 | } |
7312 | while (vi < li_vector_costs.length () |
7313 | && li_vector_costs[vi].first == vl); |
7314 | finish_cost (costs: vect_target_cost_data, scalar_costs: scalar_target_cost_data, |
7315 | prologue_cost: &vec_prologue_cost, body_cost: &vec_inside_cost, epilogue_cost: &vec_epilogue_cost); |
7316 | delete scalar_target_cost_data; |
7317 | delete vect_target_cost_data; |
7318 | |
7319 | vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; |
7320 | |
7321 | if (dump_enabled_p ()) |
7322 | { |
7323 | dump_printf_loc (MSG_NOTE, vect_location, |
7324 | "Cost model analysis for part in loop %d:\n" , sl); |
7325 | dump_printf (MSG_NOTE, " Vector cost: %d\n" , |
7326 | vec_inside_cost + vec_outside_cost); |
7327 | dump_printf (MSG_NOTE, " Scalar cost: %d\n" , scalar_cost); |
7328 | } |
7329 | |
7330 | /* Vectorization is profitable if its cost is more than the cost of scalar |
7331 | version. Note that we err on the vector side for equal cost because |
7332 | the cost estimate is otherwise quite pessimistic (constant uses are |
7333 | free on the scalar side but cost a load on the vector side for |
7334 | example). */ |
7335 | if (vec_outside_cost + vec_inside_cost > scalar_cost) |
7336 | { |
7337 | profitable = false; |
7338 | break; |
7339 | } |
7340 | } |
7341 | if (profitable && vi < li_vector_costs.length ()) |
7342 | { |
7343 | if (dump_enabled_p ()) |
7344 | dump_printf_loc (MSG_NOTE, vect_location, |
7345 | "Excess vector cost for part in loop %d:\n" , |
7346 | li_vector_costs[vi].first); |
7347 | profitable = false; |
7348 | } |
7349 | |
7350 | /* Unset visited flag. This is delayed when the subgraph is profitable |
7351 | and we process the loop for remaining unvectorized if-converted code. */ |
7352 | if (!orig_loop || !profitable) |
7353 | FOR_EACH_VEC_ELT (scalar_costs, i, cost) |
7354 | gimple_set_visited (stmt: cost->stmt_info->stmt, visited_p: false); |
7355 | |
7356 | scalar_costs.release (); |
7357 | vector_costs.release (); |
7358 | |
7359 | return profitable; |
7360 | } |
7361 | |
7362 | /* qsort comparator for lane defs. */ |
7363 | |
7364 | static int |
7365 | vld_cmp (const void *a_, const void *b_) |
7366 | { |
7367 | auto *a = (const std::pair<unsigned, tree> *)a_; |
7368 | auto *b = (const std::pair<unsigned, tree> *)b_; |
7369 | return a->first - b->first; |
7370 | } |
7371 | |
7372 | /* Return true if USE_STMT is a vector lane insert into VEC and set |
7373 | *THIS_LANE to the lane number that is set. */ |
7374 | |
7375 | static bool |
7376 | vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane) |
7377 | { |
7378 | gassign *use_ass = dyn_cast <gassign *> (p: use_stmt); |
7379 | if (!use_ass |
7380 | || gimple_assign_rhs_code (gs: use_ass) != BIT_INSERT_EXPR |
7381 | || (vec |
7382 | ? gimple_assign_rhs1 (gs: use_ass) != vec |
7383 | : ((vec = gimple_assign_rhs1 (gs: use_ass)), false)) |
7384 | || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)), |
7385 | TREE_TYPE (gimple_assign_rhs2 (use_ass))) |
7386 | || !constant_multiple_p |
7387 | (a: tree_to_poly_uint64 (gimple_assign_rhs3 (gs: use_ass)), |
7388 | b: tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))), |
7389 | multiple: this_lane)) |
7390 | return false; |
7391 | return true; |
7392 | } |
7393 | |
7394 | /* Find any vectorizable constructors and add them to the grouped_store |
7395 | array. */ |
7396 | |
7397 | static void |
7398 | vect_slp_check_for_roots (bb_vec_info bb_vinfo) |
7399 | { |
7400 | for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i) |
7401 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bb_vinfo->bbs[i]); |
7402 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
7403 | { |
7404 | gassign *assign = dyn_cast<gassign *> (p: gsi_stmt (i: gsi)); |
7405 | if (!assign) |
7406 | continue; |
7407 | |
7408 | tree rhs = gimple_assign_rhs1 (gs: assign); |
7409 | enum tree_code code = gimple_assign_rhs_code (gs: assign); |
7410 | use_operand_p use_p; |
7411 | gimple *use_stmt; |
7412 | if (code == CONSTRUCTOR) |
7413 | { |
7414 | if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) |
7415 | || maybe_ne (a: TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)), |
7416 | CONSTRUCTOR_NELTS (rhs)) |
7417 | || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value)) |
7418 | || uniform_vector_p (rhs)) |
7419 | continue; |
7420 | |
7421 | unsigned j; |
7422 | tree val; |
7423 | FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val) |
7424 | if (TREE_CODE (val) != SSA_NAME |
7425 | || !bb_vinfo->lookup_def (val)) |
7426 | break; |
7427 | if (j != CONSTRUCTOR_NELTS (rhs)) |
7428 | continue; |
7429 | |
7430 | vec<stmt_vec_info> roots = vNULL; |
7431 | roots.safe_push (obj: bb_vinfo->lookup_stmt (assign)); |
7432 | vec<stmt_vec_info> stmts; |
7433 | stmts.create (CONSTRUCTOR_NELTS (rhs)); |
7434 | FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val) |
7435 | stmts.quick_push |
7436 | (obj: vect_stmt_to_vectorize (stmt_info: bb_vinfo->lookup_def (val))); |
7437 | bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_ctor, |
7438 | stmts, roots)); |
7439 | } |
7440 | else if (code == BIT_INSERT_EXPR |
7441 | && VECTOR_TYPE_P (TREE_TYPE (rhs)) |
7442 | && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant () |
7443 | && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1 |
7444 | && integer_zerop (gimple_assign_rhs3 (gs: assign)) |
7445 | && useless_type_conversion_p |
7446 | (TREE_TYPE (TREE_TYPE (rhs)), |
7447 | TREE_TYPE (gimple_assign_rhs2 (assign))) |
7448 | && bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: assign))) |
7449 | { |
7450 | /* We start to match on insert to lane zero but since the |
7451 | inserts need not be ordered we'd have to search both |
7452 | the def and the use chains. */ |
7453 | tree vectype = TREE_TYPE (rhs); |
7454 | unsigned nlanes = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant (); |
7455 | auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes); |
7456 | auto_sbitmap lanes (nlanes); |
7457 | bitmap_clear (lanes); |
7458 | bitmap_set_bit (map: lanes, bitno: 0); |
7459 | tree def = gimple_assign_lhs (gs: assign); |
7460 | lane_defs.quick_push |
7461 | (obj: std::make_pair (x: 0, y: gimple_assign_rhs2 (gs: assign))); |
7462 | unsigned lanes_found = 1; |
7463 | /* Start with the use chains, the last stmt will be the root. */ |
7464 | stmt_vec_info last = bb_vinfo->lookup_stmt (assign); |
7465 | vec<stmt_vec_info> roots = vNULL; |
7466 | roots.safe_push (obj: last); |
7467 | do |
7468 | { |
7469 | use_operand_p use_p; |
7470 | gimple *use_stmt; |
7471 | if (!single_imm_use (var: def, use_p: &use_p, stmt: &use_stmt)) |
7472 | break; |
7473 | unsigned this_lane; |
7474 | if (!bb_vinfo->lookup_stmt (use_stmt) |
7475 | || !vect_slp_is_lane_insert (use_stmt, vec: def, this_lane: &this_lane) |
7476 | || !bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: use_stmt))) |
7477 | break; |
7478 | if (bitmap_bit_p (map: lanes, bitno: this_lane)) |
7479 | break; |
7480 | lanes_found++; |
7481 | bitmap_set_bit (map: lanes, bitno: this_lane); |
7482 | gassign *use_ass = as_a <gassign *> (p: use_stmt); |
7483 | lane_defs.quick_push (obj: std::make_pair |
7484 | (x&: this_lane, y: gimple_assign_rhs2 (gs: use_ass))); |
7485 | last = bb_vinfo->lookup_stmt (use_ass); |
7486 | roots.safe_push (obj: last); |
7487 | def = gimple_assign_lhs (gs: use_ass); |
7488 | } |
7489 | while (lanes_found < nlanes); |
7490 | if (roots.length () > 1) |
7491 | std::swap(a&: roots[0], b&: roots[roots.length () - 1]); |
7492 | if (lanes_found < nlanes) |
7493 | { |
7494 | /* Now search the def chain. */ |
7495 | def = gimple_assign_rhs1 (gs: assign); |
7496 | do |
7497 | { |
7498 | if (TREE_CODE (def) != SSA_NAME |
7499 | || !has_single_use (var: def)) |
7500 | break; |
7501 | gimple *def_stmt = SSA_NAME_DEF_STMT (def); |
7502 | unsigned this_lane; |
7503 | if (!bb_vinfo->lookup_stmt (def_stmt) |
7504 | || !vect_slp_is_lane_insert (use_stmt: def_stmt, |
7505 | NULL_TREE, this_lane: &this_lane) |
7506 | || !bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: def_stmt))) |
7507 | break; |
7508 | if (bitmap_bit_p (map: lanes, bitno: this_lane)) |
7509 | break; |
7510 | lanes_found++; |
7511 | bitmap_set_bit (map: lanes, bitno: this_lane); |
7512 | lane_defs.quick_push (obj: std::make_pair |
7513 | (x&: this_lane, |
7514 | y: gimple_assign_rhs2 (gs: def_stmt))); |
7515 | roots.safe_push (obj: bb_vinfo->lookup_stmt (def_stmt)); |
7516 | def = gimple_assign_rhs1 (gs: def_stmt); |
7517 | } |
7518 | while (lanes_found < nlanes); |
7519 | } |
7520 | if (lanes_found == nlanes) |
7521 | { |
7522 | /* Sort lane_defs after the lane index and register the root. */ |
7523 | lane_defs.qsort (vld_cmp); |
7524 | vec<stmt_vec_info> stmts; |
7525 | stmts.create (nelems: nlanes); |
7526 | for (unsigned i = 0; i < nlanes; ++i) |
7527 | stmts.quick_push (obj: bb_vinfo->lookup_def (lane_defs[i].second)); |
7528 | bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_ctor, |
7529 | stmts, roots)); |
7530 | } |
7531 | else |
7532 | roots.release (); |
7533 | } |
7534 | else if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) |
7535 | && (associative_tree_code (code) || code == MINUS_EXPR) |
7536 | /* ??? This pessimizes a two-element reduction. PR54400. |
7537 | ??? In-order reduction could be handled if we only |
7538 | traverse one operand chain in vect_slp_linearize_chain. */ |
7539 | && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code) |
7540 | /* Ops with constants at the tail can be stripped here. */ |
7541 | && TREE_CODE (rhs) == SSA_NAME |
7542 | && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME |
7543 | /* Should be the chain end. */ |
7544 | && (!single_imm_use (var: gimple_assign_lhs (gs: assign), |
7545 | use_p: &use_p, stmt: &use_stmt) |
7546 | || !is_gimple_assign (gs: use_stmt) |
7547 | || (gimple_assign_rhs_code (gs: use_stmt) != code |
7548 | && ((code != PLUS_EXPR && code != MINUS_EXPR) |
7549 | || (gimple_assign_rhs_code (gs: use_stmt) |
7550 | != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR)))))) |
7551 | { |
7552 | /* We start the match at the end of a possible association |
7553 | chain. */ |
7554 | auto_vec<chain_op_t> chain; |
7555 | auto_vec<std::pair<tree_code, gimple *> > worklist; |
7556 | auto_vec<gimple *> chain_stmts; |
7557 | gimple *code_stmt = NULL, *alt_code_stmt = NULL; |
7558 | if (code == MINUS_EXPR) |
7559 | code = PLUS_EXPR; |
7560 | internal_fn reduc_fn; |
7561 | if (!reduction_fn_for_scalar_code (code, &reduc_fn) |
7562 | || reduc_fn == IFN_LAST) |
7563 | continue; |
7564 | vect_slp_linearize_chain (vinfo: bb_vinfo, worklist, chain, code, start: assign, |
7565 | /* ??? */ |
7566 | code_stmt, alt_code_stmt, chain_stmts: &chain_stmts); |
7567 | if (chain.length () > 1) |
7568 | { |
7569 | /* Sort the chain according to def_type and operation. */ |
7570 | chain.sort (cmp: dt_sort_cmp, data: bb_vinfo); |
7571 | /* ??? Now we'd want to strip externals and constants |
7572 | but record those to be handled in the epilogue. */ |
7573 | /* ??? For now do not allow mixing ops or externs/constants. */ |
7574 | bool invalid = false; |
7575 | unsigned remain_cnt = 0; |
7576 | unsigned last_idx = 0; |
7577 | for (unsigned i = 0; i < chain.length (); ++i) |
7578 | { |
7579 | if (chain[i].code != code) |
7580 | { |
7581 | invalid = true; |
7582 | break; |
7583 | } |
7584 | if (chain[i].dt != vect_internal_def |
7585 | /* Avoid stmts where the def is not the LHS, like |
7586 | ASMs. */ |
7587 | || (gimple_get_lhs (bb_vinfo->lookup_def |
7588 | (chain[i].op)->stmt) |
7589 | != chain[i].op)) |
7590 | remain_cnt++; |
7591 | else |
7592 | last_idx = i; |
7593 | } |
7594 | /* Make sure to have an even number of lanes as we later do |
7595 | all-or-nothing discovery, not trying to split further. */ |
7596 | if ((chain.length () - remain_cnt) & 1) |
7597 | remain_cnt++; |
7598 | if (!invalid && chain.length () - remain_cnt > 1) |
7599 | { |
7600 | vec<stmt_vec_info> stmts; |
7601 | vec<tree> remain = vNULL; |
7602 | stmts.create (nelems: chain.length ()); |
7603 | if (remain_cnt > 0) |
7604 | remain.create (nelems: remain_cnt); |
7605 | for (unsigned i = 0; i < chain.length (); ++i) |
7606 | { |
7607 | stmt_vec_info stmt_info; |
7608 | if (chain[i].dt == vect_internal_def |
7609 | && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)), |
7610 | gimple_get_lhs (stmt_info->stmt) == chain[i].op) |
7611 | && (i != last_idx |
7612 | || (stmts.length () & 1))) |
7613 | stmts.quick_push (obj: stmt_info); |
7614 | else |
7615 | remain.quick_push (obj: chain[i].op); |
7616 | } |
7617 | vec<stmt_vec_info> roots; |
7618 | roots.create (nelems: chain_stmts.length ()); |
7619 | for (unsigned i = 0; i < chain_stmts.length (); ++i) |
7620 | roots.quick_push (obj: bb_vinfo->lookup_stmt (chain_stmts[i])); |
7621 | bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_bb_reduc, |
7622 | stmts, roots, remain)); |
7623 | } |
7624 | } |
7625 | } |
7626 | } |
7627 | } |
7628 | |
7629 | /* Walk the grouped store chains and replace entries with their |
7630 | pattern variant if any. */ |
7631 | |
7632 | static void |
7633 | vect_fixup_store_groups_with_patterns (vec_info *vinfo) |
7634 | { |
7635 | stmt_vec_info first_element; |
7636 | unsigned i; |
7637 | |
7638 | FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element) |
7639 | { |
7640 | /* We also have CTORs in this array. */ |
7641 | if (!STMT_VINFO_GROUPED_ACCESS (first_element)) |
7642 | continue; |
7643 | if (STMT_VINFO_IN_PATTERN_P (first_element)) |
7644 | { |
7645 | stmt_vec_info orig = first_element; |
7646 | first_element = STMT_VINFO_RELATED_STMT (first_element); |
7647 | DR_GROUP_FIRST_ELEMENT (first_element) = first_element; |
7648 | DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig); |
7649 | DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig); |
7650 | DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig); |
7651 | vinfo->grouped_stores[i] = first_element; |
7652 | } |
7653 | stmt_vec_info prev = first_element; |
7654 | while (DR_GROUP_NEXT_ELEMENT (prev)) |
7655 | { |
7656 | stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev); |
7657 | if (STMT_VINFO_IN_PATTERN_P (elt)) |
7658 | { |
7659 | stmt_vec_info orig = elt; |
7660 | elt = STMT_VINFO_RELATED_STMT (elt); |
7661 | DR_GROUP_NEXT_ELEMENT (prev) = elt; |
7662 | DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig); |
7663 | DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig); |
7664 | } |
7665 | DR_GROUP_FIRST_ELEMENT (elt) = first_element; |
7666 | prev = elt; |
7667 | } |
7668 | } |
7669 | } |
7670 | |
7671 | /* Check if the region described by BB_VINFO can be vectorized, returning |
7672 | true if so. When returning false, set FATAL to true if the same failure |
7673 | would prevent vectorization at other vector sizes, false if it is still |
7674 | worth trying other sizes. N_STMTS is the number of statements in the |
7675 | region. */ |
7676 | |
7677 | static bool |
7678 | vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, |
7679 | vec<int> *dataref_groups) |
7680 | { |
7681 | DUMP_VECT_SCOPE ("vect_slp_analyze_bb" ); |
7682 | |
7683 | slp_instance instance; |
7684 | int i; |
7685 | poly_uint64 min_vf = 2; |
7686 | |
7687 | /* The first group of checks is independent of the vector size. */ |
7688 | fatal = true; |
7689 | |
7690 | /* Analyze the data references. */ |
7691 | |
7692 | if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL)) |
7693 | { |
7694 | if (dump_enabled_p ()) |
7695 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7696 | "not vectorized: unhandled data-ref in basic " |
7697 | "block.\n" ); |
7698 | return false; |
7699 | } |
7700 | |
7701 | if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups)) |
7702 | { |
7703 | if (dump_enabled_p ()) |
7704 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7705 | "not vectorized: unhandled data access in " |
7706 | "basic block.\n" ); |
7707 | return false; |
7708 | } |
7709 | |
7710 | vect_slp_check_for_roots (bb_vinfo); |
7711 | |
7712 | /* If there are no grouped stores and no constructors in the region |
7713 | there is no need to continue with pattern recog as vect_analyze_slp |
7714 | will fail anyway. */ |
7715 | if (bb_vinfo->grouped_stores.is_empty () |
7716 | && bb_vinfo->roots.is_empty ()) |
7717 | { |
7718 | if (dump_enabled_p ()) |
7719 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7720 | "not vectorized: no grouped stores in " |
7721 | "basic block.\n" ); |
7722 | return false; |
7723 | } |
7724 | |
7725 | /* While the rest of the analysis below depends on it in some way. */ |
7726 | fatal = false; |
7727 | |
7728 | vect_pattern_recog (bb_vinfo); |
7729 | |
7730 | /* Update store groups from pattern processing. */ |
7731 | vect_fixup_store_groups_with_patterns (vinfo: bb_vinfo); |
7732 | |
7733 | /* Check the SLP opportunities in the basic block, analyze and build SLP |
7734 | trees. */ |
7735 | if (!vect_analyze_slp (vinfo: bb_vinfo, max_tree_size: n_stmts)) |
7736 | { |
7737 | if (dump_enabled_p ()) |
7738 | { |
7739 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7740 | "Failed to SLP the basic block.\n" ); |
7741 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7742 | "not vectorized: failed to find SLP opportunities " |
7743 | "in basic block.\n" ); |
7744 | } |
7745 | return false; |
7746 | } |
7747 | |
7748 | /* Optimize permutations. */ |
7749 | vect_optimize_slp (vinfo: bb_vinfo); |
7750 | |
7751 | /* Gather the loads reachable from the SLP graph entries. */ |
7752 | vect_gather_slp_loads (vinfo: bb_vinfo); |
7753 | |
7754 | vect_record_base_alignments (bb_vinfo); |
7755 | |
7756 | /* Analyze and verify the alignment of data references and the |
7757 | dependence in the SLP instances. */ |
7758 | for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (ix: i, ptr: &instance); ) |
7759 | { |
7760 | vect_location = instance->location (); |
7761 | if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance) |
7762 | || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance)) |
7763 | { |
7764 | slp_tree node = SLP_INSTANCE_TREE (instance); |
7765 | stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
7766 | if (dump_enabled_p ()) |
7767 | dump_printf_loc (MSG_NOTE, vect_location, |
7768 | "removing SLP instance operations starting from: %G" , |
7769 | stmt_info->stmt); |
7770 | vect_free_slp_instance (instance); |
7771 | BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (ix: i); |
7772 | continue; |
7773 | } |
7774 | |
7775 | /* Mark all the statements that we want to vectorize as pure SLP and |
7776 | relevant. */ |
7777 | vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance)); |
7778 | vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance)); |
7779 | unsigned j; |
7780 | stmt_vec_info root; |
7781 | /* Likewise consider instance root stmts as vectorized. */ |
7782 | FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root) |
7783 | STMT_SLP_TYPE (root) = pure_slp; |
7784 | |
7785 | i++; |
7786 | } |
7787 | if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()) |
7788 | return false; |
7789 | |
7790 | if (!vect_slp_analyze_operations (vinfo: bb_vinfo)) |
7791 | { |
7792 | if (dump_enabled_p ()) |
7793 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7794 | "not vectorized: bad operation in basic block.\n" ); |
7795 | return false; |
7796 | } |
7797 | |
7798 | vect_bb_partition_graph (bb_vinfo); |
7799 | |
7800 | return true; |
7801 | } |
7802 | |
7803 | /* Subroutine of vect_slp_bb. Try to vectorize the statements for all |
7804 | basic blocks in BBS, returning true on success. |
7805 | The region has N_STMTS statements and has the datarefs given by DATAREFS. */ |
7806 | |
7807 | static bool |
7808 | vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, |
7809 | vec<int> *dataref_groups, unsigned int n_stmts, |
7810 | loop_p orig_loop) |
7811 | { |
7812 | bb_vec_info bb_vinfo; |
7813 | auto_vector_modes vector_modes; |
7814 | |
7815 | /* Autodetect first vector size we try. */ |
7816 | machine_mode next_vector_mode = VOIDmode; |
7817 | targetm.vectorize.autovectorize_vector_modes (&vector_modes, false); |
7818 | unsigned int mode_i = 0; |
7819 | |
7820 | vec_info_shared shared; |
7821 | |
7822 | machine_mode autodetected_vector_mode = VOIDmode; |
7823 | while (1) |
7824 | { |
7825 | bool vectorized = false; |
7826 | bool fatal = false; |
7827 | bb_vinfo = new _bb_vec_info (bbs, &shared); |
7828 | |
7829 | bool first_time_p = shared.datarefs.is_empty (); |
7830 | BB_VINFO_DATAREFS (bb_vinfo) = datarefs; |
7831 | if (first_time_p) |
7832 | bb_vinfo->shared->save_datarefs (); |
7833 | else |
7834 | bb_vinfo->shared->check_datarefs (); |
7835 | bb_vinfo->vector_mode = next_vector_mode; |
7836 | |
7837 | if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)) |
7838 | { |
7839 | if (dump_enabled_p ()) |
7840 | { |
7841 | dump_printf_loc (MSG_NOTE, vect_location, |
7842 | "***** Analysis succeeded with vector mode" |
7843 | " %s\n" , GET_MODE_NAME (bb_vinfo->vector_mode)); |
7844 | dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n" ); |
7845 | } |
7846 | |
7847 | bb_vinfo->shared->check_datarefs (); |
7848 | |
7849 | bool force_clear = false; |
7850 | auto_vec<slp_instance> profitable_subgraphs; |
7851 | for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo)) |
7852 | { |
7853 | if (instance->subgraph_entries.is_empty ()) |
7854 | continue; |
7855 | |
7856 | dump_user_location_t saved_vect_location = vect_location; |
7857 | vect_location = instance->location (); |
7858 | if (!unlimited_cost_model (NULL) |
7859 | && !vect_bb_vectorization_profitable_p |
7860 | (bb_vinfo, slp_instances: instance->subgraph_entries, orig_loop)) |
7861 | { |
7862 | if (dump_enabled_p ()) |
7863 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7864 | "not vectorized: vectorization is not " |
7865 | "profitable.\n" ); |
7866 | vect_location = saved_vect_location; |
7867 | continue; |
7868 | } |
7869 | |
7870 | vect_location = saved_vect_location; |
7871 | if (!dbg_cnt (index: vect_slp)) |
7872 | { |
7873 | force_clear = true; |
7874 | continue; |
7875 | } |
7876 | |
7877 | profitable_subgraphs.safe_push (obj: instance); |
7878 | } |
7879 | |
7880 | /* When we're vectorizing an if-converted loop body make sure |
7881 | we vectorized all if-converted code. */ |
7882 | if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop) |
7883 | { |
7884 | gcc_assert (bb_vinfo->bbs.length () == 1); |
7885 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bb_vinfo->bbs[0]); |
7886 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
7887 | { |
7888 | /* The costing above left us with DCEable vectorized scalar |
7889 | stmts having the visited flag set on profitable |
7890 | subgraphs. Do the delayed clearing of the flag here. */ |
7891 | if (gimple_visited_p (stmt: gsi_stmt (i: gsi))) |
7892 | { |
7893 | gimple_set_visited (stmt: gsi_stmt (i: gsi), visited_p: false); |
7894 | continue; |
7895 | } |
7896 | if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED) |
7897 | continue; |
7898 | |
7899 | if (gassign *ass = dyn_cast <gassign *> (p: gsi_stmt (i: gsi))) |
7900 | if (gimple_assign_rhs_code (gs: ass) == COND_EXPR) |
7901 | { |
7902 | if (!profitable_subgraphs.is_empty () |
7903 | && dump_enabled_p ()) |
7904 | dump_printf_loc (MSG_NOTE, vect_location, |
7905 | "not profitable because of " |
7906 | "unprofitable if-converted scalar " |
7907 | "code\n" ); |
7908 | profitable_subgraphs.truncate (size: 0); |
7909 | } |
7910 | } |
7911 | } |
7912 | |
7913 | /* Finally schedule the profitable subgraphs. */ |
7914 | for (slp_instance instance : profitable_subgraphs) |
7915 | { |
7916 | if (!vectorized && dump_enabled_p ()) |
7917 | dump_printf_loc (MSG_NOTE, vect_location, |
7918 | "Basic block will be vectorized " |
7919 | "using SLP\n" ); |
7920 | vectorized = true; |
7921 | |
7922 | /* Dump before scheduling as store vectorization will remove |
7923 | the original stores and mess with the instance tree |
7924 | so querying its location will eventually ICE. */ |
7925 | if (flag_checking) |
7926 | for (slp_instance sub : instance->subgraph_entries) |
7927 | gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub))); |
7928 | unsigned HOST_WIDE_INT bytes; |
7929 | if (dump_enabled_p ()) |
7930 | for (slp_instance sub : instance->subgraph_entries) |
7931 | { |
7932 | tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)); |
7933 | if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (const_value: &bytes)) |
7934 | dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, |
7935 | sub->location (), |
7936 | "basic block part vectorized using %wu " |
7937 | "byte vectors\n" , bytes); |
7938 | else |
7939 | dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, |
7940 | sub->location (), |
7941 | "basic block part vectorized using " |
7942 | "variable length vectors\n" ); |
7943 | } |
7944 | |
7945 | dump_user_location_t saved_vect_location = vect_location; |
7946 | vect_location = instance->location (); |
7947 | |
7948 | vect_schedule_slp (bb_vinfo, instance->subgraph_entries); |
7949 | |
7950 | vect_location = saved_vect_location; |
7951 | } |
7952 | } |
7953 | else |
7954 | { |
7955 | if (dump_enabled_p ()) |
7956 | dump_printf_loc (MSG_NOTE, vect_location, |
7957 | "***** Analysis failed with vector mode %s\n" , |
7958 | GET_MODE_NAME (bb_vinfo->vector_mode)); |
7959 | } |
7960 | |
7961 | if (mode_i == 0) |
7962 | autodetected_vector_mode = bb_vinfo->vector_mode; |
7963 | |
7964 | if (!fatal) |
7965 | while (mode_i < vector_modes.length () |
7966 | && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i])) |
7967 | { |
7968 | if (dump_enabled_p ()) |
7969 | dump_printf_loc (MSG_NOTE, vect_location, |
7970 | "***** The result for vector mode %s would" |
7971 | " be the same\n" , |
7972 | GET_MODE_NAME (vector_modes[mode_i])); |
7973 | mode_i += 1; |
7974 | } |
7975 | |
7976 | delete bb_vinfo; |
7977 | |
7978 | if (mode_i < vector_modes.length () |
7979 | && VECTOR_MODE_P (autodetected_vector_mode) |
7980 | && (related_vector_mode (vector_modes[mode_i], |
7981 | GET_MODE_INNER (autodetected_vector_mode)) |
7982 | == autodetected_vector_mode) |
7983 | && (related_vector_mode (autodetected_vector_mode, |
7984 | GET_MODE_INNER (vector_modes[mode_i])) |
7985 | == vector_modes[mode_i])) |
7986 | { |
7987 | if (dump_enabled_p ()) |
7988 | dump_printf_loc (MSG_NOTE, vect_location, |
7989 | "***** Skipping vector mode %s, which would" |
7990 | " repeat the analysis for %s\n" , |
7991 | GET_MODE_NAME (vector_modes[mode_i]), |
7992 | GET_MODE_NAME (autodetected_vector_mode)); |
7993 | mode_i += 1; |
7994 | } |
7995 | |
7996 | if (vectorized |
7997 | || mode_i == vector_modes.length () |
7998 | || autodetected_vector_mode == VOIDmode |
7999 | /* If vect_slp_analyze_bb_1 signaled that analysis for all |
8000 | vector sizes will fail do not bother iterating. */ |
8001 | || fatal) |
8002 | return vectorized; |
8003 | |
8004 | /* Try the next biggest vector size. */ |
8005 | next_vector_mode = vector_modes[mode_i++]; |
8006 | if (dump_enabled_p ()) |
8007 | dump_printf_loc (MSG_NOTE, vect_location, |
8008 | "***** Re-trying analysis with vector mode %s\n" , |
8009 | GET_MODE_NAME (next_vector_mode)); |
8010 | } |
8011 | } |
8012 | |
8013 | |
8014 | /* Main entry for the BB vectorizer. Analyze and transform BBS, returns |
8015 | true if anything in the basic-block was vectorized. */ |
8016 | |
8017 | static bool |
8018 | vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop) |
8019 | { |
8020 | vec<data_reference_p> datarefs = vNULL; |
8021 | auto_vec<int> dataref_groups; |
8022 | int insns = 0; |
8023 | int current_group = 0; |
8024 | |
8025 | for (unsigned i = 0; i < bbs.length (); i++) |
8026 | { |
8027 | basic_block bb = bbs[i]; |
8028 | for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (i: gsi); |
8029 | gsi_next (i: &gsi)) |
8030 | { |
8031 | gimple *stmt = gsi_stmt (i: gsi); |
8032 | if (is_gimple_debug (gs: stmt)) |
8033 | continue; |
8034 | |
8035 | insns++; |
8036 | |
8037 | if (gimple_location (g: stmt) != UNKNOWN_LOCATION) |
8038 | vect_location = stmt; |
8039 | |
8040 | if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs, |
8041 | &dataref_groups, current_group)) |
8042 | ++current_group; |
8043 | } |
8044 | /* New BBs always start a new DR group. */ |
8045 | ++current_group; |
8046 | } |
8047 | |
8048 | return vect_slp_region (bbs, datarefs, dataref_groups: &dataref_groups, n_stmts: insns, orig_loop); |
8049 | } |
8050 | |
8051 | /* Special entry for the BB vectorizer. Analyze and transform a single |
8052 | if-converted BB with ORIG_LOOPs body being the not if-converted |
8053 | representation. Returns true if anything in the basic-block was |
8054 | vectorized. */ |
8055 | |
8056 | bool |
8057 | vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop) |
8058 | { |
8059 | auto_vec<basic_block> bbs; |
8060 | bbs.safe_push (obj: bb); |
8061 | return vect_slp_bbs (bbs, orig_loop); |
8062 | } |
8063 | |
8064 | /* Main entry for the BB vectorizer. Analyze and transform BB, returns |
8065 | true if anything in the basic-block was vectorized. */ |
8066 | |
8067 | bool |
8068 | vect_slp_function (function *fun) |
8069 | { |
8070 | bool r = false; |
8071 | int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun)); |
8072 | auto_bitmap exit_bbs; |
8073 | bitmap_set_bit (exit_bbs, EXIT_BLOCK); |
8074 | edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun)); |
8075 | unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs, |
8076 | true, rpo, NULL); |
8077 | |
8078 | /* For the moment split the function into pieces to avoid making |
8079 | the iteration on the vector mode moot. Split at points we know |
8080 | to not handle well which is CFG merges (SLP discovery doesn't |
8081 | handle non-loop-header PHIs) and loop exits. Since pattern |
8082 | recog requires reverse iteration to visit uses before defs |
8083 | simply chop RPO into pieces. */ |
8084 | auto_vec<basic_block> bbs; |
8085 | for (unsigned i = 0; i < n; i++) |
8086 | { |
8087 | basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]); |
8088 | bool split = false; |
8089 | |
8090 | /* Split when a BB is not dominated by the first block. */ |
8091 | if (!bbs.is_empty () |
8092 | && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0])) |
8093 | { |
8094 | if (dump_enabled_p ()) |
8095 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8096 | "splitting region at dominance boundary bb%d\n" , |
8097 | bb->index); |
8098 | split = true; |
8099 | } |
8100 | /* Split when the loop determined by the first block |
8101 | is exited. This is because we eventually insert |
8102 | invariants at region begin. */ |
8103 | else if (!bbs.is_empty () |
8104 | && bbs[0]->loop_father != bb->loop_father |
8105 | && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father)) |
8106 | { |
8107 | if (dump_enabled_p ()) |
8108 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8109 | "splitting region at loop %d exit at bb%d\n" , |
8110 | bbs[0]->loop_father->num, bb->index); |
8111 | split = true; |
8112 | } |
8113 | else if (!bbs.is_empty () |
8114 | && bb->loop_father->header == bb |
8115 | && bb->loop_father->dont_vectorize) |
8116 | { |
8117 | if (dump_enabled_p ()) |
8118 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8119 | "splitting region at dont-vectorize loop %d " |
8120 | "entry at bb%d\n" , |
8121 | bb->loop_father->num, bb->index); |
8122 | split = true; |
8123 | } |
8124 | |
8125 | if (split && !bbs.is_empty ()) |
8126 | { |
8127 | r |= vect_slp_bbs (bbs, NULL); |
8128 | bbs.truncate (size: 0); |
8129 | } |
8130 | |
8131 | if (bbs.is_empty ()) |
8132 | { |
8133 | /* We need to be able to insert at the head of the region which |
8134 | we cannot for region starting with a returns-twice call. */ |
8135 | if (gcall *first = safe_dyn_cast <gcall *> (p: first_stmt (bb))) |
8136 | if (gimple_call_flags (first) & ECF_RETURNS_TWICE) |
8137 | { |
8138 | if (dump_enabled_p ()) |
8139 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8140 | "skipping bb%d as start of region as it " |
8141 | "starts with returns-twice call\n" , |
8142 | bb->index); |
8143 | continue; |
8144 | } |
8145 | /* If the loop this BB belongs to is marked as not to be vectorized |
8146 | honor that also for BB vectorization. */ |
8147 | if (bb->loop_father->dont_vectorize) |
8148 | continue; |
8149 | } |
8150 | |
8151 | bbs.safe_push (obj: bb); |
8152 | |
8153 | /* When we have a stmt ending this block and defining a |
8154 | value we have to insert on edges when inserting after it for |
8155 | a vector containing its definition. Avoid this for now. */ |
8156 | if (gimple *last = *gsi_last_bb (bb)) |
8157 | if (gimple_get_lhs (last) |
8158 | && is_ctrl_altering_stmt (last)) |
8159 | { |
8160 | if (dump_enabled_p ()) |
8161 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8162 | "splitting region at control altering " |
8163 | "definition %G" , last); |
8164 | r |= vect_slp_bbs (bbs, NULL); |
8165 | bbs.truncate (size: 0); |
8166 | } |
8167 | } |
8168 | |
8169 | if (!bbs.is_empty ()) |
8170 | r |= vect_slp_bbs (bbs, NULL); |
8171 | |
8172 | free (ptr: rpo); |
8173 | |
8174 | return r; |
8175 | } |
8176 | |
8177 | /* Build a variable-length vector in which the elements in ELTS are repeated |
8178 | to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in |
8179 | RESULTS and add any new instructions to SEQ. |
8180 | |
8181 | The approach we use is: |
8182 | |
8183 | (1) Find a vector mode VM with integer elements of mode IM. |
8184 | |
8185 | (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of |
8186 | ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs |
8187 | from small vectors to IM. |
8188 | |
8189 | (3) Duplicate each ELTS'[I] into a vector of mode VM. |
8190 | |
8191 | (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the |
8192 | correct byte contents. |
8193 | |
8194 | (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type. |
8195 | |
8196 | We try to find the largest IM for which this sequence works, in order |
8197 | to cut down on the number of interleaves. */ |
8198 | |
8199 | void |
8200 | duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type, |
8201 | const vec<tree> &elts, unsigned int nresults, |
8202 | vec<tree> &results) |
8203 | { |
8204 | unsigned int nelts = elts.length (); |
8205 | tree element_type = TREE_TYPE (vector_type); |
8206 | |
8207 | /* (1) Find a vector mode VM with integer elements of mode IM. */ |
8208 | unsigned int nvectors = 1; |
8209 | tree new_vector_type; |
8210 | tree permutes[2]; |
8211 | if (!can_duplicate_and_interleave_p (vinfo, count: nelts, elt_type: element_type, |
8212 | nvectors_out: &nvectors, vector_type_out: &new_vector_type, |
8213 | permutes)) |
8214 | gcc_unreachable (); |
8215 | |
8216 | /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */ |
8217 | unsigned int partial_nelts = nelts / nvectors; |
8218 | tree partial_vector_type = build_vector_type (element_type, partial_nelts); |
8219 | |
8220 | tree_vector_builder partial_elts; |
8221 | auto_vec<tree, 32> pieces (nvectors * 2); |
8222 | pieces.quick_grow_cleared (len: nvectors * 2); |
8223 | for (unsigned int i = 0; i < nvectors; ++i) |
8224 | { |
8225 | /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of |
8226 | ELTS' has mode IM. */ |
8227 | partial_elts.new_vector (type: partial_vector_type, npatterns: partial_nelts, nelts_per_pattern: 1); |
8228 | for (unsigned int j = 0; j < partial_nelts; ++j) |
8229 | partial_elts.quick_push (obj: elts[i * partial_nelts + j]); |
8230 | tree t = gimple_build_vector (seq, builder: &partial_elts); |
8231 | t = gimple_build (seq, code: VIEW_CONVERT_EXPR, |
8232 | TREE_TYPE (new_vector_type), ops: t); |
8233 | |
8234 | /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */ |
8235 | pieces[i] = gimple_build_vector_from_val (seq, type: new_vector_type, op: t); |
8236 | } |
8237 | |
8238 | /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the |
8239 | correct byte contents. |
8240 | |
8241 | Conceptually, we need to repeat the following operation log2(nvectors) |
8242 | times, where hi_start = nvectors / 2: |
8243 | |
8244 | out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute); |
8245 | out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute); |
8246 | |
8247 | However, if each input repeats every N elements and the VF is |
8248 | a multiple of N * 2, the HI result is the same as the LO result. |
8249 | This will be true for the first N1 iterations of the outer loop, |
8250 | followed by N2 iterations for which both the LO and HI results |
8251 | are needed. I.e.: |
8252 | |
8253 | N1 + N2 = log2(nvectors) |
8254 | |
8255 | Each "N1 iteration" doubles the number of redundant vectors and the |
8256 | effect of the process as a whole is to have a sequence of nvectors/2**N1 |
8257 | vectors that repeats 2**N1 times. Rather than generate these redundant |
8258 | vectors, we halve the number of vectors for each N1 iteration. */ |
8259 | unsigned int in_start = 0; |
8260 | unsigned int out_start = nvectors; |
8261 | unsigned int new_nvectors = nvectors; |
8262 | for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2) |
8263 | { |
8264 | unsigned int hi_start = new_nvectors / 2; |
8265 | unsigned int out_i = 0; |
8266 | for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i) |
8267 | { |
8268 | if ((in_i & 1) != 0 |
8269 | && multiple_p (a: TYPE_VECTOR_SUBPARTS (node: new_vector_type), |
8270 | b: 2 * in_repeat)) |
8271 | continue; |
8272 | |
8273 | tree output = make_ssa_name (var: new_vector_type); |
8274 | tree input1 = pieces[in_start + (in_i / 2)]; |
8275 | tree input2 = pieces[in_start + (in_i / 2) + hi_start]; |
8276 | gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR, |
8277 | input1, input2, |
8278 | permutes[in_i & 1]); |
8279 | gimple_seq_add_stmt (seq, stmt); |
8280 | pieces[out_start + out_i] = output; |
8281 | out_i += 1; |
8282 | } |
8283 | std::swap (a&: in_start, b&: out_start); |
8284 | new_nvectors = out_i; |
8285 | } |
8286 | |
8287 | /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */ |
8288 | results.reserve (nelems: nresults); |
8289 | for (unsigned int i = 0; i < nresults; ++i) |
8290 | if (i < new_nvectors) |
8291 | results.quick_push (obj: gimple_build (seq, code: VIEW_CONVERT_EXPR, type: vector_type, |
8292 | ops: pieces[in_start + i])); |
8293 | else |
8294 | results.quick_push (obj: results[i - new_nvectors]); |
8295 | } |
8296 | |
8297 | |
8298 | /* For constant and loop invariant defs in OP_NODE this function creates |
8299 | vector defs that will be used in the vectorized stmts and stores them |
8300 | to SLP_TREE_VEC_DEFS of OP_NODE. */ |
8301 | |
8302 | static void |
8303 | vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node) |
8304 | { |
8305 | unsigned HOST_WIDE_INT nunits; |
8306 | tree vec_cst; |
8307 | unsigned j, number_of_places_left_in_vector; |
8308 | tree vector_type; |
8309 | tree vop; |
8310 | int group_size = op_node->ops.length (); |
8311 | unsigned int vec_num, i; |
8312 | unsigned number_of_copies = 1; |
8313 | bool constant_p; |
8314 | gimple_seq ctor_seq = NULL; |
8315 | auto_vec<tree, 16> permute_results; |
8316 | |
8317 | /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */ |
8318 | vector_type = SLP_TREE_VECTYPE (op_node); |
8319 | |
8320 | unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node); |
8321 | SLP_TREE_VEC_DEFS (op_node).create (nelems: number_of_vectors); |
8322 | auto_vec<tree> voprnds (number_of_vectors); |
8323 | |
8324 | /* NUMBER_OF_COPIES is the number of times we need to use the same values in |
8325 | created vectors. It is greater than 1 if unrolling is performed. |
8326 | |
8327 | For example, we have two scalar operands, s1 and s2 (e.g., group of |
8328 | strided accesses of size two), while NUNITS is four (i.e., four scalars |
8329 | of this type can be packed in a vector). The output vector will contain |
8330 | two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES |
8331 | will be 2). |
8332 | |
8333 | If GROUP_SIZE > NUNITS, the scalars will be split into several vectors |
8334 | containing the operands. |
8335 | |
8336 | For example, NUNITS is four as before, and the group size is 8 |
8337 | (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and |
8338 | {s5, s6, s7, s8}. */ |
8339 | |
8340 | /* When using duplicate_and_interleave, we just need one element for |
8341 | each scalar statement. */ |
8342 | if (!TYPE_VECTOR_SUBPARTS (node: vector_type).is_constant (const_value: &nunits)) |
8343 | nunits = group_size; |
8344 | |
8345 | number_of_copies = nunits * number_of_vectors / group_size; |
8346 | |
8347 | number_of_places_left_in_vector = nunits; |
8348 | constant_p = true; |
8349 | tree uniform_elt = NULL_TREE; |
8350 | tree_vector_builder elts (vector_type, nunits, 1); |
8351 | elts.quick_grow (len: nunits); |
8352 | stmt_vec_info insert_after = NULL; |
8353 | for (j = 0; j < number_of_copies; j++) |
8354 | { |
8355 | tree op; |
8356 | for (i = group_size - 1; op_node->ops.iterate (ix: i, ptr: &op); i--) |
8357 | { |
8358 | /* Create 'vect_ = {op0,op1,...,opn}'. */ |
8359 | tree orig_op = op; |
8360 | if (number_of_places_left_in_vector == nunits) |
8361 | uniform_elt = op; |
8362 | else if (uniform_elt && operand_equal_p (uniform_elt, op)) |
8363 | op = elts[number_of_places_left_in_vector]; |
8364 | else |
8365 | uniform_elt = NULL_TREE; |
8366 | number_of_places_left_in_vector--; |
8367 | if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op))) |
8368 | { |
8369 | if (CONSTANT_CLASS_P (op)) |
8370 | { |
8371 | if (VECTOR_BOOLEAN_TYPE_P (vector_type)) |
8372 | { |
8373 | /* Can't use VIEW_CONVERT_EXPR for booleans because |
8374 | of possibly different sizes of scalar value and |
8375 | vector element. */ |
8376 | if (integer_zerop (op)) |
8377 | op = build_int_cst (TREE_TYPE (vector_type), 0); |
8378 | else if (integer_onep (op)) |
8379 | op = build_all_ones_cst (TREE_TYPE (vector_type)); |
8380 | else |
8381 | gcc_unreachable (); |
8382 | } |
8383 | else |
8384 | op = fold_unary (VIEW_CONVERT_EXPR, |
8385 | TREE_TYPE (vector_type), op); |
8386 | gcc_assert (op && CONSTANT_CLASS_P (op)); |
8387 | } |
8388 | else |
8389 | { |
8390 | tree new_temp = make_ssa_name (TREE_TYPE (vector_type)); |
8391 | gimple *init_stmt; |
8392 | if (VECTOR_BOOLEAN_TYPE_P (vector_type)) |
8393 | { |
8394 | tree true_val |
8395 | = build_all_ones_cst (TREE_TYPE (vector_type)); |
8396 | tree false_val |
8397 | = build_zero_cst (TREE_TYPE (vector_type)); |
8398 | gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op))); |
8399 | init_stmt = gimple_build_assign (new_temp, COND_EXPR, |
8400 | op, true_val, |
8401 | false_val); |
8402 | } |
8403 | else |
8404 | { |
8405 | op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), |
8406 | op); |
8407 | init_stmt |
8408 | = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, |
8409 | op); |
8410 | } |
8411 | gimple_seq_add_stmt (&ctor_seq, init_stmt); |
8412 | op = new_temp; |
8413 | } |
8414 | } |
8415 | elts[number_of_places_left_in_vector] = op; |
8416 | if (!CONSTANT_CLASS_P (op)) |
8417 | constant_p = false; |
8418 | /* For BB vectorization we have to compute an insert location |
8419 | when a def is inside the analyzed region since we cannot |
8420 | simply insert at the BB start in this case. */ |
8421 | stmt_vec_info opdef; |
8422 | if (TREE_CODE (orig_op) == SSA_NAME |
8423 | && !SSA_NAME_IS_DEFAULT_DEF (orig_op) |
8424 | && is_a <bb_vec_info> (p: vinfo) |
8425 | && (opdef = vinfo->lookup_def (orig_op))) |
8426 | { |
8427 | if (!insert_after) |
8428 | insert_after = opdef; |
8429 | else |
8430 | insert_after = get_later_stmt (stmt1_info: insert_after, stmt2_info: opdef); |
8431 | } |
8432 | |
8433 | if (number_of_places_left_in_vector == 0) |
8434 | { |
8435 | auto type_nunits = TYPE_VECTOR_SUBPARTS (node: vector_type); |
8436 | if (uniform_elt) |
8437 | vec_cst = gimple_build_vector_from_val (seq: &ctor_seq, type: vector_type, |
8438 | op: elts[0]); |
8439 | else if (constant_p |
8440 | ? multiple_p (a: type_nunits, b: nunits) |
8441 | : known_eq (type_nunits, nunits)) |
8442 | vec_cst = gimple_build_vector (seq: &ctor_seq, builder: &elts); |
8443 | else |
8444 | { |
8445 | if (permute_results.is_empty ()) |
8446 | duplicate_and_interleave (vinfo, seq: &ctor_seq, vector_type, |
8447 | elts, nresults: number_of_vectors, |
8448 | results&: permute_results); |
8449 | vec_cst = permute_results[number_of_vectors - j - 1]; |
8450 | } |
8451 | if (!gimple_seq_empty_p (s: ctor_seq)) |
8452 | { |
8453 | if (insert_after) |
8454 | { |
8455 | gimple_stmt_iterator gsi; |
8456 | if (gimple_code (g: insert_after->stmt) == GIMPLE_PHI) |
8457 | { |
8458 | gsi = gsi_after_labels (bb: gimple_bb (g: insert_after->stmt)); |
8459 | gsi_insert_seq_before (&gsi, ctor_seq, |
8460 | GSI_CONTINUE_LINKING); |
8461 | } |
8462 | else if (!stmt_ends_bb_p (insert_after->stmt)) |
8463 | { |
8464 | gsi = gsi_for_stmt (insert_after->stmt); |
8465 | gsi_insert_seq_after (&gsi, ctor_seq, |
8466 | GSI_CONTINUE_LINKING); |
8467 | } |
8468 | else |
8469 | { |
8470 | /* When we want to insert after a def where the |
8471 | defining stmt throws then insert on the fallthru |
8472 | edge. */ |
8473 | edge e = find_fallthru_edge |
8474 | (edges: gimple_bb (g: insert_after->stmt)->succs); |
8475 | basic_block new_bb |
8476 | = gsi_insert_seq_on_edge_immediate (e, ctor_seq); |
8477 | gcc_assert (!new_bb); |
8478 | } |
8479 | } |
8480 | else |
8481 | vinfo->insert_seq_on_entry (NULL, ctor_seq); |
8482 | ctor_seq = NULL; |
8483 | } |
8484 | voprnds.quick_push (obj: vec_cst); |
8485 | insert_after = NULL; |
8486 | number_of_places_left_in_vector = nunits; |
8487 | constant_p = true; |
8488 | elts.new_vector (type: vector_type, npatterns: nunits, nelts_per_pattern: 1); |
8489 | elts.quick_grow (len: nunits); |
8490 | } |
8491 | } |
8492 | } |
8493 | |
8494 | /* Since the vectors are created in the reverse order, we should invert |
8495 | them. */ |
8496 | vec_num = voprnds.length (); |
8497 | for (j = vec_num; j != 0; j--) |
8498 | { |
8499 | vop = voprnds[j - 1]; |
8500 | SLP_TREE_VEC_DEFS (op_node).quick_push (obj: vop); |
8501 | } |
8502 | |
8503 | /* In case that VF is greater than the unrolling factor needed for the SLP |
8504 | group of stmts, NUMBER_OF_VECTORS to be created is greater than |
8505 | NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have |
8506 | to replicate the vectors. */ |
8507 | while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ()) |
8508 | for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (ix: i, ptr: &vop) && i < vec_num; |
8509 | i++) |
8510 | SLP_TREE_VEC_DEFS (op_node).quick_push (obj: vop); |
8511 | } |
8512 | |
8513 | /* Get the Ith vectorized definition from SLP_NODE. */ |
8514 | |
8515 | tree |
8516 | vect_get_slp_vect_def (slp_tree slp_node, unsigned i) |
8517 | { |
8518 | return SLP_TREE_VEC_DEFS (slp_node)[i]; |
8519 | } |
8520 | |
8521 | /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */ |
8522 | |
8523 | void |
8524 | vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs) |
8525 | { |
8526 | vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)); |
8527 | vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node)); |
8528 | } |
8529 | |
8530 | /* Get N vectorized definitions for SLP_NODE. */ |
8531 | |
8532 | void |
8533 | vect_get_slp_defs (vec_info *, |
8534 | slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n) |
8535 | { |
8536 | if (n == -1U) |
8537 | n = SLP_TREE_CHILDREN (slp_node).length (); |
8538 | |
8539 | for (unsigned i = 0; i < n; ++i) |
8540 | { |
8541 | slp_tree child = SLP_TREE_CHILDREN (slp_node)[i]; |
8542 | vec<tree> vec_defs = vNULL; |
8543 | vect_get_slp_defs (slp_node: child, vec_defs: &vec_defs); |
8544 | vec_oprnds->quick_push (obj: vec_defs); |
8545 | } |
8546 | } |
8547 | |
8548 | /* A subroutine of vect_transform_slp_perm_load with two extra arguments: |
8549 | - PERM gives the permutation that the caller wants to use for NODE, |
8550 | which might be different from SLP_LOAD_PERMUTATION. |
8551 | - DUMP_P controls whether the function dumps information. */ |
8552 | |
8553 | static bool |
8554 | vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node, |
8555 | load_permutation_t &perm, |
8556 | const vec<tree> &dr_chain, |
8557 | gimple_stmt_iterator *gsi, poly_uint64 vf, |
8558 | bool analyze_only, bool dump_p, |
8559 | unsigned *n_perms, unsigned int *n_loads, |
8560 | bool dce_chain) |
8561 | { |
8562 | stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
8563 | int vec_index = 0; |
8564 | tree vectype = SLP_TREE_VECTYPE (node); |
8565 | unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length (); |
8566 | unsigned int mask_element; |
8567 | unsigned dr_group_size; |
8568 | machine_mode mode; |
8569 | |
8570 | if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
8571 | dr_group_size = 1; |
8572 | else |
8573 | { |
8574 | stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); |
8575 | dr_group_size = DR_GROUP_SIZE (stmt_info); |
8576 | } |
8577 | |
8578 | mode = TYPE_MODE (vectype); |
8579 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype); |
8580 | unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node); |
8581 | |
8582 | /* Initialize the vect stmts of NODE to properly insert the generated |
8583 | stmts later. */ |
8584 | if (! analyze_only) |
8585 | for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++) |
8586 | SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE); |
8587 | |
8588 | /* Generate permutation masks for every NODE. Number of masks for each NODE |
8589 | is equal to GROUP_SIZE. |
8590 | E.g., we have a group of three nodes with three loads from the same |
8591 | location in each node, and the vector size is 4. I.e., we have a |
8592 | a0b0c0a1b1c1... sequence and we need to create the following vectors: |
8593 | for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3 |
8594 | for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3 |
8595 | ... |
8596 | |
8597 | The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}. |
8598 | The last mask is illegal since we assume two operands for permute |
8599 | operation, and the mask element values can't be outside that range. |
8600 | Hence, the last mask must be converted into {2,5,5,5}. |
8601 | For the first two permutations we need the first and the second input |
8602 | vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation |
8603 | we need the second and the third vectors: {b1,c1,a2,b2} and |
8604 | {c2,a3,b3,c3}. */ |
8605 | |
8606 | int vect_stmts_counter = 0; |
8607 | unsigned int index = 0; |
8608 | int first_vec_index = -1; |
8609 | int second_vec_index = -1; |
8610 | bool noop_p = true; |
8611 | *n_perms = 0; |
8612 | |
8613 | vec_perm_builder mask; |
8614 | unsigned int nelts_to_build; |
8615 | unsigned int nvectors_per_build; |
8616 | unsigned int in_nlanes; |
8617 | bool repeating_p = (group_size == dr_group_size |
8618 | && multiple_p (a: nunits, b: group_size)); |
8619 | if (repeating_p) |
8620 | { |
8621 | /* A single vector contains a whole number of copies of the node, so: |
8622 | (a) all permutes can use the same mask; and |
8623 | (b) the permutes only need a single vector input. */ |
8624 | mask.new_vector (full_nelts: nunits, npatterns: group_size, nelts_per_pattern: 3); |
8625 | nelts_to_build = mask.encoded_nelts (); |
8626 | /* It's possible to obtain zero nstmts during analyze_only, so make |
8627 | it at least one to ensure the later computation for n_perms |
8628 | proceed. */ |
8629 | nvectors_per_build = nstmts > 0 ? nstmts : 1; |
8630 | in_nlanes = dr_group_size * 3; |
8631 | } |
8632 | else |
8633 | { |
8634 | /* We need to construct a separate mask for each vector statement. */ |
8635 | unsigned HOST_WIDE_INT const_nunits, const_vf; |
8636 | if (!nunits.is_constant (const_value: &const_nunits) |
8637 | || !vf.is_constant (const_value: &const_vf)) |
8638 | return false; |
8639 | mask.new_vector (full_nelts: const_nunits, npatterns: const_nunits, nelts_per_pattern: 1); |
8640 | nelts_to_build = const_vf * group_size; |
8641 | nvectors_per_build = 1; |
8642 | in_nlanes = const_vf * dr_group_size; |
8643 | } |
8644 | auto_sbitmap used_in_lanes (in_nlanes); |
8645 | bitmap_clear (used_in_lanes); |
8646 | auto_bitmap used_defs; |
8647 | |
8648 | unsigned int count = mask.encoded_nelts (); |
8649 | mask.quick_grow (len: count); |
8650 | vec_perm_indices indices; |
8651 | |
8652 | for (unsigned int j = 0; j < nelts_to_build; j++) |
8653 | { |
8654 | unsigned int iter_num = j / group_size; |
8655 | unsigned int stmt_num = j % group_size; |
8656 | unsigned int i = (iter_num * dr_group_size + perm[stmt_num]); |
8657 | bitmap_set_bit (map: used_in_lanes, bitno: i); |
8658 | if (repeating_p) |
8659 | { |
8660 | first_vec_index = 0; |
8661 | mask_element = i; |
8662 | } |
8663 | else |
8664 | { |
8665 | /* Enforced before the loop when !repeating_p. */ |
8666 | unsigned int const_nunits = nunits.to_constant (); |
8667 | vec_index = i / const_nunits; |
8668 | mask_element = i % const_nunits; |
8669 | if (vec_index == first_vec_index |
8670 | || first_vec_index == -1) |
8671 | { |
8672 | first_vec_index = vec_index; |
8673 | } |
8674 | else if (vec_index == second_vec_index |
8675 | || second_vec_index == -1) |
8676 | { |
8677 | second_vec_index = vec_index; |
8678 | mask_element += const_nunits; |
8679 | } |
8680 | else |
8681 | { |
8682 | if (dump_p) |
8683 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8684 | "permutation requires at " |
8685 | "least three vectors %G" , |
8686 | stmt_info->stmt); |
8687 | gcc_assert (analyze_only); |
8688 | return false; |
8689 | } |
8690 | |
8691 | gcc_assert (mask_element < 2 * const_nunits); |
8692 | } |
8693 | |
8694 | if (mask_element != index) |
8695 | noop_p = false; |
8696 | mask[index++] = mask_element; |
8697 | |
8698 | if (index == count) |
8699 | { |
8700 | if (!noop_p) |
8701 | { |
8702 | indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits); |
8703 | if (!can_vec_perm_const_p (mode, mode, indices)) |
8704 | { |
8705 | if (dump_p) |
8706 | { |
8707 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8708 | "unsupported vect permute { " ); |
8709 | for (i = 0; i < count; ++i) |
8710 | { |
8711 | dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); |
8712 | dump_printf (MSG_MISSED_OPTIMIZATION, " " ); |
8713 | } |
8714 | dump_printf (MSG_MISSED_OPTIMIZATION, "}\n" ); |
8715 | } |
8716 | gcc_assert (analyze_only); |
8717 | return false; |
8718 | } |
8719 | |
8720 | tree mask_vec = NULL_TREE; |
8721 | if (!analyze_only) |
8722 | mask_vec = vect_gen_perm_mask_checked (vectype, indices); |
8723 | |
8724 | if (second_vec_index == -1) |
8725 | second_vec_index = first_vec_index; |
8726 | |
8727 | for (unsigned int ri = 0; ri < nvectors_per_build; ++ri) |
8728 | { |
8729 | ++*n_perms; |
8730 | if (analyze_only) |
8731 | continue; |
8732 | /* Generate the permute statement if necessary. */ |
8733 | tree first_vec = dr_chain[first_vec_index + ri]; |
8734 | tree second_vec = dr_chain[second_vec_index + ri]; |
8735 | gassign *stmt = as_a<gassign *> (p: stmt_info->stmt); |
8736 | tree perm_dest |
8737 | = vect_create_destination_var (gimple_assign_lhs (gs: stmt), |
8738 | vectype); |
8739 | perm_dest = make_ssa_name (var: perm_dest); |
8740 | gimple *perm_stmt |
8741 | = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec, |
8742 | second_vec, mask_vec); |
8743 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, |
8744 | gsi); |
8745 | if (dce_chain) |
8746 | { |
8747 | bitmap_set_bit (used_defs, first_vec_index + ri); |
8748 | bitmap_set_bit (used_defs, second_vec_index + ri); |
8749 | } |
8750 | |
8751 | /* Store the vector statement in NODE. */ |
8752 | SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest; |
8753 | } |
8754 | } |
8755 | else if (!analyze_only) |
8756 | { |
8757 | for (unsigned int ri = 0; ri < nvectors_per_build; ++ri) |
8758 | { |
8759 | tree first_vec = dr_chain[first_vec_index + ri]; |
8760 | /* If mask was NULL_TREE generate the requested |
8761 | identity transform. */ |
8762 | if (dce_chain) |
8763 | bitmap_set_bit (used_defs, first_vec_index + ri); |
8764 | |
8765 | /* Store the vector statement in NODE. */ |
8766 | SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec; |
8767 | } |
8768 | } |
8769 | |
8770 | index = 0; |
8771 | first_vec_index = -1; |
8772 | second_vec_index = -1; |
8773 | noop_p = true; |
8774 | } |
8775 | } |
8776 | |
8777 | if (n_loads) |
8778 | { |
8779 | if (repeating_p) |
8780 | *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node); |
8781 | else |
8782 | { |
8783 | /* Enforced above when !repeating_p. */ |
8784 | unsigned int const_nunits = nunits.to_constant (); |
8785 | *n_loads = 0; |
8786 | bool load_seen = false; |
8787 | for (unsigned i = 0; i < in_nlanes; ++i) |
8788 | { |
8789 | if (i % const_nunits == 0) |
8790 | { |
8791 | if (load_seen) |
8792 | *n_loads += 1; |
8793 | load_seen = false; |
8794 | } |
8795 | if (bitmap_bit_p (map: used_in_lanes, bitno: i)) |
8796 | load_seen = true; |
8797 | } |
8798 | if (load_seen) |
8799 | *n_loads += 1; |
8800 | } |
8801 | } |
8802 | |
8803 | if (dce_chain) |
8804 | for (unsigned i = 0; i < dr_chain.length (); ++i) |
8805 | if (!bitmap_bit_p (used_defs, i)) |
8806 | { |
8807 | tree def = dr_chain[i]; |
8808 | do |
8809 | { |
8810 | gimple *stmt = SSA_NAME_DEF_STMT (def); |
8811 | if (is_gimple_assign (gs: stmt) |
8812 | && (gimple_assign_rhs_code (gs: stmt) == VIEW_CONVERT_EXPR |
8813 | || gimple_assign_rhs_code (gs: stmt) == CONSTRUCTOR)) |
8814 | def = single_ssa_tree_operand (stmt, SSA_OP_USE); |
8815 | else |
8816 | def = NULL; |
8817 | gimple_stmt_iterator rgsi = gsi_for_stmt (stmt); |
8818 | gsi_remove (&rgsi, true); |
8819 | release_defs (stmt); |
8820 | } |
8821 | while (def); |
8822 | } |
8823 | |
8824 | return true; |
8825 | } |
8826 | |
8827 | /* Generate vector permute statements from a list of loads in DR_CHAIN. |
8828 | If ANALYZE_ONLY is TRUE, only check that it is possible to create valid |
8829 | permute statements for the SLP node NODE. Store the number of vector |
8830 | permute instructions in *N_PERMS and the number of vector load |
8831 | instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions |
8832 | that were not needed. */ |
8833 | |
8834 | bool |
8835 | vect_transform_slp_perm_load (vec_info *vinfo, |
8836 | slp_tree node, const vec<tree> &dr_chain, |
8837 | gimple_stmt_iterator *gsi, poly_uint64 vf, |
8838 | bool analyze_only, unsigned *n_perms, |
8839 | unsigned int *n_loads, bool dce_chain) |
8840 | { |
8841 | return vect_transform_slp_perm_load_1 (vinfo, node, |
8842 | SLP_TREE_LOAD_PERMUTATION (node), |
8843 | dr_chain, gsi, vf, analyze_only, |
8844 | dump_p: dump_enabled_p (), n_perms, n_loads, |
8845 | dce_chain); |
8846 | } |
8847 | |
8848 | /* Produce the next vector result for SLP permutation NODE by adding a vector |
8849 | statement at GSI. If MASK_VEC is nonnull, add: |
8850 | |
8851 | <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC> |
8852 | |
8853 | otherwise add: |
8854 | |
8855 | <new SSA name> = FIRST_DEF. */ |
8856 | |
8857 | static void |
8858 | vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi, |
8859 | slp_tree node, tree first_def, tree second_def, |
8860 | tree mask_vec, poly_uint64 identity_offset) |
8861 | { |
8862 | tree vectype = SLP_TREE_VECTYPE (node); |
8863 | |
8864 | /* ??? We SLP match existing vector element extracts but |
8865 | allow punning which we need to re-instantiate at uses |
8866 | but have no good way of explicitly representing. */ |
8867 | if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype)) |
8868 | && !types_compatible_p (TREE_TYPE (first_def), type2: vectype)) |
8869 | { |
8870 | gassign *conv_stmt |
8871 | = gimple_build_assign (make_ssa_name (var: vectype), |
8872 | build1 (VIEW_CONVERT_EXPR, vectype, first_def)); |
8873 | vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi); |
8874 | first_def = gimple_assign_lhs (gs: conv_stmt); |
8875 | } |
8876 | gassign *perm_stmt; |
8877 | tree perm_dest = make_ssa_name (var: vectype); |
8878 | if (mask_vec) |
8879 | { |
8880 | if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), |
8881 | TYPE_SIZE (vectype)) |
8882 | && !types_compatible_p (TREE_TYPE (second_def), type2: vectype)) |
8883 | { |
8884 | gassign *conv_stmt |
8885 | = gimple_build_assign (make_ssa_name (var: vectype), |
8886 | build1 (VIEW_CONVERT_EXPR, |
8887 | vectype, second_def)); |
8888 | vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi); |
8889 | second_def = gimple_assign_lhs (gs: conv_stmt); |
8890 | } |
8891 | perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR, |
8892 | first_def, second_def, |
8893 | mask_vec); |
8894 | } |
8895 | else if (!types_compatible_p (TREE_TYPE (first_def), type2: vectype)) |
8896 | { |
8897 | /* For identity permutes we still need to handle the case |
8898 | of offsetted extracts or concats. */ |
8899 | unsigned HOST_WIDE_INT c; |
8900 | auto first_def_nunits |
8901 | = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def)); |
8902 | if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits)) |
8903 | { |
8904 | unsigned HOST_WIDE_INT elsz |
8905 | = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def)))); |
8906 | tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def, |
8907 | TYPE_SIZE (vectype), |
8908 | bitsize_int (identity_offset * elsz)); |
8909 | perm_stmt = gimple_build_assign (perm_dest, lowpart); |
8910 | } |
8911 | else if (constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: vectype), |
8912 | b: first_def_nunits, multiple: &c) && c == 2) |
8913 | { |
8914 | tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def, |
8915 | NULL_TREE, second_def); |
8916 | perm_stmt = gimple_build_assign (perm_dest, ctor); |
8917 | } |
8918 | else |
8919 | gcc_unreachable (); |
8920 | } |
8921 | else |
8922 | { |
8923 | /* We need a copy here in case the def was external. */ |
8924 | perm_stmt = gimple_build_assign (perm_dest, first_def); |
8925 | } |
8926 | vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi); |
8927 | /* Store the vector statement in NODE. */ |
8928 | node->push_vec_def (def: perm_stmt); |
8929 | } |
8930 | |
8931 | /* Subroutine of vectorizable_slp_permutation. Check whether the target |
8932 | can perform permutation PERM on the (1 or 2) input nodes in CHILDREN. |
8933 | If GSI is nonnull, emit the permutation there. |
8934 | |
8935 | When GSI is null, the only purpose of NODE is to give properties |
8936 | of the result, such as the vector type and number of SLP lanes. |
8937 | The node does not need to be a VEC_PERM_EXPR. |
8938 | |
8939 | If the target supports the operation, return the number of individual |
8940 | VEC_PERM_EXPRs needed, otherwise return -1. Print information to the |
8941 | dump file if DUMP_P is true. */ |
8942 | |
8943 | static int |
8944 | vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, |
8945 | slp_tree node, lane_permutation_t &perm, |
8946 | vec<slp_tree> &children, bool dump_p) |
8947 | { |
8948 | tree vectype = SLP_TREE_VECTYPE (node); |
8949 | |
8950 | /* ??? We currently only support all same vector input types |
8951 | while the SLP IL should really do a concat + select and thus accept |
8952 | arbitrary mismatches. */ |
8953 | slp_tree child; |
8954 | unsigned i; |
8955 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype); |
8956 | bool repeating_p = multiple_p (a: nunits, SLP_TREE_LANES (node)); |
8957 | tree op_vectype = NULL_TREE; |
8958 | FOR_EACH_VEC_ELT (children, i, child) |
8959 | if (SLP_TREE_VECTYPE (child)) |
8960 | { |
8961 | op_vectype = SLP_TREE_VECTYPE (child); |
8962 | break; |
8963 | } |
8964 | if (!op_vectype) |
8965 | op_vectype = vectype; |
8966 | FOR_EACH_VEC_ELT (children, i, child) |
8967 | { |
8968 | if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def |
8969 | && !vect_maybe_update_slp_op_vectype (child, op_vectype)) |
8970 | || !types_compatible_p (SLP_TREE_VECTYPE (child), type2: op_vectype) |
8971 | || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype))) |
8972 | { |
8973 | if (dump_p) |
8974 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8975 | "Unsupported vector types in lane permutation\n" ); |
8976 | return -1; |
8977 | } |
8978 | if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node)) |
8979 | repeating_p = false; |
8980 | } |
8981 | |
8982 | gcc_assert (perm.length () == SLP_TREE_LANES (node)); |
8983 | if (dump_p) |
8984 | { |
8985 | dump_printf_loc (MSG_NOTE, vect_location, |
8986 | "vectorizing permutation" ); |
8987 | for (unsigned i = 0; i < perm.length (); ++i) |
8988 | dump_printf (MSG_NOTE, " op%u[%u]" , perm[i].first, perm[i].second); |
8989 | if (repeating_p) |
8990 | dump_printf (MSG_NOTE, " (repeat %d)\n" , SLP_TREE_LANES (node)); |
8991 | dump_printf (MSG_NOTE, "\n" ); |
8992 | } |
8993 | |
8994 | /* REPEATING_P is true if every output vector is guaranteed to use the |
8995 | same permute vector. We can handle that case for both variable-length |
8996 | and constant-length vectors, but we only handle other cases for |
8997 | constant-length vectors. |
8998 | |
8999 | Set: |
9000 | |
9001 | - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute |
9002 | mask vector that we want to build. |
9003 | |
9004 | - NCOPIES to the number of copies of PERM that we need in order |
9005 | to build the necessary permute mask vectors. |
9006 | |
9007 | - NOUTPUTS_PER_MASK to the number of output vectors we want to create |
9008 | for each permute mask vector. This is only relevant when GSI is |
9009 | nonnull. */ |
9010 | uint64_t npatterns; |
9011 | unsigned nelts_per_pattern; |
9012 | uint64_t ncopies; |
9013 | unsigned noutputs_per_mask; |
9014 | if (repeating_p) |
9015 | { |
9016 | /* We need a single permute mask vector that has the form: |
9017 | |
9018 | { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... } |
9019 | |
9020 | In other words, the original n-element permute in PERM is |
9021 | "unrolled" to fill a full vector. The stepped vector encoding |
9022 | that we use for permutes requires 3n elements. */ |
9023 | npatterns = SLP_TREE_LANES (node); |
9024 | nelts_per_pattern = ncopies = 3; |
9025 | noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node); |
9026 | } |
9027 | else |
9028 | { |
9029 | /* Calculate every element of every permute mask vector explicitly, |
9030 | instead of relying on the pattern described above. */ |
9031 | if (!nunits.is_constant (const_value: &npatterns) |
9032 | || !TYPE_VECTOR_SUBPARTS (node: op_vectype).is_constant ()) |
9033 | return -1; |
9034 | nelts_per_pattern = ncopies = 1; |
9035 | if (loop_vec_info linfo = dyn_cast <loop_vec_info> (p: vinfo)) |
9036 | if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (const_value: &ncopies)) |
9037 | return -1; |
9038 | noutputs_per_mask = 1; |
9039 | } |
9040 | unsigned olanes = ncopies * SLP_TREE_LANES (node); |
9041 | gcc_assert (repeating_p || multiple_p (olanes, nunits)); |
9042 | |
9043 | /* Compute the { { SLP operand, vector index}, lane } permutation sequence |
9044 | from the { SLP operand, scalar lane } permutation as recorded in the |
9045 | SLP node as intermediate step. This part should already work |
9046 | with SLP children with arbitrary number of lanes. */ |
9047 | auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm; |
9048 | auto_vec<unsigned> active_lane; |
9049 | vperm.create (nelems: olanes); |
9050 | active_lane.safe_grow_cleared (len: children.length (), exact: true); |
9051 | for (unsigned i = 0; i < ncopies; ++i) |
9052 | { |
9053 | for (unsigned pi = 0; pi < perm.length (); ++pi) |
9054 | { |
9055 | std::pair<unsigned, unsigned> p = perm[pi]; |
9056 | tree vtype = SLP_TREE_VECTYPE (children[p.first]); |
9057 | if (repeating_p) |
9058 | vperm.quick_push (obj: {{p.first, 0}, p.second + active_lane[p.first]}); |
9059 | else |
9060 | { |
9061 | /* We checked above that the vectors are constant-length. */ |
9062 | unsigned vnunits = TYPE_VECTOR_SUBPARTS (node: vtype).to_constant (); |
9063 | unsigned vi = (active_lane[p.first] + p.second) / vnunits; |
9064 | unsigned vl = (active_lane[p.first] + p.second) % vnunits; |
9065 | vperm.quick_push (obj: {{p.first, vi}, vl}); |
9066 | } |
9067 | } |
9068 | /* Advance to the next group. */ |
9069 | for (unsigned j = 0; j < children.length (); ++j) |
9070 | active_lane[j] += SLP_TREE_LANES (children[j]); |
9071 | } |
9072 | |
9073 | if (dump_p) |
9074 | { |
9075 | dump_printf_loc (MSG_NOTE, vect_location, |
9076 | "vectorizing permutation" ); |
9077 | for (unsigned i = 0; i < perm.length (); ++i) |
9078 | dump_printf (MSG_NOTE, " op%u[%u]" , perm[i].first, perm[i].second); |
9079 | if (repeating_p) |
9080 | dump_printf (MSG_NOTE, " (repeat %d)\n" , SLP_TREE_LANES (node)); |
9081 | dump_printf (MSG_NOTE, "\n" ); |
9082 | dump_printf_loc (MSG_NOTE, vect_location, "as" ); |
9083 | for (unsigned i = 0; i < vperm.length (); ++i) |
9084 | { |
9085 | if (i != 0 |
9086 | && (repeating_p |
9087 | ? multiple_p (a: i, b: npatterns) |
9088 | : multiple_p (a: i, b: TYPE_VECTOR_SUBPARTS (node: vectype)))) |
9089 | dump_printf (MSG_NOTE, "," ); |
9090 | dump_printf (MSG_NOTE, " vops%u[%u][%u]" , |
9091 | vperm[i].first.first, vperm[i].first.second, |
9092 | vperm[i].second); |
9093 | } |
9094 | dump_printf (MSG_NOTE, "\n" ); |
9095 | } |
9096 | |
9097 | /* We can only handle two-vector permutes, everything else should |
9098 | be lowered on the SLP level. The following is closely inspired |
9099 | by vect_transform_slp_perm_load and is supposed to eventually |
9100 | replace it. |
9101 | ??? As intermediate step do code-gen in the SLP tree representation |
9102 | somehow? */ |
9103 | std::pair<unsigned, unsigned> first_vec = std::make_pair (x: -1U, y: -1U); |
9104 | std::pair<unsigned, unsigned> second_vec = std::make_pair (x: -1U, y: -1U); |
9105 | unsigned int index = 0; |
9106 | poly_uint64 mask_element; |
9107 | vec_perm_builder mask; |
9108 | mask.new_vector (full_nelts: nunits, npatterns, nelts_per_pattern); |
9109 | unsigned int count = mask.encoded_nelts (); |
9110 | mask.quick_grow (len: count); |
9111 | vec_perm_indices indices; |
9112 | unsigned nperms = 0; |
9113 | for (unsigned i = 0; i < vperm.length (); ++i) |
9114 | { |
9115 | mask_element = vperm[i].second; |
9116 | if (first_vec.first == -1U |
9117 | || first_vec == vperm[i].first) |
9118 | first_vec = vperm[i].first; |
9119 | else if (second_vec.first == -1U |
9120 | || second_vec == vperm[i].first) |
9121 | { |
9122 | second_vec = vperm[i].first; |
9123 | mask_element += nunits; |
9124 | } |
9125 | else |
9126 | { |
9127 | if (dump_p) |
9128 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9129 | "permutation requires at " |
9130 | "least three vectors\n" ); |
9131 | gcc_assert (!gsi); |
9132 | return -1; |
9133 | } |
9134 | |
9135 | mask[index++] = mask_element; |
9136 | |
9137 | if (index == count) |
9138 | { |
9139 | indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, |
9140 | TYPE_VECTOR_SUBPARTS (node: op_vectype)); |
9141 | bool identity_p = (indices.series_p (0, 1, mask[0], 1) |
9142 | && constant_multiple_p (a: mask[0], b: nunits)); |
9143 | machine_mode vmode = TYPE_MODE (vectype); |
9144 | machine_mode op_vmode = TYPE_MODE (op_vectype); |
9145 | unsigned HOST_WIDE_INT c; |
9146 | if ((!identity_p |
9147 | && !can_vec_perm_const_p (vmode, op_vmode, indices)) |
9148 | || (identity_p |
9149 | && !known_le (nunits, |
9150 | TYPE_VECTOR_SUBPARTS (op_vectype)) |
9151 | && (!constant_multiple_p (a: nunits, |
9152 | b: TYPE_VECTOR_SUBPARTS (node: op_vectype), |
9153 | multiple: &c) || c != 2))) |
9154 | { |
9155 | if (dump_p) |
9156 | { |
9157 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, |
9158 | vect_location, |
9159 | "unsupported vect permute { " ); |
9160 | for (i = 0; i < count; ++i) |
9161 | { |
9162 | dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); |
9163 | dump_printf (MSG_MISSED_OPTIMIZATION, " " ); |
9164 | } |
9165 | dump_printf (MSG_MISSED_OPTIMIZATION, "}\n" ); |
9166 | } |
9167 | gcc_assert (!gsi); |
9168 | return -1; |
9169 | } |
9170 | |
9171 | if (!identity_p) |
9172 | nperms++; |
9173 | if (gsi) |
9174 | { |
9175 | if (second_vec.first == -1U) |
9176 | second_vec = first_vec; |
9177 | |
9178 | slp_tree |
9179 | first_node = children[first_vec.first], |
9180 | second_node = children[second_vec.first]; |
9181 | |
9182 | tree mask_vec = NULL_TREE; |
9183 | if (!identity_p) |
9184 | mask_vec = vect_gen_perm_mask_checked (vectype, indices); |
9185 | |
9186 | for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi) |
9187 | { |
9188 | tree first_def |
9189 | = vect_get_slp_vect_def (slp_node: first_node, |
9190 | i: first_vec.second + vi); |
9191 | tree second_def |
9192 | = vect_get_slp_vect_def (slp_node: second_node, |
9193 | i: second_vec.second + vi); |
9194 | vect_add_slp_permutation (vinfo, gsi, node, first_def, |
9195 | second_def, mask_vec, identity_offset: mask[0]); |
9196 | } |
9197 | } |
9198 | |
9199 | index = 0; |
9200 | first_vec = std::make_pair (x: -1U, y: -1U); |
9201 | second_vec = std::make_pair (x: -1U, y: -1U); |
9202 | } |
9203 | } |
9204 | |
9205 | return nperms; |
9206 | } |
9207 | |
9208 | /* Vectorize the SLP permutations in NODE as specified |
9209 | in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP |
9210 | child number and lane number. |
9211 | Interleaving of two two-lane two-child SLP subtrees (not supported): |
9212 | [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ] |
9213 | A blend of two four-lane two-child SLP subtrees: |
9214 | [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ] |
9215 | Highpart of a four-lane one-child SLP subtree (not supported): |
9216 | [ { 0, 2 }, { 0, 3 } ] |
9217 | Where currently only a subset is supported by code generating below. */ |
9218 | |
9219 | static bool |
9220 | vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi, |
9221 | slp_tree node, stmt_vector_for_cost *cost_vec) |
9222 | { |
9223 | tree vectype = SLP_TREE_VECTYPE (node); |
9224 | lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node); |
9225 | int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm, |
9226 | SLP_TREE_CHILDREN (node), |
9227 | dump_p: dump_enabled_p ()); |
9228 | if (nperms < 0) |
9229 | return false; |
9230 | |
9231 | if (!gsi) |
9232 | record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body); |
9233 | |
9234 | return true; |
9235 | } |
9236 | |
9237 | /* Vectorize SLP NODE. */ |
9238 | |
9239 | static void |
9240 | vect_schedule_slp_node (vec_info *vinfo, |
9241 | slp_tree node, slp_instance instance) |
9242 | { |
9243 | gimple_stmt_iterator si; |
9244 | int i; |
9245 | slp_tree child; |
9246 | |
9247 | /* Vectorize externals and constants. */ |
9248 | if (SLP_TREE_DEF_TYPE (node) == vect_constant_def |
9249 | || SLP_TREE_DEF_TYPE (node) == vect_external_def) |
9250 | { |
9251 | /* ??? vectorizable_shift can end up using a scalar operand which is |
9252 | currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the |
9253 | node in this case. */ |
9254 | if (!SLP_TREE_VECTYPE (node)) |
9255 | return; |
9256 | |
9257 | /* There are two reasons vector defs might already exist. The first |
9258 | is that we are vectorizing an existing vector def. The second is |
9259 | when performing BB vectorization shared constant/external nodes |
9260 | are not split apart during partitioning so during the code-gen |
9261 | DFS walk we can end up visiting them twice. */ |
9262 | if (! SLP_TREE_VEC_DEFS (node).exists ()) |
9263 | vect_create_constant_vectors (vinfo, op_node: node); |
9264 | return; |
9265 | } |
9266 | |
9267 | gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ()); |
9268 | |
9269 | stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node); |
9270 | |
9271 | gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0); |
9272 | SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); |
9273 | |
9274 | if (dump_enabled_p ()) |
9275 | dump_printf_loc (MSG_NOTE, vect_location, |
9276 | "------>vectorizing SLP node starting from: %G" , |
9277 | stmt_info->stmt); |
9278 | |
9279 | if (STMT_VINFO_DATA_REF (stmt_info) |
9280 | && SLP_TREE_CODE (node) != VEC_PERM_EXPR) |
9281 | { |
9282 | /* Vectorized loads go before the first scalar load to make it |
9283 | ready early, vectorized stores go before the last scalar |
9284 | stmt which is where all uses are ready. */ |
9285 | stmt_vec_info last_stmt_info = NULL; |
9286 | if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) |
9287 | last_stmt_info = vect_find_first_scalar_stmt_in_slp (node); |
9288 | else /* DR_IS_WRITE */ |
9289 | last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); |
9290 | si = gsi_for_stmt (last_stmt_info->stmt); |
9291 | } |
9292 | else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type |
9293 | || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type |
9294 | || STMT_VINFO_TYPE (stmt_info) == phi_info_type) |
9295 | && SLP_TREE_CODE (node) != VEC_PERM_EXPR) |
9296 | { |
9297 | /* For PHI node vectorization we do not use the insertion iterator. */ |
9298 | si = gsi_none (); |
9299 | } |
9300 | else |
9301 | { |
9302 | /* Emit other stmts after the children vectorized defs which is |
9303 | earliest possible. */ |
9304 | gimple *last_stmt = NULL; |
9305 | if (auto loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo)) |
9306 | if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) |
9307 | || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) |
9308 | { |
9309 | /* But avoid scheduling internal defs outside of the loop when |
9310 | we might have only implicitly tracked loop mask/len defs. */ |
9311 | gimple_stmt_iterator si |
9312 | = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header); |
9313 | last_stmt = *si; |
9314 | } |
9315 | bool seen_vector_def = false; |
9316 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
9317 | if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) |
9318 | { |
9319 | /* For fold-left reductions we are retaining the scalar |
9320 | reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS |
9321 | set so the representation isn't perfect. Resort to the |
9322 | last scalar def here. */ |
9323 | if (SLP_TREE_VEC_DEFS (child).is_empty ()) |
9324 | { |
9325 | gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child)) |
9326 | == cycle_phi_info_type); |
9327 | gphi *phi = as_a <gphi *> |
9328 | (p: vect_find_last_scalar_stmt_in_slp (node: child)->stmt); |
9329 | if (!last_stmt |
9330 | || vect_stmt_dominates_stmt_p (last_stmt, phi)) |
9331 | last_stmt = phi; |
9332 | } |
9333 | /* We are emitting all vectorized stmts in the same place and |
9334 | the last one is the last. |
9335 | ??? Unless we have a load permutation applied and that |
9336 | figures to re-use an earlier generated load. */ |
9337 | unsigned j; |
9338 | tree vdef; |
9339 | FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef) |
9340 | { |
9341 | gimple *vstmt = SSA_NAME_DEF_STMT (vdef); |
9342 | if (!last_stmt |
9343 | || vect_stmt_dominates_stmt_p (last_stmt, vstmt)) |
9344 | last_stmt = vstmt; |
9345 | } |
9346 | } |
9347 | else if (!SLP_TREE_VECTYPE (child)) |
9348 | { |
9349 | /* For externals we use unvectorized at all scalar defs. */ |
9350 | unsigned j; |
9351 | tree def; |
9352 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def) |
9353 | if (TREE_CODE (def) == SSA_NAME |
9354 | && !SSA_NAME_IS_DEFAULT_DEF (def)) |
9355 | { |
9356 | gimple *stmt = SSA_NAME_DEF_STMT (def); |
9357 | if (!last_stmt |
9358 | || vect_stmt_dominates_stmt_p (last_stmt, stmt)) |
9359 | last_stmt = stmt; |
9360 | } |
9361 | } |
9362 | else |
9363 | { |
9364 | /* For externals we have to look at all defs since their |
9365 | insertion place is decided per vector. But beware |
9366 | of pre-existing vectors where we need to make sure |
9367 | we do not insert before the region boundary. */ |
9368 | if (SLP_TREE_SCALAR_OPS (child).is_empty () |
9369 | && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0])) |
9370 | seen_vector_def = true; |
9371 | else |
9372 | { |
9373 | unsigned j; |
9374 | tree vdef; |
9375 | FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef) |
9376 | if (TREE_CODE (vdef) == SSA_NAME |
9377 | && !SSA_NAME_IS_DEFAULT_DEF (vdef)) |
9378 | { |
9379 | gimple *vstmt = SSA_NAME_DEF_STMT (vdef); |
9380 | if (!last_stmt |
9381 | || vect_stmt_dominates_stmt_p (last_stmt, vstmt)) |
9382 | last_stmt = vstmt; |
9383 | } |
9384 | } |
9385 | } |
9386 | /* This can happen when all children are pre-existing vectors or |
9387 | constants. */ |
9388 | if (!last_stmt) |
9389 | last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt; |
9390 | if (!last_stmt) |
9391 | { |
9392 | gcc_assert (seen_vector_def); |
9393 | si = gsi_after_labels (bb: as_a <bb_vec_info> (p: vinfo)->bbs[0]); |
9394 | } |
9395 | else if (is_ctrl_altering_stmt (last_stmt)) |
9396 | { |
9397 | /* We split regions to vectorize at control altering stmts |
9398 | with a definition so this must be an external which |
9399 | we can insert at the start of the region. */ |
9400 | si = gsi_after_labels (bb: as_a <bb_vec_info> (p: vinfo)->bbs[0]); |
9401 | } |
9402 | else if (is_a <bb_vec_info> (p: vinfo) |
9403 | && gimple_bb (g: last_stmt) != gimple_bb (g: stmt_info->stmt) |
9404 | && gimple_could_trap_p (stmt_info->stmt)) |
9405 | { |
9406 | /* We've constrained possibly trapping operations to all come |
9407 | from the same basic-block, if vectorized defs would allow earlier |
9408 | scheduling still force vectorized stmts to the original block. |
9409 | This is only necessary for BB vectorization since for loop vect |
9410 | all operations are in a single BB and scalar stmt based |
9411 | placement doesn't play well with epilogue vectorization. */ |
9412 | gcc_assert (dominated_by_p (CDI_DOMINATORS, |
9413 | gimple_bb (stmt_info->stmt), |
9414 | gimple_bb (last_stmt))); |
9415 | si = gsi_after_labels (bb: gimple_bb (g: stmt_info->stmt)); |
9416 | } |
9417 | else if (is_a <gphi *> (p: last_stmt)) |
9418 | si = gsi_after_labels (bb: gimple_bb (g: last_stmt)); |
9419 | else |
9420 | { |
9421 | si = gsi_for_stmt (last_stmt); |
9422 | gsi_next (i: &si); |
9423 | } |
9424 | } |
9425 | |
9426 | /* Handle purely internal nodes. */ |
9427 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
9428 | { |
9429 | /* ??? the transform kind is stored to STMT_VINFO_TYPE which might |
9430 | be shared with different SLP nodes (but usually it's the same |
9431 | operation apart from the case the stmt is only there for denoting |
9432 | the actual scalar lane defs ...). So do not call vect_transform_stmt |
9433 | but open-code it here (partly). */ |
9434 | bool done = vectorizable_slp_permutation (vinfo, gsi: &si, node, NULL); |
9435 | gcc_assert (done); |
9436 | stmt_vec_info slp_stmt_info; |
9437 | unsigned int i; |
9438 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info) |
9439 | if (STMT_VINFO_LIVE_P (slp_stmt_info)) |
9440 | { |
9441 | done = vectorizable_live_operation (vinfo, slp_stmt_info, node, |
9442 | instance, i, true, NULL); |
9443 | gcc_assert (done); |
9444 | } |
9445 | } |
9446 | else |
9447 | vect_transform_stmt (vinfo, stmt_info, &si, node, instance); |
9448 | } |
9449 | |
9450 | /* Replace scalar calls from SLP node NODE with setting of their lhs to zero. |
9451 | For loop vectorization this is done in vectorizable_call, but for SLP |
9452 | it needs to be deferred until end of vect_schedule_slp, because multiple |
9453 | SLP instances may refer to the same scalar stmt. */ |
9454 | |
9455 | static void |
9456 | vect_remove_slp_scalar_calls (vec_info *vinfo, |
9457 | slp_tree node, hash_set<slp_tree> &visited) |
9458 | { |
9459 | gimple *new_stmt; |
9460 | gimple_stmt_iterator gsi; |
9461 | int i; |
9462 | slp_tree child; |
9463 | tree lhs; |
9464 | stmt_vec_info stmt_info; |
9465 | |
9466 | if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def) |
9467 | return; |
9468 | |
9469 | if (visited.add (k: node)) |
9470 | return; |
9471 | |
9472 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
9473 | vect_remove_slp_scalar_calls (vinfo, node: child, visited); |
9474 | |
9475 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
9476 | { |
9477 | gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt); |
9478 | if (!stmt || gimple_bb (g: stmt) == NULL) |
9479 | continue; |
9480 | if (is_pattern_stmt_p (stmt_info) |
9481 | || !PURE_SLP_STMT (stmt_info)) |
9482 | continue; |
9483 | lhs = gimple_call_lhs (gs: stmt); |
9484 | if (lhs) |
9485 | new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs))); |
9486 | else |
9487 | { |
9488 | new_stmt = gimple_build_nop (); |
9489 | unlink_stmt_vdef (stmt_info->stmt); |
9490 | } |
9491 | gsi = gsi_for_stmt (stmt); |
9492 | vinfo->replace_stmt (&gsi, stmt_info, new_stmt); |
9493 | if (lhs) |
9494 | SSA_NAME_DEF_STMT (lhs) = new_stmt; |
9495 | } |
9496 | } |
9497 | |
9498 | static void |
9499 | vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node) |
9500 | { |
9501 | hash_set<slp_tree> visited; |
9502 | vect_remove_slp_scalar_calls (vinfo, node, visited); |
9503 | } |
9504 | |
9505 | /* Vectorize the instance root. */ |
9506 | |
9507 | void |
9508 | vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance) |
9509 | { |
9510 | gassign *rstmt = NULL; |
9511 | |
9512 | if (instance->kind == slp_inst_kind_ctor) |
9513 | { |
9514 | if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1) |
9515 | { |
9516 | tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0]; |
9517 | tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt); |
9518 | if (!useless_type_conversion_p (TREE_TYPE (root_lhs), |
9519 | TREE_TYPE (vect_lhs))) |
9520 | vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs), |
9521 | vect_lhs); |
9522 | rstmt = gimple_build_assign (root_lhs, vect_lhs); |
9523 | } |
9524 | else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1) |
9525 | { |
9526 | int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node); |
9527 | tree child_def; |
9528 | int j; |
9529 | vec<constructor_elt, va_gc> *v; |
9530 | vec_alloc (v, nelems: nelts); |
9531 | |
9532 | /* A CTOR can handle V16HI composition from VNx8HI so we |
9533 | do not need to convert vector elements if the types |
9534 | do not match. */ |
9535 | FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def) |
9536 | CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def); |
9537 | tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt); |
9538 | tree rtype |
9539 | = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt)); |
9540 | tree r_constructor = build_constructor (rtype, v); |
9541 | rstmt = gimple_build_assign (lhs, r_constructor); |
9542 | } |
9543 | } |
9544 | else if (instance->kind == slp_inst_kind_bb_reduc) |
9545 | { |
9546 | /* Largely inspired by reduction chain epilogue handling in |
9547 | vect_create_epilog_for_reduction. */ |
9548 | vec<tree> vec_defs = vNULL; |
9549 | vect_get_slp_defs (slp_node: node, vec_defs: &vec_defs); |
9550 | enum tree_code reduc_code |
9551 | = gimple_assign_rhs_code (gs: instance->root_stmts[0]->stmt); |
9552 | /* ??? We actually have to reflect signs somewhere. */ |
9553 | if (reduc_code == MINUS_EXPR) |
9554 | reduc_code = PLUS_EXPR; |
9555 | gimple_seq epilogue = NULL; |
9556 | /* We may end up with more than one vector result, reduce them |
9557 | to one vector. */ |
9558 | tree vec_def = vec_defs[0]; |
9559 | tree vectype = TREE_TYPE (vec_def); |
9560 | tree compute_vectype = vectype; |
9561 | bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype) |
9562 | && TYPE_OVERFLOW_UNDEFINED (vectype) |
9563 | && operation_can_overflow (reduc_code)); |
9564 | if (pun_for_overflow_p) |
9565 | { |
9566 | compute_vectype = unsigned_type_for (vectype); |
9567 | vec_def = gimple_build (seq: &epilogue, code: VIEW_CONVERT_EXPR, |
9568 | type: compute_vectype, ops: vec_def); |
9569 | } |
9570 | for (unsigned i = 1; i < vec_defs.length (); ++i) |
9571 | { |
9572 | tree def = vec_defs[i]; |
9573 | if (pun_for_overflow_p) |
9574 | def = gimple_build (seq: &epilogue, code: VIEW_CONVERT_EXPR, |
9575 | type: compute_vectype, ops: def); |
9576 | vec_def = gimple_build (seq: &epilogue, code: reduc_code, type: compute_vectype, |
9577 | ops: vec_def, ops: def); |
9578 | } |
9579 | vec_defs.release (); |
9580 | /* ??? Support other schemes than direct internal fn. */ |
9581 | internal_fn reduc_fn; |
9582 | if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn) |
9583 | || reduc_fn == IFN_LAST) |
9584 | gcc_unreachable (); |
9585 | tree scalar_def = gimple_build (seq: &epilogue, fn: as_combined_fn (fn: reduc_fn), |
9586 | TREE_TYPE (compute_vectype), args: vec_def); |
9587 | if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ()) |
9588 | { |
9589 | tree rem_def = NULL_TREE; |
9590 | for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance)) |
9591 | { |
9592 | def = gimple_convert (seq: &epilogue, TREE_TYPE (scalar_def), op: def); |
9593 | if (!rem_def) |
9594 | rem_def = def; |
9595 | else |
9596 | rem_def = gimple_build (seq: &epilogue, code: reduc_code, |
9597 | TREE_TYPE (scalar_def), |
9598 | ops: rem_def, ops: def); |
9599 | } |
9600 | scalar_def = gimple_build (seq: &epilogue, code: reduc_code, |
9601 | TREE_TYPE (scalar_def), |
9602 | ops: scalar_def, ops: rem_def); |
9603 | } |
9604 | scalar_def = gimple_convert (seq: &epilogue, |
9605 | TREE_TYPE (vectype), op: scalar_def); |
9606 | gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt); |
9607 | gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT); |
9608 | gimple_assign_set_rhs_from_tree (&rgsi, scalar_def); |
9609 | update_stmt (s: gsi_stmt (i: rgsi)); |
9610 | return; |
9611 | } |
9612 | else |
9613 | gcc_unreachable (); |
9614 | |
9615 | gcc_assert (rstmt); |
9616 | |
9617 | gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt); |
9618 | gsi_replace (&rgsi, rstmt, true); |
9619 | } |
9620 | |
9621 | struct slp_scc_info |
9622 | { |
9623 | bool on_stack; |
9624 | int dfs; |
9625 | int lowlink; |
9626 | }; |
9627 | |
9628 | /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */ |
9629 | |
9630 | static void |
9631 | vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance, |
9632 | hash_map<slp_tree, slp_scc_info> &scc_info, |
9633 | int &maxdfs, vec<slp_tree> &stack) |
9634 | { |
9635 | bool existed_p; |
9636 | slp_scc_info *info = &scc_info.get_or_insert (k: node, existed: &existed_p); |
9637 | gcc_assert (!existed_p); |
9638 | info->dfs = maxdfs; |
9639 | info->lowlink = maxdfs; |
9640 | maxdfs++; |
9641 | |
9642 | /* Leaf. */ |
9643 | if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) |
9644 | { |
9645 | info->on_stack = false; |
9646 | vect_schedule_slp_node (vinfo, node, instance); |
9647 | return; |
9648 | } |
9649 | |
9650 | info->on_stack = true; |
9651 | stack.safe_push (obj: node); |
9652 | |
9653 | unsigned i; |
9654 | slp_tree child; |
9655 | /* DFS recurse. */ |
9656 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
9657 | { |
9658 | if (!child) |
9659 | continue; |
9660 | slp_scc_info *child_info = scc_info.get (k: child); |
9661 | if (!child_info) |
9662 | { |
9663 | vect_schedule_scc (vinfo, node: child, instance, scc_info, maxdfs, stack); |
9664 | /* Recursion might have re-allocated the node. */ |
9665 | info = scc_info.get (k: node); |
9666 | child_info = scc_info.get (k: child); |
9667 | info->lowlink = MIN (info->lowlink, child_info->lowlink); |
9668 | } |
9669 | else if (child_info->on_stack) |
9670 | info->lowlink = MIN (info->lowlink, child_info->dfs); |
9671 | } |
9672 | if (info->lowlink != info->dfs) |
9673 | return; |
9674 | |
9675 | auto_vec<slp_tree, 4> phis_to_fixup; |
9676 | |
9677 | /* Singleton. */ |
9678 | if (stack.last () == node) |
9679 | { |
9680 | stack.pop (); |
9681 | info->on_stack = false; |
9682 | vect_schedule_slp_node (vinfo, node, instance); |
9683 | if (SLP_TREE_CODE (node) != VEC_PERM_EXPR |
9684 | && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt)) |
9685 | phis_to_fixup.quick_push (obj: node); |
9686 | } |
9687 | else |
9688 | { |
9689 | /* SCC. */ |
9690 | int last_idx = stack.length () - 1; |
9691 | while (stack[last_idx] != node) |
9692 | last_idx--; |
9693 | /* We can break the cycle at PHIs who have at least one child |
9694 | code generated. Then we could re-start the DFS walk until |
9695 | all nodes in the SCC are covered (we might have new entries |
9696 | for only back-reachable nodes). But it's simpler to just |
9697 | iterate and schedule those that are ready. */ |
9698 | unsigned todo = stack.length () - last_idx; |
9699 | do |
9700 | { |
9701 | for (int idx = stack.length () - 1; idx >= last_idx; --idx) |
9702 | { |
9703 | slp_tree entry = stack[idx]; |
9704 | if (!entry) |
9705 | continue; |
9706 | bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR |
9707 | && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt)); |
9708 | bool ready = !phi; |
9709 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child) |
9710 | if (!child) |
9711 | { |
9712 | gcc_assert (phi); |
9713 | ready = true; |
9714 | break; |
9715 | } |
9716 | else if (scc_info.get (k: child)->on_stack) |
9717 | { |
9718 | if (!phi) |
9719 | { |
9720 | ready = false; |
9721 | break; |
9722 | } |
9723 | } |
9724 | else |
9725 | { |
9726 | if (phi) |
9727 | { |
9728 | ready = true; |
9729 | break; |
9730 | } |
9731 | } |
9732 | if (ready) |
9733 | { |
9734 | vect_schedule_slp_node (vinfo, node: entry, instance); |
9735 | scc_info.get (k: entry)->on_stack = false; |
9736 | stack[idx] = NULL; |
9737 | todo--; |
9738 | if (phi) |
9739 | phis_to_fixup.safe_push (obj: entry); |
9740 | } |
9741 | } |
9742 | } |
9743 | while (todo != 0); |
9744 | |
9745 | /* Pop the SCC. */ |
9746 | stack.truncate (size: last_idx); |
9747 | } |
9748 | |
9749 | /* Now fixup the backedge def of the vectorized PHIs in this SCC. */ |
9750 | slp_tree phi_node; |
9751 | FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node) |
9752 | { |
9753 | gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt); |
9754 | edge_iterator ei; |
9755 | edge e; |
9756 | FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds) |
9757 | { |
9758 | unsigned dest_idx = e->dest_idx; |
9759 | child = SLP_TREE_CHILDREN (phi_node)[dest_idx]; |
9760 | if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def) |
9761 | continue; |
9762 | unsigned n = SLP_TREE_VEC_DEFS (phi_node).length (); |
9763 | /* Simply fill all args. */ |
9764 | if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node)) |
9765 | != vect_first_order_recurrence) |
9766 | for (unsigned i = 0; i < n; ++i) |
9767 | { |
9768 | tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i]; |
9769 | gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef)); |
9770 | add_phi_arg (phi, vect_get_slp_vect_def (slp_node: child, i), |
9771 | e, gimple_phi_arg_location (phi, i: dest_idx)); |
9772 | } |
9773 | else |
9774 | { |
9775 | /* Unless it is a first order recurrence which needs |
9776 | args filled in for both the PHI node and the permutes. */ |
9777 | gimple *perm |
9778 | = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]); |
9779 | gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm)); |
9780 | add_phi_arg (as_a <gphi *> (p: rphi), |
9781 | vect_get_slp_vect_def (slp_node: child, i: n - 1), |
9782 | e, gimple_phi_arg_location (phi, i: dest_idx)); |
9783 | for (unsigned i = 0; i < n; ++i) |
9784 | { |
9785 | gimple *perm |
9786 | = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]); |
9787 | if (i > 0) |
9788 | gimple_assign_set_rhs1 (gs: perm, |
9789 | rhs: vect_get_slp_vect_def (slp_node: child, i: i - 1)); |
9790 | gimple_assign_set_rhs2 (gs: perm, |
9791 | rhs: vect_get_slp_vect_def (slp_node: child, i)); |
9792 | update_stmt (s: perm); |
9793 | } |
9794 | } |
9795 | } |
9796 | } |
9797 | } |
9798 | |
9799 | /* Generate vector code for SLP_INSTANCES in the loop/basic block. */ |
9800 | |
9801 | void |
9802 | vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances) |
9803 | { |
9804 | slp_instance instance; |
9805 | unsigned int i; |
9806 | |
9807 | hash_map<slp_tree, slp_scc_info> scc_info; |
9808 | int maxdfs = 0; |
9809 | FOR_EACH_VEC_ELT (slp_instances, i, instance) |
9810 | { |
9811 | slp_tree node = SLP_INSTANCE_TREE (instance); |
9812 | if (dump_enabled_p ()) |
9813 | { |
9814 | dump_printf_loc (MSG_NOTE, vect_location, |
9815 | "Vectorizing SLP tree:\n" ); |
9816 | /* ??? Dump all? */ |
9817 | if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()) |
9818 | dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G" , |
9819 | SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt); |
9820 | vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location, |
9821 | SLP_INSTANCE_TREE (instance)); |
9822 | } |
9823 | /* Schedule the tree of INSTANCE, scheduling SCCs in a way to |
9824 | have a PHI be the node breaking the cycle. */ |
9825 | auto_vec<slp_tree> stack; |
9826 | if (!scc_info.get (k: node)) |
9827 | vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack); |
9828 | |
9829 | if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()) |
9830 | vectorize_slp_instance_root_stmt (node, instance); |
9831 | |
9832 | if (dump_enabled_p ()) |
9833 | dump_printf_loc (MSG_NOTE, vect_location, |
9834 | "vectorizing stmts using SLP.\n" ); |
9835 | } |
9836 | |
9837 | FOR_EACH_VEC_ELT (slp_instances, i, instance) |
9838 | { |
9839 | slp_tree root = SLP_INSTANCE_TREE (instance); |
9840 | stmt_vec_info store_info; |
9841 | unsigned int j; |
9842 | |
9843 | /* Remove scalar call stmts. Do not do this for basic-block |
9844 | vectorization as not all uses may be vectorized. |
9845 | ??? Why should this be necessary? DCE should be able to |
9846 | remove the stmts itself. |
9847 | ??? For BB vectorization we can as well remove scalar |
9848 | stmts starting from the SLP tree root if they have no |
9849 | uses. */ |
9850 | if (is_a <loop_vec_info> (p: vinfo)) |
9851 | vect_remove_slp_scalar_calls (vinfo, node: root); |
9852 | |
9853 | /* Remove vectorized stores original scalar stmts. */ |
9854 | for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (ix: j, ptr: &store_info); j++) |
9855 | { |
9856 | if (!STMT_VINFO_DATA_REF (store_info) |
9857 | || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info))) |
9858 | break; |
9859 | |
9860 | store_info = vect_orig_stmt (stmt_info: store_info); |
9861 | /* Free the attached stmt_vec_info and remove the stmt. */ |
9862 | vinfo->remove_stmt (store_info); |
9863 | |
9864 | /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it |
9865 | to not crash in vect_free_slp_tree later. */ |
9866 | if (SLP_TREE_REPRESENTATIVE (root) == store_info) |
9867 | SLP_TREE_REPRESENTATIVE (root) = NULL; |
9868 | } |
9869 | } |
9870 | } |
9871 | |