1/* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#define INCLUDE_ALGORITHM
24#include "system.h"
25#include "coretypes.h"
26#include "backend.h"
27#include "target.h"
28#include "rtl.h"
29#include "tree.h"
30#include "gimple.h"
31#include "tree-pass.h"
32#include "ssa.h"
33#include "optabs-tree.h"
34#include "insn-config.h"
35#include "recog.h" /* FIXME: for insn_data */
36#include "fold-const.h"
37#include "stor-layout.h"
38#include "gimple-iterator.h"
39#include "cfgloop.h"
40#include "tree-vectorizer.h"
41#include "langhooks.h"
42#include "gimple-walk.h"
43#include "dbgcnt.h"
44#include "tree-vector-builder.h"
45#include "vec-perm-indices.h"
46#include "gimple-fold.h"
47#include "internal-fn.h"
48#include "dump-context.h"
49#include "cfganal.h"
50#include "tree-eh.h"
51#include "tree-cfg.h"
52#include "alloc-pool.h"
53#include "sreal.h"
54#include "predict.h"
55
56static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
70
71static object_allocator<_slp_tree> *slp_tree_pool;
72static slp_tree slp_first_node;
73
74void
75vect_slp_init (void)
76{
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
78}
79
80void
81vect_slp_fini (void)
82{
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
87}
88
89void *
90_slp_tree::operator new (size_t n)
91{
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
94}
95
96void
97_slp_tree::operator delete (void *node, size_t n)
98{
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (object: node);
101}
102
103
104/* Initialize a SLP node. */
105
106_slp_tree::_slp_tree ()
107{
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_DEFS (this) = vNULL;
116 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
117 SLP_TREE_CHILDREN (this) = vNULL;
118 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
119 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
120 SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
121 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 SLP_TREE_CODE (this) = ERROR_MARK;
123 SLP_TREE_VECTYPE (this) = NULL_TREE;
124 SLP_TREE_REPRESENTATIVE (this) = NULL;
125 SLP_TREE_REF_COUNT (this) = 1;
126 this->failed = NULL;
127 this->max_nunits = 1;
128 this->lanes = 0;
129}
130
131/* Tear down a SLP node. */
132
133_slp_tree::~_slp_tree ()
134{
135 if (this->prev_node)
136 this->prev_node->next_node = this->next_node;
137 else
138 slp_first_node = this->next_node;
139 if (this->next_node)
140 this->next_node->prev_node = this->prev_node;
141 SLP_TREE_CHILDREN (this).release ();
142 SLP_TREE_SCALAR_STMTS (this).release ();
143 SLP_TREE_SCALAR_OPS (this).release ();
144 SLP_TREE_VEC_DEFS (this).release ();
145 SLP_TREE_LOAD_PERMUTATION (this).release ();
146 SLP_TREE_LANE_PERMUTATION (this).release ();
147 SLP_TREE_SIMD_CLONE_INFO (this).release ();
148 if (this->failed)
149 free (ptr: failed);
150}
151
152/* Push the single SSA definition in DEF to the vector of vector defs. */
153
154void
155_slp_tree::push_vec_def (gimple *def)
156{
157 if (gphi *phi = dyn_cast <gphi *> (p: def))
158 vec_defs.quick_push (obj: gimple_phi_result (gs: phi));
159 else
160 {
161 def_operand_p defop = single_ssa_def_operand (stmt: def, SSA_OP_ALL_DEFS);
162 vec_defs.quick_push (obj: get_def_from_ptr (def: defop));
163 }
164}
165
166/* Recursively free the memory allocated for the SLP tree rooted at NODE. */
167
168void
169vect_free_slp_tree (slp_tree node)
170{
171 int i;
172 slp_tree child;
173
174 if (--SLP_TREE_REF_COUNT (node) != 0)
175 return;
176
177 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
178 if (child)
179 vect_free_slp_tree (node: child);
180
181 /* If the node defines any SLP only patterns then those patterns are no
182 longer valid and should be removed. */
183 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
184 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
185 {
186 stmt_vec_info stmt_info = vect_orig_stmt (stmt_info: rep_stmt_info);
187 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
188 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
189 }
190
191 delete node;
192}
193
194/* Return a location suitable for dumpings related to the SLP instance. */
195
196dump_user_location_t
197_slp_instance::location () const
198{
199 if (!root_stmts.is_empty ())
200 return root_stmts[0]->stmt;
201 else
202 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
203}
204
205
206/* Free the memory allocated for the SLP instance. */
207
208void
209vect_free_slp_instance (slp_instance instance)
210{
211 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
212 SLP_INSTANCE_LOADS (instance).release ();
213 SLP_INSTANCE_ROOT_STMTS (instance).release ();
214 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
215 instance->subgraph_entries.release ();
216 instance->cost_vec.release ();
217 free (ptr: instance);
218}
219
220
221/* Create an SLP node for SCALAR_STMTS. */
222
223slp_tree
224vect_create_new_slp_node (unsigned nops, tree_code code)
225{
226 slp_tree node = new _slp_tree;
227 SLP_TREE_SCALAR_STMTS (node) = vNULL;
228 SLP_TREE_CHILDREN (node).create (nelems: nops);
229 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
230 SLP_TREE_CODE (node) = code;
231 return node;
232}
233/* Create an SLP node for SCALAR_STMTS. */
234
235static slp_tree
236vect_create_new_slp_node (slp_tree node,
237 vec<stmt_vec_info> scalar_stmts, unsigned nops)
238{
239 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
240 SLP_TREE_CHILDREN (node).create (nelems: nops);
241 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
242 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
243 SLP_TREE_LANES (node) = scalar_stmts.length ();
244 return node;
245}
246
247/* Create an SLP node for SCALAR_STMTS. */
248
249static slp_tree
250vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
251{
252 return vect_create_new_slp_node (node: new _slp_tree, scalar_stmts, nops);
253}
254
255/* Create an SLP node for OPS. */
256
257static slp_tree
258vect_create_new_slp_node (slp_tree node, vec<tree> ops)
259{
260 SLP_TREE_SCALAR_OPS (node) = ops;
261 SLP_TREE_DEF_TYPE (node) = vect_external_def;
262 SLP_TREE_LANES (node) = ops.length ();
263 return node;
264}
265
266/* Create an SLP node for OPS. */
267
268static slp_tree
269vect_create_new_slp_node (vec<tree> ops)
270{
271 return vect_create_new_slp_node (node: new _slp_tree, ops);
272}
273
274
275/* This structure is used in creation of an SLP tree. Each instance
276 corresponds to the same operand in a group of scalar stmts in an SLP
277 node. */
278typedef struct _slp_oprnd_info
279{
280 /* Def-stmts for the operands. */
281 vec<stmt_vec_info> def_stmts;
282 /* Operands. */
283 vec<tree> ops;
284 /* Information about the first statement, its vector def-type, type, the
285 operand itself in case it's constant, and an indication if it's a pattern
286 stmt and gather/scatter info. */
287 tree first_op_type;
288 enum vect_def_type first_dt;
289 bool any_pattern;
290 bool first_gs_p;
291 gather_scatter_info first_gs_info;
292} *slp_oprnd_info;
293
294
295/* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
296 operand. */
297static vec<slp_oprnd_info>
298vect_create_oprnd_info (int nops, int group_size)
299{
300 int i;
301 slp_oprnd_info oprnd_info;
302 vec<slp_oprnd_info> oprnds_info;
303
304 oprnds_info.create (nelems: nops);
305 for (i = 0; i < nops; i++)
306 {
307 oprnd_info = XNEW (struct _slp_oprnd_info);
308 oprnd_info->def_stmts.create (nelems: group_size);
309 oprnd_info->ops.create (nelems: group_size);
310 oprnd_info->first_dt = vect_uninitialized_def;
311 oprnd_info->first_op_type = NULL_TREE;
312 oprnd_info->any_pattern = false;
313 oprnd_info->first_gs_p = false;
314 oprnds_info.quick_push (obj: oprnd_info);
315 }
316
317 return oprnds_info;
318}
319
320
321/* Free operands info. */
322
323static void
324vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
325{
326 int i;
327 slp_oprnd_info oprnd_info;
328
329 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
330 {
331 oprnd_info->def_stmts.release ();
332 oprnd_info->ops.release ();
333 XDELETE (oprnd_info);
334 }
335
336 oprnds_info.release ();
337}
338
339/* Return the execution frequency of NODE (so that a higher value indicates
340 a "more important" node when optimizing for speed). */
341
342static sreal
343vect_slp_node_weight (slp_tree node)
344{
345 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
346 basic_block bb = gimple_bb (g: stmt_info->stmt);
347 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
348}
349
350/* Return true if STMTS contains a pattern statement. */
351
352static bool
353vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
354{
355 stmt_vec_info stmt_info;
356 unsigned int i;
357 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
358 if (is_pattern_stmt_p (stmt_info))
359 return true;
360 return false;
361}
362
363/* Return true when all lanes in the external or constant NODE have
364 the same value. */
365
366static bool
367vect_slp_tree_uniform_p (slp_tree node)
368{
369 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
370 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
371
372 /* Pre-exsting vectors. */
373 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
374 return false;
375
376 unsigned i;
377 tree op, first = NULL_TREE;
378 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
379 if (!first)
380 first = op;
381 else if (!operand_equal_p (first, op, flags: 0))
382 return false;
383
384 return true;
385}
386
387/* Find the place of the data-ref in STMT_INFO in the interleaving chain
388 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
389 of the chain. */
390
391int
392vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
393 stmt_vec_info first_stmt_info)
394{
395 stmt_vec_info next_stmt_info = first_stmt_info;
396 int result = 0;
397
398 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
399 return -1;
400
401 do
402 {
403 if (next_stmt_info == stmt_info)
404 return result;
405 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
406 if (next_stmt_info)
407 result += DR_GROUP_GAP (next_stmt_info);
408 }
409 while (next_stmt_info);
410
411 return -1;
412}
413
414/* Check whether it is possible to load COUNT elements of type ELT_TYPE
415 using the method implemented by duplicate_and_interleave. Return true
416 if so, returning the number of intermediate vectors in *NVECTORS_OUT
417 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
418 (if nonnull). */
419
420bool
421can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
422 tree elt_type, unsigned int *nvectors_out,
423 tree *vector_type_out,
424 tree *permutes)
425{
426 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
427 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
428 return false;
429
430 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
431 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
432 unsigned int nvectors = 1;
433 for (;;)
434 {
435 scalar_int_mode int_mode;
436 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
437 if (int_mode_for_size (size: elt_bits, limit: 1).exists (mode: &int_mode))
438 {
439 /* Get the natural vector type for this SLP group size. */
440 tree int_type = build_nonstandard_integer_type
441 (GET_MODE_BITSIZE (mode: int_mode), 1);
442 tree vector_type
443 = get_vectype_for_scalar_type (vinfo, int_type, count);
444 poly_int64 half_nelts;
445 if (vector_type
446 && VECTOR_MODE_P (TYPE_MODE (vector_type))
447 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
448 GET_MODE_SIZE (base_vector_mode))
449 && multiple_p (a: GET_MODE_NUNITS (TYPE_MODE (vector_type)),
450 b: 2, multiple: &half_nelts))
451 {
452 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
453 together into elements of type INT_TYPE and using the result
454 to build NVECTORS vectors. */
455 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
456 vec_perm_builder sel1 (nelts, 2, 3);
457 vec_perm_builder sel2 (nelts, 2, 3);
458
459 for (unsigned int i = 0; i < 3; ++i)
460 {
461 sel1.quick_push (obj: i);
462 sel1.quick_push (obj: i + nelts);
463 sel2.quick_push (obj: half_nelts + i);
464 sel2.quick_push (obj: half_nelts + i + nelts);
465 }
466 vec_perm_indices indices1 (sel1, 2, nelts);
467 vec_perm_indices indices2 (sel2, 2, nelts);
468 machine_mode vmode = TYPE_MODE (vector_type);
469 if (can_vec_perm_const_p (vmode, vmode, indices1)
470 && can_vec_perm_const_p (vmode, vmode, indices2))
471 {
472 if (nvectors_out)
473 *nvectors_out = nvectors;
474 if (vector_type_out)
475 *vector_type_out = vector_type;
476 if (permutes)
477 {
478 permutes[0] = vect_gen_perm_mask_checked (vector_type,
479 indices1);
480 permutes[1] = vect_gen_perm_mask_checked (vector_type,
481 indices2);
482 }
483 return true;
484 }
485 }
486 }
487 if (!multiple_p (a: elt_bytes, b: 2, multiple: &elt_bytes))
488 return false;
489 nvectors *= 2;
490 }
491}
492
493/* Return true if DTA and DTB match. */
494
495static bool
496vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
497{
498 return (dta == dtb
499 || ((dta == vect_external_def || dta == vect_constant_def)
500 && (dtb == vect_external_def || dtb == vect_constant_def)));
501}
502
503static const int cond_expr_maps[3][5] = {
504 { 4, -1, -2, 1, 2 },
505 { 4, -2, -1, 1, 2 },
506 { 4, -1, -2, 2, 1 }
507};
508static const int arg0_map[] = { 1, 0 };
509static const int arg1_map[] = { 1, 1 };
510static const int arg2_map[] = { 1, 2 };
511static const int arg1_arg4_map[] = { 2, 1, 4 };
512static const int arg3_arg2_map[] = { 2, 3, 2 };
513static const int op1_op0_map[] = { 2, 1, 0 };
514static const int off_map[] = { 1, -3 };
515static const int off_op0_map[] = { 2, -3, 0 };
516static const int off_arg2_map[] = { 2, -3, 2 };
517static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
518static const int mask_call_maps[6][7] = {
519 { 1, 1, },
520 { 2, 1, 2, },
521 { 3, 1, 2, 3, },
522 { 4, 1, 2, 3, 4, },
523 { 5, 1, 2, 3, 4, 5, },
524 { 6, 1, 2, 3, 4, 5, 6 },
525};
526
527/* For most SLP statements, there is a one-to-one mapping between
528 gimple arguments and child nodes. If that is not true for STMT,
529 return an array that contains:
530
531 - the number of child nodes, followed by
532 - for each child node, the index of the argument associated with that node.
533 The special index -1 is the first operand of an embedded comparison and
534 the special index -2 is the second operand of an embedded comparison.
535 The special indes -3 is the offset of a gather as analyzed by
536 vect_check_gather_scatter.
537
538 SWAP is as for vect_get_and_check_slp_defs. */
539
540static const int *
541vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
542 unsigned char swap = 0)
543{
544 if (auto assign = dyn_cast<const gassign *> (p: stmt))
545 {
546 if (gimple_assign_rhs_code (gs: assign) == COND_EXPR
547 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
548 return cond_expr_maps[swap];
549 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
550 && swap)
551 return op1_op0_map;
552 if (gather_scatter_p)
553 return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
554 ? off_op0_map : off_map);
555 }
556 gcc_assert (!swap);
557 if (auto call = dyn_cast<const gcall *> (p: stmt))
558 {
559 if (gimple_call_internal_p (gs: call))
560 switch (gimple_call_internal_fn (gs: call))
561 {
562 case IFN_MASK_LOAD:
563 return gather_scatter_p ? off_arg2_map : arg2_map;
564
565 case IFN_GATHER_LOAD:
566 return arg1_map;
567
568 case IFN_MASK_GATHER_LOAD:
569 case IFN_MASK_LEN_GATHER_LOAD:
570 return arg1_arg4_map;
571
572 case IFN_MASK_STORE:
573 return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
574
575 case IFN_MASK_CALL:
576 {
577 unsigned nargs = gimple_call_num_args (gs: call);
578 if (nargs >= 2 && nargs <= 7)
579 return mask_call_maps[nargs-2];
580 else
581 return nullptr;
582 }
583
584 case IFN_CLZ:
585 case IFN_CTZ:
586 return arg0_map;
587
588 default:
589 break;
590 }
591 }
592 return nullptr;
593}
594
595/* Return the SLP node child index for operand OP of STMT. */
596
597int
598vect_slp_child_index_for_operand (const gimple *stmt, int op,
599 bool gather_scatter_p)
600{
601 const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
602 if (!opmap)
603 return op;
604 for (int i = 1; i < 1 + opmap[0]; ++i)
605 if (opmap[i] == op)
606 return i - 1;
607 gcc_unreachable ();
608}
609
610/* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
611 they are of a valid type and that they match the defs of the first stmt of
612 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
613 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
614 indicates swap is required for cond_expr stmts. Specifically, SWAP
615 is 1 if STMT is cond and operands of comparison need to be swapped;
616 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
617
618 If there was a fatal error return -1; if the error could be corrected by
619 swapping operands of father node of this one, return 1; if everything is
620 ok return 0. */
621static int
622vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
623 bool *skip_args,
624 vec<stmt_vec_info> stmts, unsigned stmt_num,
625 vec<slp_oprnd_info> *oprnds_info)
626{
627 stmt_vec_info stmt_info = stmts[stmt_num];
628 tree oprnd;
629 unsigned int i, number_of_oprnds;
630 enum vect_def_type dt = vect_uninitialized_def;
631 slp_oprnd_info oprnd_info;
632 gather_scatter_info gs_info;
633 unsigned int gs_op = -1u;
634 unsigned int commutative_op = -1U;
635 bool first = stmt_num == 0;
636
637 if (!is_a<gcall *> (p: stmt_info->stmt)
638 && !is_a<gassign *> (p: stmt_info->stmt)
639 && !is_a<gphi *> (p: stmt_info->stmt))
640 return -1;
641
642 number_of_oprnds = gimple_num_args (gs: stmt_info->stmt);
643 const int *map
644 = vect_get_operand_map (stmt: stmt_info->stmt,
645 STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
646 if (map)
647 number_of_oprnds = *map++;
648 if (gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt))
649 {
650 if (gimple_call_internal_p (gs: stmt))
651 {
652 internal_fn ifn = gimple_call_internal_fn (gs: stmt);
653 commutative_op = first_commutative_argument (ifn);
654 }
655 }
656 else if (gassign *stmt = dyn_cast <gassign *> (p: stmt_info->stmt))
657 {
658 if (commutative_tree_code (gimple_assign_rhs_code (gs: stmt)))
659 commutative_op = 0;
660 }
661
662 bool swapped = (swap != 0);
663 bool backedge = false;
664 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
665 for (i = 0; i < number_of_oprnds; i++)
666 {
667 oprnd_info = (*oprnds_info)[i];
668 int opno = map ? map[i] : int (i);
669 if (opno == -3)
670 {
671 gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
672 if (!is_a <loop_vec_info> (p: vinfo)
673 || !vect_check_gather_scatter (stmt_info,
674 as_a <loop_vec_info> (p: vinfo),
675 first ? &oprnd_info->first_gs_info
676 : &gs_info))
677 return -1;
678
679 if (first)
680 {
681 oprnd_info->first_gs_p = true;
682 oprnd = oprnd_info->first_gs_info.offset;
683 }
684 else
685 {
686 gs_op = i;
687 oprnd = gs_info.offset;
688 }
689 }
690 else if (opno < 0)
691 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
692 else
693 {
694 oprnd = gimple_arg (gs: stmt_info->stmt, i: opno);
695 if (gphi *stmt = dyn_cast <gphi *> (p: stmt_info->stmt))
696 {
697 edge e = gimple_phi_arg_edge (phi: stmt, i: opno);
698 backedge = (is_a <bb_vec_info> (p: vinfo)
699 ? e->flags & EDGE_DFS_BACK
700 : dominated_by_p (CDI_DOMINATORS, e->src,
701 gimple_bb (g: stmt_info->stmt)));
702 }
703 }
704 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
705 oprnd = TREE_OPERAND (oprnd, 0);
706
707 stmt_vec_info def_stmt_info;
708 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
709 {
710 if (dump_enabled_p ())
711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
712 "Build SLP failed: can't analyze def for %T\n",
713 oprnd);
714
715 return -1;
716 }
717
718 if (skip_args[i])
719 {
720 oprnd_info->def_stmts.quick_push (NULL);
721 oprnd_info->ops.quick_push (NULL_TREE);
722 oprnd_info->first_dt = vect_uninitialized_def;
723 continue;
724 }
725
726 oprnd_info->def_stmts.quick_push (obj: def_stmt_info);
727 oprnd_info->ops.quick_push (obj: oprnd);
728
729 if (def_stmt_info
730 && is_pattern_stmt_p (stmt_info: def_stmt_info))
731 {
732 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
733 != def_stmt_info)
734 oprnd_info->any_pattern = true;
735 else
736 /* If we promote this to external use the original stmt def. */
737 oprnd_info->ops.last ()
738 = gimple_get_lhs (vect_orig_stmt (stmt_info: def_stmt_info)->stmt);
739 }
740
741 /* If there's a extern def on a backedge make sure we can
742 code-generate at the region start.
743 ??? This is another case that could be fixed by adjusting
744 how we split the function but at the moment we'd have conflicting
745 goals there. */
746 if (backedge
747 && dts[i] == vect_external_def
748 && is_a <bb_vec_info> (p: vinfo)
749 && TREE_CODE (oprnd) == SSA_NAME
750 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
751 && !dominated_by_p (CDI_DOMINATORS,
752 as_a <bb_vec_info> (p: vinfo)->bbs[0],
753 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
754 {
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "Build SLP failed: extern def %T only defined "
758 "on backedge\n", oprnd);
759 return -1;
760 }
761
762 if (first)
763 {
764 tree type = TREE_TYPE (oprnd);
765 dt = dts[i];
766
767 /* For the swapping logic below force vect_reduction_def
768 for the reduction op in a SLP reduction group. */
769 if (!STMT_VINFO_DATA_REF (stmt_info)
770 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
771 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
772 && def_stmt_info)
773 dts[i] = dt = vect_reduction_def;
774
775 /* Check the types of the definition. */
776 switch (dt)
777 {
778 case vect_external_def:
779 case vect_constant_def:
780 case vect_internal_def:
781 case vect_reduction_def:
782 case vect_induction_def:
783 case vect_nested_cycle:
784 case vect_first_order_recurrence:
785 break;
786
787 default:
788 /* FORNOW: Not supported. */
789 if (dump_enabled_p ())
790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
791 "Build SLP failed: illegal type of def %T\n",
792 oprnd);
793 return -1;
794 }
795
796 oprnd_info->first_dt = dt;
797 oprnd_info->first_op_type = type;
798 }
799 }
800 if (first)
801 return 0;
802
803 /* Now match the operand definition types to that of the first stmt. */
804 for (i = 0; i < number_of_oprnds;)
805 {
806 if (skip_args[i])
807 {
808 ++i;
809 continue;
810 }
811
812 oprnd_info = (*oprnds_info)[i];
813 dt = dts[i];
814 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
815 oprnd = oprnd_info->ops[stmt_num];
816 tree type = TREE_TYPE (oprnd);
817
818 if (!types_compatible_p (type1: oprnd_info->first_op_type, type2: type))
819 {
820 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Build SLP failed: different operand types\n");
823 return 1;
824 }
825
826 if ((gs_op == i) != oprnd_info->first_gs_p)
827 {
828 if (dump_enabled_p ())
829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
830 "Build SLP failed: mixed gather and non-gather\n");
831 return 1;
832 }
833 else if (gs_op == i)
834 {
835 if (!operand_equal_p (oprnd_info->first_gs_info.base,
836 gs_info.base))
837 {
838 if (dump_enabled_p ())
839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
840 "Build SLP failed: different gather base\n");
841 return 1;
842 }
843 if (oprnd_info->first_gs_info.scale != gs_info.scale)
844 {
845 if (dump_enabled_p ())
846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
847 "Build SLP failed: different gather scale\n");
848 return 1;
849 }
850 }
851
852 /* Not first stmt of the group, check that the def-stmt/s match
853 the def-stmt/s of the first stmt. Allow different definition
854 types for reduction chains: the first stmt must be a
855 vect_reduction_def (a phi node), and the rest
856 end in the reduction chain. */
857 if ((!vect_def_types_match (dta: oprnd_info->first_dt, dtb: dt)
858 && !(oprnd_info->first_dt == vect_reduction_def
859 && !STMT_VINFO_DATA_REF (stmt_info)
860 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
861 && def_stmt_info
862 && !STMT_VINFO_DATA_REF (def_stmt_info)
863 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
864 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
865 || (!STMT_VINFO_DATA_REF (stmt_info)
866 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
867 && ((!def_stmt_info
868 || STMT_VINFO_DATA_REF (def_stmt_info)
869 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
870 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
871 != (oprnd_info->first_dt != vect_reduction_def))))
872 {
873 /* Try swapping operands if we got a mismatch. For BB
874 vectorization only in case it will clearly improve things. */
875 if (i == commutative_op && !swapped
876 && (!is_a <bb_vec_info> (p: vinfo)
877 || (!vect_def_types_match (dta: (*oprnds_info)[i+1]->first_dt,
878 dtb: dts[i+1])
879 && (vect_def_types_match (dta: oprnd_info->first_dt, dtb: dts[i+1])
880 || vect_def_types_match
881 (dta: (*oprnds_info)[i+1]->first_dt, dtb: dts[i])))))
882 {
883 if (dump_enabled_p ())
884 dump_printf_loc (MSG_NOTE, vect_location,
885 "trying swapped operands\n");
886 std::swap (a&: dts[i], b&: dts[i+1]);
887 std::swap (a&: (*oprnds_info)[i]->def_stmts[stmt_num],
888 b&: (*oprnds_info)[i+1]->def_stmts[stmt_num]);
889 std::swap (a&: (*oprnds_info)[i]->ops[stmt_num],
890 b&: (*oprnds_info)[i+1]->ops[stmt_num]);
891 /* After swapping some operands we lost track whether an
892 operand has any pattern defs so be conservative here. */
893 if ((*oprnds_info)[i]->any_pattern
894 || (*oprnds_info)[i+1]->any_pattern)
895 (*oprnds_info)[i]->any_pattern
896 = (*oprnds_info)[i+1]->any_pattern = true;
897 swapped = true;
898 continue;
899 }
900
901 if (is_a <bb_vec_info> (p: vinfo)
902 && !oprnd_info->any_pattern)
903 {
904 /* Now for commutative ops we should see whether we can
905 make the other operand matching. */
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
908 "treating operand as external\n");
909 oprnd_info->first_dt = dt = vect_external_def;
910 }
911 else
912 {
913 if (dump_enabled_p ())
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915 "Build SLP failed: different types\n");
916 return 1;
917 }
918 }
919
920 /* Make sure to demote the overall operand to external. */
921 if (dt == vect_external_def)
922 oprnd_info->first_dt = vect_external_def;
923 /* For a SLP reduction chain we want to duplicate the reduction to
924 each of the chain members. That gets us a sane SLP graph (still
925 the stmts are not 100% correct wrt the initial values). */
926 else if ((dt == vect_internal_def
927 || dt == vect_reduction_def)
928 && oprnd_info->first_dt == vect_reduction_def
929 && !STMT_VINFO_DATA_REF (stmt_info)
930 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
931 && !STMT_VINFO_DATA_REF (def_stmt_info)
932 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
933 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
934 {
935 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
936 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
937 }
938
939 ++i;
940 }
941
942 /* Swap operands. */
943 if (swapped)
944 {
945 if (dump_enabled_p ())
946 dump_printf_loc (MSG_NOTE, vect_location,
947 "swapped operands to match def types in %G",
948 stmt_info->stmt);
949 }
950
951 return 0;
952}
953
954/* Return true if call statements CALL1 and CALL2 are similar enough
955 to be combined into the same SLP group. */
956
957bool
958compatible_calls_p (gcall *call1, gcall *call2)
959{
960 unsigned int nargs = gimple_call_num_args (gs: call1);
961 if (nargs != gimple_call_num_args (gs: call2))
962 return false;
963
964 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
965 return false;
966
967 if (gimple_call_internal_p (gs: call1))
968 {
969 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
970 TREE_TYPE (gimple_call_lhs (call2))))
971 return false;
972 for (unsigned int i = 0; i < nargs; ++i)
973 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
974 TREE_TYPE (gimple_call_arg (call2, i))))
975 return false;
976 }
977 else
978 {
979 if (!operand_equal_p (gimple_call_fn (gs: call1),
980 gimple_call_fn (gs: call2), flags: 0))
981 return false;
982
983 if (gimple_call_fntype (gs: call1) != gimple_call_fntype (gs: call2))
984 return false;
985 }
986
987 /* Check that any unvectorized arguments are equal. */
988 if (const int *map = vect_get_operand_map (stmt: call1))
989 {
990 unsigned int nkept = *map++;
991 unsigned int mapi = 0;
992 for (unsigned int i = 0; i < nargs; ++i)
993 if (mapi < nkept && map[mapi] == int (i))
994 mapi += 1;
995 else if (!operand_equal_p (gimple_call_arg (gs: call1, index: i),
996 gimple_call_arg (gs: call2, index: i)))
997 return false;
998 }
999
1000 return true;
1001}
1002
1003/* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1004 caller's attempt to find the vector type in STMT_INFO with the narrowest
1005 element type. Return true if VECTYPE is nonnull and if it is valid
1006 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1007 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1008 vect_build_slp_tree. */
1009
1010static bool
1011vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1012 unsigned int group_size,
1013 tree vectype, poly_uint64 *max_nunits)
1014{
1015 if (!vectype)
1016 {
1017 if (dump_enabled_p ())
1018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019 "Build SLP failed: unsupported data-type in %G\n",
1020 stmt_info->stmt);
1021 /* Fatal mismatch. */
1022 return false;
1023 }
1024
1025 /* If populating the vector type requires unrolling then fail
1026 before adjusting *max_nunits for basic-block vectorization. */
1027 if (is_a <bb_vec_info> (p: vinfo)
1028 && !multiple_p (a: group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype)))
1029 {
1030 if (dump_enabled_p ())
1031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032 "Build SLP failed: unrolling required "
1033 "in basic block SLP\n");
1034 /* Fatal mismatch. */
1035 return false;
1036 }
1037
1038 /* In case of multiple types we need to detect the smallest type. */
1039 vect_update_max_nunits (max_nunits, vectype);
1040 return true;
1041}
1042
1043/* Verify if the scalar stmts STMTS are isomorphic, require data
1044 permutation or are of unsupported types of operation. Return
1045 true if they are, otherwise return false and indicate in *MATCHES
1046 which stmts are not isomorphic to the first one. If MATCHES[0]
1047 is false then this indicates the comparison could not be
1048 carried out or the stmts will never be vectorized by SLP.
1049
1050 Note COND_EXPR is possibly isomorphic to another one after swapping its
1051 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1052 the first stmt by swapping the two operands of comparison; set SWAP[i]
1053 to 2 if stmt I is isormorphic to the first stmt by inverting the code
1054 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1055 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1056
1057static bool
1058vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1059 vec<stmt_vec_info> stmts, unsigned int group_size,
1060 poly_uint64 *max_nunits, bool *matches,
1061 bool *two_operators, tree *node_vectype)
1062{
1063 unsigned int i;
1064 stmt_vec_info first_stmt_info = stmts[0];
1065 code_helper first_stmt_code = ERROR_MARK;
1066 code_helper alt_stmt_code = ERROR_MARK;
1067 code_helper rhs_code = ERROR_MARK;
1068 code_helper first_cond_code = ERROR_MARK;
1069 tree lhs;
1070 bool need_same_oprnds = false;
1071 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1072 stmt_vec_info first_load = NULL, prev_first_load = NULL;
1073 bool first_stmt_ldst_p = false, ldst_p = false;
1074 bool first_stmt_phi_p = false, phi_p = false;
1075 bool maybe_soft_fail = false;
1076 tree soft_fail_nunits_vectype = NULL_TREE;
1077
1078 /* For every stmt in NODE find its def stmt/s. */
1079 stmt_vec_info stmt_info;
1080 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1081 {
1082 gimple *stmt = stmt_info->stmt;
1083 swap[i] = 0;
1084 matches[i] = false;
1085
1086 if (dump_enabled_p ())
1087 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1088
1089 /* Fail to vectorize statements marked as unvectorizable, throw
1090 or are volatile. */
1091 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1092 || stmt_can_throw_internal (cfun, stmt)
1093 || gimple_has_volatile_ops (stmt))
1094 {
1095 if (dump_enabled_p ())
1096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1097 "Build SLP failed: unvectorizable statement %G",
1098 stmt);
1099 /* ??? For BB vectorization we want to commutate operands in a way
1100 to shuffle all unvectorizable defs into one operand and have
1101 the other still vectorized. The following doesn't reliably
1102 work for this though but it's the easiest we can do here. */
1103 if (is_a <bb_vec_info> (p: vinfo) && i != 0)
1104 continue;
1105 /* Fatal mismatch. */
1106 matches[0] = false;
1107 return false;
1108 }
1109
1110 gcall *call_stmt = dyn_cast <gcall *> (p: stmt);
1111 lhs = gimple_get_lhs (stmt);
1112 if (lhs == NULL_TREE
1113 && (!call_stmt
1114 || !gimple_call_internal_p (gs: stmt)
1115 || !internal_store_fn_p (gimple_call_internal_fn (gs: stmt))))
1116 {
1117 if (dump_enabled_p ())
1118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1119 "Build SLP failed: not GIMPLE_ASSIGN nor "
1120 "GIMPLE_CALL %G", stmt);
1121 if (is_a <bb_vec_info> (p: vinfo) && i != 0)
1122 continue;
1123 /* Fatal mismatch. */
1124 matches[0] = false;
1125 return false;
1126 }
1127
1128 tree nunits_vectype;
1129 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1130 &nunits_vectype, group_size))
1131 {
1132 if (is_a <bb_vec_info> (p: vinfo) && i != 0)
1133 continue;
1134 /* Fatal mismatch. */
1135 matches[0] = false;
1136 return false;
1137 }
1138 /* Record nunits required but continue analysis, producing matches[]
1139 as if nunits was not an issue. This allows splitting of groups
1140 to happen. */
1141 if (nunits_vectype
1142 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1143 vectype: nunits_vectype, max_nunits))
1144 {
1145 gcc_assert (is_a <bb_vec_info> (vinfo));
1146 maybe_soft_fail = true;
1147 soft_fail_nunits_vectype = nunits_vectype;
1148 }
1149
1150 gcc_assert (vectype);
1151
1152 if (call_stmt)
1153 {
1154 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1155 if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1156 rhs_code = cfn;
1157 else
1158 rhs_code = CALL_EXPR;
1159
1160 if (cfn == CFN_MASK_LOAD
1161 || cfn == CFN_GATHER_LOAD
1162 || cfn == CFN_MASK_GATHER_LOAD
1163 || cfn == CFN_MASK_LEN_GATHER_LOAD)
1164 ldst_p = true;
1165 else if (cfn == CFN_MASK_STORE)
1166 {
1167 ldst_p = true;
1168 rhs_code = CFN_MASK_STORE;
1169 }
1170 else if ((cfn != CFN_LAST
1171 && cfn != CFN_MASK_CALL
1172 && internal_fn_p (code: cfn)
1173 && !vectorizable_internal_fn_p (fn: as_internal_fn (code: cfn)))
1174 || gimple_call_tail_p (s: call_stmt)
1175 || gimple_call_noreturn_p (s: call_stmt)
1176 || gimple_call_chain (gs: call_stmt))
1177 {
1178 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180 "Build SLP failed: unsupported call type %G",
1181 (gimple *) call_stmt);
1182 if (is_a <bb_vec_info> (p: vinfo) && i != 0)
1183 continue;
1184 /* Fatal mismatch. */
1185 matches[0] = false;
1186 return false;
1187 }
1188 }
1189 else if (gimple_code (g: stmt) == GIMPLE_PHI)
1190 {
1191 rhs_code = ERROR_MARK;
1192 phi_p = true;
1193 }
1194 else
1195 {
1196 rhs_code = gimple_assign_rhs_code (gs: stmt);
1197 ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1198 }
1199
1200 /* Check the operation. */
1201 if (i == 0)
1202 {
1203 *node_vectype = vectype;
1204 first_stmt_code = rhs_code;
1205 first_stmt_ldst_p = ldst_p;
1206 first_stmt_phi_p = phi_p;
1207
1208 /* Shift arguments should be equal in all the packed stmts for a
1209 vector shift with scalar shift operand. */
1210 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1211 || rhs_code == LROTATE_EXPR
1212 || rhs_code == RROTATE_EXPR)
1213 {
1214 /* First see if we have a vector/vector shift. */
1215 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1216 {
1217 /* No vector/vector shift, try for a vector/scalar shift. */
1218 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1219 {
1220 if (dump_enabled_p ())
1221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222 "Build SLP failed: "
1223 "op not supported by target.\n");
1224 if (is_a <bb_vec_info> (p: vinfo) && i != 0)
1225 continue;
1226 /* Fatal mismatch. */
1227 matches[0] = false;
1228 return false;
1229 }
1230 need_same_oprnds = true;
1231 first_op1 = gimple_assign_rhs2 (gs: stmt);
1232 }
1233 }
1234 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1235 {
1236 need_same_oprnds = true;
1237 first_op1 = gimple_assign_rhs2 (gs: stmt);
1238 }
1239 else if (!ldst_p
1240 && rhs_code == BIT_FIELD_REF)
1241 {
1242 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1243 if (!is_a <bb_vec_info> (p: vinfo)
1244 || TREE_CODE (vec) != SSA_NAME
1245 /* When the element types are not compatible we pun the
1246 source to the target vectype which requires equal size. */
1247 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1248 || !types_compatible_p (TREE_TYPE (vectype),
1249 TREE_TYPE (TREE_TYPE (vec))))
1250 && !operand_equal_p (TYPE_SIZE (vectype),
1251 TYPE_SIZE (TREE_TYPE (vec)))))
1252 {
1253 if (dump_enabled_p ())
1254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1255 "Build SLP failed: "
1256 "BIT_FIELD_REF not supported\n");
1257 /* Fatal mismatch. */
1258 matches[0] = false;
1259 return false;
1260 }
1261 }
1262 else if (rhs_code == CFN_DIV_POW2)
1263 {
1264 need_same_oprnds = true;
1265 first_op1 = gimple_call_arg (gs: call_stmt, index: 1);
1266 }
1267 }
1268 else
1269 {
1270 if (first_stmt_code != rhs_code
1271 && alt_stmt_code == ERROR_MARK)
1272 alt_stmt_code = rhs_code;
1273 if ((first_stmt_code != rhs_code
1274 && (first_stmt_code != IMAGPART_EXPR
1275 || rhs_code != REALPART_EXPR)
1276 && (first_stmt_code != REALPART_EXPR
1277 || rhs_code != IMAGPART_EXPR)
1278 /* Handle mismatches in plus/minus by computing both
1279 and merging the results. */
1280 && !((first_stmt_code == PLUS_EXPR
1281 || first_stmt_code == MINUS_EXPR)
1282 && (alt_stmt_code == PLUS_EXPR
1283 || alt_stmt_code == MINUS_EXPR)
1284 && rhs_code == alt_stmt_code)
1285 && !(first_stmt_code.is_tree_code ()
1286 && rhs_code.is_tree_code ()
1287 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1288 == tcc_comparison)
1289 && (swap_tree_comparison (tree_code (first_stmt_code))
1290 == tree_code (rhs_code)))
1291 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1292 && (first_stmt_code == ARRAY_REF
1293 || first_stmt_code == BIT_FIELD_REF
1294 || first_stmt_code == INDIRECT_REF
1295 || first_stmt_code == COMPONENT_REF
1296 || first_stmt_code == MEM_REF)
1297 && (rhs_code == ARRAY_REF
1298 || rhs_code == BIT_FIELD_REF
1299 || rhs_code == INDIRECT_REF
1300 || rhs_code == COMPONENT_REF
1301 || rhs_code == MEM_REF)))
1302 || (ldst_p
1303 && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1304 != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1305 || (ldst_p
1306 && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1307 != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1308 || first_stmt_ldst_p != ldst_p
1309 || first_stmt_phi_p != phi_p)
1310 {
1311 if (dump_enabled_p ())
1312 {
1313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314 "Build SLP failed: different operation "
1315 "in stmt %G", stmt);
1316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1317 "original stmt %G", first_stmt_info->stmt);
1318 }
1319 /* Mismatch. */
1320 continue;
1321 }
1322
1323 if (!ldst_p
1324 && first_stmt_code == BIT_FIELD_REF
1325 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1326 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1327 {
1328 if (dump_enabled_p ())
1329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330 "Build SLP failed: different BIT_FIELD_REF "
1331 "arguments in %G", stmt);
1332 /* Mismatch. */
1333 continue;
1334 }
1335
1336 if (call_stmt
1337 && first_stmt_code != CFN_MASK_LOAD
1338 && first_stmt_code != CFN_MASK_STORE)
1339 {
1340 if (!compatible_calls_p (call1: as_a <gcall *> (p: stmts[0]->stmt),
1341 call2: call_stmt))
1342 {
1343 if (dump_enabled_p ())
1344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345 "Build SLP failed: different calls in %G",
1346 stmt);
1347 /* Mismatch. */
1348 continue;
1349 }
1350 }
1351
1352 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1353 && (gimple_bb (g: first_stmt_info->stmt)
1354 != gimple_bb (g: stmt_info->stmt)))
1355 {
1356 if (dump_enabled_p ())
1357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1358 "Build SLP failed: different BB for PHI "
1359 "or possibly trapping operation in %G", stmt);
1360 /* Mismatch. */
1361 continue;
1362 }
1363
1364 if (need_same_oprnds)
1365 {
1366 tree other_op1 = gimple_arg (gs: stmt, i: 1);
1367 if (!operand_equal_p (first_op1, other_op1, flags: 0))
1368 {
1369 if (dump_enabled_p ())
1370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371 "Build SLP failed: different shift "
1372 "arguments in %G", stmt);
1373 /* Mismatch. */
1374 continue;
1375 }
1376 }
1377
1378 if (!types_compatible_p (type1: vectype, type2: *node_vectype))
1379 {
1380 if (dump_enabled_p ())
1381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1382 "Build SLP failed: different vector type "
1383 "in %G", stmt);
1384 /* Mismatch. */
1385 continue;
1386 }
1387 }
1388
1389 /* Grouped store or load. */
1390 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1391 {
1392 gcc_assert (ldst_p);
1393 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1394 {
1395 /* Store. */
1396 gcc_assert (rhs_code == CFN_MASK_STORE
1397 || REFERENCE_CLASS_P (lhs)
1398 || DECL_P (lhs));
1399 }
1400 else
1401 {
1402 /* Load. */
1403 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1404 if (prev_first_load)
1405 {
1406 /* Check that there are no loads from different interleaving
1407 chains in the same node. */
1408 if (prev_first_load != first_load)
1409 {
1410 if (dump_enabled_p ())
1411 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1412 vect_location,
1413 "Build SLP failed: different "
1414 "interleaving chains in one node %G",
1415 stmt);
1416 /* Mismatch. */
1417 continue;
1418 }
1419 }
1420 else
1421 prev_first_load = first_load;
1422 }
1423 }
1424 /* Non-grouped store or load. */
1425 else if (ldst_p)
1426 {
1427 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1428 && rhs_code != CFN_GATHER_LOAD
1429 && rhs_code != CFN_MASK_GATHER_LOAD
1430 && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1431 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1432 /* Not grouped loads are handled as externals for BB
1433 vectorization. For loop vectorization we can handle
1434 splats the same we handle single element interleaving. */
1435 && (is_a <bb_vec_info> (p: vinfo)
1436 || stmt_info != first_stmt_info))
1437 {
1438 /* Not grouped load. */
1439 if (dump_enabled_p ())
1440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1441 "Build SLP failed: not grouped load %G", stmt);
1442
1443 if (i != 0)
1444 continue;
1445 /* Fatal mismatch. */
1446 matches[0] = false;
1447 return false;
1448 }
1449 }
1450 /* Not memory operation. */
1451 else
1452 {
1453 if (!phi_p
1454 && rhs_code.is_tree_code ()
1455 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1456 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1457 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1458 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1459 && rhs_code != VIEW_CONVERT_EXPR
1460 && rhs_code != CALL_EXPR
1461 && rhs_code != BIT_FIELD_REF)
1462 {
1463 if (dump_enabled_p ())
1464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465 "Build SLP failed: operation unsupported %G",
1466 stmt);
1467 if (is_a <bb_vec_info> (p: vinfo) && i != 0)
1468 continue;
1469 /* Fatal mismatch. */
1470 matches[0] = false;
1471 return false;
1472 }
1473
1474 if (rhs_code == COND_EXPR)
1475 {
1476 tree cond_expr = gimple_assign_rhs1 (gs: stmt);
1477 enum tree_code cond_code = TREE_CODE (cond_expr);
1478 enum tree_code swap_code = ERROR_MARK;
1479 enum tree_code invert_code = ERROR_MARK;
1480
1481 if (i == 0)
1482 first_cond_code = TREE_CODE (cond_expr);
1483 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1484 {
1485 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1486 swap_code = swap_tree_comparison (cond_code);
1487 invert_code = invert_tree_comparison (cond_code, honor_nans);
1488 }
1489
1490 if (first_cond_code == cond_code)
1491 ;
1492 /* Isomorphic can be achieved by swapping. */
1493 else if (first_cond_code == swap_code)
1494 swap[i] = 1;
1495 /* Isomorphic can be achieved by inverting. */
1496 else if (first_cond_code == invert_code)
1497 swap[i] = 2;
1498 else
1499 {
1500 if (dump_enabled_p ())
1501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1502 "Build SLP failed: different"
1503 " operation %G", stmt);
1504 /* Mismatch. */
1505 continue;
1506 }
1507 }
1508
1509 if (rhs_code.is_tree_code ()
1510 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1511 && (swap_tree_comparison ((tree_code)first_stmt_code)
1512 == (tree_code)rhs_code))
1513 swap[i] = 1;
1514 }
1515
1516 matches[i] = true;
1517 }
1518
1519 for (i = 0; i < group_size; ++i)
1520 if (!matches[i])
1521 return false;
1522
1523 /* If we allowed a two-operation SLP node verify the target can cope
1524 with the permute we are going to use. */
1525 if (alt_stmt_code != ERROR_MARK
1526 && (!alt_stmt_code.is_tree_code ()
1527 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1528 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1529 {
1530 *two_operators = true;
1531 }
1532
1533 if (maybe_soft_fail)
1534 {
1535 unsigned HOST_WIDE_INT const_nunits;
1536 if (!TYPE_VECTOR_SUBPARTS
1537 (node: soft_fail_nunits_vectype).is_constant (const_value: &const_nunits)
1538 || const_nunits > group_size)
1539 matches[0] = false;
1540 else
1541 {
1542 /* With constant vector elements simulate a mismatch at the
1543 point we need to split. */
1544 unsigned tail = group_size & (const_nunits - 1);
1545 memset (s: &matches[group_size - tail], c: 0, n: sizeof (bool) * tail);
1546 }
1547 return false;
1548 }
1549
1550 return true;
1551}
1552
1553/* Traits for the hash_set to record failed SLP builds for a stmt set.
1554 Note we never remove apart from at destruction time so we do not
1555 need a special value for deleted that differs from empty. */
1556struct bst_traits
1557{
1558 typedef vec <stmt_vec_info> value_type;
1559 typedef vec <stmt_vec_info> compare_type;
1560 static inline hashval_t hash (value_type);
1561 static inline bool equal (value_type existing, value_type candidate);
1562 static inline bool is_empty (value_type x) { return !x.exists (); }
1563 static inline bool is_deleted (value_type x) { return !x.exists (); }
1564 static const bool empty_zero_p = true;
1565 static inline void mark_empty (value_type &x) { x.release (); }
1566 static inline void mark_deleted (value_type &x) { x.release (); }
1567 static inline void remove (value_type &x) { x.release (); }
1568};
1569inline hashval_t
1570bst_traits::hash (value_type x)
1571{
1572 inchash::hash h;
1573 for (unsigned i = 0; i < x.length (); ++i)
1574 h.add_int (v: gimple_uid (g: x[i]->stmt));
1575 return h.end ();
1576}
1577inline bool
1578bst_traits::equal (value_type existing, value_type candidate)
1579{
1580 if (existing.length () != candidate.length ())
1581 return false;
1582 for (unsigned i = 0; i < existing.length (); ++i)
1583 if (existing[i] != candidate[i])
1584 return false;
1585 return true;
1586}
1587
1588/* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1589 but then vec::insert does memmove and that's not compatible with
1590 std::pair. */
1591struct chain_op_t
1592{
1593 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1594 : code (code_), dt (dt_), op (op_) {}
1595 tree_code code;
1596 vect_def_type dt;
1597 tree op;
1598};
1599
1600/* Comparator for sorting associatable chains. */
1601
1602static int
1603dt_sort_cmp (const void *op1_, const void *op2_, void *)
1604{
1605 auto *op1 = (const chain_op_t *) op1_;
1606 auto *op2 = (const chain_op_t *) op2_;
1607 if (op1->dt != op2->dt)
1608 return (int)op1->dt - (int)op2->dt;
1609 return (int)op1->code - (int)op2->code;
1610}
1611
1612/* Linearize the associatable expression chain at START with the
1613 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1614 filling CHAIN with the result and using WORKLIST as intermediate storage.
1615 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1616 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1617 stmts, starting with START. */
1618
1619static void
1620vect_slp_linearize_chain (vec_info *vinfo,
1621 vec<std::pair<tree_code, gimple *> > &worklist,
1622 vec<chain_op_t> &chain,
1623 enum tree_code code, gimple *start,
1624 gimple *&code_stmt, gimple *&alt_code_stmt,
1625 vec<gimple *> *chain_stmts)
1626{
1627 /* For each lane linearize the addition/subtraction (or other
1628 uniform associatable operation) expression tree. */
1629 worklist.safe_push (obj: std::make_pair (x&: code, y&: start));
1630 while (!worklist.is_empty ())
1631 {
1632 auto entry = worklist.pop ();
1633 gassign *stmt = as_a <gassign *> (p: entry.second);
1634 enum tree_code in_code = entry.first;
1635 enum tree_code this_code = gimple_assign_rhs_code (gs: stmt);
1636 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1637 if (!code_stmt
1638 && gimple_assign_rhs_code (gs: stmt) == code)
1639 code_stmt = stmt;
1640 else if (!alt_code_stmt
1641 && gimple_assign_rhs_code (gs: stmt) == MINUS_EXPR)
1642 alt_code_stmt = stmt;
1643 if (chain_stmts)
1644 chain_stmts->safe_push (obj: stmt);
1645 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1646 {
1647 tree op = gimple_op (gs: stmt, i: opnum);
1648 vect_def_type dt;
1649 stmt_vec_info def_stmt_info;
1650 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1651 gcc_assert (res);
1652 if (dt == vect_internal_def
1653 && is_pattern_stmt_p (stmt_info: def_stmt_info))
1654 op = gimple_get_lhs (def_stmt_info->stmt);
1655 gimple *use_stmt;
1656 use_operand_p use_p;
1657 if (dt == vect_internal_def
1658 && single_imm_use (var: op, use_p: &use_p, stmt: &use_stmt)
1659 && is_gimple_assign (gs: def_stmt_info->stmt)
1660 && (gimple_assign_rhs_code (gs: def_stmt_info->stmt) == code
1661 || (code == PLUS_EXPR
1662 && (gimple_assign_rhs_code (gs: def_stmt_info->stmt)
1663 == MINUS_EXPR))))
1664 {
1665 tree_code op_def_code = this_code;
1666 if (op_def_code == MINUS_EXPR && opnum == 1)
1667 op_def_code = PLUS_EXPR;
1668 if (in_code == MINUS_EXPR)
1669 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1670 worklist.safe_push (obj: std::make_pair (x&: op_def_code,
1671 y&: def_stmt_info->stmt));
1672 }
1673 else
1674 {
1675 tree_code op_def_code = this_code;
1676 if (op_def_code == MINUS_EXPR && opnum == 1)
1677 op_def_code = PLUS_EXPR;
1678 if (in_code == MINUS_EXPR)
1679 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1680 chain.safe_push (obj: chain_op_t (op_def_code, dt, op));
1681 }
1682 }
1683 }
1684}
1685
1686typedef hash_map <vec <stmt_vec_info>, slp_tree,
1687 simple_hashmap_traits <bst_traits, slp_tree> >
1688 scalar_stmts_to_slp_tree_map_t;
1689
1690static slp_tree
1691vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1692 vec<stmt_vec_info> stmts, unsigned int group_size,
1693 poly_uint64 *max_nunits,
1694 bool *matches, unsigned *limit, unsigned *tree_size,
1695 scalar_stmts_to_slp_tree_map_t *bst_map);
1696
1697static slp_tree
1698vect_build_slp_tree (vec_info *vinfo,
1699 vec<stmt_vec_info> stmts, unsigned int group_size,
1700 poly_uint64 *max_nunits,
1701 bool *matches, unsigned *limit, unsigned *tree_size,
1702 scalar_stmts_to_slp_tree_map_t *bst_map)
1703{
1704 if (slp_tree *leader = bst_map->get (k: stmts))
1705 {
1706 if (dump_enabled_p ())
1707 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1708 !(*leader)->failed ? "" : "failed ",
1709 (void *) *leader);
1710 if (!(*leader)->failed)
1711 {
1712 SLP_TREE_REF_COUNT (*leader)++;
1713 vect_update_max_nunits (max_nunits, nunits: (*leader)->max_nunits);
1714 stmts.release ();
1715 return *leader;
1716 }
1717 memcpy (dest: matches, src: (*leader)->failed, n: sizeof (bool) * group_size);
1718 return NULL;
1719 }
1720
1721 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1722 so we can pick up backedge destinations during discovery. */
1723 slp_tree res = new _slp_tree;
1724 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1725 SLP_TREE_SCALAR_STMTS (res) = stmts;
1726 bst_map->put (k: stmts.copy (), v: res);
1727
1728 if (*limit == 0)
1729 {
1730 if (dump_enabled_p ())
1731 dump_printf_loc (MSG_NOTE, vect_location,
1732 "SLP discovery limit exceeded\n");
1733 /* Mark the node invalid so we can detect those when still in use
1734 as backedge destinations. */
1735 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1736 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1737 res->failed = XNEWVEC (bool, group_size);
1738 memset (s: res->failed, c: 0, n: sizeof (bool) * group_size);
1739 memset (s: matches, c: 0, n: sizeof (bool) * group_size);
1740 return NULL;
1741 }
1742 --*limit;
1743
1744 if (dump_enabled_p ())
1745 dump_printf_loc (MSG_NOTE, vect_location,
1746 "starting SLP discovery for node %p\n", (void *) res);
1747
1748 poly_uint64 this_max_nunits = 1;
1749 slp_tree res_ = vect_build_slp_tree_2 (vinfo, node: res, stmts, group_size,
1750 max_nunits: &this_max_nunits,
1751 matches, limit, tree_size, bst_map);
1752 if (!res_)
1753 {
1754 if (dump_enabled_p ())
1755 dump_printf_loc (MSG_NOTE, vect_location,
1756 "SLP discovery for node %p failed\n", (void *) res);
1757 /* Mark the node invalid so we can detect those when still in use
1758 as backedge destinations. */
1759 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1760 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1761 res->failed = XNEWVEC (bool, group_size);
1762 if (flag_checking)
1763 {
1764 unsigned i;
1765 for (i = 0; i < group_size; ++i)
1766 if (!matches[i])
1767 break;
1768 gcc_assert (i < group_size);
1769 }
1770 memcpy (dest: res->failed, src: matches, n: sizeof (bool) * group_size);
1771 }
1772 else
1773 {
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_NOTE, vect_location,
1776 "SLP discovery for node %p succeeded\n",
1777 (void *) res);
1778 gcc_assert (res_ == res);
1779 res->max_nunits = this_max_nunits;
1780 vect_update_max_nunits (max_nunits, nunits: this_max_nunits);
1781 /* Keep a reference for the bst_map use. */
1782 SLP_TREE_REF_COUNT (res)++;
1783 }
1784 return res_;
1785}
1786
1787/* Helper for building an associated SLP node chain. */
1788
1789static void
1790vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1791 slp_tree op0, slp_tree op1,
1792 stmt_vec_info oper1, stmt_vec_info oper2,
1793 vec<std::pair<unsigned, unsigned> > lperm)
1794{
1795 unsigned group_size = SLP_TREE_LANES (op1);
1796
1797 slp_tree child1 = new _slp_tree;
1798 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1799 SLP_TREE_VECTYPE (child1) = vectype;
1800 SLP_TREE_LANES (child1) = group_size;
1801 SLP_TREE_CHILDREN (child1).create (nelems: 2);
1802 SLP_TREE_CHILDREN (child1).quick_push (obj: op0);
1803 SLP_TREE_CHILDREN (child1).quick_push (obj: op1);
1804 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1805
1806 slp_tree child2 = new _slp_tree;
1807 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1808 SLP_TREE_VECTYPE (child2) = vectype;
1809 SLP_TREE_LANES (child2) = group_size;
1810 SLP_TREE_CHILDREN (child2).create (nelems: 2);
1811 SLP_TREE_CHILDREN (child2).quick_push (obj: op0);
1812 SLP_TREE_REF_COUNT (op0)++;
1813 SLP_TREE_CHILDREN (child2).quick_push (obj: op1);
1814 SLP_TREE_REF_COUNT (op1)++;
1815 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1816
1817 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1818 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1819 SLP_TREE_VECTYPE (perm) = vectype;
1820 SLP_TREE_LANES (perm) = group_size;
1821 /* ??? We should set this NULL but that's not expected. */
1822 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1823 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1824 SLP_TREE_CHILDREN (perm).quick_push (obj: child1);
1825 SLP_TREE_CHILDREN (perm).quick_push (obj: child2);
1826}
1827
1828/* Recursively build an SLP tree starting from NODE.
1829 Fail (and return a value not equal to zero) if def-stmts are not
1830 isomorphic, require data permutation or are of unsupported types of
1831 operation. Otherwise, return 0.
1832 The value returned is the depth in the SLP tree where a mismatch
1833 was found. */
1834
1835static slp_tree
1836vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1837 vec<stmt_vec_info> stmts, unsigned int group_size,
1838 poly_uint64 *max_nunits,
1839 bool *matches, unsigned *limit, unsigned *tree_size,
1840 scalar_stmts_to_slp_tree_map_t *bst_map)
1841{
1842 unsigned nops, i, this_tree_size = 0;
1843 poly_uint64 this_max_nunits = *max_nunits;
1844
1845 matches[0] = false;
1846
1847 stmt_vec_info stmt_info = stmts[0];
1848 if (!is_a<gcall *> (p: stmt_info->stmt)
1849 && !is_a<gassign *> (p: stmt_info->stmt)
1850 && !is_a<gphi *> (p: stmt_info->stmt))
1851 return NULL;
1852
1853 nops = gimple_num_args (gs: stmt_info->stmt);
1854 if (const int *map = vect_get_operand_map (stmt: stmt_info->stmt,
1855 STMT_VINFO_GATHER_SCATTER_P
1856 (stmt_info)))
1857 nops = map[0];
1858
1859 /* If the SLP node is a PHI (induction or reduction), terminate
1860 the recursion. */
1861 bool *skip_args = XALLOCAVEC (bool, nops);
1862 memset (s: skip_args, c: 0, n: sizeof (bool) * nops);
1863 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
1864 if (gphi *stmt = dyn_cast <gphi *> (p: stmt_info->stmt))
1865 {
1866 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1867 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1868 group_size);
1869 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1870 max_nunits))
1871 return NULL;
1872
1873 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1874 if (def_type == vect_induction_def)
1875 {
1876 /* Induction PHIs are not cycles but walk the initial
1877 value. Only for inner loops through, for outer loops
1878 we need to pick up the value from the actual PHIs
1879 to more easily support peeling and epilogue vectorization. */
1880 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1881 if (!nested_in_vect_loop_p (loop, stmt_info))
1882 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1883 else
1884 loop = loop->inner;
1885 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1886 }
1887 else if (def_type == vect_reduction_def
1888 || def_type == vect_double_reduction_def
1889 || def_type == vect_nested_cycle
1890 || def_type == vect_first_order_recurrence)
1891 {
1892 /* Else def types have to match. */
1893 stmt_vec_info other_info;
1894 bool all_same = true;
1895 FOR_EACH_VEC_ELT (stmts, i, other_info)
1896 {
1897 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1898 return NULL;
1899 if (other_info != stmt_info)
1900 all_same = false;
1901 }
1902 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1903 /* Reduction initial values are not explicitely represented. */
1904 if (def_type != vect_first_order_recurrence
1905 && !nested_in_vect_loop_p (loop, stmt_info))
1906 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1907 /* Reduction chain backedge defs are filled manually.
1908 ??? Need a better way to identify a SLP reduction chain PHI.
1909 Or a better overall way to SLP match those. */
1910 if (all_same && def_type == vect_reduction_def)
1911 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1912 }
1913 else if (def_type != vect_internal_def)
1914 return NULL;
1915 }
1916
1917
1918 bool two_operators = false;
1919 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1920 tree vectype = NULL_TREE;
1921 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1922 max_nunits: &this_max_nunits, matches, two_operators: &two_operators,
1923 node_vectype: &vectype))
1924 return NULL;
1925
1926 /* If the SLP node is a load, terminate the recursion unless masked. */
1927 if (STMT_VINFO_DATA_REF (stmt_info)
1928 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1929 {
1930 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1931 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1932 else
1933 {
1934 *max_nunits = this_max_nunits;
1935 (*tree_size)++;
1936 node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 0);
1937 SLP_TREE_VECTYPE (node) = vectype;
1938 /* And compute the load permutation. Whether it is actually
1939 a permutation depends on the unrolling factor which is
1940 decided later. */
1941 vec<unsigned> load_permutation;
1942 int j;
1943 stmt_vec_info load_info;
1944 load_permutation.create (nelems: group_size);
1945 stmt_vec_info first_stmt_info
1946 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1947 bool any_permute = false;
1948 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1949 {
1950 int load_place;
1951 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1952 load_place = vect_get_place_in_interleaving_chain
1953 (stmt_info: load_info, first_stmt_info);
1954 else
1955 load_place = 0;
1956 gcc_assert (load_place != -1);
1957 any_permute |= load_place != j;
1958 load_permutation.quick_push (obj: load_place);
1959 }
1960
1961 if (gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt))
1962 {
1963 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1964 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1965 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1966 || gimple_call_internal_p (stmt,
1967 IFN_MASK_LEN_GATHER_LOAD));
1968 load_permutation.release ();
1969 /* We cannot handle permuted masked loads, see PR114375. */
1970 if (any_permute
1971 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1972 && DR_GROUP_SIZE (first_stmt_info) != group_size)
1973 || STMT_VINFO_STRIDED_P (stmt_info))
1974 {
1975 matches[0] = false;
1976 return NULL;
1977 }
1978 }
1979 else
1980 {
1981 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1982 return node;
1983 }
1984 }
1985 }
1986 else if (gimple_assign_single_p (gs: stmt_info->stmt)
1987 && !gimple_vuse (g: stmt_info->stmt)
1988 && gimple_assign_rhs_code (gs: stmt_info->stmt) == BIT_FIELD_REF)
1989 {
1990 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1991 the same SSA name vector of a compatible type to vectype. */
1992 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1993 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1994 stmt_vec_info estmt_info;
1995 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1996 {
1997 gassign *estmt = as_a <gassign *> (p: estmt_info->stmt);
1998 tree bfref = gimple_assign_rhs1 (gs: estmt);
1999 HOST_WIDE_INT lane;
2000 if (!known_eq (bit_field_size (bfref),
2001 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2002 || !constant_multiple_p (a: bit_field_offset (t: bfref),
2003 b: bit_field_size (t: bfref), multiple: &lane))
2004 {
2005 lperm.release ();
2006 matches[0] = false;
2007 return NULL;
2008 }
2009 lperm.safe_push (obj: std::make_pair (x: 0, y: (unsigned)lane));
2010 }
2011 slp_tree vnode = vect_create_new_slp_node (ops: vNULL);
2012 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2013 /* ??? We record vectype here but we hide eventually necessary
2014 punning and instead rely on code generation to materialize
2015 VIEW_CONVERT_EXPRs as necessary. We instead should make
2016 this explicit somehow. */
2017 SLP_TREE_VECTYPE (vnode) = vectype;
2018 else
2019 {
2020 /* For different size but compatible elements we can still
2021 use VEC_PERM_EXPR without punning. */
2022 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2023 && types_compatible_p (TREE_TYPE (vectype),
2024 TREE_TYPE (TREE_TYPE (vec))));
2025 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2026 }
2027 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2028 unsigned HOST_WIDE_INT const_nunits;
2029 if (nunits.is_constant (const_value: &const_nunits))
2030 SLP_TREE_LANES (vnode) = const_nunits;
2031 SLP_TREE_VEC_DEFS (vnode).safe_push (obj: vec);
2032 /* We are always building a permutation node even if it is an identity
2033 permute to shield the rest of the vectorizer from the odd node
2034 representing an actual vector without any scalar ops.
2035 ??? We could hide it completely with making the permute node
2036 external? */
2037 node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 1);
2038 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2039 SLP_TREE_LANE_PERMUTATION (node) = lperm;
2040 SLP_TREE_VECTYPE (node) = vectype;
2041 SLP_TREE_CHILDREN (node).quick_push (obj: vnode);
2042 return node;
2043 }
2044 /* When discovery reaches an associatable operation see whether we can
2045 improve that to match up lanes in a way superior to the operand
2046 swapping code which at most looks at two defs.
2047 ??? For BB vectorization we cannot do the brute-force search
2048 for matching as we can succeed by means of builds from scalars
2049 and have no good way to "cost" one build against another. */
2050 else if (is_a <loop_vec_info> (p: vinfo)
2051 /* ??? We don't handle !vect_internal_def defs below. */
2052 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2053 && is_gimple_assign (gs: stmt_info->stmt)
2054 && (associative_tree_code (gimple_assign_rhs_code (gs: stmt_info->stmt))
2055 || gimple_assign_rhs_code (gs: stmt_info->stmt) == MINUS_EXPR)
2056 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2057 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2058 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2059 {
2060 /* See if we have a chain of (mixed) adds or subtracts or other
2061 associatable ops. */
2062 enum tree_code code = gimple_assign_rhs_code (gs: stmt_info->stmt);
2063 if (code == MINUS_EXPR)
2064 code = PLUS_EXPR;
2065 stmt_vec_info other_op_stmt_info = NULL;
2066 stmt_vec_info op_stmt_info = NULL;
2067 unsigned chain_len = 0;
2068 auto_vec<chain_op_t> chain;
2069 auto_vec<std::pair<tree_code, gimple *> > worklist;
2070 auto_vec<vec<chain_op_t> > chains (group_size);
2071 auto_vec<slp_tree, 4> children;
2072 bool hard_fail = true;
2073 for (unsigned lane = 0; lane < group_size; ++lane)
2074 {
2075 /* For each lane linearize the addition/subtraction (or other
2076 uniform associatable operation) expression tree. */
2077 gimple *op_stmt = NULL, *other_op_stmt = NULL;
2078 vect_slp_linearize_chain (vinfo, worklist, chain, code,
2079 start: stmts[lane]->stmt, code_stmt&: op_stmt, alt_code_stmt&: other_op_stmt,
2080 NULL);
2081 if (!op_stmt_info && op_stmt)
2082 op_stmt_info = vinfo->lookup_stmt (op_stmt);
2083 if (!other_op_stmt_info && other_op_stmt)
2084 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2085 if (chain.length () == 2)
2086 {
2087 /* In a chain of just two elements resort to the regular
2088 operand swapping scheme. If we run into a length
2089 mismatch still hard-FAIL. */
2090 if (chain_len == 0)
2091 hard_fail = false;
2092 else
2093 {
2094 matches[lane] = false;
2095 /* ??? We might want to process the other lanes, but
2096 make sure to not give false matching hints to the
2097 caller for lanes we did not process. */
2098 if (lane != group_size - 1)
2099 matches[0] = false;
2100 }
2101 break;
2102 }
2103 else if (chain_len == 0)
2104 chain_len = chain.length ();
2105 else if (chain.length () != chain_len)
2106 {
2107 /* ??? Here we could slip in magic to compensate with
2108 neutral operands. */
2109 matches[lane] = false;
2110 if (lane != group_size - 1)
2111 matches[0] = false;
2112 break;
2113 }
2114 chains.quick_push (obj: chain.copy ());
2115 chain.truncate (size: 0);
2116 }
2117 if (chains.length () == group_size)
2118 {
2119 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2120 if (!op_stmt_info)
2121 {
2122 hard_fail = false;
2123 goto out;
2124 }
2125 /* Now we have a set of chains with the same length. */
2126 /* 1. pre-sort according to def_type and operation. */
2127 for (unsigned lane = 0; lane < group_size; ++lane)
2128 chains[lane].stablesort (cmp: dt_sort_cmp, data: vinfo);
2129 if (dump_enabled_p ())
2130 {
2131 dump_printf_loc (MSG_NOTE, vect_location,
2132 "pre-sorted chains of %s\n",
2133 get_tree_code_name (code));
2134 for (unsigned lane = 0; lane < group_size; ++lane)
2135 {
2136 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2137 dump_printf (MSG_NOTE, "%s %T ",
2138 get_tree_code_name (chains[lane][opnum].code),
2139 chains[lane][opnum].op);
2140 dump_printf (MSG_NOTE, "\n");
2141 }
2142 }
2143 /* 2. try to build children nodes, associating as necessary. */
2144 for (unsigned n = 0; n < chain_len; ++n)
2145 {
2146 vect_def_type dt = chains[0][n].dt;
2147 unsigned lane;
2148 for (lane = 0; lane < group_size; ++lane)
2149 if (chains[lane][n].dt != dt)
2150 {
2151 if (dt == vect_constant_def
2152 && chains[lane][n].dt == vect_external_def)
2153 dt = vect_external_def;
2154 else if (dt == vect_external_def
2155 && chains[lane][n].dt == vect_constant_def)
2156 ;
2157 else
2158 break;
2159 }
2160 if (lane != group_size)
2161 {
2162 if (dump_enabled_p ())
2163 dump_printf_loc (MSG_NOTE, vect_location,
2164 "giving up on chain due to mismatched "
2165 "def types\n");
2166 matches[lane] = false;
2167 if (lane != group_size - 1)
2168 matches[0] = false;
2169 goto out;
2170 }
2171 if (dt == vect_constant_def
2172 || dt == vect_external_def)
2173 {
2174 /* Check whether we can build the invariant. If we can't
2175 we never will be able to. */
2176 tree type = TREE_TYPE (chains[0][n].op);
2177 if (!GET_MODE_SIZE (mode: vinfo->vector_mode).is_constant ()
2178 && (TREE_CODE (type) == BOOLEAN_TYPE
2179 || !can_duplicate_and_interleave_p (vinfo, count: group_size,
2180 elt_type: type)))
2181 {
2182 matches[0] = false;
2183 goto out;
2184 }
2185 vec<tree> ops;
2186 ops.create (nelems: group_size);
2187 for (lane = 0; lane < group_size; ++lane)
2188 ops.quick_push (obj: chains[lane][n].op);
2189 slp_tree child = vect_create_new_slp_node (ops);
2190 SLP_TREE_DEF_TYPE (child) = dt;
2191 children.safe_push (obj: child);
2192 }
2193 else if (dt != vect_internal_def)
2194 {
2195 /* Not sure, we might need sth special.
2196 gcc.dg/vect/pr96854.c,
2197 gfortran.dg/vect/fast-math-pr37021.f90
2198 and gfortran.dg/vect/pr61171.f trigger. */
2199 /* Soft-fail for now. */
2200 hard_fail = false;
2201 goto out;
2202 }
2203 else
2204 {
2205 vec<stmt_vec_info> op_stmts;
2206 op_stmts.create (nelems: group_size);
2207 slp_tree child = NULL;
2208 /* Brute-force our way. We have to consider a lane
2209 failing after fixing an earlier fail up in the
2210 SLP discovery recursion. So track the current
2211 permute per lane. */
2212 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2213 memset (s: perms, c: 0, n: sizeof (unsigned) * group_size);
2214 do
2215 {
2216 op_stmts.truncate (size: 0);
2217 for (lane = 0; lane < group_size; ++lane)
2218 op_stmts.quick_push
2219 (obj: vinfo->lookup_def (chains[lane][n].op));
2220 child = vect_build_slp_tree (vinfo, stmts: op_stmts,
2221 group_size, max_nunits: &this_max_nunits,
2222 matches, limit,
2223 tree_size: &this_tree_size, bst_map);
2224 /* ??? We're likely getting too many fatal mismatches
2225 here so maybe we want to ignore them (but then we
2226 have no idea which lanes fatally mismatched). */
2227 if (child || !matches[0])
2228 break;
2229 /* Swap another lane we have not yet matched up into
2230 lanes that did not match. If we run out of
2231 permute possibilities for a lane terminate the
2232 search. */
2233 bool term = false;
2234 for (lane = 1; lane < group_size; ++lane)
2235 if (!matches[lane])
2236 {
2237 if (n + perms[lane] + 1 == chain_len)
2238 {
2239 term = true;
2240 break;
2241 }
2242 std::swap (a&: chains[lane][n],
2243 b&: chains[lane][n + perms[lane] + 1]);
2244 perms[lane]++;
2245 }
2246 if (term)
2247 break;
2248 }
2249 while (1);
2250 if (!child)
2251 {
2252 if (dump_enabled_p ())
2253 dump_printf_loc (MSG_NOTE, vect_location,
2254 "failed to match up op %d\n", n);
2255 op_stmts.release ();
2256 if (lane != group_size - 1)
2257 matches[0] = false;
2258 else
2259 matches[lane] = false;
2260 goto out;
2261 }
2262 if (dump_enabled_p ())
2263 {
2264 dump_printf_loc (MSG_NOTE, vect_location,
2265 "matched up op %d to\n", n);
2266 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2267 }
2268 children.safe_push (obj: child);
2269 }
2270 }
2271 /* 3. build SLP nodes to combine the chain. */
2272 for (unsigned lane = 0; lane < group_size; ++lane)
2273 if (chains[lane][0].code != code)
2274 {
2275 /* See if there's any alternate all-PLUS entry. */
2276 unsigned n;
2277 for (n = 1; n < chain_len; ++n)
2278 {
2279 for (lane = 0; lane < group_size; ++lane)
2280 if (chains[lane][n].code != code)
2281 break;
2282 if (lane == group_size)
2283 break;
2284 }
2285 if (n != chain_len)
2286 {
2287 /* Swap that in at first position. */
2288 std::swap (a&: children[0], b&: children[n]);
2289 for (lane = 0; lane < group_size; ++lane)
2290 std::swap (a&: chains[lane][0], b&: chains[lane][n]);
2291 }
2292 else
2293 {
2294 /* ??? When this triggers and we end up with two
2295 vect_constant/external_def up-front things break (ICE)
2296 spectacularly finding an insertion place for the
2297 all-constant op. We should have a fully
2298 vect_internal_def operand though(?) so we can swap
2299 that into first place and then prepend the all-zero
2300 constant. */
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_NOTE, vect_location,
2303 "inserting constant zero to compensate "
2304 "for (partially) negated first "
2305 "operand\n");
2306 chain_len++;
2307 for (lane = 0; lane < group_size; ++lane)
2308 chains[lane].safe_insert
2309 (ix: 0, obj: chain_op_t (code, vect_constant_def, NULL_TREE));
2310 vec<tree> zero_ops;
2311 zero_ops.create (nelems: group_size);
2312 zero_ops.quick_push (obj: build_zero_cst (TREE_TYPE (vectype)));
2313 for (lane = 1; lane < group_size; ++lane)
2314 zero_ops.quick_push (obj: zero_ops[0]);
2315 slp_tree zero = vect_create_new_slp_node (ops: zero_ops);
2316 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2317 children.safe_insert (ix: 0, obj: zero);
2318 }
2319 break;
2320 }
2321 for (unsigned i = 1; i < children.length (); ++i)
2322 {
2323 slp_tree op0 = children[i - 1];
2324 slp_tree op1 = children[i];
2325 bool this_two_op = false;
2326 for (unsigned lane = 0; lane < group_size; ++lane)
2327 if (chains[lane][i].code != chains[0][i].code)
2328 {
2329 this_two_op = true;
2330 break;
2331 }
2332 slp_tree child;
2333 if (i == children.length () - 1)
2334 child = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 2);
2335 else
2336 child = vect_create_new_slp_node (nops: 2, code: ERROR_MARK);
2337 if (this_two_op)
2338 {
2339 vec<std::pair<unsigned, unsigned> > lperm;
2340 lperm.create (nelems: group_size);
2341 for (unsigned lane = 0; lane < group_size; ++lane)
2342 lperm.quick_push (obj: std::make_pair
2343 (x: chains[lane][i].code != chains[0][i].code, y&: lane));
2344 vect_slp_build_two_operator_nodes (perm: child, vectype, op0, op1,
2345 oper1: (chains[0][i].code == code
2346 ? op_stmt_info
2347 : other_op_stmt_info),
2348 oper2: (chains[0][i].code == code
2349 ? other_op_stmt_info
2350 : op_stmt_info),
2351 lperm);
2352 }
2353 else
2354 {
2355 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2356 SLP_TREE_VECTYPE (child) = vectype;
2357 SLP_TREE_LANES (child) = group_size;
2358 SLP_TREE_CHILDREN (child).quick_push (obj: op0);
2359 SLP_TREE_CHILDREN (child).quick_push (obj: op1);
2360 SLP_TREE_REPRESENTATIVE (child)
2361 = (chains[0][i].code == code
2362 ? op_stmt_info : other_op_stmt_info);
2363 }
2364 children[i] = child;
2365 }
2366 *tree_size += this_tree_size + 1;
2367 *max_nunits = this_max_nunits;
2368 while (!chains.is_empty ())
2369 chains.pop ().release ();
2370 return node;
2371 }
2372out:
2373 while (!children.is_empty ())
2374 vect_free_slp_tree (node: children.pop ());
2375 while (!chains.is_empty ())
2376 chains.pop ().release ();
2377 /* Hard-fail, otherwise we might run into quadratic processing of the
2378 chains starting one stmt into the chain again. */
2379 if (hard_fail)
2380 return NULL;
2381 /* Fall thru to normal processing. */
2382 }
2383
2384 /* Get at the operands, verifying they are compatible. */
2385 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2386 slp_oprnd_info oprnd_info;
2387 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2388 {
2389 int res = vect_get_and_check_slp_defs (vinfo, swap: swap[i], skip_args,
2390 stmts, stmt_num: i, oprnds_info: &oprnds_info);
2391 if (res != 0)
2392 matches[(res == -1) ? 0 : i] = false;
2393 if (!matches[0])
2394 break;
2395 }
2396 for (i = 0; i < group_size; ++i)
2397 if (!matches[i])
2398 {
2399 vect_free_oprnd_info (oprnds_info);
2400 return NULL;
2401 }
2402 swap = NULL;
2403
2404 auto_vec<slp_tree, 4> children;
2405
2406 stmt_info = stmts[0];
2407
2408 /* Create SLP_TREE nodes for the definition node/s. */
2409 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2410 {
2411 slp_tree child = nullptr;
2412 unsigned int j;
2413
2414 /* We're skipping certain operands from processing, for example
2415 outer loop reduction initial defs. */
2416 if (skip_args[i])
2417 {
2418 children.safe_push (NULL);
2419 continue;
2420 }
2421
2422 if (oprnd_info->first_dt == vect_uninitialized_def)
2423 {
2424 /* COND_EXPR have one too many eventually if the condition
2425 is a SSA name. */
2426 gcc_assert (i == 3 && nops == 4);
2427 continue;
2428 }
2429
2430 if (is_a <bb_vec_info> (p: vinfo)
2431 && oprnd_info->first_dt == vect_internal_def
2432 && !oprnd_info->any_pattern)
2433 {
2434 /* For BB vectorization, if all defs are the same do not
2435 bother to continue the build along the single-lane
2436 graph but use a splat of the scalar value. */
2437 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2438 for (j = 1; j < group_size; ++j)
2439 if (oprnd_info->def_stmts[j] != first_def)
2440 break;
2441 if (j == group_size
2442 /* But avoid doing this for loads where we may be
2443 able to CSE things, unless the stmt is not
2444 vectorizable. */
2445 && (!STMT_VINFO_VECTORIZABLE (first_def)
2446 || !gimple_vuse (g: first_def->stmt)))
2447 {
2448 if (dump_enabled_p ())
2449 dump_printf_loc (MSG_NOTE, vect_location,
2450 "Using a splat of the uniform operand %G",
2451 first_def->stmt);
2452 oprnd_info->first_dt = vect_external_def;
2453 }
2454 }
2455
2456 if (oprnd_info->first_dt == vect_external_def
2457 || oprnd_info->first_dt == vect_constant_def)
2458 {
2459 if (!GET_MODE_SIZE (mode: vinfo->vector_mode).is_constant ())
2460 {
2461 tree op0;
2462 tree uniform_val = op0 = oprnd_info->ops[0];
2463 for (j = 1; j < oprnd_info->ops.length (); ++j)
2464 if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2465 {
2466 uniform_val = NULL_TREE;
2467 break;
2468 }
2469 if (!uniform_val
2470 && !can_duplicate_and_interleave_p (vinfo,
2471 count: oprnd_info->ops.length (),
2472 TREE_TYPE (op0)))
2473 {
2474 matches[j] = false;
2475 if (dump_enabled_p ())
2476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2477 "Build SLP failed: invalid type of def "
2478 "for variable-length SLP %T\n", op0);
2479 goto fail;
2480 }
2481 }
2482 slp_tree invnode = vect_create_new_slp_node (ops: oprnd_info->ops);
2483 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2484 oprnd_info->ops = vNULL;
2485 children.safe_push (obj: invnode);
2486 continue;
2487 }
2488
2489 if ((child = vect_build_slp_tree (vinfo, stmts: oprnd_info->def_stmts,
2490 group_size, max_nunits: &this_max_nunits,
2491 matches, limit,
2492 tree_size: &this_tree_size, bst_map)) != NULL)
2493 {
2494 oprnd_info->def_stmts = vNULL;
2495 children.safe_push (obj: child);
2496 continue;
2497 }
2498
2499 /* If the SLP build for operand zero failed and operand zero
2500 and one can be commutated try that for the scalar stmts
2501 that failed the match. */
2502 if (i == 0
2503 /* A first scalar stmt mismatch signals a fatal mismatch. */
2504 && matches[0]
2505 /* ??? For COND_EXPRs we can swap the comparison operands
2506 as well as the arms under some constraints. */
2507 && nops == 2
2508 && oprnds_info[1]->first_dt == vect_internal_def
2509 && is_gimple_assign (gs: stmt_info->stmt)
2510 /* Swapping operands for reductions breaks assumptions later on. */
2511 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2512 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2513 {
2514 /* See whether we can swap the matching or the non-matching
2515 stmt operands. */
2516 bool swap_not_matching = true;
2517 do
2518 {
2519 for (j = 0; j < group_size; ++j)
2520 {
2521 if (matches[j] != !swap_not_matching)
2522 continue;
2523 stmt_vec_info stmt_info = stmts[j];
2524 /* Verify if we can swap operands of this stmt. */
2525 gassign *stmt = dyn_cast <gassign *> (p: stmt_info->stmt);
2526 if (!stmt
2527 || !commutative_tree_code (gimple_assign_rhs_code (gs: stmt)))
2528 {
2529 if (!swap_not_matching)
2530 goto fail;
2531 swap_not_matching = false;
2532 break;
2533 }
2534 }
2535 }
2536 while (j != group_size);
2537
2538 /* Swap mismatched definition stmts. */
2539 if (dump_enabled_p ())
2540 dump_printf_loc (MSG_NOTE, vect_location,
2541 "Re-trying with swapped operands of stmts ");
2542 for (j = 0; j < group_size; ++j)
2543 if (matches[j] == !swap_not_matching)
2544 {
2545 std::swap (a&: oprnds_info[0]->def_stmts[j],
2546 b&: oprnds_info[1]->def_stmts[j]);
2547 std::swap (a&: oprnds_info[0]->ops[j],
2548 b&: oprnds_info[1]->ops[j]);
2549 if (dump_enabled_p ())
2550 dump_printf (MSG_NOTE, "%d ", j);
2551 }
2552 if (dump_enabled_p ())
2553 dump_printf (MSG_NOTE, "\n");
2554 /* After swapping some operands we lost track whether an
2555 operand has any pattern defs so be conservative here. */
2556 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2557 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2558 /* And try again with scratch 'matches' ... */
2559 bool *tem = XALLOCAVEC (bool, group_size);
2560 if ((child = vect_build_slp_tree (vinfo, stmts: oprnd_info->def_stmts,
2561 group_size, max_nunits: &this_max_nunits,
2562 matches: tem, limit,
2563 tree_size: &this_tree_size, bst_map)) != NULL)
2564 {
2565 oprnd_info->def_stmts = vNULL;
2566 children.safe_push (obj: child);
2567 continue;
2568 }
2569 }
2570fail:
2571
2572 /* If the SLP build failed and we analyze a basic-block
2573 simply treat nodes we fail to build as externally defined
2574 (and thus build vectors from the scalar defs).
2575 The cost model will reject outright expensive cases.
2576 ??? This doesn't treat cases where permutation ultimatively
2577 fails (or we don't try permutation below). Ideally we'd
2578 even compute a permutation that will end up with the maximum
2579 SLP tree size... */
2580 if (is_a <bb_vec_info> (p: vinfo)
2581 /* ??? Rejecting patterns this way doesn't work. We'd have to
2582 do extra work to cancel the pattern so the uses see the
2583 scalar version. */
2584 && !is_pattern_stmt_p (stmt_info)
2585 && !oprnd_info->any_pattern)
2586 {
2587 /* But if there's a leading vector sized set of matching stmts
2588 fail here so we can split the group. This matches the condition
2589 vect_analyze_slp_instance uses. */
2590 /* ??? We might want to split here and combine the results to support
2591 multiple vector sizes better. */
2592 for (j = 0; j < group_size; ++j)
2593 if (!matches[j])
2594 break;
2595 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2596 {
2597 if (dump_enabled_p ())
2598 dump_printf_loc (MSG_NOTE, vect_location,
2599 "Building vector operands from scalars\n");
2600 this_tree_size++;
2601 child = vect_create_new_slp_node (ops: oprnd_info->ops);
2602 children.safe_push (obj: child);
2603 oprnd_info->ops = vNULL;
2604 continue;
2605 }
2606 }
2607
2608 gcc_assert (child == NULL);
2609 FOR_EACH_VEC_ELT (children, j, child)
2610 if (child)
2611 vect_free_slp_tree (node: child);
2612 vect_free_oprnd_info (oprnds_info);
2613 return NULL;
2614 }
2615
2616 vect_free_oprnd_info (oprnds_info);
2617
2618 /* If we have all children of a child built up from uniform scalars
2619 or does more than one possibly expensive vector construction then
2620 just throw that away, causing it built up from scalars.
2621 The exception is the SLP node for the vector store. */
2622 if (is_a <bb_vec_info> (p: vinfo)
2623 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2624 /* ??? Rejecting patterns this way doesn't work. We'd have to
2625 do extra work to cancel the pattern so the uses see the
2626 scalar version. */
2627 && !is_pattern_stmt_p (stmt_info))
2628 {
2629 slp_tree child;
2630 unsigned j;
2631 bool all_uniform_p = true;
2632 unsigned n_vector_builds = 0;
2633 FOR_EACH_VEC_ELT (children, j, child)
2634 {
2635 if (!child)
2636 ;
2637 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2638 all_uniform_p = false;
2639 else if (!vect_slp_tree_uniform_p (node: child))
2640 {
2641 all_uniform_p = false;
2642 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2643 n_vector_builds++;
2644 }
2645 }
2646 if (all_uniform_p
2647 || n_vector_builds > 1
2648 || (n_vector_builds == children.length ()
2649 && is_a <gphi *> (p: stmt_info->stmt)))
2650 {
2651 /* Roll back. */
2652 matches[0] = false;
2653 FOR_EACH_VEC_ELT (children, j, child)
2654 if (child)
2655 vect_free_slp_tree (node: child);
2656
2657 if (dump_enabled_p ())
2658 dump_printf_loc (MSG_NOTE, vect_location,
2659 "Building parent vector operands from "
2660 "scalars instead\n");
2661 return NULL;
2662 }
2663 }
2664
2665 *tree_size += this_tree_size + 1;
2666 *max_nunits = this_max_nunits;
2667
2668 if (two_operators)
2669 {
2670 /* ??? We'd likely want to either cache in bst_map sth like
2671 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2672 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2673 explicit stmts to put in so the keying on 'stmts' doesn't
2674 work (but we have the same issue with nodes that use 'ops'). */
2675 slp_tree one = new _slp_tree;
2676 slp_tree two = new _slp_tree;
2677 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2678 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2679 SLP_TREE_VECTYPE (one) = vectype;
2680 SLP_TREE_VECTYPE (two) = vectype;
2681 SLP_TREE_CHILDREN (one).safe_splice (src: children);
2682 SLP_TREE_CHILDREN (two).safe_splice (src: children);
2683 slp_tree child;
2684 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2685 SLP_TREE_REF_COUNT (child)++;
2686
2687 /* Here we record the original defs since this
2688 node represents the final lane configuration. */
2689 node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 2);
2690 SLP_TREE_VECTYPE (node) = vectype;
2691 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2692 SLP_TREE_CHILDREN (node).quick_push (obj: one);
2693 SLP_TREE_CHILDREN (node).quick_push (obj: two);
2694 gassign *stmt = as_a <gassign *> (p: stmts[0]->stmt);
2695 enum tree_code code0 = gimple_assign_rhs_code (gs: stmt);
2696 enum tree_code ocode = ERROR_MARK;
2697 stmt_vec_info ostmt_info;
2698 unsigned j = 0;
2699 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2700 {
2701 gassign *ostmt = as_a <gassign *> (p: ostmt_info->stmt);
2702 if (gimple_assign_rhs_code (gs: ostmt) != code0)
2703 {
2704 SLP_TREE_LANE_PERMUTATION (node).safe_push (obj: std::make_pair (x: 1, y&: i));
2705 ocode = gimple_assign_rhs_code (gs: ostmt);
2706 j = i;
2707 }
2708 else
2709 SLP_TREE_LANE_PERMUTATION (node).safe_push (obj: std::make_pair (x: 0, y&: i));
2710 }
2711 SLP_TREE_CODE (one) = code0;
2712 SLP_TREE_CODE (two) = ocode;
2713 SLP_TREE_LANES (one) = stmts.length ();
2714 SLP_TREE_LANES (two) = stmts.length ();
2715 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2716 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2717 return node;
2718 }
2719
2720 node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops);
2721 SLP_TREE_VECTYPE (node) = vectype;
2722 SLP_TREE_CHILDREN (node).splice (src: children);
2723 return node;
2724}
2725
2726/* Dump a single SLP tree NODE. */
2727
2728static void
2729vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2730 slp_tree node)
2731{
2732 unsigned i, j;
2733 slp_tree child;
2734 stmt_vec_info stmt_info;
2735 tree op;
2736
2737 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2738 dump_user_location_t user_loc = loc.get_user_location ();
2739 dump_printf_loc (metadata, user_loc,
2740 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2741 ", refcnt=%u)",
2742 SLP_TREE_DEF_TYPE (node) == vect_external_def
2743 ? " (external)"
2744 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2745 ? " (constant)"
2746 : ""), (void *) node,
2747 estimated_poly_value (x: node->max_nunits),
2748 SLP_TREE_REF_COUNT (node));
2749 if (SLP_TREE_VECTYPE (node))
2750 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2751 dump_printf (metadata, "\n");
2752 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2753 {
2754 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2755 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2756 else
2757 dump_printf_loc (metadata, user_loc, "op template: %G",
2758 SLP_TREE_REPRESENTATIVE (node)->stmt);
2759 }
2760 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2761 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2762 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2763 else
2764 {
2765 dump_printf_loc (metadata, user_loc, "\t{ ");
2766 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2767 dump_printf (metadata, "%T%s ", op,
2768 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2769 dump_printf (metadata, "}\n");
2770 }
2771 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2772 {
2773 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2774 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2775 dump_printf (dump_kind, " %u", j);
2776 dump_printf (dump_kind, " }\n");
2777 }
2778 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2779 {
2780 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2781 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2782 dump_printf (dump_kind, " %u[%u]",
2783 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2784 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2785 dump_printf (dump_kind, " }\n");
2786 }
2787 if (SLP_TREE_CHILDREN (node).is_empty ())
2788 return;
2789 dump_printf_loc (metadata, user_loc, "\tchildren");
2790 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2791 dump_printf (dump_kind, " %p", (void *)child);
2792 dump_printf (dump_kind, "\n");
2793}
2794
2795DEBUG_FUNCTION void
2796debug (slp_tree node)
2797{
2798 debug_dump_context ctx;
2799 vect_print_slp_tree (dump_kind: MSG_NOTE,
2800 loc: dump_location_t::from_location_t (UNKNOWN_LOCATION),
2801 node);
2802}
2803
2804/* Recursive helper for the dot producer below. */
2805
2806static void
2807dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2808{
2809 if (visited.add (k: node))
2810 return;
2811
2812 fprintf (stream: f, format: "\"%p\" [label=\"", (void *)node);
2813 vect_print_slp_tree (dump_kind: MSG_NOTE,
2814 loc: dump_location_t::from_location_t (UNKNOWN_LOCATION),
2815 node);
2816 fprintf (stream: f, format: "\"];\n");
2817
2818
2819 for (slp_tree child : SLP_TREE_CHILDREN (node))
2820 fprintf (stream: f, format: "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2821
2822 for (slp_tree child : SLP_TREE_CHILDREN (node))
2823 if (child)
2824 dot_slp_tree (f, node: child, visited);
2825}
2826
2827DEBUG_FUNCTION void
2828dot_slp_tree (const char *fname, slp_tree node)
2829{
2830 FILE *f = fopen (filename: fname, modes: "w");
2831 fprintf (stream: f, format: "digraph {\n");
2832 fflush (stream: f);
2833 {
2834 debug_dump_context ctx (f);
2835 hash_set<slp_tree> visited;
2836 dot_slp_tree (f, node, visited);
2837 }
2838 fflush (stream: f);
2839 fprintf (stream: f, format: "}\n");
2840 fclose (stream: f);
2841}
2842
2843/* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2844
2845static void
2846vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2847 slp_tree node, hash_set<slp_tree> &visited)
2848{
2849 unsigned i;
2850 slp_tree child;
2851
2852 if (visited.add (k: node))
2853 return;
2854
2855 vect_print_slp_tree (dump_kind, loc, node);
2856
2857 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2858 if (child)
2859 vect_print_slp_graph (dump_kind, loc, node: child, visited);
2860}
2861
2862static void
2863vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2864 slp_tree entry)
2865{
2866 hash_set<slp_tree> visited;
2867 vect_print_slp_graph (dump_kind, loc, node: entry, visited);
2868}
2869
2870/* Mark the tree rooted at NODE with PURE_SLP. */
2871
2872static void
2873vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2874{
2875 int i;
2876 stmt_vec_info stmt_info;
2877 slp_tree child;
2878
2879 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2880 return;
2881
2882 if (visited.add (k: node))
2883 return;
2884
2885 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2886 STMT_SLP_TYPE (stmt_info) = pure_slp;
2887
2888 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2889 if (child)
2890 vect_mark_slp_stmts (node: child, visited);
2891}
2892
2893static void
2894vect_mark_slp_stmts (slp_tree node)
2895{
2896 hash_set<slp_tree> visited;
2897 vect_mark_slp_stmts (node, visited);
2898}
2899
2900/* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2901
2902static void
2903vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2904{
2905 int i;
2906 stmt_vec_info stmt_info;
2907 slp_tree child;
2908
2909 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2910 return;
2911
2912 if (visited.add (k: node))
2913 return;
2914
2915 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2916 {
2917 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2918 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2919 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2920 }
2921
2922 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2923 if (child)
2924 vect_mark_slp_stmts_relevant (node: child, visited);
2925}
2926
2927static void
2928vect_mark_slp_stmts_relevant (slp_tree node)
2929{
2930 hash_set<slp_tree> visited;
2931 vect_mark_slp_stmts_relevant (node, visited);
2932}
2933
2934
2935/* Gather loads in the SLP graph NODE and populate the INST loads array. */
2936
2937static void
2938vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2939 hash_set<slp_tree> &visited)
2940{
2941 if (!node || visited.add (k: node))
2942 return;
2943
2944 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2945 return;
2946
2947 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2948 {
2949 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2950 if (STMT_VINFO_DATA_REF (stmt_info)
2951 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2952 loads.safe_push (obj: node);
2953 }
2954
2955 unsigned i;
2956 slp_tree child;
2957 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2958 vect_gather_slp_loads (loads, node: child, visited);
2959}
2960
2961
2962/* Find the last store in SLP INSTANCE. */
2963
2964stmt_vec_info
2965vect_find_last_scalar_stmt_in_slp (slp_tree node)
2966{
2967 stmt_vec_info last = NULL;
2968 stmt_vec_info stmt_vinfo;
2969
2970 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (ix: i, ptr: &stmt_vinfo); i++)
2971 {
2972 stmt_vinfo = vect_orig_stmt (stmt_info: stmt_vinfo);
2973 last = last ? get_later_stmt (stmt1_info: stmt_vinfo, stmt2_info: last) : stmt_vinfo;
2974 }
2975
2976 return last;
2977}
2978
2979/* Find the first stmt in NODE. */
2980
2981stmt_vec_info
2982vect_find_first_scalar_stmt_in_slp (slp_tree node)
2983{
2984 stmt_vec_info first = NULL;
2985 stmt_vec_info stmt_vinfo;
2986
2987 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (ix: i, ptr: &stmt_vinfo); i++)
2988 {
2989 stmt_vinfo = vect_orig_stmt (stmt_info: stmt_vinfo);
2990 if (!first
2991 || get_later_stmt (stmt1_info: stmt_vinfo, stmt2_info: first) == first)
2992 first = stmt_vinfo;
2993 }
2994
2995 return first;
2996}
2997
2998/* Splits a group of stores, currently beginning at FIRST_VINFO, into
2999 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3000 (also containing the first GROUP1_SIZE stmts, since stores are
3001 consecutive), the second containing the remainder.
3002 Return the first stmt in the second group. */
3003
3004static stmt_vec_info
3005vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3006{
3007 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3008 gcc_assert (group1_size > 0);
3009 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3010 gcc_assert (group2_size > 0);
3011 DR_GROUP_SIZE (first_vinfo) = group1_size;
3012
3013 stmt_vec_info stmt_info = first_vinfo;
3014 for (unsigned i = group1_size; i > 1; i--)
3015 {
3016 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3017 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3018 }
3019 /* STMT is now the last element of the first group. */
3020 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3021 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3022
3023 DR_GROUP_SIZE (group2) = group2_size;
3024 for (stmt_info = group2; stmt_info;
3025 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3026 {
3027 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3028 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3029 }
3030
3031 /* For the second group, the DR_GROUP_GAP is that before the original group,
3032 plus skipping over the first vector. */
3033 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3034
3035 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3036 DR_GROUP_GAP (first_vinfo) += group2_size;
3037
3038 if (dump_enabled_p ())
3039 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3040 group1_size, group2_size);
3041
3042 return group2;
3043}
3044
3045/* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3046 statements and a vector of NUNITS elements. */
3047
3048static poly_uint64
3049calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3050{
3051 return exact_div (a: common_multiple (a: nunits, b: group_size), b: group_size);
3052}
3053
3054/* Helper that checks to see if a node is a load node. */
3055
3056static inline bool
3057vect_is_slp_load_node (slp_tree root)
3058{
3059 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3060 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3061 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3062}
3063
3064
3065/* Helper function of optimize_load_redistribution that performs the operation
3066 recursively. */
3067
3068static slp_tree
3069optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3070 vec_info *vinfo, unsigned int group_size,
3071 hash_map<slp_tree, slp_tree> *load_map,
3072 slp_tree root)
3073{
3074 if (slp_tree *leader = load_map->get (k: root))
3075 return *leader;
3076
3077 slp_tree node;
3078 unsigned i;
3079
3080 /* For now, we don't know anything about externals so do not do anything. */
3081 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3082 return NULL;
3083 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3084 {
3085 /* First convert this node into a load node and add it to the leaves
3086 list and flatten the permute from a lane to a load one. If it's
3087 unneeded it will be elided later. */
3088 vec<stmt_vec_info> stmts;
3089 stmts.create (SLP_TREE_LANES (root));
3090 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3091 for (unsigned j = 0; j < lane_perm.length (); j++)
3092 {
3093 std::pair<unsigned, unsigned> perm = lane_perm[j];
3094 node = SLP_TREE_CHILDREN (root)[perm.first];
3095
3096 if (!vect_is_slp_load_node (root: node)
3097 || SLP_TREE_CHILDREN (node).exists ())
3098 {
3099 stmts.release ();
3100 goto next;
3101 }
3102
3103 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3104 }
3105
3106 if (dump_enabled_p ())
3107 dump_printf_loc (MSG_NOTE, vect_location,
3108 "converting stmts on permute node %p\n",
3109 (void *) root);
3110
3111 bool *matches = XALLOCAVEC (bool, group_size);
3112 poly_uint64 max_nunits = 1;
3113 unsigned tree_size = 0, limit = 1;
3114 node = vect_build_slp_tree (vinfo, stmts, group_size, max_nunits: &max_nunits,
3115 matches, limit: &limit, tree_size: &tree_size, bst_map);
3116 if (!node)
3117 stmts.release ();
3118
3119 load_map->put (k: root, v: node);
3120 return node;
3121 }
3122
3123next:
3124 load_map->put (k: root, NULL);
3125
3126 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3127 {
3128 slp_tree value
3129 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3130 root: node);
3131 if (value)
3132 {
3133 SLP_TREE_REF_COUNT (value)++;
3134 SLP_TREE_CHILDREN (root)[i] = value;
3135 /* ??? We know the original leafs of the replaced nodes will
3136 be referenced by bst_map, only the permutes created by
3137 pattern matching are not. */
3138 if (SLP_TREE_REF_COUNT (node) == 1)
3139 load_map->remove (k: node);
3140 vect_free_slp_tree (node);
3141 }
3142 }
3143
3144 return NULL;
3145}
3146
3147/* Temporary workaround for loads not being CSEd during SLP build. This
3148 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3149 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3150 same DR such that the final operation is equal to a permuted load. Such
3151 NODES are then directly converted into LOADS themselves. The nodes are
3152 CSEd using BST_MAP. */
3153
3154static void
3155optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3156 vec_info *vinfo, unsigned int group_size,
3157 hash_map<slp_tree, slp_tree> *load_map,
3158 slp_tree root)
3159{
3160 slp_tree node;
3161 unsigned i;
3162
3163 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3164 {
3165 slp_tree value
3166 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3167 root: node);
3168 if (value)
3169 {
3170 SLP_TREE_REF_COUNT (value)++;
3171 SLP_TREE_CHILDREN (root)[i] = value;
3172 /* ??? We know the original leafs of the replaced nodes will
3173 be referenced by bst_map, only the permutes created by
3174 pattern matching are not. */
3175 if (SLP_TREE_REF_COUNT (node) == 1)
3176 load_map->remove (k: node);
3177 vect_free_slp_tree (node);
3178 }
3179 }
3180}
3181
3182/* Helper function of vect_match_slp_patterns.
3183
3184 Attempts to match patterns against the slp tree rooted in REF_NODE using
3185 VINFO. Patterns are matched in post-order traversal.
3186
3187 If matching is successful the value in REF_NODE is updated and returned, if
3188 not then it is returned unchanged. */
3189
3190static bool
3191vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3192 slp_tree_to_load_perm_map_t *perm_cache,
3193 slp_compat_nodes_map_t *compat_cache,
3194 hash_set<slp_tree> *visited)
3195{
3196 unsigned i;
3197 slp_tree node = *ref_node;
3198 bool found_p = false;
3199 if (!node || visited->add (k: node))
3200 return false;
3201
3202 slp_tree child;
3203 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3204 found_p |= vect_match_slp_patterns_2 (ref_node: &SLP_TREE_CHILDREN (node)[i],
3205 vinfo, perm_cache, compat_cache,
3206 visited);
3207
3208 for (unsigned x = 0; x < num__slp_patterns; x++)
3209 {
3210 vect_pattern *pattern
3211 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3212 if (pattern)
3213 {
3214 pattern->build (vinfo);
3215 delete pattern;
3216 found_p = true;
3217 }
3218 }
3219
3220 return found_p;
3221}
3222
3223/* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3224 vec_info VINFO.
3225
3226 The modified tree is returned. Patterns are tried in order and multiple
3227 patterns may match. */
3228
3229static bool
3230vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3231 hash_set<slp_tree> *visited,
3232 slp_tree_to_load_perm_map_t *perm_cache,
3233 slp_compat_nodes_map_t *compat_cache)
3234{
3235 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3236 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3237
3238 if (dump_enabled_p ())
3239 dump_printf_loc (MSG_NOTE, vect_location,
3240 "Analyzing SLP tree %p for patterns\n",
3241 (void *) SLP_INSTANCE_TREE (instance));
3242
3243 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3244 visited);
3245}
3246
3247/* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3248 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3249 Return true if we could use IFN_STORE_LANES instead and if that appears
3250 to be the better approach. */
3251
3252static bool
3253vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3254 unsigned int group_size,
3255 unsigned int new_group_size)
3256{
3257 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3258 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3259 if (!vectype)
3260 return false;
3261 /* Allow the split if one of the two new groups would operate on full
3262 vectors *within* rather than across one scalar loop iteration.
3263 This is purely a heuristic, but it should work well for group
3264 sizes of 3 and 4, where the possible splits are:
3265
3266 3->2+1: OK if the vector has exactly two elements
3267 4->2+2: Likewise
3268 4->3+1: Less clear-cut. */
3269 if (multiple_p (a: group_size - new_group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype))
3270 || multiple_p (a: new_group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype)))
3271 return false;
3272 return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3273}
3274
3275/* Analyze an SLP instance starting from a group of grouped stores. Call
3276 vect_build_slp_tree to build a tree of packed stmts if possible.
3277 Return FALSE if it's impossible to SLP any stmt in the loop. */
3278
3279static bool
3280vect_analyze_slp_instance (vec_info *vinfo,
3281 scalar_stmts_to_slp_tree_map_t *bst_map,
3282 stmt_vec_info stmt_info, slp_instance_kind kind,
3283 unsigned max_tree_size, unsigned *limit);
3284
3285/* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3286 of KIND. Return true if successful. */
3287
3288static bool
3289vect_build_slp_instance (vec_info *vinfo,
3290 slp_instance_kind kind,
3291 vec<stmt_vec_info> &scalar_stmts,
3292 vec<stmt_vec_info> &root_stmt_infos,
3293 vec<tree> &remain,
3294 unsigned max_tree_size, unsigned *limit,
3295 scalar_stmts_to_slp_tree_map_t *bst_map,
3296 /* ??? We need stmt_info for group splitting. */
3297 stmt_vec_info stmt_info_)
3298{
3299 if (kind == slp_inst_kind_ctor)
3300 {
3301 if (dump_enabled_p ())
3302 dump_printf_loc (MSG_NOTE, vect_location,
3303 "Analyzing vectorizable constructor: %G\n",
3304 root_stmt_infos[0]->stmt);
3305 }
3306
3307 if (dump_enabled_p ())
3308 {
3309 dump_printf_loc (MSG_NOTE, vect_location,
3310 "Starting SLP discovery for\n");
3311 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3312 dump_printf_loc (MSG_NOTE, vect_location,
3313 " %G", scalar_stmts[i]->stmt);
3314 }
3315
3316 /* Build the tree for the SLP instance. */
3317 unsigned int group_size = scalar_stmts.length ();
3318 bool *matches = XALLOCAVEC (bool, group_size);
3319 poly_uint64 max_nunits = 1;
3320 unsigned tree_size = 0;
3321 unsigned i;
3322 slp_tree node = vect_build_slp_tree (vinfo, stmts: scalar_stmts, group_size,
3323 max_nunits: &max_nunits, matches, limit,
3324 tree_size: &tree_size, bst_map);
3325 if (node != NULL)
3326 {
3327 /* Calculate the unrolling factor based on the smallest type. */
3328 poly_uint64 unrolling_factor
3329 = calculate_unrolling_factor (nunits: max_nunits, group_size);
3330
3331 if (maybe_ne (a: unrolling_factor, b: 1U)
3332 && is_a <bb_vec_info> (p: vinfo))
3333 {
3334 unsigned HOST_WIDE_INT const_max_nunits;
3335 if (!max_nunits.is_constant (const_value: &const_max_nunits)
3336 || const_max_nunits > group_size)
3337 {
3338 if (dump_enabled_p ())
3339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3340 "Build SLP failed: store group "
3341 "size not a multiple of the vector size "
3342 "in basic block SLP\n");
3343 vect_free_slp_tree (node);
3344 return false;
3345 }
3346 /* Fatal mismatch. */
3347 if (dump_enabled_p ())
3348 dump_printf_loc (MSG_NOTE, vect_location,
3349 "SLP discovery succeeded but node needs "
3350 "splitting\n");
3351 memset (s: matches, c: true, n: group_size);
3352 matches[group_size / const_max_nunits * const_max_nunits] = false;
3353 vect_free_slp_tree (node);
3354 }
3355 else
3356 {
3357 /* Create a new SLP instance. */
3358 slp_instance new_instance = XNEW (class _slp_instance);
3359 SLP_INSTANCE_TREE (new_instance) = node;
3360 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3361 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3362 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3363 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3364 SLP_INSTANCE_KIND (new_instance) = kind;
3365 new_instance->reduc_phis = NULL;
3366 new_instance->cost_vec = vNULL;
3367 new_instance->subgraph_entries = vNULL;
3368
3369 if (dump_enabled_p ())
3370 dump_printf_loc (MSG_NOTE, vect_location,
3371 "SLP size %u vs. limit %u.\n",
3372 tree_size, max_tree_size);
3373
3374 /* Fixup SLP reduction chains. */
3375 if (kind == slp_inst_kind_reduc_chain)
3376 {
3377 /* If this is a reduction chain with a conversion in front
3378 amend the SLP tree with a node for that. */
3379 gimple *scalar_def
3380 = vect_orig_stmt (stmt_info: scalar_stmts[group_size - 1])->stmt;
3381 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3382 {
3383 /* Get at the conversion stmt - we know it's the single use
3384 of the last stmt of the reduction chain. */
3385 use_operand_p use_p;
3386 bool r = single_imm_use (var: gimple_assign_lhs (gs: scalar_def),
3387 use_p: &use_p, stmt: &scalar_def);
3388 gcc_assert (r);
3389 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3390 next_info = vect_stmt_to_vectorize (stmt_info: next_info);
3391 scalar_stmts = vNULL;
3392 scalar_stmts.create (nelems: group_size);
3393 for (unsigned i = 0; i < group_size; ++i)
3394 scalar_stmts.quick_push (obj: next_info);
3395 slp_tree conv = vect_create_new_slp_node (scalar_stmts, nops: 1);
3396 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3397 SLP_TREE_CHILDREN (conv).quick_push (obj: node);
3398 SLP_INSTANCE_TREE (new_instance) = conv;
3399 /* We also have to fake this conversion stmt as SLP reduction
3400 group so we don't have to mess with too much code
3401 elsewhere. */
3402 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3403 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3404 }
3405 /* Fill the backedge child of the PHI SLP node. The
3406 general matching code cannot find it because the
3407 scalar code does not reflect how we vectorize the
3408 reduction. */
3409 use_operand_p use_p;
3410 imm_use_iterator imm_iter;
3411 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3412 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3413 gimple_get_lhs (scalar_def))
3414 /* There are exactly two non-debug uses, the reduction
3415 PHI and the loop-closed PHI node. */
3416 if (!is_gimple_debug (USE_STMT (use_p))
3417 && gimple_bb (USE_STMT (use_p)) == loop->header)
3418 {
3419 auto_vec<stmt_vec_info, 64> phis (group_size);
3420 stmt_vec_info phi_info
3421 = vinfo->lookup_stmt (USE_STMT (use_p));
3422 for (unsigned i = 0; i < group_size; ++i)
3423 phis.quick_push (obj: phi_info);
3424 slp_tree *phi_node = bst_map->get (k: phis);
3425 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3426 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3427 = SLP_INSTANCE_TREE (new_instance);
3428 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3429 }
3430 }
3431
3432 vinfo->slp_instances.safe_push (obj: new_instance);
3433
3434 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3435 the number of scalar stmts in the root in a few places.
3436 Verify that assumption holds. */
3437 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3438 .length () == group_size);
3439
3440 if (dump_enabled_p ())
3441 {
3442 dump_printf_loc (MSG_NOTE, vect_location,
3443 "Final SLP tree for instance %p:\n",
3444 (void *) new_instance);
3445 vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location,
3446 SLP_INSTANCE_TREE (new_instance));
3447 }
3448
3449 return true;
3450 }
3451 }
3452 else
3453 {
3454 /* Failed to SLP. */
3455 /* Free the allocated memory. */
3456 scalar_stmts.release ();
3457 }
3458
3459 stmt_vec_info stmt_info = stmt_info_;
3460 /* Try to break the group up into pieces. */
3461 if (kind == slp_inst_kind_store)
3462 {
3463 /* ??? We could delay all the actual splitting of store-groups
3464 until after SLP discovery of the original group completed.
3465 Then we can recurse to vect_build_slp_instance directly. */
3466 for (i = 0; i < group_size; i++)
3467 if (!matches[i])
3468 break;
3469
3470 /* For basic block SLP, try to break the group up into multiples of
3471 a vector size. */
3472 if (is_a <bb_vec_info> (p: vinfo)
3473 && (i > 1 && i < group_size))
3474 {
3475 tree scalar_type
3476 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3477 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3478 1 << floor_log2 (x: i));
3479 unsigned HOST_WIDE_INT const_nunits;
3480 if (vectype
3481 && TYPE_VECTOR_SUBPARTS (node: vectype).is_constant (const_value: &const_nunits))
3482 {
3483 /* Split into two groups at the first vector boundary. */
3484 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3485 unsigned group1_size = i & ~(const_nunits - 1);
3486
3487 if (dump_enabled_p ())
3488 dump_printf_loc (MSG_NOTE, vect_location,
3489 "Splitting SLP group at stmt %u\n", i);
3490 stmt_vec_info rest = vect_split_slp_store_group (first_vinfo: stmt_info,
3491 group1_size);
3492 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3493 kind, max_tree_size,
3494 limit);
3495 /* Split the rest at the failure point and possibly
3496 re-analyze the remaining matching part if it has
3497 at least two lanes. */
3498 if (group1_size < i
3499 && (i + 1 < group_size
3500 || i - group1_size > 1))
3501 {
3502 stmt_vec_info rest2 = rest;
3503 rest = vect_split_slp_store_group (first_vinfo: rest, group1_size: i - group1_size);
3504 if (i - group1_size > 1)
3505 res |= vect_analyze_slp_instance (vinfo, bst_map, stmt_info: rest2,
3506 kind, max_tree_size,
3507 limit);
3508 }
3509 /* Re-analyze the non-matching tail if it has at least
3510 two lanes. */
3511 if (i + 1 < group_size)
3512 res |= vect_analyze_slp_instance (vinfo, bst_map,
3513 stmt_info: rest, kind, max_tree_size,
3514 limit);
3515 return res;
3516 }
3517 }
3518
3519 /* For loop vectorization split into arbitrary pieces of size > 1. */
3520 if (is_a <loop_vec_info> (p: vinfo)
3521 && (i > 1 && i < group_size)
3522 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, new_group_size: i))
3523 {
3524 unsigned group1_size = i;
3525
3526 if (dump_enabled_p ())
3527 dump_printf_loc (MSG_NOTE, vect_location,
3528 "Splitting SLP group at stmt %u\n", i);
3529
3530 stmt_vec_info rest = vect_split_slp_store_group (first_vinfo: stmt_info,
3531 group1_size);
3532 /* Loop vectorization cannot handle gaps in stores, make sure
3533 the split group appears as strided. */
3534 STMT_VINFO_STRIDED_P (rest) = 1;
3535 DR_GROUP_GAP (rest) = 0;
3536 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3537 DR_GROUP_GAP (stmt_info) = 0;
3538
3539 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3540 kind, max_tree_size, limit);
3541 if (i + 1 < group_size)
3542 res |= vect_analyze_slp_instance (vinfo, bst_map,
3543 stmt_info: rest, kind, max_tree_size, limit);
3544
3545 return res;
3546 }
3547
3548 /* Even though the first vector did not all match, we might be able to SLP
3549 (some) of the remainder. FORNOW ignore this possibility. */
3550 }
3551
3552 /* Failed to SLP. */
3553 if (dump_enabled_p ())
3554 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3555 return false;
3556}
3557
3558
3559/* Analyze an SLP instance starting from a group of grouped stores. Call
3560 vect_build_slp_tree to build a tree of packed stmts if possible.
3561 Return FALSE if it's impossible to SLP any stmt in the loop. */
3562
3563static bool
3564vect_analyze_slp_instance (vec_info *vinfo,
3565 scalar_stmts_to_slp_tree_map_t *bst_map,
3566 stmt_vec_info stmt_info,
3567 slp_instance_kind kind,
3568 unsigned max_tree_size, unsigned *limit)
3569{
3570 unsigned int i;
3571 vec<stmt_vec_info> scalar_stmts;
3572
3573 if (is_a <bb_vec_info> (p: vinfo))
3574 vect_location = stmt_info->stmt;
3575
3576 stmt_vec_info next_info = stmt_info;
3577 if (kind == slp_inst_kind_store)
3578 {
3579 /* Collect the stores and store them in scalar_stmts. */
3580 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3581 while (next_info)
3582 {
3583 scalar_stmts.quick_push (obj: vect_stmt_to_vectorize (stmt_info: next_info));
3584 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3585 }
3586 }
3587 else if (kind == slp_inst_kind_reduc_chain)
3588 {
3589 /* Collect the reduction stmts and store them in scalar_stmts. */
3590 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3591 while (next_info)
3592 {
3593 scalar_stmts.quick_push (obj: vect_stmt_to_vectorize (stmt_info: next_info));
3594 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3595 }
3596 /* Mark the first element of the reduction chain as reduction to properly
3597 transform the node. In the reduction analysis phase only the last
3598 element of the chain is marked as reduction. */
3599 STMT_VINFO_DEF_TYPE (stmt_info)
3600 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3601 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3602 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3603 }
3604 else if (kind == slp_inst_kind_reduc_group)
3605 {
3606 /* Collect reduction statements. */
3607 const vec<stmt_vec_info> &reductions
3608 = as_a <loop_vec_info> (p: vinfo)->reductions;
3609 scalar_stmts.create (nelems: reductions.length ());
3610 for (i = 0; reductions.iterate (ix: i, ptr: &next_info); i++)
3611 if ((STMT_VINFO_RELEVANT_P (next_info)
3612 || STMT_VINFO_LIVE_P (next_info))
3613 /* ??? Make sure we didn't skip a conversion around a reduction
3614 path. In that case we'd have to reverse engineer that conversion
3615 stmt following the chain using reduc_idx and from the PHI
3616 using reduc_def. */
3617 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3618 scalar_stmts.quick_push (obj: next_info);
3619 /* If less than two were relevant/live there's nothing to SLP. */
3620 if (scalar_stmts.length () < 2)
3621 return false;
3622 }
3623 else
3624 gcc_unreachable ();
3625
3626 vec<stmt_vec_info> roots = vNULL;
3627 vec<tree> remain = vNULL;
3628 /* Build the tree for the SLP instance. */
3629 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3630 root_stmt_infos&: roots, remain,
3631 max_tree_size, limit, bst_map,
3632 stmt_info_: kind == slp_inst_kind_store
3633 ? stmt_info : NULL);
3634
3635 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3636 where we should do store group splitting. */
3637
3638 return res;
3639}
3640
3641/* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3642 trees of packed scalar stmts if SLP is possible. */
3643
3644opt_result
3645vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3646{
3647 unsigned int i;
3648 stmt_vec_info first_element;
3649 slp_instance instance;
3650
3651 DUMP_VECT_SCOPE ("vect_analyze_slp");
3652
3653 unsigned limit = max_tree_size;
3654
3655 scalar_stmts_to_slp_tree_map_t *bst_map
3656 = new scalar_stmts_to_slp_tree_map_t ();
3657
3658 /* Find SLP sequences starting from groups of grouped stores. */
3659 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3660 vect_analyze_slp_instance (vinfo, bst_map, stmt_info: first_element,
3661 kind: slp_inst_kind_store, max_tree_size, limit: &limit);
3662
3663 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo))
3664 {
3665 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3666 {
3667 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3668 /* Apply patterns. */
3669 for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
3670 bb_vinfo->roots[i].stmts[j]
3671 = vect_stmt_to_vectorize (stmt_info: bb_vinfo->roots[i].stmts[j]);
3672 if (vect_build_slp_instance (vinfo: bb_vinfo, kind: bb_vinfo->roots[i].kind,
3673 scalar_stmts&: bb_vinfo->roots[i].stmts,
3674 root_stmt_infos&: bb_vinfo->roots[i].roots,
3675 remain&: bb_vinfo->roots[i].remain,
3676 max_tree_size, limit: &limit, bst_map, NULL))
3677 {
3678 bb_vinfo->roots[i].stmts = vNULL;
3679 bb_vinfo->roots[i].roots = vNULL;
3680 bb_vinfo->roots[i].remain = vNULL;
3681 }
3682 }
3683 }
3684
3685 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
3686 {
3687 /* Find SLP sequences starting from reduction chains. */
3688 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3689 if (! STMT_VINFO_RELEVANT_P (first_element)
3690 && ! STMT_VINFO_LIVE_P (first_element))
3691 ;
3692 else if (! vect_analyze_slp_instance (vinfo, bst_map, stmt_info: first_element,
3693 kind: slp_inst_kind_reduc_chain,
3694 max_tree_size, limit: &limit))
3695 {
3696 /* Dissolve reduction chain group. */
3697 stmt_vec_info vinfo = first_element;
3698 stmt_vec_info last = NULL;
3699 while (vinfo)
3700 {
3701 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3702 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3703 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3704 last = vinfo;
3705 vinfo = next;
3706 }
3707 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3708 /* It can be still vectorized as part of an SLP reduction. */
3709 loop_vinfo->reductions.safe_push (obj: last);
3710 }
3711
3712 /* Find SLP sequences starting from groups of reductions. */
3713 if (loop_vinfo->reductions.length () > 1)
3714 vect_analyze_slp_instance (vinfo, bst_map, stmt_info: loop_vinfo->reductions[0],
3715 kind: slp_inst_kind_reduc_group, max_tree_size,
3716 limit: &limit);
3717 }
3718
3719 hash_set<slp_tree> visited_patterns;
3720 slp_tree_to_load_perm_map_t perm_cache;
3721 slp_compat_nodes_map_t compat_cache;
3722
3723 /* See if any patterns can be found in the SLP tree. */
3724 bool pattern_found = false;
3725 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3726 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3727 visited: &visited_patterns, perm_cache: &perm_cache,
3728 compat_cache: &compat_cache);
3729
3730 /* If any were found optimize permutations of loads. */
3731 if (pattern_found)
3732 {
3733 hash_map<slp_tree, slp_tree> load_map;
3734 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3735 {
3736 slp_tree root = SLP_INSTANCE_TREE (instance);
3737 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3738 load_map: &load_map, root);
3739 }
3740 }
3741
3742
3743
3744 /* The map keeps a reference on SLP nodes built, release that. */
3745 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3746 it != bst_map->end (); ++it)
3747 if ((*it).second)
3748 vect_free_slp_tree (node: (*it).second);
3749 delete bst_map;
3750
3751 if (pattern_found && dump_enabled_p ())
3752 {
3753 dump_printf_loc (MSG_NOTE, vect_location,
3754 "Pattern matched SLP tree\n");
3755 hash_set<slp_tree> visited;
3756 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3757 vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location,
3758 SLP_INSTANCE_TREE (instance), visited);
3759 }
3760
3761 return opt_result::success ();
3762}
3763
3764/* Estimates the cost of inserting layout changes into the SLP graph.
3765 It can also say that the insertion is impossible. */
3766
3767struct slpg_layout_cost
3768{
3769 slpg_layout_cost () = default;
3770 slpg_layout_cost (sreal, bool);
3771
3772 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3773 bool is_possible () const { return depth != sreal::max (); }
3774
3775 bool operator== (const slpg_layout_cost &) const;
3776 bool operator!= (const slpg_layout_cost &) const;
3777
3778 bool is_better_than (const slpg_layout_cost &, bool) const;
3779
3780 void add_parallel_cost (const slpg_layout_cost &);
3781 void add_serial_cost (const slpg_layout_cost &);
3782 void split (unsigned int);
3783
3784 /* The longest sequence of layout changes needed during any traversal
3785 of the partition dag, weighted by execution frequency.
3786
3787 This is the most important metric when optimizing for speed, since
3788 it helps to ensure that we keep the number of operations on
3789 critical paths to a minimum. */
3790 sreal depth = 0;
3791
3792 /* An estimate of the total number of operations needed. It is weighted by
3793 execution frequency when optimizing for speed but not when optimizing for
3794 size. In order to avoid double-counting, a node with a fanout of N will
3795 distribute 1/N of its total cost to each successor.
3796
3797 This is the most important metric when optimizing for size, since
3798 it helps to keep the total number of operations to a minimum, */
3799 sreal total = 0;
3800};
3801
3802/* Construct costs for a node with weight WEIGHT. A higher weight
3803 indicates more frequent execution. IS_FOR_SIZE is true if we are
3804 optimizing for size rather than speed. */
3805
3806slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3807 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3808{
3809}
3810
3811bool
3812slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3813{
3814 return depth == other.depth && total == other.total;
3815}
3816
3817bool
3818slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3819{
3820 return !operator== (other);
3821}
3822
3823/* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3824 true if we are optimizing for size rather than speed. */
3825
3826bool
3827slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3828 bool is_for_size) const
3829{
3830 if (is_for_size)
3831 {
3832 if (total != other.total)
3833 return total < other.total;
3834 return depth < other.depth;
3835 }
3836 else
3837 {
3838 if (depth != other.depth)
3839 return depth < other.depth;
3840 return total < other.total;
3841 }
3842}
3843
3844/* Increase the costs to account for something with cost INPUT_COST
3845 happening in parallel with the current costs. */
3846
3847void
3848slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3849{
3850 depth = std::max (a: depth, b: input_cost.depth);
3851 total += input_cost.total;
3852}
3853
3854/* Increase the costs to account for something with cost INPUT_COST
3855 happening in series with the current costs. */
3856
3857void
3858slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3859{
3860 depth += other.depth;
3861 total += other.total;
3862}
3863
3864/* Split the total cost among TIMES successors or predecessors. */
3865
3866void
3867slpg_layout_cost::split (unsigned int times)
3868{
3869 if (times > 1)
3870 total /= times;
3871}
3872
3873/* Information about one node in the SLP graph, for use during
3874 vect_optimize_slp_pass. */
3875
3876struct slpg_vertex
3877{
3878 slpg_vertex (slp_tree node_) : node (node_) {}
3879
3880 /* The node itself. */
3881 slp_tree node;
3882
3883 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3884 partitions are flexible; they can have whichever layout consumers
3885 want them to have. */
3886 int partition = -1;
3887
3888 /* The number of nodes that directly use the result of this one
3889 (i.e. the number of nodes that count this one as a child). */
3890 unsigned int out_degree = 0;
3891
3892 /* The execution frequency of the node. */
3893 sreal weight = 0;
3894
3895 /* The total execution frequency of all nodes that directly use the
3896 result of this one. */
3897 sreal out_weight = 0;
3898};
3899
3900/* Information about one partition of the SLP graph, for use during
3901 vect_optimize_slp_pass. */
3902
3903struct slpg_partition_info
3904{
3905 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3906 of m_partitioned_nodes. */
3907 unsigned int node_begin = 0;
3908 unsigned int node_end = 0;
3909
3910 /* Which layout we've chosen to use for this partition, or -1 if
3911 we haven't picked one yet. */
3912 int layout = -1;
3913
3914 /* The number of predecessors and successors in the partition dag.
3915 The predecessors always have lower partition numbers and the
3916 successors always have higher partition numbers.
3917
3918 Note that the directions of these edges are not necessarily the
3919 same as in the data flow graph. For example, if an SCC has separate
3920 partitions for an inner loop and an outer loop, the inner loop's
3921 partition will have at least two incoming edges from the outer loop's
3922 partition: one for a live-in value and one for a live-out value.
3923 In data flow terms, one of these edges would also be from the outer loop
3924 to the inner loop, but the other would be in the opposite direction. */
3925 unsigned int in_degree = 0;
3926 unsigned int out_degree = 0;
3927};
3928
3929/* Information about the costs of using a particular layout for a
3930 particular partition. It can also say that the combination is
3931 impossible. */
3932
3933struct slpg_partition_layout_costs
3934{
3935 bool is_possible () const { return internal_cost.is_possible (); }
3936 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3937
3938 /* The costs inherited from predecessor partitions. */
3939 slpg_layout_cost in_cost;
3940
3941 /* The inherent cost of the layout within the node itself. For example,
3942 this is nonzero for a load if choosing a particular layout would require
3943 the load to permute the loaded elements. It is nonzero for a
3944 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3945 to full-vector moves. */
3946 slpg_layout_cost internal_cost;
3947
3948 /* The costs inherited from successor partitions. */
3949 slpg_layout_cost out_cost;
3950};
3951
3952/* This class tries to optimize the layout of vectors in order to avoid
3953 unnecessary shuffling. At the moment, the set of possible layouts are
3954 restricted to bijective permutations.
3955
3956 The goal of the pass depends on whether we're optimizing for size or
3957 for speed. When optimizing for size, the goal is to reduce the overall
3958 number of layout changes (including layout changes implied by things
3959 like load permutations). When optimizing for speed, the goal is to
3960 reduce the maximum latency attributable to layout changes on any
3961 non-cyclical path through the data flow graph.
3962
3963 For example, when optimizing a loop nest for speed, we will prefer
3964 to make layout changes outside of a loop rather than inside of a loop,
3965 and will prefer to make layout changes in parallel rather than serially,
3966 even if that increases the overall number of layout changes.
3967
3968 The high-level procedure is:
3969
3970 (1) Build a graph in which edges go from uses (parents) to definitions
3971 (children).
3972
3973 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3974
3975 (3) When optimizing for speed, partition the nodes in each SCC based
3976 on their containing cfg loop. When optimizing for size, treat
3977 each SCC as a single partition.
3978
3979 This gives us a dag of partitions. The goal is now to assign a
3980 layout to each partition.
3981
3982 (4) Construct a set of vector layouts that are worth considering.
3983 Record which nodes must keep their current layout.
3984
3985 (5) Perform a forward walk over the partition dag (from loads to stores)
3986 accumulating the "forward" cost of using each layout. When visiting
3987 each partition, assign a tentative choice of layout to the partition
3988 and use that choice when calculating the cost of using a different
3989 layout in successor partitions.
3990
3991 (6) Perform a backward walk over the partition dag (from stores to loads),
3992 accumulating the "backward" cost of using each layout. When visiting
3993 each partition, make a final choice of layout for that partition based
3994 on the accumulated forward costs (from (5)) and backward costs
3995 (from (6)).
3996
3997 (7) Apply the chosen layouts to the SLP graph.
3998
3999 For example, consider the SLP statements:
4000
4001 S1: a_1 = load
4002 loop:
4003 S2: a_2 = PHI<a_1, a_3>
4004 S3: b_1 = load
4005 S4: a_3 = a_2 + b_1
4006 exit:
4007 S5: a_4 = PHI<a_3>
4008 S6: store a_4
4009
4010 S2 and S4 form an SCC and are part of the same loop. Every other
4011 statement is in a singleton SCC. In this example there is a one-to-one
4012 mapping between SCCs and partitions and the partition dag looks like this;
4013
4014 S1 S3
4015 \ /
4016 S2+S4
4017 |
4018 S5
4019 |
4020 S6
4021
4022 S2, S3 and S4 will have a higher execution frequency than the other
4023 statements, so when optimizing for speed, the goal is to avoid any
4024 layout changes:
4025
4026 - within S3
4027 - within S2+S4
4028 - on the S3->S2+S4 edge
4029
4030 For example, if S3 was originally a reversing load, the goal of the
4031 pass is to make it an unreversed load and change the layout on the
4032 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
4033 on S1->S2+S4 and S5->S6 would also be acceptable.)
4034
4035 The difference between SCCs and partitions becomes important if we
4036 add an outer loop:
4037
4038 S1: a_1 = ...
4039 loop1:
4040 S2: a_2 = PHI<a_1, a_6>
4041 S3: b_1 = load
4042 S4: a_3 = a_2 + b_1
4043 loop2:
4044 S5: a_4 = PHI<a_3, a_5>
4045 S6: c_1 = load
4046 S7: a_5 = a_4 + c_1
4047 exit2:
4048 S8: a_6 = PHI<a_5>
4049 S9: store a_6
4050 exit1:
4051
4052 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
4053 for speed, we usually do not want restrictions in the outer loop to "infect"
4054 the decision for the inner loop. For example, if an outer-loop node
4055 in the SCC contains a statement with a fixed layout, that should not
4056 prevent the inner loop from using a different layout. Conversely,
4057 the inner loop should not dictate a layout to the outer loop: if the
4058 outer loop does a lot of computation, then it may not be efficient to
4059 do all of that computation in the inner loop's preferred layout.
4060
4061 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4062 and S5+S7 (inner). We also try to arrange partitions so that:
4063
4064 - the partition for an outer loop comes before the partition for
4065 an inner loop
4066
4067 - if a sibling loop A dominates a sibling loop B, A's partition
4068 comes before B's
4069
4070 This gives the following partition dag for the example above:
4071
4072 S1 S3
4073 \ /
4074 S2+S4+S8 S6
4075 | \\ /
4076 | S5+S7
4077 |
4078 S9
4079
4080 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4081 one for a reversal of the edge S7->S8.
4082
4083 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
4084 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4085 preferred layout against the cost of changing the layout on entry to the
4086 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4087
4088 Although this works well when optimizing for speed, it has the downside
4089 when optimizing for size that the choice of layout for S5+S7 is completely
4090 independent of S9, which lessens the chance of reducing the overall number
4091 of permutations. We therefore do not partition SCCs when optimizing
4092 for size.
4093
4094 To give a concrete example of the difference between optimizing
4095 for size and speed, consider:
4096
4097 a[0] = (b[1] << c[3]) - d[1];
4098 a[1] = (b[0] << c[2]) - d[0];
4099 a[2] = (b[3] << c[1]) - d[3];
4100 a[3] = (b[2] << c[0]) - d[2];
4101
4102 There are three different layouts here: one for a, one for b and d,
4103 and one for c. When optimizing for speed it is better to permute each
4104 of b, c and d into the order required by a, since those permutations
4105 happen in parallel. But when optimizing for size, it is better to:
4106
4107 - permute c into the same order as b
4108 - do the arithmetic
4109 - permute the result into the order required by a
4110
4111 This gives 2 permutations rather than 3. */
4112
4113class vect_optimize_slp_pass
4114{
4115public:
4116 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4117 void run ();
4118
4119private:
4120 /* Graph building. */
4121 struct loop *containing_loop (slp_tree);
4122 bool is_cfg_latch_edge (graph_edge *);
4123 void build_vertices (hash_set<slp_tree> &, slp_tree);
4124 void build_vertices ();
4125 void build_graph ();
4126
4127 /* Partitioning. */
4128 void create_partitions ();
4129 template<typename T> void for_each_partition_edge (unsigned int, T);
4130
4131 /* Layout selection. */
4132 bool is_compatible_layout (slp_tree, unsigned int);
4133 int change_layout_cost (slp_tree, unsigned int, unsigned int);
4134 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4135 unsigned int);
4136 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4137 int, unsigned int);
4138 int internal_node_cost (slp_tree, int, unsigned int);
4139 void start_choosing_layouts ();
4140
4141 /* Cost propagation. */
4142 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4143 unsigned int, unsigned int);
4144 slpg_layout_cost total_in_cost (unsigned int);
4145 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4146 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4147 void forward_pass ();
4148 void backward_pass ();
4149
4150 /* Rematerialization. */
4151 slp_tree get_result_with_layout (slp_tree, unsigned int);
4152 void materialize ();
4153
4154 /* Clean-up. */
4155 void remove_redundant_permutations ();
4156
4157 void dump ();
4158
4159 vec_info *m_vinfo;
4160
4161 /* True if we should optimize the graph for size, false if we should
4162 optimize it for speed. (It wouldn't be easy to make this decision
4163 more locally.) */
4164 bool m_optimize_size;
4165
4166 /* A graph of all SLP nodes, with edges leading from uses to definitions.
4167 In other words, a node's predecessors are its slp_tree parents and
4168 a node's successors are its slp_tree children. */
4169 graph *m_slpg = nullptr;
4170
4171 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
4172 auto_vec<slpg_vertex> m_vertices;
4173
4174 /* The list of all leaves of M_SLPG. such as external definitions, constants,
4175 and loads. */
4176 auto_vec<int> m_leafs;
4177
4178 /* This array has one entry for every vector layout that we're considering.
4179 Element 0 is null and indicates "no change". Other entries describe
4180 permutations that are inherent in the current graph and that we would
4181 like to reverse if possible.
4182
4183 For example, a permutation { 1, 2, 3, 0 } means that something has
4184 effectively been permuted in that way, such as a load group
4185 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4186 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4187 in order to put things "back" in order. */
4188 auto_vec<vec<unsigned> > m_perms;
4189
4190 /* A partitioning of the nodes for which a layout must be chosen.
4191 Each partition represents an <SCC, cfg loop> pair; that is,
4192 nodes in different SCCs belong to different partitions, and nodes
4193 within an SCC can be further partitioned according to a containing
4194 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4195
4196 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4197 from leaves (such as loads) to roots (such as stores).
4198
4199 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4200 auto_vec<slpg_partition_info> m_partitions;
4201
4202 /* The list of all nodes for which a layout must be chosen. Nodes for
4203 partition P come before the nodes for partition P+1. Nodes within a
4204 partition are in reverse postorder. */
4205 auto_vec<unsigned int> m_partitioned_nodes;
4206
4207 /* Index P * num-layouts + L contains the cost of using layout L
4208 for partition P. */
4209 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4210
4211 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4212 original output of node N adjusted to have layout L. */
4213 auto_vec<slp_tree> m_node_layouts;
4214};
4215
4216/* Fill the vertices and leafs vector with all nodes in the SLP graph.
4217 Also record whether we should optimize anything for speed rather
4218 than size. */
4219
4220void
4221vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4222 slp_tree node)
4223{
4224 unsigned i;
4225 slp_tree child;
4226
4227 if (visited.add (k: node))
4228 return;
4229
4230 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4231 {
4232 basic_block bb = gimple_bb (g: vect_orig_stmt (stmt_info: rep)->stmt);
4233 if (optimize_bb_for_speed_p (bb))
4234 m_optimize_size = false;
4235 }
4236
4237 node->vertex = m_vertices.length ();
4238 m_vertices.safe_push (obj: slpg_vertex (node));
4239
4240 bool leaf = true;
4241 bool force_leaf = false;
4242 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4243 if (child)
4244 {
4245 leaf = false;
4246 build_vertices (visited, node: child);
4247 }
4248 else
4249 force_leaf = true;
4250 /* Since SLP discovery works along use-def edges all cycles have an
4251 entry - but there's the exception of cycles where we do not handle
4252 the entry explicitely (but with a NULL SLP node), like some reductions
4253 and inductions. Force those SLP PHIs to act as leafs to make them
4254 backwards reachable. */
4255 if (leaf || force_leaf)
4256 m_leafs.safe_push (obj: node->vertex);
4257}
4258
4259/* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4260
4261void
4262vect_optimize_slp_pass::build_vertices ()
4263{
4264 hash_set<slp_tree> visited;
4265 unsigned i;
4266 slp_instance instance;
4267 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4268 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4269}
4270
4271/* Apply (reverse) bijectite PERM to VEC. */
4272
4273template <class T>
4274static void
4275vect_slp_permute (vec<unsigned> perm,
4276 vec<T> &vec, bool reverse)
4277{
4278 auto_vec<T, 64> saved;
4279 saved.create (vec.length ());
4280 for (unsigned i = 0; i < vec.length (); ++i)
4281 saved.quick_push (vec[i]);
4282
4283 if (reverse)
4284 {
4285 for (unsigned i = 0; i < vec.length (); ++i)
4286 vec[perm[i]] = saved[i];
4287 for (unsigned i = 0; i < vec.length (); ++i)
4288 gcc_assert (vec[perm[i]] == saved[i]);
4289 }
4290 else
4291 {
4292 for (unsigned i = 0; i < vec.length (); ++i)
4293 vec[i] = saved[perm[i]];
4294 for (unsigned i = 0; i < vec.length (); ++i)
4295 gcc_assert (vec[i] == saved[perm[i]]);
4296 }
4297}
4298
4299/* Return the cfg loop that contains NODE. */
4300
4301struct loop *
4302vect_optimize_slp_pass::containing_loop (slp_tree node)
4303{
4304 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4305 if (!rep)
4306 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4307 return gimple_bb (g: vect_orig_stmt (stmt_info: rep)->stmt)->loop_father;
4308}
4309
4310/* Return true if UD (an edge from a use to a definition) is associated
4311 with a loop latch edge in the cfg. */
4312
4313bool
4314vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4315{
4316 slp_tree use = m_vertices[ud->src].node;
4317 slp_tree def = m_vertices[ud->dest].node;
4318 if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
4319 || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
4320 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4321 return false;
4322
4323 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4324 return (is_a<gphi *> (p: use_rep->stmt)
4325 && bb_loop_header_p (gimple_bb (g: use_rep->stmt))
4326 && containing_loop (node: def) == containing_loop (node: use));
4327}
4328
4329/* Build the graph. Mark edges that correspond to cfg loop latch edges with
4330 a nonnull data field. */
4331
4332void
4333vect_optimize_slp_pass::build_graph ()
4334{
4335 m_optimize_size = true;
4336 build_vertices ();
4337
4338 m_slpg = new_graph (m_vertices.length ());
4339 for (slpg_vertex &v : m_vertices)
4340 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4341 if (child)
4342 {
4343 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4344 if (is_cfg_latch_edge (ud))
4345 ud->data = this;
4346 }
4347}
4348
4349/* Return true if E corresponds to a loop latch edge in the cfg. */
4350
4351static bool
4352skip_cfg_latch_edges (graph_edge *e)
4353{
4354 return e->data;
4355}
4356
4357/* Create the node partitions. */
4358
4359void
4360vect_optimize_slp_pass::create_partitions ()
4361{
4362 /* Calculate a postorder of the graph, ignoring edges that correspond
4363 to natural latch edges in the cfg. Reading the vector from the end
4364 to the beginning gives the reverse postorder. */
4365 auto_vec<int> initial_rpo;
4366 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4367 false, NULL, skip_cfg_latch_edges);
4368 gcc_assert (initial_rpo.length () == m_vertices.length ());
4369
4370 /* Calculate the strongly connected components of the graph. */
4371 auto_vec<int> scc_grouping;
4372 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4373
4374 /* Create a new index order in which all nodes from the same SCC are
4375 consecutive. Use scc_pos to record the index of the first node in
4376 each SCC. */
4377 auto_vec<unsigned int> scc_pos (num_sccs);
4378 int last_component = -1;
4379 unsigned int node_count = 0;
4380 for (unsigned int node_i : scc_grouping)
4381 {
4382 if (last_component != m_slpg->vertices[node_i].component)
4383 {
4384 last_component = m_slpg->vertices[node_i].component;
4385 gcc_assert (last_component == int (scc_pos.length ()));
4386 scc_pos.quick_push (obj: node_count);
4387 }
4388 node_count += 1;
4389 }
4390 gcc_assert (node_count == initial_rpo.length ()
4391 && last_component + 1 == int (num_sccs));
4392
4393 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4394 inside each SCC following the RPO we calculated above. The fact that
4395 we ignored natural latch edges when calculating the RPO should ensure
4396 that, for natural loop nests:
4397
4398 - the first node that we encounter in a cfg loop is the loop header phi
4399 - the loop header phis are in dominance order
4400
4401 Arranging for this is an optimization (see below) rather than a
4402 correctness issue. Unnatural loops with a tangled mess of backedges
4403 will still work correctly, but might give poorer results.
4404
4405 Also update scc_pos so that it gives 1 + the index of the last node
4406 in the SCC. */
4407 m_partitioned_nodes.safe_grow (len: node_count);
4408 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4409 {
4410 unsigned int node_i = initial_rpo[old_i];
4411 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4412 m_partitioned_nodes[new_i] = node_i;
4413 }
4414
4415 /* When optimizing for speed, partition each SCC based on the containing
4416 cfg loop. The order we constructed above should ensure that, for natural
4417 cfg loops, we'll create sub-SCC partitions for outer loops before
4418 the corresponding sub-SCC partitions for inner loops. Similarly,
4419 when one sibling loop A dominates another sibling loop B, we should
4420 create a sub-SCC partition for A before a sub-SCC partition for B.
4421
4422 As above, nothing depends for correctness on whether this achieves
4423 a natural nesting, but we should get better results when it does. */
4424 m_partitions.reserve (nelems: m_vertices.length ());
4425 unsigned int next_partition_i = 0;
4426 hash_map<struct loop *, int> loop_partitions;
4427 unsigned int rpo_begin = 0;
4428 unsigned int num_partitioned_nodes = 0;
4429 for (unsigned int rpo_end : scc_pos)
4430 {
4431 loop_partitions.empty ();
4432 unsigned int partition_i = next_partition_i;
4433 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4434 {
4435 /* Handle externals and constants optimistically throughout.
4436 But treat existing vectors as fixed since we do not handle
4437 permuting them. */
4438 unsigned int node_i = m_partitioned_nodes[rpo_i];
4439 auto &vertex = m_vertices[node_i];
4440 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4441 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4442 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4443 vertex.partition = -1;
4444 else
4445 {
4446 bool existed;
4447 if (m_optimize_size)
4448 existed = next_partition_i > partition_i;
4449 else
4450 {
4451 struct loop *loop = containing_loop (node: vertex.node);
4452 auto &entry = loop_partitions.get_or_insert (k: loop, existed: &existed);
4453 if (!existed)
4454 entry = next_partition_i;
4455 partition_i = entry;
4456 }
4457 if (!existed)
4458 {
4459 m_partitions.quick_push (obj: slpg_partition_info ());
4460 next_partition_i += 1;
4461 }
4462 vertex.partition = partition_i;
4463 num_partitioned_nodes += 1;
4464 m_partitions[partition_i].node_end += 1;
4465 }
4466 }
4467 rpo_begin = rpo_end;
4468 }
4469
4470 /* Assign ranges of consecutive node indices to each partition,
4471 in partition order. Start with node_end being the same as
4472 node_begin so that the next loop can use it as a counter. */
4473 unsigned int node_begin = 0;
4474 for (auto &partition : m_partitions)
4475 {
4476 partition.node_begin = node_begin;
4477 node_begin += partition.node_end;
4478 partition.node_end = partition.node_begin;
4479 }
4480 gcc_assert (node_begin == num_partitioned_nodes);
4481
4482 /* Finally build the list of nodes in partition order. */
4483 m_partitioned_nodes.truncate (size: num_partitioned_nodes);
4484 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4485 {
4486 int partition_i = m_vertices[node_i].partition;
4487 if (partition_i >= 0)
4488 {
4489 unsigned int order_i = m_partitions[partition_i].node_end++;
4490 m_partitioned_nodes[order_i] = node_i;
4491 }
4492 }
4493}
4494
4495/* Look for edges from earlier partitions into node NODE_I and edges from
4496 node NODE_I into later partitions. Call:
4497
4498 FN (ud, other_node_i)
4499
4500 for each such use-to-def edge ud, where other_node_i is the node at the
4501 other end of the edge. */
4502
4503template<typename T>
4504void
4505vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4506{
4507 int partition_i = m_vertices[node_i].partition;
4508 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4509 pred; pred = pred->pred_next)
4510 {
4511 int src_partition_i = m_vertices[pred->src].partition;
4512 if (src_partition_i >= 0 && src_partition_i != partition_i)
4513 fn (pred, pred->src);
4514 }
4515 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4516 succ; succ = succ->succ_next)
4517 {
4518 int dest_partition_i = m_vertices[succ->dest].partition;
4519 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4520 fn (succ, succ->dest);
4521 }
4522}
4523
4524/* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4525 that NODE would operate on. This test is independent of NODE's actual
4526 operation. */
4527
4528bool
4529vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4530 unsigned int layout_i)
4531{
4532 if (layout_i == 0)
4533 return true;
4534
4535 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4536 return false;
4537
4538 return true;
4539}
4540
4541/* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4542 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4543 layouts is incompatible with NODE or if the change is not possible for
4544 some other reason.
4545
4546 The properties taken from NODE include the number of lanes and the
4547 vector type. The actual operation doesn't matter. */
4548
4549int
4550vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4551 unsigned int from_layout_i,
4552 unsigned int to_layout_i)
4553{
4554 if (!is_compatible_layout (node, layout_i: from_layout_i)
4555 || !is_compatible_layout (node, layout_i: to_layout_i))
4556 return -1;
4557
4558 if (from_layout_i == to_layout_i)
4559 return 0;
4560
4561 auto_vec<slp_tree, 1> children (1);
4562 children.quick_push (obj: node);
4563 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4564 if (from_layout_i > 0)
4565 for (unsigned int i : m_perms[from_layout_i])
4566 perm.quick_push (obj: { 0, i });
4567 else
4568 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4569 perm.quick_push (obj: { 0, i });
4570 if (to_layout_i > 0)
4571 vect_slp_permute (perm: m_perms[to_layout_i], vec&: perm, reverse: true);
4572 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4573 children, false);
4574 if (count >= 0)
4575 return MAX (count, 1);
4576
4577 /* ??? In principle we could try changing via layout 0, giving two
4578 layout changes rather than 1. Doing that would require
4579 corresponding support in get_result_with_layout. */
4580 return -1;
4581}
4582
4583/* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4584
4585inline slpg_partition_layout_costs &
4586vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4587 unsigned int layout_i)
4588{
4589 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4590}
4591
4592/* Change PERM in one of two ways:
4593
4594 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4595 chosen for child I of NODE.
4596
4597 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4598
4599 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4600
4601void
4602vect_optimize_slp_pass::
4603change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4604 int in_layout_i, unsigned int out_layout_i)
4605{
4606 for (auto &entry : perm)
4607 {
4608 int this_in_layout_i = in_layout_i;
4609 if (this_in_layout_i < 0)
4610 {
4611 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4612 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4613 this_in_layout_i = m_partitions[in_partition_i].layout;
4614 }
4615 if (this_in_layout_i > 0)
4616 entry.second = m_perms[this_in_layout_i][entry.second];
4617 }
4618 if (out_layout_i > 0)
4619 vect_slp_permute (perm: m_perms[out_layout_i], vec&: perm, reverse: true);
4620}
4621
4622/* Check whether the target allows NODE to be rearranged so that the node's
4623 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4624 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4625
4626 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4627 NODE can adapt to the layout changes that have (perhaps provisionally)
4628 been chosen for NODE's children, so that no extra permutations are
4629 needed on either the input or the output of NODE.
4630
4631 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4632 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4633
4634 IN_LAYOUT_I has no meaning for other types of node.
4635
4636 Keeping the node as-is is always valid. If the target doesn't appear
4637 to support the node as-is, but might realistically support other layouts,
4638 then layout 0 instead has the cost of a worst-case permutation. On the
4639 one hand, this ensures that every node has at least one valid layout,
4640 avoiding what would otherwise be an awkward special case. On the other,
4641 it still encourages the pass to change an invalid pre-existing layout
4642 choice into a valid one. */
4643
4644int
4645vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4646 unsigned int out_layout_i)
4647{
4648 const int fallback_cost = 1;
4649
4650 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4651 {
4652 auto_lane_permutation_t tmp_perm;
4653 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4654
4655 /* Check that the child nodes support the chosen layout. Checking
4656 the first child is enough, since any second child would have the
4657 same shape. */
4658 auto first_child = SLP_TREE_CHILDREN (node)[0];
4659 if (in_layout_i > 0
4660 && !is_compatible_layout (node: first_child, layout_i: in_layout_i))
4661 return -1;
4662
4663 change_vec_perm_layout (node, perm&: tmp_perm, in_layout_i, out_layout_i);
4664 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4665 node, tmp_perm,
4666 SLP_TREE_CHILDREN (node),
4667 false);
4668 if (count < 0)
4669 {
4670 if (in_layout_i == 0 && out_layout_i == 0)
4671 {
4672 /* Use the fallback cost if the node could in principle support
4673 some nonzero layout for both the inputs and the outputs.
4674 Otherwise assume that the node will be rejected later
4675 and rebuilt from scalars. */
4676 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4677 return fallback_cost;
4678 return 0;
4679 }
4680 return -1;
4681 }
4682
4683 /* We currently have no way of telling whether the new layout is cheaper
4684 or more expensive than the old one. But at least in principle,
4685 it should be worth making zero permutations (whole-vector shuffles)
4686 cheaper than real permutations, in case the pass is able to remove
4687 the latter. */
4688 return count == 0 ? 0 : 1;
4689 }
4690
4691 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4692 if (rep
4693 && STMT_VINFO_DATA_REF (rep)
4694 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4695 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4696 {
4697 auto_load_permutation_t tmp_perm;
4698 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4699 if (out_layout_i > 0)
4700 vect_slp_permute (perm: m_perms[out_layout_i], vec&: tmp_perm, reverse: true);
4701
4702 poly_uint64 vf = 1;
4703 if (auto loop_vinfo = dyn_cast<loop_vec_info> (p: m_vinfo))
4704 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4705 unsigned int n_perms;
4706 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4707 nullptr, vf, true, false, &n_perms))
4708 {
4709 auto rep = SLP_TREE_REPRESENTATIVE (node);
4710 if (out_layout_i == 0)
4711 {
4712 /* Use the fallback cost if the load is an N-to-N permutation.
4713 Otherwise assume that the node will be rejected later
4714 and rebuilt from scalars. */
4715 if (STMT_VINFO_GROUPED_ACCESS (rep)
4716 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4717 == SLP_TREE_LANES (node)))
4718 return fallback_cost;
4719 return 0;
4720 }
4721 return -1;
4722 }
4723
4724 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4725 return n_perms == 0 ? 0 : 1;
4726 }
4727
4728 return 0;
4729}
4730
4731/* Decide which element layouts we should consider using. Calculate the
4732 weights associated with inserting layout changes on partition edges.
4733 Also mark partitions that cannot change layout, by setting their
4734 layout to zero. */
4735
4736void
4737vect_optimize_slp_pass::start_choosing_layouts ()
4738{
4739 /* Used to assign unique permutation indices. */
4740 using perm_hash = unbounded_hashmap_traits<
4741 vec_free_hash_base<int_hash_base<unsigned>>,
4742 int_hash<int, -1, -2>
4743 >;
4744 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4745
4746 /* Layout 0 is "no change". */
4747 m_perms.safe_push (obj: vNULL);
4748
4749 /* Create layouts from existing permutations. */
4750 auto_load_permutation_t tmp_perm;
4751 for (unsigned int node_i : m_partitioned_nodes)
4752 {
4753 /* Leafs also double as entries to the reverse graph. Allow the
4754 layout of those to be changed. */
4755 auto &vertex = m_vertices[node_i];
4756 auto &partition = m_partitions[vertex.partition];
4757 if (!m_slpg->vertices[node_i].succ)
4758 partition.layout = 0;
4759
4760 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4761 slp_tree node = vertex.node;
4762 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4763 slp_tree child;
4764 unsigned HOST_WIDE_INT imin, imax = 0;
4765 bool any_permute = false;
4766 tmp_perm.truncate (size: 0);
4767 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4768 {
4769 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4770 unpermuted, record a layout that reverses this permutation.
4771
4772 We would need more work to cope with loads that are internally
4773 permuted and also have inputs (such as masks for
4774 IFN_MASK_LOADs). */
4775 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4776 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4777 {
4778 partition.layout = -1;
4779 continue;
4780 }
4781 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4782 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4783 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4784 }
4785 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4786 && SLP_TREE_CHILDREN (node).length () == 1
4787 && (child = SLP_TREE_CHILDREN (node)[0])
4788 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4789 .is_constant (const_value: &imin)))
4790 {
4791 /* If the child has the same vector size as this node,
4792 reversing the permutation can make the permutation a no-op.
4793 In other cases it can change a true permutation into a
4794 full-vector extract. */
4795 tmp_perm.reserve (SLP_TREE_LANES (node));
4796 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4797 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4798 }
4799 else
4800 continue;
4801
4802 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4803 {
4804 unsigned idx = tmp_perm[j];
4805 imin = MIN (imin, idx);
4806 imax = MAX (imax, idx);
4807 if (idx - tmp_perm[0] != j)
4808 any_permute = true;
4809 }
4810 /* If the span doesn't match we'd disrupt VF computation, avoid
4811 that for now. */
4812 if (imax - imin + 1 != SLP_TREE_LANES (node))
4813 continue;
4814 /* If there's no permute no need to split one out. In this case
4815 we can consider turning a load into a permuted load, if that
4816 turns out to be cheaper than alternatives. */
4817 if (!any_permute)
4818 {
4819 partition.layout = -1;
4820 continue;
4821 }
4822
4823 /* For now only handle true permutes, like
4824 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4825 when permuting constants and invariants keeping the permute
4826 bijective. */
4827 auto_sbitmap load_index (SLP_TREE_LANES (node));
4828 bitmap_clear (load_index);
4829 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4830 bitmap_set_bit (map: load_index, bitno: tmp_perm[j] - imin);
4831 unsigned j;
4832 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4833 if (!bitmap_bit_p (map: load_index, bitno: j))
4834 break;
4835 if (j != SLP_TREE_LANES (node))
4836 continue;
4837
4838 vec<unsigned> perm = vNULL;
4839 perm.safe_grow (SLP_TREE_LANES (node), exact: true);
4840 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4841 perm[j] = tmp_perm[j] - imin;
4842
4843 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4844 {
4845 /* Continue to use existing layouts, but don't add any more. */
4846 int *entry = layout_ids.get (k: perm);
4847 partition.layout = entry ? *entry : 0;
4848 perm.release ();
4849 }
4850 else
4851 {
4852 bool existed;
4853 int &layout_i = layout_ids.get_or_insert (k: perm, existed: &existed);
4854 if (existed)
4855 perm.release ();
4856 else
4857 {
4858 layout_i = m_perms.length ();
4859 m_perms.safe_push (obj: perm);
4860 }
4861 partition.layout = layout_i;
4862 }
4863 }
4864
4865 /* Initially assume that every layout is possible and has zero cost
4866 in every partition. */
4867 m_partition_layout_costs.safe_grow_cleared (len: m_partitions.length ()
4868 * m_perms.length ());
4869
4870 /* We have to mark outgoing permutations facing non-associating-reduction
4871 graph entries that are not represented as to be materialized.
4872 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
4873 for (slp_instance instance : m_vinfo->slp_instances)
4874 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4875 {
4876 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4877 m_partitions[m_vertices[node_i].partition].layout = 0;
4878 }
4879 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4880 {
4881 stmt_vec_info stmt_info
4882 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4883 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4884 if (needs_fold_left_reduction_p (TREE_TYPE
4885 (gimple_get_lhs (stmt_info->stmt)),
4886 STMT_VINFO_REDUC_CODE (reduc_info)))
4887 {
4888 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4889 m_partitions[m_vertices[node_i].partition].layout = 0;
4890 }
4891 }
4892
4893 /* Check which layouts each node and partition can handle. Calculate the
4894 weights associated with inserting layout changes on edges. */
4895 for (unsigned int node_i : m_partitioned_nodes)
4896 {
4897 auto &vertex = m_vertices[node_i];
4898 auto &partition = m_partitions[vertex.partition];
4899 slp_tree node = vertex.node;
4900
4901 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4902 {
4903 vertex.weight = vect_slp_node_weight (node);
4904
4905 /* We do not handle stores with a permutation, so all
4906 incoming permutations must have been materialized.
4907
4908 We also don't handle masked grouped loads, which lack a
4909 permutation vector. In this case the memory locations
4910 form an implicit second input to the loads, on top of the
4911 explicit mask input, and the memory input's layout cannot
4912 be changed.
4913
4914 On the other hand, we do support permuting gather loads and
4915 masked gather loads, where each scalar load is independent
4916 of the others. This can be useful if the address/index input
4917 benefits from permutation. */
4918 if (STMT_VINFO_DATA_REF (rep)
4919 && STMT_VINFO_GROUPED_ACCESS (rep)
4920 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4921 partition.layout = 0;
4922
4923 /* We cannot change the layout of an operation that is
4924 not independent on lanes. Note this is an explicit
4925 negative list since that's much shorter than the respective
4926 positive one but it's critical to keep maintaining it. */
4927 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4928 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4929 {
4930 case CFN_COMPLEX_ADD_ROT90:
4931 case CFN_COMPLEX_ADD_ROT270:
4932 case CFN_COMPLEX_MUL:
4933 case CFN_COMPLEX_MUL_CONJ:
4934 case CFN_VEC_ADDSUB:
4935 case CFN_VEC_FMADDSUB:
4936 case CFN_VEC_FMSUBADD:
4937 partition.layout = 0;
4938 default:;
4939 }
4940 }
4941
4942 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4943 {
4944 auto &other_vertex = m_vertices[other_node_i];
4945
4946 /* Count the number of edges from earlier partitions and the number
4947 of edges to later partitions. */
4948 if (other_vertex.partition < vertex.partition)
4949 partition.in_degree += 1;
4950 else
4951 partition.out_degree += 1;
4952
4953 /* If the current node uses the result of OTHER_NODE_I, accumulate
4954 the effects of that. */
4955 if (ud->src == int (node_i))
4956 {
4957 other_vertex.out_weight += vertex.weight;
4958 other_vertex.out_degree += 1;
4959 }
4960 };
4961 for_each_partition_edge (node_i, fn: process_edge);
4962 }
4963}
4964
4965/* Return the incoming costs for node NODE_I, assuming that each input keeps
4966 its current (provisional) choice of layout. The inputs do not necessarily
4967 have the same layout as each other. */
4968
4969slpg_layout_cost
4970vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4971{
4972 auto &vertex = m_vertices[node_i];
4973 slpg_layout_cost cost;
4974 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4975 {
4976 auto &other_vertex = m_vertices[other_node_i];
4977 if (other_vertex.partition < vertex.partition)
4978 {
4979 auto &other_partition = m_partitions[other_vertex.partition];
4980 auto &other_costs = partition_layout_costs (partition_i: other_vertex.partition,
4981 layout_i: other_partition.layout);
4982 slpg_layout_cost this_cost = other_costs.in_cost;
4983 this_cost.add_serial_cost (other: other_costs.internal_cost);
4984 this_cost.split (times: other_partition.out_degree);
4985 cost.add_parallel_cost (input_cost: this_cost);
4986 }
4987 };
4988 for_each_partition_edge (node_i, fn: add_cost);
4989 return cost;
4990}
4991
4992/* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4993 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4994 slpg_layout_cost::impossible () if the change isn't possible. */
4995
4996slpg_layout_cost
4997vect_optimize_slp_pass::
4998edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4999 unsigned int layout2_i)
5000{
5001 auto &def_vertex = m_vertices[ud->dest];
5002 auto &use_vertex = m_vertices[ud->src];
5003 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
5004 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
5005 auto factor = change_layout_cost (node: def_vertex.node, from_layout_i: def_layout_i,
5006 to_layout_i: use_layout_i);
5007 if (factor < 0)
5008 return slpg_layout_cost::impossible ();
5009
5010 /* We have a choice of putting the layout change at the site of the
5011 definition or at the site of the use. Prefer the former when
5012 optimizing for size or when the execution frequency of the
5013 definition is no greater than the combined execution frequencies of
5014 the uses. When putting the layout change at the site of the definition,
5015 divvy up the cost among all consumers. */
5016 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
5017 {
5018 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
5019 cost.split (times: def_vertex.out_degree);
5020 return cost;
5021 }
5022 return { use_vertex.weight * factor, m_optimize_size };
5023}
5024
5025/* UD represents a use-def link between FROM_NODE_I and a node in a later
5026 partition; FROM_NODE_I could be the definition node or the use node.
5027 The node at the other end of the link wants to use layout TO_LAYOUT_I.
5028 Return the cost of any necessary fix-ups on edge UD, or return
5029 slpg_layout_cost::impossible () if the change isn't possible.
5030
5031 At this point, FROM_NODE_I's partition has chosen the cheapest
5032 layout based on the information available so far, but this choice
5033 is only provisional. */
5034
5035slpg_layout_cost
5036vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5037 unsigned int to_layout_i)
5038{
5039 auto &from_vertex = m_vertices[from_node_i];
5040 unsigned int from_partition_i = from_vertex.partition;
5041 slpg_partition_info &from_partition = m_partitions[from_partition_i];
5042 gcc_assert (from_partition.layout >= 0);
5043
5044 /* First calculate the cost on the assumption that FROM_PARTITION sticks
5045 with its current layout preference. */
5046 slpg_layout_cost cost = slpg_layout_cost::impossible ();
5047 auto edge_cost = edge_layout_cost (ud, node1_i: from_node_i,
5048 layout1_i: from_partition.layout, layout2_i: to_layout_i);
5049 if (edge_cost.is_possible ())
5050 {
5051 auto &from_costs = partition_layout_costs (partition_i: from_partition_i,
5052 layout_i: from_partition.layout);
5053 cost = from_costs.in_cost;
5054 cost.add_serial_cost (other: from_costs.internal_cost);
5055 cost.split (times: from_partition.out_degree);
5056 cost.add_serial_cost (other: edge_cost);
5057 }
5058 else if (from_partition.layout == 0)
5059 /* We must allow the source partition to have layout 0 as a fallback,
5060 in case all other options turn out to be impossible. */
5061 return cost;
5062
5063 /* Take the minimum of that cost and the cost that applies if
5064 FROM_PARTITION instead switches to TO_LAYOUT_I. */
5065 auto &direct_layout_costs = partition_layout_costs (partition_i: from_partition_i,
5066 layout_i: to_layout_i);
5067 if (direct_layout_costs.is_possible ())
5068 {
5069 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5070 direct_cost.add_serial_cost (other: direct_layout_costs.internal_cost);
5071 direct_cost.split (times: from_partition.out_degree);
5072 if (!cost.is_possible ()
5073 || direct_cost.is_better_than (other: cost, is_for_size: m_optimize_size))
5074 cost = direct_cost;
5075 }
5076
5077 return cost;
5078}
5079
5080/* UD represents a use-def link between TO_NODE_I and a node in an earlier
5081 partition; TO_NODE_I could be the definition node or the use node.
5082 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5083 return the cost of any necessary fix-ups on edge UD, or
5084 slpg_layout_cost::impossible () if the choice cannot be made.
5085
5086 At this point, TO_NODE_I's partition has a fixed choice of layout. */
5087
5088slpg_layout_cost
5089vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5090 unsigned int from_layout_i)
5091{
5092 auto &to_vertex = m_vertices[to_node_i];
5093 unsigned int to_partition_i = to_vertex.partition;
5094 slpg_partition_info &to_partition = m_partitions[to_partition_i];
5095 gcc_assert (to_partition.layout >= 0);
5096
5097 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5098 adjusted for this input having layout FROM_LAYOUT_I. Assume that
5099 any other inputs keep their current choice of layout. */
5100 auto &to_costs = partition_layout_costs (partition_i: to_partition_i,
5101 layout_i: to_partition.layout);
5102 if (ud->src == int (to_node_i)
5103 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5104 {
5105 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5106 auto old_layout = from_partition.layout;
5107 from_partition.layout = from_layout_i;
5108 int factor = internal_node_cost (node: to_vertex.node, in_layout_i: -1,
5109 out_layout_i: to_partition.layout);
5110 from_partition.layout = old_layout;
5111 if (factor >= 0)
5112 {
5113 slpg_layout_cost cost = to_costs.out_cost;
5114 cost.add_serial_cost (other: { to_vertex.weight * factor,
5115 m_optimize_size });
5116 cost.split (times: to_partition.in_degree);
5117 return cost;
5118 }
5119 }
5120
5121 /* Compute the cost if we insert any necessary layout change on edge UD. */
5122 auto edge_cost = edge_layout_cost (ud, node1_i: to_node_i,
5123 layout1_i: to_partition.layout, layout2_i: from_layout_i);
5124 if (edge_cost.is_possible ())
5125 {
5126 slpg_layout_cost cost = to_costs.out_cost;
5127 cost.add_serial_cost (other: to_costs.internal_cost);
5128 cost.split (times: to_partition.in_degree);
5129 cost.add_serial_cost (other: edge_cost);
5130 return cost;
5131 }
5132
5133 return slpg_layout_cost::impossible ();
5134}
5135
5136/* Make a forward pass through the partitions, accumulating input costs.
5137 Make a tentative (provisional) choice of layout for each partition,
5138 ensuring that this choice still allows later partitions to keep
5139 their original layout. */
5140
5141void
5142vect_optimize_slp_pass::forward_pass ()
5143{
5144 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5145 ++partition_i)
5146 {
5147 auto &partition = m_partitions[partition_i];
5148
5149 /* If the partition consists of a single VEC_PERM_EXPR, precompute
5150 the incoming cost that would apply if every predecessor partition
5151 keeps its current layout. This is used within the loop below. */
5152 slpg_layout_cost in_cost;
5153 slp_tree single_node = nullptr;
5154 if (partition.node_end == partition.node_begin + 1)
5155 {
5156 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5157 single_node = m_vertices[node_i].node;
5158 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5159 in_cost = total_in_cost (node_i);
5160 }
5161
5162 /* Go through the possible layouts. Decide which ones are valid
5163 for this partition and record which of the valid layouts has
5164 the lowest cost. */
5165 unsigned int min_layout_i = 0;
5166 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5167 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5168 {
5169 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5170 if (!layout_costs.is_possible ())
5171 continue;
5172
5173 /* If the recorded layout is already 0 then the layout cannot
5174 change. */
5175 if (partition.layout == 0 && layout_i != 0)
5176 {
5177 layout_costs.mark_impossible ();
5178 continue;
5179 }
5180
5181 bool is_possible = true;
5182 for (unsigned int order_i = partition.node_begin;
5183 order_i < partition.node_end; ++order_i)
5184 {
5185 unsigned int node_i = m_partitioned_nodes[order_i];
5186 auto &vertex = m_vertices[node_i];
5187
5188 /* Reject the layout if it is individually incompatible
5189 with any node in the partition. */
5190 if (!is_compatible_layout (node: vertex.node, layout_i))
5191 {
5192 is_possible = false;
5193 break;
5194 }
5195
5196 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5197 {
5198 auto &other_vertex = m_vertices[other_node_i];
5199 if (other_vertex.partition < vertex.partition)
5200 {
5201 /* Accumulate the incoming costs from earlier
5202 partitions, plus the cost of any layout changes
5203 on UD itself. */
5204 auto cost = forward_cost (ud, from_node_i: other_node_i, to_layout_i: layout_i);
5205 if (!cost.is_possible ())
5206 is_possible = false;
5207 else
5208 layout_costs.in_cost.add_parallel_cost (input_cost: cost);
5209 }
5210 else
5211 /* Reject the layout if it would make layout 0 impossible
5212 for later partitions. This amounts to testing that the
5213 target supports reversing the layout change on edges
5214 to later partitions.
5215
5216 In principle, it might be possible to push a layout
5217 change all the way down a graph, so that it never
5218 needs to be reversed and so that the target doesn't
5219 need to support the reverse operation. But it would
5220 be awkward to bail out if we hit a partition that
5221 does not support the new layout, especially since
5222 we are not dealing with a lattice. */
5223 is_possible &= edge_layout_cost (ud, node1_i: other_node_i, layout1_i: 0,
5224 layout2_i: layout_i).is_possible ();
5225 };
5226 for_each_partition_edge (node_i, fn: add_cost);
5227
5228 /* Accumulate the cost of using LAYOUT_I within NODE,
5229 both for the inputs and the outputs. */
5230 int factor = internal_node_cost (node: vertex.node, in_layout_i: layout_i,
5231 out_layout_i: layout_i);
5232 if (factor < 0)
5233 {
5234 is_possible = false;
5235 break;
5236 }
5237 else if (factor)
5238 layout_costs.internal_cost.add_serial_cost
5239 (other: { vertex.weight * factor, m_optimize_size });
5240 }
5241 if (!is_possible)
5242 {
5243 layout_costs.mark_impossible ();
5244 continue;
5245 }
5246
5247 /* Combine the incoming and partition-internal costs. */
5248 slpg_layout_cost combined_cost = layout_costs.in_cost;
5249 combined_cost.add_serial_cost (other: layout_costs.internal_cost);
5250
5251 /* If this partition consists of a single VEC_PERM_EXPR, see
5252 if the VEC_PERM_EXPR can be changed to support output layout
5253 LAYOUT_I while keeping all the provisional choices of input
5254 layout. */
5255 if (single_node
5256 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5257 {
5258 int factor = internal_node_cost (node: single_node, in_layout_i: -1, out_layout_i: layout_i);
5259 if (factor >= 0)
5260 {
5261 auto weight = m_vertices[single_node->vertex].weight;
5262 slpg_layout_cost internal_cost
5263 = { weight * factor, m_optimize_size };
5264
5265 slpg_layout_cost alt_cost = in_cost;
5266 alt_cost.add_serial_cost (other: internal_cost);
5267 if (alt_cost.is_better_than (other: combined_cost, is_for_size: m_optimize_size))
5268 {
5269 combined_cost = alt_cost;
5270 layout_costs.in_cost = in_cost;
5271 layout_costs.internal_cost = internal_cost;
5272 }
5273 }
5274 }
5275
5276 /* Record the layout with the lowest cost. Prefer layout 0 in
5277 the event of a tie between it and another layout. */
5278 if (!min_layout_cost.is_possible ()
5279 || combined_cost.is_better_than (other: min_layout_cost,
5280 is_for_size: m_optimize_size))
5281 {
5282 min_layout_i = layout_i;
5283 min_layout_cost = combined_cost;
5284 }
5285 }
5286
5287 /* This loop's handling of earlier partitions should ensure that
5288 choosing the original layout for the current partition is no
5289 less valid than it was in the original graph, even with the
5290 provisional layout choices for those earlier partitions. */
5291 gcc_assert (min_layout_cost.is_possible ());
5292 partition.layout = min_layout_i;
5293 }
5294}
5295
5296/* Make a backward pass through the partitions, accumulating output costs.
5297 Make a final choice of layout for each partition. */
5298
5299void
5300vect_optimize_slp_pass::backward_pass ()
5301{
5302 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5303 {
5304 auto &partition = m_partitions[partition_i];
5305
5306 unsigned int min_layout_i = 0;
5307 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5308 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5309 {
5310 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5311 if (!layout_costs.is_possible ())
5312 continue;
5313
5314 /* Accumulate the costs from successor partitions. */
5315 bool is_possible = true;
5316 for (unsigned int order_i = partition.node_begin;
5317 order_i < partition.node_end; ++order_i)
5318 {
5319 unsigned int node_i = m_partitioned_nodes[order_i];
5320 auto &vertex = m_vertices[node_i];
5321 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5322 {
5323 auto &other_vertex = m_vertices[other_node_i];
5324 auto &other_partition = m_partitions[other_vertex.partition];
5325 if (other_vertex.partition > vertex.partition)
5326 {
5327 /* Accumulate the incoming costs from later
5328 partitions, plus the cost of any layout changes
5329 on UD itself. */
5330 auto cost = backward_cost (ud, to_node_i: other_node_i, from_layout_i: layout_i);
5331 if (!cost.is_possible ())
5332 is_possible = false;
5333 else
5334 layout_costs.out_cost.add_parallel_cost (input_cost: cost);
5335 }
5336 else
5337 /* Make sure that earlier partitions can (if necessary
5338 or beneficial) keep the layout that they chose in
5339 the forward pass. This ensures that there is at
5340 least one valid choice of layout. */
5341 is_possible &= edge_layout_cost (ud, node1_i: other_node_i,
5342 layout1_i: other_partition.layout,
5343 layout2_i: layout_i).is_possible ();
5344 };
5345 for_each_partition_edge (node_i, fn: add_cost);
5346 }
5347 if (!is_possible)
5348 {
5349 layout_costs.mark_impossible ();
5350 continue;
5351 }
5352
5353 /* Locally combine the costs from the forward and backward passes.
5354 (This combined cost is not passed on, since that would lead
5355 to double counting.) */
5356 slpg_layout_cost combined_cost = layout_costs.in_cost;
5357 combined_cost.add_serial_cost (other: layout_costs.internal_cost);
5358 combined_cost.add_serial_cost (other: layout_costs.out_cost);
5359
5360 /* Record the layout with the lowest cost. Prefer layout 0 in
5361 the event of a tie between it and another layout. */
5362 if (!min_layout_cost.is_possible ()
5363 || combined_cost.is_better_than (other: min_layout_cost,
5364 is_for_size: m_optimize_size))
5365 {
5366 min_layout_i = layout_i;
5367 min_layout_cost = combined_cost;
5368 }
5369 }
5370
5371 gcc_assert (min_layout_cost.is_possible ());
5372 partition.layout = min_layout_i;
5373 }
5374}
5375
5376/* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5377 NODE already has the layout that was selected for its partition. */
5378
5379slp_tree
5380vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5381 unsigned int to_layout_i)
5382{
5383 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5384 slp_tree result = m_node_layouts[result_i];
5385 if (result)
5386 return result;
5387
5388 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5389 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5390 /* We can't permute vector defs in place. */
5391 && SLP_TREE_VEC_DEFS (node).is_empty ()))
5392 {
5393 /* If the vector is uniform or unchanged, there's nothing to do. */
5394 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5395 result = node;
5396 else
5397 {
5398 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5399 result = vect_create_new_slp_node (ops: scalar_ops);
5400 vect_slp_permute (perm: m_perms[to_layout_i], vec&: scalar_ops, reverse: true);
5401 }
5402 }
5403 else
5404 {
5405 unsigned int partition_i = m_vertices[node->vertex].partition;
5406 unsigned int from_layout_i = m_partitions[partition_i].layout;
5407 if (from_layout_i == to_layout_i)
5408 return node;
5409
5410 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5411 permutation instead of a serial one. Leave the new permutation
5412 in TMP_PERM on success. */
5413 auto_lane_permutation_t tmp_perm;
5414 unsigned int num_inputs = 1;
5415 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5416 {
5417 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5418 if (from_layout_i != 0)
5419 vect_slp_permute (perm: m_perms[from_layout_i], vec&: tmp_perm, reverse: false);
5420 if (to_layout_i != 0)
5421 vect_slp_permute (perm: m_perms[to_layout_i], vec&: tmp_perm, reverse: true);
5422 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5423 tmp_perm,
5424 SLP_TREE_CHILDREN (node),
5425 false) >= 0)
5426 num_inputs = SLP_TREE_CHILDREN (node).length ();
5427 else
5428 tmp_perm.truncate (size: 0);
5429 }
5430
5431 if (dump_enabled_p ())
5432 {
5433 if (tmp_perm.length () > 0)
5434 dump_printf_loc (MSG_NOTE, vect_location,
5435 "duplicating permutation node %p with"
5436 " layout %d\n",
5437 (void *) node, to_layout_i);
5438 else
5439 dump_printf_loc (MSG_NOTE, vect_location,
5440 "inserting permutation node in place of %p\n",
5441 (void *) node);
5442 }
5443
5444 unsigned int num_lanes = SLP_TREE_LANES (node);
5445 result = vect_create_new_slp_node (nops: num_inputs, code: VEC_PERM_EXPR);
5446 if (SLP_TREE_SCALAR_STMTS (node).length ())
5447 {
5448 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5449 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5450 if (from_layout_i != 0)
5451 vect_slp_permute (perm: m_perms[from_layout_i], vec&: stmts, reverse: false);
5452 if (to_layout_i != 0)
5453 vect_slp_permute (perm: m_perms[to_layout_i], vec&: stmts, reverse: true);
5454 }
5455 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5456 SLP_TREE_LANES (result) = num_lanes;
5457 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5458 result->vertex = -1;
5459
5460 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5461 if (tmp_perm.length ())
5462 {
5463 lane_perm.safe_splice (src: tmp_perm);
5464 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5465 }
5466 else
5467 {
5468 lane_perm.create (nelems: num_lanes);
5469 for (unsigned j = 0; j < num_lanes; ++j)
5470 lane_perm.quick_push (obj: { 0, j });
5471 if (from_layout_i != 0)
5472 vect_slp_permute (perm: m_perms[from_layout_i], vec&: lane_perm, reverse: false);
5473 if (to_layout_i != 0)
5474 vect_slp_permute (perm: m_perms[to_layout_i], vec&: lane_perm, reverse: true);
5475 SLP_TREE_CHILDREN (result).safe_push (obj: node);
5476 }
5477 for (slp_tree child : SLP_TREE_CHILDREN (result))
5478 child->refcnt++;
5479 }
5480 m_node_layouts[result_i] = result;
5481 return result;
5482}
5483
5484/* Apply the chosen vector layouts to the SLP graph. */
5485
5486void
5487vect_optimize_slp_pass::materialize ()
5488{
5489 /* We no longer need the costs, so avoid having two O(N * P) arrays
5490 live at the same time. */
5491 m_partition_layout_costs.release ();
5492 m_node_layouts.safe_grow_cleared (len: m_vertices.length () * m_perms.length ());
5493
5494 auto_sbitmap fully_folded (m_vertices.length ());
5495 bitmap_clear (fully_folded);
5496 for (unsigned int node_i : m_partitioned_nodes)
5497 {
5498 auto &vertex = m_vertices[node_i];
5499 slp_tree node = vertex.node;
5500 int layout_i = m_partitions[vertex.partition].layout;
5501 gcc_assert (layout_i >= 0);
5502
5503 /* Rearrange the scalar statements to match the chosen layout. */
5504 if (layout_i > 0)
5505 vect_slp_permute (perm: m_perms[layout_i],
5506 SLP_TREE_SCALAR_STMTS (node), reverse: true);
5507
5508 /* Update load and lane permutations. */
5509 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5510 {
5511 /* First try to absorb the input vector layouts. If that fails,
5512 force the inputs to have layout LAYOUT_I too. We checked that
5513 that was possible before deciding to use nonzero output layouts.
5514 (Note that at this stage we don't really have any guarantee that
5515 the target supports the original VEC_PERM_EXPR.) */
5516 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5517 auto_lane_permutation_t tmp_perm;
5518 tmp_perm.safe_splice (src: perm);
5519 change_vec_perm_layout (node, perm&: tmp_perm, in_layout_i: -1, out_layout_i: layout_i);
5520 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5521 tmp_perm,
5522 SLP_TREE_CHILDREN (node),
5523 false) >= 0)
5524 {
5525 if (dump_enabled_p ()
5526 && !std::equal (first1: tmp_perm.begin (), last1: tmp_perm.end (),
5527 first2: perm.begin ()))
5528 dump_printf_loc (MSG_NOTE, vect_location,
5529 "absorbing input layouts into %p\n",
5530 (void *) node);
5531 std::copy (first: tmp_perm.begin (), last: tmp_perm.end (), result: perm.begin ());
5532 bitmap_set_bit (map: fully_folded, bitno: node_i);
5533 }
5534 else
5535 {
5536 /* Not MSG_MISSED because it would make no sense to users. */
5537 if (dump_enabled_p ())
5538 dump_printf_loc (MSG_NOTE, vect_location,
5539 "failed to absorb input layouts into %p\n",
5540 (void *) node);
5541 change_vec_perm_layout (node: nullptr, perm, in_layout_i: layout_i, out_layout_i: layout_i);
5542 }
5543 }
5544 else
5545 {
5546 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5547 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5548 if (layout_i > 0)
5549 /* ??? When we handle non-bijective permutes the idea
5550 is that we can force the load-permutation to be
5551 { min, min + 1, min + 2, ... max }. But then the
5552 scalar defs might no longer match the lane content
5553 which means wrong-code with live lane vectorization.
5554 So we possibly have to have NULL entries for those. */
5555 vect_slp_permute (perm: m_perms[layout_i], vec&: load_perm, reverse: true);
5556 }
5557 }
5558
5559 /* Do this before any nodes disappear, since it involves a walk
5560 over the leaves. */
5561 remove_redundant_permutations ();
5562
5563 /* Replace each child with a correctly laid-out version. */
5564 for (unsigned int node_i : m_partitioned_nodes)
5565 {
5566 /* Skip nodes that have already been handled above. */
5567 if (bitmap_bit_p (map: fully_folded, bitno: node_i))
5568 continue;
5569
5570 auto &vertex = m_vertices[node_i];
5571 int in_layout_i = m_partitions[vertex.partition].layout;
5572 gcc_assert (in_layout_i >= 0);
5573
5574 unsigned j;
5575 slp_tree child;
5576 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5577 {
5578 if (!child)
5579 continue;
5580
5581 slp_tree new_child = get_result_with_layout (node: child, to_layout_i: in_layout_i);
5582 if (new_child != child)
5583 {
5584 vect_free_slp_tree (node: child);
5585 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5586 new_child->refcnt += 1;
5587 }
5588 }
5589 }
5590}
5591
5592/* Elide load permutations that are not necessary. Such permutations might
5593 be pre-existing, rather than created by the layout optimizations. */
5594
5595void
5596vect_optimize_slp_pass::remove_redundant_permutations ()
5597{
5598 for (unsigned int node_i : m_leafs)
5599 {
5600 slp_tree node = m_vertices[node_i].node;
5601 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5602 continue;
5603
5604 /* In basic block vectorization we allow any subchain of an interleaving
5605 chain.
5606 FORNOW: not in loop SLP because of realignment complications. */
5607 if (is_a <bb_vec_info> (p: m_vinfo))
5608 {
5609 bool subchain_p = true;
5610 stmt_vec_info next_load_info = NULL;
5611 stmt_vec_info load_info;
5612 unsigned j;
5613 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5614 {
5615 if (j != 0
5616 && (next_load_info != load_info
5617 || DR_GROUP_GAP (load_info) != 1))
5618 {
5619 subchain_p = false;
5620 break;
5621 }
5622 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5623 }
5624 if (subchain_p)
5625 {
5626 SLP_TREE_LOAD_PERMUTATION (node).release ();
5627 continue;
5628 }
5629 }
5630 else
5631 {
5632 loop_vec_info loop_vinfo = as_a<loop_vec_info> (p: m_vinfo);
5633 stmt_vec_info load_info;
5634 bool this_load_permuted = false;
5635 unsigned j;
5636 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5637 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5638 {
5639 this_load_permuted = true;
5640 break;
5641 }
5642 /* When this isn't a grouped access we know it's single element
5643 and contiguous. */
5644 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5645 {
5646 if (!this_load_permuted
5647 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5648 || SLP_TREE_LANES (node) == 1))
5649 SLP_TREE_LOAD_PERMUTATION (node).release ();
5650 continue;
5651 }
5652 stmt_vec_info first_stmt_info
5653 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5654 if (!this_load_permuted
5655 /* The load requires permutation when unrolling exposes
5656 a gap either because the group is larger than the SLP
5657 group-size or because there is a gap between the groups. */
5658 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5659 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5660 && DR_GROUP_GAP (first_stmt_info) == 0)))
5661 {
5662 SLP_TREE_LOAD_PERMUTATION (node).release ();
5663 continue;
5664 }
5665 }
5666 }
5667}
5668
5669/* Print the partition graph and layout information to the dump file. */
5670
5671void
5672vect_optimize_slp_pass::dump ()
5673{
5674 dump_printf_loc (MSG_NOTE, vect_location,
5675 "SLP optimize permutations:\n");
5676 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5677 {
5678 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5679 const char *sep = "";
5680 for (unsigned int idx : m_perms[layout_i])
5681 {
5682 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5683 sep = ", ";
5684 }
5685 dump_printf (MSG_NOTE, " }\n");
5686 }
5687 dump_printf_loc (MSG_NOTE, vect_location,
5688 "SLP optimize partitions:\n");
5689 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5690 ++partition_i)
5691 {
5692 auto &partition = m_partitions[partition_i];
5693 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5694 dump_printf_loc (MSG_NOTE, vect_location,
5695 " partition %d (layout %d):\n",
5696 partition_i, partition.layout);
5697 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5698 for (unsigned int order_i = partition.node_begin;
5699 order_i < partition.node_end; ++order_i)
5700 {
5701 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5702 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5703 (void *) vertex.node);
5704 dump_printf_loc (MSG_NOTE, vect_location,
5705 " weight: %f\n",
5706 vertex.weight.to_double ());
5707 if (vertex.out_degree)
5708 dump_printf_loc (MSG_NOTE, vect_location,
5709 " out weight: %f (degree %d)\n",
5710 vertex.out_weight.to_double (),
5711 vertex.out_degree);
5712 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5713 dump_printf_loc (MSG_NOTE, vect_location,
5714 " op: VEC_PERM_EXPR\n");
5715 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5716 dump_printf_loc (MSG_NOTE, vect_location,
5717 " op template: %G", rep->stmt);
5718 }
5719 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5720 for (unsigned int order_i = partition.node_begin;
5721 order_i < partition.node_end; ++order_i)
5722 {
5723 unsigned int node_i = m_partitioned_nodes[order_i];
5724 auto &vertex = m_vertices[node_i];
5725 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5726 {
5727 auto &other_vertex = m_vertices[other_node_i];
5728 if (other_vertex.partition < vertex.partition)
5729 dump_printf_loc (MSG_NOTE, vect_location,
5730 " - %p [%d] --> %p\n",
5731 (void *) other_vertex.node,
5732 other_vertex.partition,
5733 (void *) vertex.node);
5734 else
5735 dump_printf_loc (MSG_NOTE, vect_location,
5736 " - %p --> [%d] %p\n",
5737 (void *) vertex.node,
5738 other_vertex.partition,
5739 (void *) other_vertex.node);
5740 };
5741 for_each_partition_edge (node_i, fn: print_edge);
5742 }
5743
5744 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5745 {
5746 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5747 if (layout_costs.is_possible ())
5748 {
5749 dump_printf_loc (MSG_NOTE, vect_location,
5750 " layout %d:%s\n", layout_i,
5751 partition.layout == int (layout_i)
5752 ? " (*)" : "");
5753 slpg_layout_cost combined_cost = layout_costs.in_cost;
5754 combined_cost.add_serial_cost (other: layout_costs.internal_cost);
5755 combined_cost.add_serial_cost (other: layout_costs.out_cost);
5756#define TEMPLATE "{depth: %f, total: %f}"
5757 dump_printf_loc (MSG_NOTE, vect_location,
5758 " " TEMPLATE "\n",
5759 layout_costs.in_cost.depth.to_double (),
5760 layout_costs.in_cost.total.to_double ());
5761 dump_printf_loc (MSG_NOTE, vect_location,
5762 " + " TEMPLATE "\n",
5763 layout_costs.internal_cost.depth.to_double (),
5764 layout_costs.internal_cost.total.to_double ());
5765 dump_printf_loc (MSG_NOTE, vect_location,
5766 " + " TEMPLATE "\n",
5767 layout_costs.out_cost.depth.to_double (),
5768 layout_costs.out_cost.total.to_double ());
5769 dump_printf_loc (MSG_NOTE, vect_location,
5770 " = " TEMPLATE "\n",
5771 combined_cost.depth.to_double (),
5772 combined_cost.total.to_double ());
5773#undef TEMPLATE
5774 }
5775 else
5776 dump_printf_loc (MSG_NOTE, vect_location,
5777 " layout %d: rejected\n", layout_i);
5778 }
5779 }
5780}
5781
5782/* Main entry point for the SLP graph optimization pass. */
5783
5784void
5785vect_optimize_slp_pass::run ()
5786{
5787 build_graph ();
5788 create_partitions ();
5789 start_choosing_layouts ();
5790 if (m_perms.length () > 1)
5791 {
5792 forward_pass ();
5793 backward_pass ();
5794 if (dump_enabled_p ())
5795 dump ();
5796 materialize ();
5797 while (!m_perms.is_empty ())
5798 m_perms.pop ().release ();
5799 }
5800 else
5801 remove_redundant_permutations ();
5802 free_graph (g: m_slpg);
5803}
5804
5805/* Optimize the SLP graph of VINFO. */
5806
5807void
5808vect_optimize_slp (vec_info *vinfo)
5809{
5810 if (vinfo->slp_instances.is_empty ())
5811 return;
5812 vect_optimize_slp_pass (vinfo).run ();
5813}
5814
5815/* Gather loads reachable from the individual SLP graph entries. */
5816
5817void
5818vect_gather_slp_loads (vec_info *vinfo)
5819{
5820 unsigned i;
5821 slp_instance instance;
5822 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5823 {
5824 hash_set<slp_tree> visited;
5825 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5826 SLP_INSTANCE_TREE (instance), visited);
5827 }
5828}
5829
5830
5831/* For each possible SLP instance decide whether to SLP it and calculate overall
5832 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5833 least one instance. */
5834
5835bool
5836vect_make_slp_decision (loop_vec_info loop_vinfo)
5837{
5838 unsigned int i;
5839 poly_uint64 unrolling_factor = 1;
5840 const vec<slp_instance> &slp_instances
5841 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5842 slp_instance instance;
5843 int decided_to_slp = 0;
5844
5845 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5846
5847 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5848 {
5849 /* FORNOW: SLP if you can. */
5850 /* All unroll factors have the form:
5851
5852 GET_MODE_SIZE (vinfo->vector_mode) * X
5853
5854 for some rational X, so they must have a common multiple. */
5855 unrolling_factor
5856 = force_common_multiple (a: unrolling_factor,
5857 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5858
5859 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5860 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5861 loop-based vectorization. Such stmts will be marked as HYBRID. */
5862 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5863 decided_to_slp++;
5864 }
5865
5866 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5867
5868 if (decided_to_slp && dump_enabled_p ())
5869 {
5870 dump_printf_loc (MSG_NOTE, vect_location,
5871 "Decided to SLP %d instances. Unrolling factor ",
5872 decided_to_slp);
5873 dump_dec (MSG_NOTE, unrolling_factor);
5874 dump_printf (MSG_NOTE, "\n");
5875 }
5876
5877 return (decided_to_slp > 0);
5878}
5879
5880/* Private data for vect_detect_hybrid_slp. */
5881struct vdhs_data
5882{
5883 loop_vec_info loop_vinfo;
5884 vec<stmt_vec_info> *worklist;
5885};
5886
5887/* Walker for walk_gimple_op. */
5888
5889static tree
5890vect_detect_hybrid_slp (tree *tp, int *, void *data)
5891{
5892 walk_stmt_info *wi = (walk_stmt_info *)data;
5893 vdhs_data *dat = (vdhs_data *)wi->info;
5894
5895 if (wi->is_lhs)
5896 return NULL_TREE;
5897
5898 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5899 if (!def_stmt_info)
5900 return NULL_TREE;
5901 def_stmt_info = vect_stmt_to_vectorize (stmt_info: def_stmt_info);
5902 if (PURE_SLP_STMT (def_stmt_info))
5903 {
5904 if (dump_enabled_p ())
5905 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5906 def_stmt_info->stmt);
5907 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5908 dat->worklist->safe_push (obj: def_stmt_info);
5909 }
5910
5911 return NULL_TREE;
5912}
5913
5914/* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5915 if so, otherwise pushing it to WORKLIST. */
5916
5917static void
5918maybe_push_to_hybrid_worklist (vec_info *vinfo,
5919 vec<stmt_vec_info> &worklist,
5920 stmt_vec_info stmt_info)
5921{
5922 if (dump_enabled_p ())
5923 dump_printf_loc (MSG_NOTE, vect_location,
5924 "Processing hybrid candidate : %G", stmt_info->stmt);
5925 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5926 imm_use_iterator iter2;
5927 ssa_op_iter iter1;
5928 use_operand_p use_p;
5929 def_operand_p def_p;
5930 bool any_def = false;
5931 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5932 {
5933 any_def = true;
5934 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5935 {
5936 if (is_gimple_debug (USE_STMT (use_p)))
5937 continue;
5938 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5939 /* An out-of loop use means this is a loop_vect sink. */
5940 if (!use_info)
5941 {
5942 if (dump_enabled_p ())
5943 dump_printf_loc (MSG_NOTE, vect_location,
5944 "Found loop_vect sink: %G", stmt_info->stmt);
5945 worklist.safe_push (obj: stmt_info);
5946 return;
5947 }
5948 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5949 {
5950 if (dump_enabled_p ())
5951 dump_printf_loc (MSG_NOTE, vect_location,
5952 "Found loop_vect use: %G", use_info->stmt);
5953 worklist.safe_push (obj: stmt_info);
5954 return;
5955 }
5956 }
5957 }
5958 /* No def means this is a loo_vect sink. */
5959 if (!any_def)
5960 {
5961 if (dump_enabled_p ())
5962 dump_printf_loc (MSG_NOTE, vect_location,
5963 "Found loop_vect sink: %G", stmt_info->stmt);
5964 worklist.safe_push (obj: stmt_info);
5965 return;
5966 }
5967 if (dump_enabled_p ())
5968 dump_printf_loc (MSG_NOTE, vect_location,
5969 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5970 STMT_SLP_TYPE (stmt_info) = pure_slp;
5971}
5972
5973/* Find stmts that must be both vectorized and SLPed. */
5974
5975void
5976vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5977{
5978 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5979
5980 /* All stmts participating in SLP are marked pure_slp, all other
5981 stmts are loop_vect.
5982 First collect all loop_vect stmts into a worklist.
5983 SLP patterns cause not all original scalar stmts to appear in
5984 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5985 Rectify this here and do a backward walk over the IL only considering
5986 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5987 mark them as pure_slp. */
5988 auto_vec<stmt_vec_info> worklist;
5989 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5990 {
5991 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5992 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi);
5993 gsi_next (i: &gsi))
5994 {
5995 gphi *phi = gsi.phi ();
5996 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5997 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5998 maybe_push_to_hybrid_worklist (vinfo: loop_vinfo,
5999 worklist, stmt_info);
6000 }
6001 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (i: gsi);
6002 gsi_prev (i: &gsi))
6003 {
6004 gimple *stmt = gsi_stmt (i: gsi);
6005 if (is_gimple_debug (gs: stmt))
6006 continue;
6007 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
6008 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
6009 {
6010 for (gimple_stmt_iterator gsi2
6011 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
6012 !gsi_end_p (i: gsi2); gsi_next (i: &gsi2))
6013 {
6014 stmt_vec_info patt_info
6015 = loop_vinfo->lookup_stmt (gsi_stmt (i: gsi2));
6016 if (!STMT_SLP_TYPE (patt_info)
6017 && STMT_VINFO_RELEVANT (patt_info))
6018 maybe_push_to_hybrid_worklist (vinfo: loop_vinfo,
6019 worklist, stmt_info: patt_info);
6020 }
6021 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6022 }
6023 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
6024 maybe_push_to_hybrid_worklist (vinfo: loop_vinfo,
6025 worklist, stmt_info);
6026 }
6027 }
6028
6029 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
6030 mark any SLP vectorized stmt as hybrid.
6031 ??? We're visiting def stmts N times (once for each non-SLP and
6032 once for each hybrid-SLP use). */
6033 walk_stmt_info wi;
6034 vdhs_data dat;
6035 dat.worklist = &worklist;
6036 dat.loop_vinfo = loop_vinfo;
6037 memset (s: &wi, c: 0, n: sizeof (wi));
6038 wi.info = (void *)&dat;
6039 while (!worklist.is_empty ())
6040 {
6041 stmt_vec_info stmt_info = worklist.pop ();
6042 /* Since SSA operands are not set up for pattern stmts we need
6043 to use walk_gimple_op. */
6044 wi.is_lhs = 0;
6045 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6046 /* For gather/scatter make sure to walk the offset operand, that
6047 can be a scaling and conversion away. */
6048 gather_scatter_info gs_info;
6049 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6050 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6051 {
6052 int dummy;
6053 vect_detect_hybrid_slp (tp: &gs_info.offset, &dummy, data: &wi);
6054 }
6055 }
6056}
6057
6058
6059/* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
6060
6061_bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6062 : vec_info (vec_info::bb, shared),
6063 bbs (_bbs),
6064 roots (vNULL)
6065{
6066 for (unsigned i = 0; i < bbs.length (); ++i)
6067 {
6068 if (i != 0)
6069 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (i: si);
6070 gsi_next (i: &si))
6071 {
6072 gphi *phi = si.phi ();
6073 gimple_set_uid (g: phi, uid: 0);
6074 add_stmt (phi);
6075 }
6076 for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs[i]);
6077 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
6078 {
6079 gimple *stmt = gsi_stmt (i: gsi);
6080 gimple_set_uid (g: stmt, uid: 0);
6081 if (is_gimple_debug (gs: stmt))
6082 continue;
6083 add_stmt (stmt);
6084 }
6085 }
6086}
6087
6088
6089/* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6090 stmts in the basic block. */
6091
6092_bb_vec_info::~_bb_vec_info ()
6093{
6094 /* Reset region marker. */
6095 for (unsigned i = 0; i < bbs.length (); ++i)
6096 {
6097 if (i != 0)
6098 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (i: si);
6099 gsi_next (i: &si))
6100 {
6101 gphi *phi = si.phi ();
6102 gimple_set_uid (g: phi, uid: -1);
6103 }
6104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs[i]);
6105 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
6106 {
6107 gimple *stmt = gsi_stmt (i: gsi);
6108 gimple_set_uid (g: stmt, uid: -1);
6109 }
6110 }
6111
6112 for (unsigned i = 0; i < roots.length (); ++i)
6113 {
6114 roots[i].stmts.release ();
6115 roots[i].roots.release ();
6116 roots[i].remain.release ();
6117 }
6118 roots.release ();
6119}
6120
6121/* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
6122 given then that child nodes have already been processed, and that
6123 their def types currently match their SLP node's def type. */
6124
6125static bool
6126vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6127 slp_instance node_instance,
6128 stmt_vector_for_cost *cost_vec)
6129{
6130 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6131
6132 /* Calculate the number of vector statements to be created for the
6133 scalar stmts in this node. For SLP reductions it is equal to the
6134 number of vector statements in the children (which has already been
6135 calculated by the recursive call). Otherwise it is the number of
6136 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6137 VF divided by the number of elements in a vector. */
6138 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6139 && !STMT_VINFO_DATA_REF (stmt_info)
6140 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6141 {
6142 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6143 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6144 {
6145 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6146 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6147 break;
6148 }
6149 }
6150 else
6151 {
6152 poly_uint64 vf;
6153 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
6154 vf = loop_vinfo->vectorization_factor;
6155 else
6156 vf = 1;
6157 unsigned int group_size = SLP_TREE_LANES (node);
6158 tree vectype = SLP_TREE_VECTYPE (node);
6159 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6160 = vect_get_num_vectors (nunits: vf * group_size, vectype);
6161 }
6162
6163 /* Handle purely internal nodes. */
6164 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6165 {
6166 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6167 return false;
6168
6169 stmt_vec_info slp_stmt_info;
6170 unsigned int i;
6171 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6172 {
6173 if (STMT_VINFO_LIVE_P (slp_stmt_info)
6174 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6175 node_instance, i,
6176 false, cost_vec))
6177 return false;
6178 }
6179 return true;
6180 }
6181
6182 bool dummy;
6183 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6184 node, node_instance, cost_vec);
6185}
6186
6187/* Try to build NODE from scalars, returning true on success.
6188 NODE_INSTANCE is the SLP instance that contains NODE. */
6189
6190static bool
6191vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6192 slp_instance node_instance)
6193{
6194 stmt_vec_info stmt_info;
6195 unsigned int i;
6196
6197 if (!is_a <bb_vec_info> (p: vinfo)
6198 || node == SLP_INSTANCE_TREE (node_instance)
6199 || !SLP_TREE_SCALAR_STMTS (node).exists ()
6200 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6201 /* Force the mask use to be built from scalars instead. */
6202 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6203 return false;
6204
6205 if (dump_enabled_p ())
6206 dump_printf_loc (MSG_NOTE, vect_location,
6207 "Building vector operands of %p from scalars instead\n",
6208 (void *) node);
6209
6210 /* Don't remove and free the child nodes here, since they could be
6211 referenced by other structures. The analysis and scheduling phases
6212 (need to) ignore child nodes of anything that isn't vect_internal_def. */
6213 unsigned int group_size = SLP_TREE_LANES (node);
6214 SLP_TREE_DEF_TYPE (node) = vect_external_def;
6215 /* Invariants get their vector type from the uses. */
6216 SLP_TREE_VECTYPE (node) = NULL_TREE;
6217 SLP_TREE_SCALAR_OPS (node).safe_grow (len: group_size, exact: true);
6218 SLP_TREE_LOAD_PERMUTATION (node).release ();
6219 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6220 {
6221 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6222 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6223 }
6224 return true;
6225}
6226
6227/* Return true if all elements of the slice are the same. */
6228bool
6229vect_scalar_ops_slice::all_same_p () const
6230{
6231 for (unsigned int i = 1; i < length; ++i)
6232 if (!operand_equal_p (op (i: 0), op (i)))
6233 return false;
6234 return true;
6235}
6236
6237hashval_t
6238vect_scalar_ops_slice_hash::hash (const value_type &s)
6239{
6240 hashval_t hash = 0;
6241 for (unsigned i = 0; i < s.length; ++i)
6242 hash = iterative_hash_expr (tree: s.op (i), seed: hash);
6243 return hash;
6244}
6245
6246bool
6247vect_scalar_ops_slice_hash::equal (const value_type &s1,
6248 const compare_type &s2)
6249{
6250 if (s1.length != s2.length)
6251 return false;
6252 for (unsigned i = 0; i < s1.length; ++i)
6253 if (!operand_equal_p (s1.op (i), s2.op (i)))
6254 return false;
6255 return true;
6256}
6257
6258/* Compute the prologue cost for invariant or constant operands represented
6259 by NODE. */
6260
6261static void
6262vect_prologue_cost_for_slp (slp_tree node,
6263 stmt_vector_for_cost *cost_vec)
6264{
6265 /* There's a special case of an existing vector, that costs nothing. */
6266 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6267 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6268 return;
6269 /* Without looking at the actual initializer a vector of
6270 constants can be implemented as load from the constant pool.
6271 When all elements are the same we can use a splat. */
6272 tree vectype = SLP_TREE_VECTYPE (node);
6273 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6274 unsigned HOST_WIDE_INT const_nunits;
6275 unsigned nelt_limit;
6276 auto ops = &SLP_TREE_SCALAR_OPS (node);
6277 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6278 if (TYPE_VECTOR_SUBPARTS (node: vectype).is_constant (const_value: &const_nunits)
6279 && ! multiple_p (a: const_nunits, b: group_size))
6280 {
6281 nelt_limit = const_nunits;
6282 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6283 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6284 if (!vector_ops.add (k: { .ops: ops, .start: i * const_nunits, .length: const_nunits }))
6285 starts.quick_push (obj: i * const_nunits);
6286 }
6287 else
6288 {
6289 /* If either the vector has variable length or the vectors
6290 are composed of repeated whole groups we only need to
6291 cost construction once. All vectors will be the same. */
6292 nelt_limit = group_size;
6293 starts.quick_push (obj: 0);
6294 }
6295 /* ??? We're just tracking whether vectors in a single node are the same.
6296 Ideally we'd do something more global. */
6297 bool passed = false;
6298 for (unsigned int start : starts)
6299 {
6300 vect_cost_for_stmt kind;
6301 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6302 kind = vector_load;
6303 else if (vect_scalar_ops_slice { .ops: ops, .start: start, .length: nelt_limit }.all_same_p ())
6304 kind = scalar_to_vec;
6305 else
6306 kind = vec_construct;
6307 /* The target cost hook has no idea which part of the SLP node
6308 we are costing so avoid passing it down more than once. Pass
6309 it to the first vec_construct or scalar_to_vec part since for those
6310 the x86 backend tries to account for GPR to XMM register moves. */
6311 record_stmt_cost (cost_vec, 1, kind,
6312 (kind != vector_load && !passed) ? node : nullptr,
6313 vectype, 0, vect_prologue);
6314 if (kind != vector_load)
6315 passed = true;
6316 }
6317}
6318
6319/* Analyze statements contained in SLP tree NODE after recursively analyzing
6320 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6321
6322 Return true if the operations are supported. */
6323
6324static bool
6325vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6326 slp_instance node_instance,
6327 hash_set<slp_tree> &visited_set,
6328 vec<slp_tree> &visited_vec,
6329 stmt_vector_for_cost *cost_vec)
6330{
6331 int i, j;
6332 slp_tree child;
6333
6334 /* Assume we can code-generate all invariants. */
6335 if (!node
6336 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6337 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6338 return true;
6339
6340 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6341 {
6342 if (dump_enabled_p ())
6343 dump_printf_loc (MSG_NOTE, vect_location,
6344 "Failed cyclic SLP reference in %p\n", (void *) node);
6345 return false;
6346 }
6347 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6348
6349 /* If we already analyzed the exact same set of scalar stmts we're done.
6350 We share the generated vector stmts for those. */
6351 if (visited_set.add (k: node))
6352 return true;
6353 visited_vec.safe_push (obj: node);
6354
6355 bool res = true;
6356 unsigned visited_rec_start = visited_vec.length ();
6357 unsigned cost_vec_rec_start = cost_vec->length ();
6358 bool seen_non_constant_child = false;
6359 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6360 {
6361 res = vect_slp_analyze_node_operations (vinfo, node: child, node_instance,
6362 visited_set, visited_vec,
6363 cost_vec);
6364 if (!res)
6365 break;
6366 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6367 seen_non_constant_child = true;
6368 }
6369 /* We're having difficulties scheduling nodes with just constant
6370 operands and no scalar stmts since we then cannot compute a stmt
6371 insertion place. */
6372 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6373 {
6374 if (dump_enabled_p ())
6375 dump_printf_loc (MSG_NOTE, vect_location,
6376 "Cannot vectorize all-constant op node %p\n",
6377 (void *) node);
6378 res = false;
6379 }
6380
6381 if (res)
6382 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6383 cost_vec);
6384 /* If analysis failed we have to pop all recursive visited nodes
6385 plus ourselves. */
6386 if (!res)
6387 {
6388 while (visited_vec.length () >= visited_rec_start)
6389 visited_set.remove (k: visited_vec.pop ());
6390 cost_vec->truncate (size: cost_vec_rec_start);
6391 }
6392
6393 /* When the node can be vectorized cost invariant nodes it references.
6394 This is not done in DFS order to allow the refering node
6395 vectorizable_* calls to nail down the invariant nodes vector type
6396 and possibly unshare it if it needs a different vector type than
6397 other referrers. */
6398 if (res)
6399 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6400 if (child
6401 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6402 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6403 /* Perform usual caching, note code-generation still
6404 code-gens these nodes multiple times but we expect
6405 to CSE them later. */
6406 && !visited_set.add (k: child))
6407 {
6408 visited_vec.safe_push (obj: child);
6409 /* ??? After auditing more code paths make a "default"
6410 and push the vector type from NODE to all children
6411 if it is not already set. */
6412 /* Compute the number of vectors to be generated. */
6413 tree vector_type = SLP_TREE_VECTYPE (child);
6414 if (!vector_type)
6415 {
6416 /* For shifts with a scalar argument we don't need
6417 to cost or code-generate anything.
6418 ??? Represent this more explicitely. */
6419 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6420 == shift_vec_info_type)
6421 && j == 1);
6422 continue;
6423 }
6424 unsigned group_size = SLP_TREE_LANES (child);
6425 poly_uint64 vf = 1;
6426 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
6427 vf = loop_vinfo->vectorization_factor;
6428 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6429 = vect_get_num_vectors (nunits: vf * group_size, vectype: vector_type);
6430 /* And cost them. */
6431 vect_prologue_cost_for_slp (node: child, cost_vec);
6432 }
6433
6434 /* If this node or any of its children can't be vectorized, try pruning
6435 the tree here rather than felling the whole thing. */
6436 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6437 {
6438 /* We'll need to revisit this for invariant costing and number
6439 of vectorized stmt setting. */
6440 res = true;
6441 }
6442
6443 return res;
6444}
6445
6446/* Given a definition DEF, analyze if it will have any live scalar use after
6447 performing SLP vectorization whose information is represented by BB_VINFO,
6448 and record result into hash map SCALAR_USE_MAP as cache for later fast
6449 check. If recursion DEPTH exceeds a limit, stop analysis and make a
6450 conservative assumption. Return 0 if no scalar use, 1 if there is, -1
6451 means recursion is limited. */
6452
6453static int
6454vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
6455 hash_map<tree, int> &scalar_use_map,
6456 int depth = 0)
6457{
6458 const int depth_limit = 2;
6459 imm_use_iterator use_iter;
6460 gimple *use_stmt;
6461
6462 if (int *res = scalar_use_map.get (k: def))
6463 return *res;
6464
6465 int scalar_use = 1;
6466
6467 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
6468 {
6469 if (is_gimple_debug (gs: use_stmt))
6470 continue;
6471
6472 stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6473
6474 if (!use_stmt_info)
6475 break;
6476
6477 if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6478 continue;
6479
6480 /* Do not step forward when encounter PHI statement, since it may
6481 involve cyclic reference and cause infinite recursive invocation. */
6482 if (gimple_code (g: use_stmt) == GIMPLE_PHI)
6483 break;
6484
6485 /* When pattern recognition is involved, a statement whose definition is
6486 consumed in some pattern, may not be included in the final replacement
6487 pattern statements, so would be skipped when building SLP graph.
6488
6489 * Original
6490 char a_c = *(char *) a;
6491 char b_c = *(char *) b;
6492 unsigned short a_s = (unsigned short) a_c;
6493 int a_i = (int) a_s;
6494 int b_i = (int) b_c;
6495 int r_i = a_i - b_i;
6496
6497 * After pattern replacement
6498 a_s = (unsigned short) a_c;
6499 a_i = (int) a_s;
6500
6501 patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
6502 patt_b_i = (int) patt_b_s; // b_i = (int) b_c
6503
6504 patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
6505 patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
6506
6507 The definitions of a_i(original statement) and b_i(pattern statement)
6508 are related to, but actually not part of widen_minus pattern.
6509 Vectorizing the pattern does not cause these definition statements to
6510 be marked as PURE_SLP. For this case, we need to recursively check
6511 whether their uses are all absorbed into vectorized code. But there
6512 is an exception that some use may participate in an vectorized
6513 operation via an external SLP node containing that use as an element.
6514 The parameter "scalar_use_map" tags such kind of SSA as having scalar
6515 use in advance. */
6516 tree lhs = gimple_get_lhs (use_stmt);
6517
6518 if (!lhs || TREE_CODE (lhs) != SSA_NAME)
6519 break;
6520
6521 if (depth_limit && depth >= depth_limit)
6522 return -1;
6523
6524 if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, def: lhs, scalar_use_map,
6525 depth: depth + 1)))
6526 break;
6527 }
6528
6529 if (end_imm_use_stmt_p (imm: &use_iter))
6530 scalar_use = 0;
6531
6532 /* If recursion is limited, do not cache result for non-root defs. */
6533 if (!depth || scalar_use >= 0)
6534 {
6535 bool added = scalar_use_map.put (k: def, v: scalar_use);
6536 gcc_assert (!added);
6537 }
6538
6539 return scalar_use;
6540}
6541
6542/* Mark lanes of NODE that are live outside of the basic-block vectorized
6543 region and that can be vectorized using vectorizable_live_operation
6544 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6545 scalar code computing it to be retained. */
6546
6547static void
6548vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6549 slp_instance instance,
6550 stmt_vector_for_cost *cost_vec,
6551 hash_map<tree, int> &scalar_use_map,
6552 hash_set<stmt_vec_info> &svisited,
6553 hash_set<slp_tree> &visited)
6554{
6555 if (visited.add (k: node))
6556 return;
6557
6558 unsigned i;
6559 stmt_vec_info stmt_info;
6560 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6561 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6562 {
6563 if (svisited.contains (k: stmt_info))
6564 continue;
6565 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6566 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6567 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6568 /* Only the pattern root stmt computes the original scalar value. */
6569 continue;
6570 bool mark_visited = true;
6571 gimple *orig_stmt = orig_stmt_info->stmt;
6572 ssa_op_iter op_iter;
6573 def_operand_p def_p;
6574 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6575 {
6576 if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
6577 scalar_use_map))
6578 {
6579 STMT_VINFO_LIVE_P (stmt_info) = true;
6580 if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
6581 instance, i, false, cost_vec))
6582 /* ??? So we know we can vectorize the live stmt from one SLP
6583 node. If we cannot do so from all or none consistently
6584 we'd have to record which SLP node (and lane) we want to
6585 use for the live operation. So make sure we can
6586 code-generate from all nodes. */
6587 mark_visited = false;
6588 else
6589 STMT_VINFO_LIVE_P (stmt_info) = false;
6590 }
6591
6592 /* We have to verify whether we can insert the lane extract
6593 before all uses. The following is a conservative approximation.
6594 We cannot put this into vectorizable_live_operation because
6595 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6596 doesn't work.
6597 Note that while the fact that we emit code for loads at the
6598 first load should make this a non-problem leafs we construct
6599 from scalars are vectorized after the last scalar def.
6600 ??? If we'd actually compute the insert location during
6601 analysis we could use sth less conservative than the last
6602 scalar stmt in the node for the dominance check. */
6603 /* ??? What remains is "live" uses in vector CTORs in the same
6604 SLP graph which is where those uses can end up code-generated
6605 right after their definition instead of close to their original
6606 use. But that would restrict us to code-generate lane-extracts
6607 from the latest stmt in a node. So we compensate for this
6608 during code-generation, simply not replacing uses for those
6609 hopefully rare cases. */
6610 imm_use_iterator use_iter;
6611 gimple *use_stmt;
6612 stmt_vec_info use_stmt_info;
6613
6614 if (STMT_VINFO_LIVE_P (stmt_info))
6615 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6616 if (!is_gimple_debug (gs: use_stmt)
6617 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6618 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6619 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6620 {
6621 if (dump_enabled_p ())
6622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6623 "Cannot determine insertion place for "
6624 "lane extract\n");
6625 STMT_VINFO_LIVE_P (stmt_info) = false;
6626 mark_visited = true;
6627 }
6628 }
6629 if (mark_visited)
6630 svisited.add (k: stmt_info);
6631 }
6632
6633 slp_tree child;
6634 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6635 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6636 vect_bb_slp_mark_live_stmts (bb_vinfo, node: child, instance, cost_vec,
6637 scalar_use_map, svisited, visited);
6638}
6639
6640/* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
6641 are live outside of the basic-block vectorized region and that can be
6642 vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
6643
6644static void
6645vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
6646{
6647 if (bb_vinfo->slp_instances.is_empty ())
6648 return;
6649
6650 hash_set<stmt_vec_info> svisited;
6651 hash_set<slp_tree> visited;
6652 hash_map<tree, int> scalar_use_map;
6653 auto_vec<slp_tree> worklist;
6654
6655 for (slp_instance instance : bb_vinfo->slp_instances)
6656 {
6657 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
6658 for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
6659 if (TREE_CODE (op) == SSA_NAME)
6660 scalar_use_map.put (k: op, v: 1);
6661 if (!visited.add (SLP_INSTANCE_TREE (instance)))
6662 worklist.safe_push (SLP_INSTANCE_TREE (instance));
6663 }
6664
6665 do
6666 {
6667 slp_tree node = worklist.pop ();
6668
6669 if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
6670 {
6671 for (tree op : SLP_TREE_SCALAR_OPS (node))
6672 if (TREE_CODE (op) == SSA_NAME)
6673 scalar_use_map.put (k: op, v: 1);
6674 }
6675 else
6676 {
6677 for (slp_tree child : SLP_TREE_CHILDREN (node))
6678 if (child && !visited.add (k: child))
6679 worklist.safe_push (obj: child);
6680 }
6681 }
6682 while (!worklist.is_empty ());
6683
6684 visited.empty ();
6685
6686 for (slp_instance instance : bb_vinfo->slp_instances)
6687 {
6688 vect_location = instance->location ();
6689 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6690 instance, cost_vec: &instance->cost_vec,
6691 scalar_use_map, svisited, visited);
6692 }
6693}
6694
6695/* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6696
6697static bool
6698vectorizable_bb_reduc_epilogue (slp_instance instance,
6699 stmt_vector_for_cost *cost_vec)
6700{
6701 gassign *stmt = as_a <gassign *> (p: instance->root_stmts[0]->stmt);
6702 enum tree_code reduc_code = gimple_assign_rhs_code (gs: stmt);
6703 if (reduc_code == MINUS_EXPR)
6704 reduc_code = PLUS_EXPR;
6705 internal_fn reduc_fn;
6706 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6707 if (!vectype
6708 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6709 || reduc_fn == IFN_LAST
6710 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6711 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6712 TREE_TYPE (vectype)))
6713 {
6714 if (dump_enabled_p ())
6715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6716 "not vectorized: basic block reduction epilogue "
6717 "operation unsupported.\n");
6718 return false;
6719 }
6720
6721 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6722 cost log2 vector operations plus shuffles and one extraction. */
6723 unsigned steps = floor_log2 (x: vect_nunits_for_cost (vec_type: vectype));
6724 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6725 vectype, 0, vect_body);
6726 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6727 vectype, 0, vect_body);
6728 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6729 vectype, 0, vect_body);
6730
6731 /* Since we replace all stmts of a possibly longer scalar reduction
6732 chain account for the extra scalar stmts for that. */
6733 record_stmt_cost (body_cost_vec: cost_vec, count: instance->remain_defs.length (), kind: scalar_stmt,
6734 stmt_info: instance->root_stmts[0], misalign: 0, where: vect_body);
6735 return true;
6736}
6737
6738/* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6739 and recurse to children. */
6740
6741static void
6742vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6743 hash_set<slp_tree> &visited)
6744{
6745 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6746 || visited.add (k: node))
6747 return;
6748
6749 stmt_vec_info stmt;
6750 unsigned i;
6751 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6752 roots.remove (k: vect_orig_stmt (stmt_info: stmt));
6753
6754 slp_tree child;
6755 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6756 if (child)
6757 vect_slp_prune_covered_roots (node: child, roots, visited);
6758}
6759
6760/* Analyze statements in SLP instances of VINFO. Return true if the
6761 operations are supported. */
6762
6763bool
6764vect_slp_analyze_operations (vec_info *vinfo)
6765{
6766 slp_instance instance;
6767 int i;
6768
6769 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6770
6771 hash_set<slp_tree> visited;
6772 for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); )
6773 {
6774 auto_vec<slp_tree> visited_vec;
6775 stmt_vector_for_cost cost_vec;
6776 cost_vec.create (nelems: 2);
6777 if (is_a <bb_vec_info> (p: vinfo))
6778 vect_location = instance->location ();
6779 if (!vect_slp_analyze_node_operations (vinfo,
6780 SLP_INSTANCE_TREE (instance),
6781 node_instance: instance, visited_set&: visited, visited_vec,
6782 cost_vec: &cost_vec)
6783 /* CTOR instances require vectorized defs for the SLP tree root. */
6784 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6785 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6786 != vect_internal_def
6787 /* Make sure we vectorized with the expected type. */
6788 || !useless_type_conversion_p
6789 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6790 (instance->root_stmts[0]->stmt))),
6791 TREE_TYPE (SLP_TREE_VECTYPE
6792 (SLP_INSTANCE_TREE (instance))))))
6793 /* Check we can vectorize the reduction. */
6794 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6795 && !vectorizable_bb_reduc_epilogue (instance, cost_vec: &cost_vec)))
6796 {
6797 slp_tree node = SLP_INSTANCE_TREE (instance);
6798 stmt_vec_info stmt_info;
6799 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6800 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6801 else
6802 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6803 if (dump_enabled_p ())
6804 dump_printf_loc (MSG_NOTE, vect_location,
6805 "removing SLP instance operations starting from: %G",
6806 stmt_info->stmt);
6807 vect_free_slp_instance (instance);
6808 vinfo->slp_instances.ordered_remove (ix: i);
6809 cost_vec.release ();
6810 while (!visited_vec.is_empty ())
6811 visited.remove (k: visited_vec.pop ());
6812 }
6813 else
6814 {
6815 i++;
6816 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (p: vinfo))
6817 {
6818 add_stmt_costs (costs: loop_vinfo->vector_costs, cost_vec: &cost_vec);
6819 cost_vec.release ();
6820 }
6821 else
6822 /* For BB vectorization remember the SLP graph entry
6823 cost for later. */
6824 instance->cost_vec = cost_vec;
6825 }
6826 }
6827
6828 /* Now look for SLP instances with a root that are covered by other
6829 instances and remove them. */
6830 hash_set<stmt_vec_info> roots;
6831 for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i)
6832 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6833 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6834 if (!roots.is_empty ())
6835 {
6836 visited.empty ();
6837 for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i)
6838 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6839 visited);
6840 for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); )
6841 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6842 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6843 {
6844 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6845 if (dump_enabled_p ())
6846 dump_printf_loc (MSG_NOTE, vect_location,
6847 "removing SLP instance operations starting "
6848 "from: %G", root->stmt);
6849 vect_free_slp_instance (instance);
6850 vinfo->slp_instances.ordered_remove (ix: i);
6851 }
6852 else
6853 ++i;
6854 }
6855
6856 /* Compute vectorizable live stmts. */
6857 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo))
6858 vect_bb_slp_mark_live_stmts (bb_vinfo);
6859
6860 return !vinfo->slp_instances.is_empty ();
6861}
6862
6863/* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6864 closing the eventual chain. */
6865
6866static slp_instance
6867get_ultimate_leader (slp_instance instance,
6868 hash_map<slp_instance, slp_instance> &instance_leader)
6869{
6870 auto_vec<slp_instance *, 8> chain;
6871 slp_instance *tem;
6872 while (*(tem = instance_leader.get (k: instance)) != instance)
6873 {
6874 chain.safe_push (obj: tem);
6875 instance = *tem;
6876 }
6877 while (!chain.is_empty ())
6878 *chain.pop () = instance;
6879 return instance;
6880}
6881
6882namespace {
6883/* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6884 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6885 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6886
6887 INSTANCE_LEADER is as for get_ultimate_leader. */
6888
6889template<typename T>
6890bool
6891vect_map_to_instance (slp_instance instance, T key,
6892 hash_map<T, slp_instance> &key_to_instance,
6893 hash_map<slp_instance, slp_instance> &instance_leader)
6894{
6895 bool existed_p;
6896 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6897 if (!existed_p)
6898 ;
6899 else if (key_instance != instance)
6900 {
6901 /* If we're running into a previously marked key make us the
6902 leader of the current ultimate leader. This keeps the
6903 leader chain acyclic and works even when the current instance
6904 connects two previously independent graph parts. */
6905 slp_instance key_leader
6906 = get_ultimate_leader (instance: key_instance, instance_leader);
6907 if (key_leader != instance)
6908 instance_leader.put (k: key_leader, v: instance);
6909 }
6910 key_instance = instance;
6911 return existed_p;
6912}
6913}
6914
6915/* Worker of vect_bb_partition_graph, recurse on NODE. */
6916
6917static void
6918vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6919 slp_instance instance, slp_tree node,
6920 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6921 hash_map<slp_tree, slp_instance> &node_to_instance,
6922 hash_map<slp_instance, slp_instance> &instance_leader)
6923{
6924 stmt_vec_info stmt_info;
6925 unsigned i;
6926
6927 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6928 vect_map_to_instance (instance, key: stmt_info, key_to_instance&: stmt_to_instance,
6929 instance_leader);
6930
6931 if (vect_map_to_instance (instance, key: node, key_to_instance&: node_to_instance,
6932 instance_leader))
6933 return;
6934
6935 slp_tree child;
6936 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6937 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6938 vect_bb_partition_graph_r (bb_vinfo, instance, node: child, stmt_to_instance,
6939 node_to_instance, instance_leader);
6940}
6941
6942/* Partition the SLP graph into pieces that can be costed independently. */
6943
6944static void
6945vect_bb_partition_graph (bb_vec_info bb_vinfo)
6946{
6947 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6948
6949 /* First walk the SLP graph assigning each involved scalar stmt a
6950 corresponding SLP graph entry and upon visiting a previously
6951 marked stmt, make the stmts leader the current SLP graph entry. */
6952 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6953 hash_map<slp_tree, slp_instance> node_to_instance;
6954 hash_map<slp_instance, slp_instance> instance_leader;
6955 slp_instance instance;
6956 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i)
6957 {
6958 instance_leader.put (k: instance, v: instance);
6959 vect_bb_partition_graph_r (bb_vinfo,
6960 instance, SLP_INSTANCE_TREE (instance),
6961 stmt_to_instance, node_to_instance,
6962 instance_leader);
6963 }
6964
6965 /* Then collect entries to each independent subgraph. */
6966 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i)
6967 {
6968 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6969 leader->subgraph_entries.safe_push (obj: instance);
6970 if (dump_enabled_p ()
6971 && leader != instance)
6972 dump_printf_loc (MSG_NOTE, vect_location,
6973 "instance %p is leader of %p\n",
6974 (void *) leader, (void *) instance);
6975 }
6976}
6977
6978/* Compute the set of scalar stmts participating in internal and external
6979 nodes. */
6980
6981static void
6982vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6983 hash_set<slp_tree> &visited,
6984 hash_set<stmt_vec_info> &vstmts,
6985 hash_set<stmt_vec_info> &estmts)
6986{
6987 int i;
6988 stmt_vec_info stmt_info;
6989 slp_tree child;
6990
6991 if (visited.add (k: node))
6992 return;
6993
6994 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6995 {
6996 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6997 vstmts.add (k: stmt_info);
6998
6999 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7000 if (child)
7001 vect_slp_gather_vectorized_scalar_stmts (vinfo, node: child, visited,
7002 vstmts, estmts);
7003 }
7004 else
7005 for (tree def : SLP_TREE_SCALAR_OPS (node))
7006 {
7007 stmt_vec_info def_stmt = vinfo->lookup_def (def);
7008 if (def_stmt)
7009 estmts.add (k: def_stmt);
7010 }
7011}
7012
7013
7014/* Compute the scalar cost of the SLP node NODE and its children
7015 and return it. Do not account defs that are marked in LIFE and
7016 update LIFE according to uses of NODE. */
7017
7018static void
7019vect_bb_slp_scalar_cost (vec_info *vinfo,
7020 slp_tree node, vec<bool, va_heap> *life,
7021 stmt_vector_for_cost *cost_vec,
7022 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
7023 hash_set<slp_tree> &visited)
7024{
7025 unsigned i;
7026 stmt_vec_info stmt_info;
7027 slp_tree child;
7028
7029 if (visited.add (k: node))
7030 return;
7031
7032 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7033 {
7034 ssa_op_iter op_iter;
7035 def_operand_p def_p;
7036
7037 if ((*life)[i])
7038 continue;
7039
7040 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7041 gimple *orig_stmt = orig_stmt_info->stmt;
7042
7043 /* If there is a non-vectorized use of the defs then the scalar
7044 stmt is kept live in which case we do not account it or any
7045 required defs in the SLP children in the scalar cost. This
7046 way we make the vectorization more costly when compared to
7047 the scalar cost. */
7048 if (!STMT_VINFO_LIVE_P (stmt_info))
7049 {
7050 auto_vec<gimple *, 8> worklist;
7051 hash_set<gimple *> *worklist_visited = NULL;
7052 worklist.quick_push (obj: orig_stmt);
7053 do
7054 {
7055 gimple *work_stmt = worklist.pop ();
7056 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
7057 {
7058 imm_use_iterator use_iter;
7059 gimple *use_stmt;
7060 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
7061 DEF_FROM_PTR (def_p))
7062 if (!is_gimple_debug (gs: use_stmt))
7063 {
7064 stmt_vec_info use_stmt_info
7065 = vinfo->lookup_stmt (use_stmt);
7066 if (!use_stmt_info
7067 || !vectorized_scalar_stmts.contains (k: use_stmt_info))
7068 {
7069 if (use_stmt_info
7070 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
7071 {
7072 /* For stmts participating in patterns we have
7073 to check its uses recursively. */
7074 if (!worklist_visited)
7075 worklist_visited = new hash_set<gimple *> ();
7076 if (!worklist_visited->add (k: use_stmt))
7077 worklist.safe_push (obj: use_stmt);
7078 continue;
7079 }
7080 (*life)[i] = true;
7081 goto next_lane;
7082 }
7083 }
7084 }
7085 }
7086 while (!worklist.is_empty ());
7087next_lane:
7088 if (worklist_visited)
7089 delete worklist_visited;
7090 if ((*life)[i])
7091 continue;
7092 }
7093
7094 /* Count scalar stmts only once. */
7095 if (gimple_visited_p (stmt: orig_stmt))
7096 continue;
7097 gimple_set_visited (stmt: orig_stmt, visited_p: true);
7098
7099 vect_cost_for_stmt kind;
7100 if (STMT_VINFO_DATA_REF (orig_stmt_info))
7101 {
7102 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
7103 kind = scalar_load;
7104 else
7105 kind = scalar_store;
7106 }
7107 else if (vect_nop_conversion_p (orig_stmt_info))
7108 continue;
7109 /* For single-argument PHIs assume coalescing which means zero cost
7110 for the scalar and the vector PHIs. This avoids artificially
7111 favoring the vector path (but may pessimize it in some cases). */
7112 else if (is_a <gphi *> (p: orig_stmt_info->stmt)
7113 && gimple_phi_num_args
7114 (gs: as_a <gphi *> (p: orig_stmt_info->stmt)) == 1)
7115 continue;
7116 else
7117 kind = scalar_stmt;
7118 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
7119 SLP_TREE_VECTYPE (node), 0, vect_body);
7120 }
7121
7122 auto_vec<bool, 20> subtree_life;
7123 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7124 {
7125 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7126 {
7127 /* Do not directly pass LIFE to the recursive call, copy it to
7128 confine changes in the callee to the current child/subtree. */
7129 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7130 {
7131 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), exact: true);
7132 for (unsigned j = 0;
7133 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
7134 {
7135 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
7136 if (perm.first == i)
7137 subtree_life[perm.second] = (*life)[j];
7138 }
7139 }
7140 else
7141 {
7142 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
7143 subtree_life.safe_splice (src: *life);
7144 }
7145 vect_bb_slp_scalar_cost (vinfo, node: child, life: &subtree_life, cost_vec,
7146 vectorized_scalar_stmts, visited);
7147 subtree_life.truncate (size: 0);
7148 }
7149 }
7150}
7151
7152/* Comparator for the loop-index sorted cost vectors. */
7153
7154static int
7155li_cost_vec_cmp (const void *a_, const void *b_)
7156{
7157 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
7158 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
7159 if (a->first < b->first)
7160 return -1;
7161 else if (a->first == b->first)
7162 return 0;
7163 return 1;
7164}
7165
7166/* Check if vectorization of the basic block is profitable for the
7167 subgraph denoted by SLP_INSTANCES. */
7168
7169static bool
7170vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
7171 vec<slp_instance> slp_instances,
7172 loop_p orig_loop)
7173{
7174 slp_instance instance;
7175 int i;
7176 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7177 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7178
7179 if (dump_enabled_p ())
7180 {
7181 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7182 hash_set<slp_tree> visited;
7183 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7184 vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location,
7185 SLP_INSTANCE_TREE (instance), visited);
7186 }
7187
7188 /* Compute the set of scalar stmts we know will go away 'locally' when
7189 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
7190 not accurate for nodes promoted extern late or for scalar stmts that
7191 are used both in extern defs and in vectorized defs. */
7192 hash_set<stmt_vec_info> vectorized_scalar_stmts;
7193 hash_set<stmt_vec_info> scalar_stmts_in_externs;
7194 hash_set<slp_tree> visited;
7195 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7196 {
7197 vect_slp_gather_vectorized_scalar_stmts (vinfo: bb_vinfo,
7198 SLP_INSTANCE_TREE (instance),
7199 visited,
7200 vstmts&: vectorized_scalar_stmts,
7201 estmts&: scalar_stmts_in_externs);
7202 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7203 vectorized_scalar_stmts.add (k: rstmt);
7204 }
7205 /* Scalar stmts used as defs in external nodes need to be preseved, so
7206 remove them from vectorized_scalar_stmts. */
7207 for (stmt_vec_info stmt : scalar_stmts_in_externs)
7208 vectorized_scalar_stmts.remove (k: stmt);
7209
7210 /* Calculate scalar cost and sum the cost for the vector stmts
7211 previously collected. */
7212 stmt_vector_for_cost scalar_costs = vNULL;
7213 stmt_vector_for_cost vector_costs = vNULL;
7214 visited.empty ();
7215 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7216 {
7217 auto_vec<bool, 20> life;
7218 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7219 exact: true);
7220 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7221 record_stmt_cost (body_cost_vec: &scalar_costs,
7222 SLP_INSTANCE_ROOT_STMTS (instance).length (),
7223 kind: scalar_stmt,
7224 SLP_INSTANCE_ROOT_STMTS (instance)[0], misalign: 0, where: vect_body);
7225 vect_bb_slp_scalar_cost (vinfo: bb_vinfo,
7226 SLP_INSTANCE_TREE (instance),
7227 life: &life, cost_vec: &scalar_costs, vectorized_scalar_stmts,
7228 visited);
7229 vector_costs.safe_splice (src: instance->cost_vec);
7230 instance->cost_vec.release ();
7231 }
7232
7233 if (dump_enabled_p ())
7234 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7235
7236 /* When costing non-loop vectorization we need to consider each covered
7237 loop independently and make sure vectorization is profitable. For
7238 now we assume a loop may be not entered or executed an arbitrary
7239 number of iterations (??? static information can provide more
7240 precise info here) which means we can simply cost each containing
7241 loops stmts separately. */
7242
7243 /* First produce cost vectors sorted by loop index. */
7244 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7245 li_scalar_costs (scalar_costs.length ());
7246 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7247 li_vector_costs (vector_costs.length ());
7248 stmt_info_for_cost *cost;
7249 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7250 {
7251 unsigned l = gimple_bb (g: cost->stmt_info->stmt)->loop_father->num;
7252 li_scalar_costs.quick_push (obj: std::make_pair (x&: l, y&: cost));
7253 }
7254 /* Use a random used loop as fallback in case the first vector_costs
7255 entry does not have a stmt_info associated with it. */
7256 unsigned l = li_scalar_costs[0].first;
7257 FOR_EACH_VEC_ELT (vector_costs, i, cost)
7258 {
7259 /* We inherit from the previous COST, invariants, externals and
7260 extracts immediately follow the cost for the related stmt. */
7261 if (cost->stmt_info)
7262 l = gimple_bb (g: cost->stmt_info->stmt)->loop_father->num;
7263 li_vector_costs.quick_push (obj: std::make_pair (x&: l, y&: cost));
7264 }
7265 li_scalar_costs.qsort (li_cost_vec_cmp);
7266 li_vector_costs.qsort (li_cost_vec_cmp);
7267
7268 /* Now cost the portions individually. */
7269 unsigned vi = 0;
7270 unsigned si = 0;
7271 bool profitable = true;
7272 while (si < li_scalar_costs.length ()
7273 && vi < li_vector_costs.length ())
7274 {
7275 unsigned sl = li_scalar_costs[si].first;
7276 unsigned vl = li_vector_costs[vi].first;
7277 if (sl != vl)
7278 {
7279 if (dump_enabled_p ())
7280 dump_printf_loc (MSG_NOTE, vect_location,
7281 "Scalar %d and vector %d loop part do not "
7282 "match up, skipping scalar part\n", sl, vl);
7283 /* Skip the scalar part, assuming zero cost on the vector side. */
7284 do
7285 {
7286 si++;
7287 }
7288 while (si < li_scalar_costs.length ()
7289 && li_scalar_costs[si].first == sl);
7290 continue;
7291 }
7292
7293 class vector_costs *scalar_target_cost_data = init_cost (vinfo: bb_vinfo, costing_for_scalar: true);
7294 do
7295 {
7296 add_stmt_cost (costs: scalar_target_cost_data, i: li_scalar_costs[si].second);
7297 si++;
7298 }
7299 while (si < li_scalar_costs.length ()
7300 && li_scalar_costs[si].first == sl);
7301 unsigned dummy;
7302 finish_cost (costs: scalar_target_cost_data, scalar_costs: nullptr,
7303 prologue_cost: &dummy, body_cost: &scalar_cost, epilogue_cost: &dummy);
7304
7305 /* Complete the target-specific vector cost calculation. */
7306 class vector_costs *vect_target_cost_data = init_cost (vinfo: bb_vinfo, costing_for_scalar: false);
7307 do
7308 {
7309 add_stmt_cost (costs: vect_target_cost_data, i: li_vector_costs[vi].second);
7310 vi++;
7311 }
7312 while (vi < li_vector_costs.length ()
7313 && li_vector_costs[vi].first == vl);
7314 finish_cost (costs: vect_target_cost_data, scalar_costs: scalar_target_cost_data,
7315 prologue_cost: &vec_prologue_cost, body_cost: &vec_inside_cost, epilogue_cost: &vec_epilogue_cost);
7316 delete scalar_target_cost_data;
7317 delete vect_target_cost_data;
7318
7319 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7320
7321 if (dump_enabled_p ())
7322 {
7323 dump_printf_loc (MSG_NOTE, vect_location,
7324 "Cost model analysis for part in loop %d:\n", sl);
7325 dump_printf (MSG_NOTE, " Vector cost: %d\n",
7326 vec_inside_cost + vec_outside_cost);
7327 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
7328 }
7329
7330 /* Vectorization is profitable if its cost is more than the cost of scalar
7331 version. Note that we err on the vector side for equal cost because
7332 the cost estimate is otherwise quite pessimistic (constant uses are
7333 free on the scalar side but cost a load on the vector side for
7334 example). */
7335 if (vec_outside_cost + vec_inside_cost > scalar_cost)
7336 {
7337 profitable = false;
7338 break;
7339 }
7340 }
7341 if (profitable && vi < li_vector_costs.length ())
7342 {
7343 if (dump_enabled_p ())
7344 dump_printf_loc (MSG_NOTE, vect_location,
7345 "Excess vector cost for part in loop %d:\n",
7346 li_vector_costs[vi].first);
7347 profitable = false;
7348 }
7349
7350 /* Unset visited flag. This is delayed when the subgraph is profitable
7351 and we process the loop for remaining unvectorized if-converted code. */
7352 if (!orig_loop || !profitable)
7353 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7354 gimple_set_visited (stmt: cost->stmt_info->stmt, visited_p: false);
7355
7356 scalar_costs.release ();
7357 vector_costs.release ();
7358
7359 return profitable;
7360}
7361
7362/* qsort comparator for lane defs. */
7363
7364static int
7365vld_cmp (const void *a_, const void *b_)
7366{
7367 auto *a = (const std::pair<unsigned, tree> *)a_;
7368 auto *b = (const std::pair<unsigned, tree> *)b_;
7369 return a->first - b->first;
7370}
7371
7372/* Return true if USE_STMT is a vector lane insert into VEC and set
7373 *THIS_LANE to the lane number that is set. */
7374
7375static bool
7376vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7377{
7378 gassign *use_ass = dyn_cast <gassign *> (p: use_stmt);
7379 if (!use_ass
7380 || gimple_assign_rhs_code (gs: use_ass) != BIT_INSERT_EXPR
7381 || (vec
7382 ? gimple_assign_rhs1 (gs: use_ass) != vec
7383 : ((vec = gimple_assign_rhs1 (gs: use_ass)), false))
7384 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7385 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7386 || !constant_multiple_p
7387 (a: tree_to_poly_uint64 (gimple_assign_rhs3 (gs: use_ass)),
7388 b: tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7389 multiple: this_lane))
7390 return false;
7391 return true;
7392}
7393
7394/* Find any vectorizable constructors and add them to the grouped_store
7395 array. */
7396
7397static void
7398vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7399{
7400 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7401 for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bb_vinfo->bbs[i]);
7402 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
7403 {
7404 gassign *assign = dyn_cast<gassign *> (p: gsi_stmt (i: gsi));
7405 if (!assign)
7406 continue;
7407
7408 tree rhs = gimple_assign_rhs1 (gs: assign);
7409 enum tree_code code = gimple_assign_rhs_code (gs: assign);
7410 use_operand_p use_p;
7411 gimple *use_stmt;
7412 if (code == CONSTRUCTOR)
7413 {
7414 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7415 || maybe_ne (a: TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7416 CONSTRUCTOR_NELTS (rhs))
7417 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7418 || uniform_vector_p (rhs))
7419 continue;
7420
7421 unsigned j;
7422 tree val;
7423 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7424 if (TREE_CODE (val) != SSA_NAME
7425 || !bb_vinfo->lookup_def (val))
7426 break;
7427 if (j != CONSTRUCTOR_NELTS (rhs))
7428 continue;
7429
7430 vec<stmt_vec_info> roots = vNULL;
7431 roots.safe_push (obj: bb_vinfo->lookup_stmt (assign));
7432 vec<stmt_vec_info> stmts;
7433 stmts.create (CONSTRUCTOR_NELTS (rhs));
7434 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7435 stmts.quick_push
7436 (obj: vect_stmt_to_vectorize (stmt_info: bb_vinfo->lookup_def (val)));
7437 bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_ctor,
7438 stmts, roots));
7439 }
7440 else if (code == BIT_INSERT_EXPR
7441 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7442 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7443 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7444 && integer_zerop (gimple_assign_rhs3 (gs: assign))
7445 && useless_type_conversion_p
7446 (TREE_TYPE (TREE_TYPE (rhs)),
7447 TREE_TYPE (gimple_assign_rhs2 (assign)))
7448 && bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: assign)))
7449 {
7450 /* We start to match on insert to lane zero but since the
7451 inserts need not be ordered we'd have to search both
7452 the def and the use chains. */
7453 tree vectype = TREE_TYPE (rhs);
7454 unsigned nlanes = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant ();
7455 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7456 auto_sbitmap lanes (nlanes);
7457 bitmap_clear (lanes);
7458 bitmap_set_bit (map: lanes, bitno: 0);
7459 tree def = gimple_assign_lhs (gs: assign);
7460 lane_defs.quick_push
7461 (obj: std::make_pair (x: 0, y: gimple_assign_rhs2 (gs: assign)));
7462 unsigned lanes_found = 1;
7463 /* Start with the use chains, the last stmt will be the root. */
7464 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7465 vec<stmt_vec_info> roots = vNULL;
7466 roots.safe_push (obj: last);
7467 do
7468 {
7469 use_operand_p use_p;
7470 gimple *use_stmt;
7471 if (!single_imm_use (var: def, use_p: &use_p, stmt: &use_stmt))
7472 break;
7473 unsigned this_lane;
7474 if (!bb_vinfo->lookup_stmt (use_stmt)
7475 || !vect_slp_is_lane_insert (use_stmt, vec: def, this_lane: &this_lane)
7476 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: use_stmt)))
7477 break;
7478 if (bitmap_bit_p (map: lanes, bitno: this_lane))
7479 break;
7480 lanes_found++;
7481 bitmap_set_bit (map: lanes, bitno: this_lane);
7482 gassign *use_ass = as_a <gassign *> (p: use_stmt);
7483 lane_defs.quick_push (obj: std::make_pair
7484 (x&: this_lane, y: gimple_assign_rhs2 (gs: use_ass)));
7485 last = bb_vinfo->lookup_stmt (use_ass);
7486 roots.safe_push (obj: last);
7487 def = gimple_assign_lhs (gs: use_ass);
7488 }
7489 while (lanes_found < nlanes);
7490 if (roots.length () > 1)
7491 std::swap(a&: roots[0], b&: roots[roots.length () - 1]);
7492 if (lanes_found < nlanes)
7493 {
7494 /* Now search the def chain. */
7495 def = gimple_assign_rhs1 (gs: assign);
7496 do
7497 {
7498 if (TREE_CODE (def) != SSA_NAME
7499 || !has_single_use (var: def))
7500 break;
7501 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7502 unsigned this_lane;
7503 if (!bb_vinfo->lookup_stmt (def_stmt)
7504 || !vect_slp_is_lane_insert (use_stmt: def_stmt,
7505 NULL_TREE, this_lane: &this_lane)
7506 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: def_stmt)))
7507 break;
7508 if (bitmap_bit_p (map: lanes, bitno: this_lane))
7509 break;
7510 lanes_found++;
7511 bitmap_set_bit (map: lanes, bitno: this_lane);
7512 lane_defs.quick_push (obj: std::make_pair
7513 (x&: this_lane,
7514 y: gimple_assign_rhs2 (gs: def_stmt)));
7515 roots.safe_push (obj: bb_vinfo->lookup_stmt (def_stmt));
7516 def = gimple_assign_rhs1 (gs: def_stmt);
7517 }
7518 while (lanes_found < nlanes);
7519 }
7520 if (lanes_found == nlanes)
7521 {
7522 /* Sort lane_defs after the lane index and register the root. */
7523 lane_defs.qsort (vld_cmp);
7524 vec<stmt_vec_info> stmts;
7525 stmts.create (nelems: nlanes);
7526 for (unsigned i = 0; i < nlanes; ++i)
7527 stmts.quick_push (obj: bb_vinfo->lookup_def (lane_defs[i].second));
7528 bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_ctor,
7529 stmts, roots));
7530 }
7531 else
7532 roots.release ();
7533 }
7534 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7535 && (associative_tree_code (code) || code == MINUS_EXPR)
7536 /* ??? This pessimizes a two-element reduction. PR54400.
7537 ??? In-order reduction could be handled if we only
7538 traverse one operand chain in vect_slp_linearize_chain. */
7539 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7540 /* Ops with constants at the tail can be stripped here. */
7541 && TREE_CODE (rhs) == SSA_NAME
7542 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7543 /* Should be the chain end. */
7544 && (!single_imm_use (var: gimple_assign_lhs (gs: assign),
7545 use_p: &use_p, stmt: &use_stmt)
7546 || !is_gimple_assign (gs: use_stmt)
7547 || (gimple_assign_rhs_code (gs: use_stmt) != code
7548 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7549 || (gimple_assign_rhs_code (gs: use_stmt)
7550 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7551 {
7552 /* We start the match at the end of a possible association
7553 chain. */
7554 auto_vec<chain_op_t> chain;
7555 auto_vec<std::pair<tree_code, gimple *> > worklist;
7556 auto_vec<gimple *> chain_stmts;
7557 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7558 if (code == MINUS_EXPR)
7559 code = PLUS_EXPR;
7560 internal_fn reduc_fn;
7561 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7562 || reduc_fn == IFN_LAST)
7563 continue;
7564 vect_slp_linearize_chain (vinfo: bb_vinfo, worklist, chain, code, start: assign,
7565 /* ??? */
7566 code_stmt, alt_code_stmt, chain_stmts: &chain_stmts);
7567 if (chain.length () > 1)
7568 {
7569 /* Sort the chain according to def_type and operation. */
7570 chain.sort (cmp: dt_sort_cmp, data: bb_vinfo);
7571 /* ??? Now we'd want to strip externals and constants
7572 but record those to be handled in the epilogue. */
7573 /* ??? For now do not allow mixing ops or externs/constants. */
7574 bool invalid = false;
7575 unsigned remain_cnt = 0;
7576 unsigned last_idx = 0;
7577 for (unsigned i = 0; i < chain.length (); ++i)
7578 {
7579 if (chain[i].code != code)
7580 {
7581 invalid = true;
7582 break;
7583 }
7584 if (chain[i].dt != vect_internal_def
7585 /* Avoid stmts where the def is not the LHS, like
7586 ASMs. */
7587 || (gimple_get_lhs (bb_vinfo->lookup_def
7588 (chain[i].op)->stmt)
7589 != chain[i].op))
7590 remain_cnt++;
7591 else
7592 last_idx = i;
7593 }
7594 /* Make sure to have an even number of lanes as we later do
7595 all-or-nothing discovery, not trying to split further. */
7596 if ((chain.length () - remain_cnt) & 1)
7597 remain_cnt++;
7598 if (!invalid && chain.length () - remain_cnt > 1)
7599 {
7600 vec<stmt_vec_info> stmts;
7601 vec<tree> remain = vNULL;
7602 stmts.create (nelems: chain.length ());
7603 if (remain_cnt > 0)
7604 remain.create (nelems: remain_cnt);
7605 for (unsigned i = 0; i < chain.length (); ++i)
7606 {
7607 stmt_vec_info stmt_info;
7608 if (chain[i].dt == vect_internal_def
7609 && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
7610 gimple_get_lhs (stmt_info->stmt) == chain[i].op)
7611 && (i != last_idx
7612 || (stmts.length () & 1)))
7613 stmts.quick_push (obj: stmt_info);
7614 else
7615 remain.quick_push (obj: chain[i].op);
7616 }
7617 vec<stmt_vec_info> roots;
7618 roots.create (nelems: chain_stmts.length ());
7619 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7620 roots.quick_push (obj: bb_vinfo->lookup_stmt (chain_stmts[i]));
7621 bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_bb_reduc,
7622 stmts, roots, remain));
7623 }
7624 }
7625 }
7626 }
7627}
7628
7629/* Walk the grouped store chains and replace entries with their
7630 pattern variant if any. */
7631
7632static void
7633vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7634{
7635 stmt_vec_info first_element;
7636 unsigned i;
7637
7638 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7639 {
7640 /* We also have CTORs in this array. */
7641 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7642 continue;
7643 if (STMT_VINFO_IN_PATTERN_P (first_element))
7644 {
7645 stmt_vec_info orig = first_element;
7646 first_element = STMT_VINFO_RELATED_STMT (first_element);
7647 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7648 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7649 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7650 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7651 vinfo->grouped_stores[i] = first_element;
7652 }
7653 stmt_vec_info prev = first_element;
7654 while (DR_GROUP_NEXT_ELEMENT (prev))
7655 {
7656 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7657 if (STMT_VINFO_IN_PATTERN_P (elt))
7658 {
7659 stmt_vec_info orig = elt;
7660 elt = STMT_VINFO_RELATED_STMT (elt);
7661 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7662 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7663 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7664 }
7665 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7666 prev = elt;
7667 }
7668 }
7669}
7670
7671/* Check if the region described by BB_VINFO can be vectorized, returning
7672 true if so. When returning false, set FATAL to true if the same failure
7673 would prevent vectorization at other vector sizes, false if it is still
7674 worth trying other sizes. N_STMTS is the number of statements in the
7675 region. */
7676
7677static bool
7678vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7679 vec<int> *dataref_groups)
7680{
7681 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7682
7683 slp_instance instance;
7684 int i;
7685 poly_uint64 min_vf = 2;
7686
7687 /* The first group of checks is independent of the vector size. */
7688 fatal = true;
7689
7690 /* Analyze the data references. */
7691
7692 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7693 {
7694 if (dump_enabled_p ())
7695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7696 "not vectorized: unhandled data-ref in basic "
7697 "block.\n");
7698 return false;
7699 }
7700
7701 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7702 {
7703 if (dump_enabled_p ())
7704 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7705 "not vectorized: unhandled data access in "
7706 "basic block.\n");
7707 return false;
7708 }
7709
7710 vect_slp_check_for_roots (bb_vinfo);
7711
7712 /* If there are no grouped stores and no constructors in the region
7713 there is no need to continue with pattern recog as vect_analyze_slp
7714 will fail anyway. */
7715 if (bb_vinfo->grouped_stores.is_empty ()
7716 && bb_vinfo->roots.is_empty ())
7717 {
7718 if (dump_enabled_p ())
7719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7720 "not vectorized: no grouped stores in "
7721 "basic block.\n");
7722 return false;
7723 }
7724
7725 /* While the rest of the analysis below depends on it in some way. */
7726 fatal = false;
7727
7728 vect_pattern_recog (bb_vinfo);
7729
7730 /* Update store groups from pattern processing. */
7731 vect_fixup_store_groups_with_patterns (vinfo: bb_vinfo);
7732
7733 /* Check the SLP opportunities in the basic block, analyze and build SLP
7734 trees. */
7735 if (!vect_analyze_slp (vinfo: bb_vinfo, max_tree_size: n_stmts))
7736 {
7737 if (dump_enabled_p ())
7738 {
7739 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7740 "Failed to SLP the basic block.\n");
7741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7742 "not vectorized: failed to find SLP opportunities "
7743 "in basic block.\n");
7744 }
7745 return false;
7746 }
7747
7748 /* Optimize permutations. */
7749 vect_optimize_slp (vinfo: bb_vinfo);
7750
7751 /* Gather the loads reachable from the SLP graph entries. */
7752 vect_gather_slp_loads (vinfo: bb_vinfo);
7753
7754 vect_record_base_alignments (bb_vinfo);
7755
7756 /* Analyze and verify the alignment of data references and the
7757 dependence in the SLP instances. */
7758 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (ix: i, ptr: &instance); )
7759 {
7760 vect_location = instance->location ();
7761 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7762 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7763 {
7764 slp_tree node = SLP_INSTANCE_TREE (instance);
7765 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7766 if (dump_enabled_p ())
7767 dump_printf_loc (MSG_NOTE, vect_location,
7768 "removing SLP instance operations starting from: %G",
7769 stmt_info->stmt);
7770 vect_free_slp_instance (instance);
7771 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (ix: i);
7772 continue;
7773 }
7774
7775 /* Mark all the statements that we want to vectorize as pure SLP and
7776 relevant. */
7777 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7778 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7779 unsigned j;
7780 stmt_vec_info root;
7781 /* Likewise consider instance root stmts as vectorized. */
7782 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7783 STMT_SLP_TYPE (root) = pure_slp;
7784
7785 i++;
7786 }
7787 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7788 return false;
7789
7790 if (!vect_slp_analyze_operations (vinfo: bb_vinfo))
7791 {
7792 if (dump_enabled_p ())
7793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7794 "not vectorized: bad operation in basic block.\n");
7795 return false;
7796 }
7797
7798 vect_bb_partition_graph (bb_vinfo);
7799
7800 return true;
7801}
7802
7803/* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7804 basic blocks in BBS, returning true on success.
7805 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7806
7807static bool
7808vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7809 vec<int> *dataref_groups, unsigned int n_stmts,
7810 loop_p orig_loop)
7811{
7812 bb_vec_info bb_vinfo;
7813 auto_vector_modes vector_modes;
7814
7815 /* Autodetect first vector size we try. */
7816 machine_mode next_vector_mode = VOIDmode;
7817 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7818 unsigned int mode_i = 0;
7819
7820 vec_info_shared shared;
7821
7822 machine_mode autodetected_vector_mode = VOIDmode;
7823 while (1)
7824 {
7825 bool vectorized = false;
7826 bool fatal = false;
7827 bb_vinfo = new _bb_vec_info (bbs, &shared);
7828
7829 bool first_time_p = shared.datarefs.is_empty ();
7830 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7831 if (first_time_p)
7832 bb_vinfo->shared->save_datarefs ();
7833 else
7834 bb_vinfo->shared->check_datarefs ();
7835 bb_vinfo->vector_mode = next_vector_mode;
7836
7837 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7838 {
7839 if (dump_enabled_p ())
7840 {
7841 dump_printf_loc (MSG_NOTE, vect_location,
7842 "***** Analysis succeeded with vector mode"
7843 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7844 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7845 }
7846
7847 bb_vinfo->shared->check_datarefs ();
7848
7849 bool force_clear = false;
7850 auto_vec<slp_instance> profitable_subgraphs;
7851 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7852 {
7853 if (instance->subgraph_entries.is_empty ())
7854 continue;
7855
7856 dump_user_location_t saved_vect_location = vect_location;
7857 vect_location = instance->location ();
7858 if (!unlimited_cost_model (NULL)
7859 && !vect_bb_vectorization_profitable_p
7860 (bb_vinfo, slp_instances: instance->subgraph_entries, orig_loop))
7861 {
7862 if (dump_enabled_p ())
7863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7864 "not vectorized: vectorization is not "
7865 "profitable.\n");
7866 vect_location = saved_vect_location;
7867 continue;
7868 }
7869
7870 vect_location = saved_vect_location;
7871 if (!dbg_cnt (index: vect_slp))
7872 {
7873 force_clear = true;
7874 continue;
7875 }
7876
7877 profitable_subgraphs.safe_push (obj: instance);
7878 }
7879
7880 /* When we're vectorizing an if-converted loop body make sure
7881 we vectorized all if-converted code. */
7882 if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
7883 {
7884 gcc_assert (bb_vinfo->bbs.length () == 1);
7885 for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bb_vinfo->bbs[0]);
7886 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
7887 {
7888 /* The costing above left us with DCEable vectorized scalar
7889 stmts having the visited flag set on profitable
7890 subgraphs. Do the delayed clearing of the flag here. */
7891 if (gimple_visited_p (stmt: gsi_stmt (i: gsi)))
7892 {
7893 gimple_set_visited (stmt: gsi_stmt (i: gsi), visited_p: false);
7894 continue;
7895 }
7896 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7897 continue;
7898
7899 if (gassign *ass = dyn_cast <gassign *> (p: gsi_stmt (i: gsi)))
7900 if (gimple_assign_rhs_code (gs: ass) == COND_EXPR)
7901 {
7902 if (!profitable_subgraphs.is_empty ()
7903 && dump_enabled_p ())
7904 dump_printf_loc (MSG_NOTE, vect_location,
7905 "not profitable because of "
7906 "unprofitable if-converted scalar "
7907 "code\n");
7908 profitable_subgraphs.truncate (size: 0);
7909 }
7910 }
7911 }
7912
7913 /* Finally schedule the profitable subgraphs. */
7914 for (slp_instance instance : profitable_subgraphs)
7915 {
7916 if (!vectorized && dump_enabled_p ())
7917 dump_printf_loc (MSG_NOTE, vect_location,
7918 "Basic block will be vectorized "
7919 "using SLP\n");
7920 vectorized = true;
7921
7922 /* Dump before scheduling as store vectorization will remove
7923 the original stores and mess with the instance tree
7924 so querying its location will eventually ICE. */
7925 if (flag_checking)
7926 for (slp_instance sub : instance->subgraph_entries)
7927 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7928 unsigned HOST_WIDE_INT bytes;
7929 if (dump_enabled_p ())
7930 for (slp_instance sub : instance->subgraph_entries)
7931 {
7932 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7933 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (const_value: &bytes))
7934 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7935 sub->location (),
7936 "basic block part vectorized using %wu "
7937 "byte vectors\n", bytes);
7938 else
7939 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7940 sub->location (),
7941 "basic block part vectorized using "
7942 "variable length vectors\n");
7943 }
7944
7945 dump_user_location_t saved_vect_location = vect_location;
7946 vect_location = instance->location ();
7947
7948 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7949
7950 vect_location = saved_vect_location;
7951 }
7952 }
7953 else
7954 {
7955 if (dump_enabled_p ())
7956 dump_printf_loc (MSG_NOTE, vect_location,
7957 "***** Analysis failed with vector mode %s\n",
7958 GET_MODE_NAME (bb_vinfo->vector_mode));
7959 }
7960
7961 if (mode_i == 0)
7962 autodetected_vector_mode = bb_vinfo->vector_mode;
7963
7964 if (!fatal)
7965 while (mode_i < vector_modes.length ()
7966 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7967 {
7968 if (dump_enabled_p ())
7969 dump_printf_loc (MSG_NOTE, vect_location,
7970 "***** The result for vector mode %s would"
7971 " be the same\n",
7972 GET_MODE_NAME (vector_modes[mode_i]));
7973 mode_i += 1;
7974 }
7975
7976 delete bb_vinfo;
7977
7978 if (mode_i < vector_modes.length ()
7979 && VECTOR_MODE_P (autodetected_vector_mode)
7980 && (related_vector_mode (vector_modes[mode_i],
7981 GET_MODE_INNER (autodetected_vector_mode))
7982 == autodetected_vector_mode)
7983 && (related_vector_mode (autodetected_vector_mode,
7984 GET_MODE_INNER (vector_modes[mode_i]))
7985 == vector_modes[mode_i]))
7986 {
7987 if (dump_enabled_p ())
7988 dump_printf_loc (MSG_NOTE, vect_location,
7989 "***** Skipping vector mode %s, which would"
7990 " repeat the analysis for %s\n",
7991 GET_MODE_NAME (vector_modes[mode_i]),
7992 GET_MODE_NAME (autodetected_vector_mode));
7993 mode_i += 1;
7994 }
7995
7996 if (vectorized
7997 || mode_i == vector_modes.length ()
7998 || autodetected_vector_mode == VOIDmode
7999 /* If vect_slp_analyze_bb_1 signaled that analysis for all
8000 vector sizes will fail do not bother iterating. */
8001 || fatal)
8002 return vectorized;
8003
8004 /* Try the next biggest vector size. */
8005 next_vector_mode = vector_modes[mode_i++];
8006 if (dump_enabled_p ())
8007 dump_printf_loc (MSG_NOTE, vect_location,
8008 "***** Re-trying analysis with vector mode %s\n",
8009 GET_MODE_NAME (next_vector_mode));
8010 }
8011}
8012
8013
8014/* Main entry for the BB vectorizer. Analyze and transform BBS, returns
8015 true if anything in the basic-block was vectorized. */
8016
8017static bool
8018vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
8019{
8020 vec<data_reference_p> datarefs = vNULL;
8021 auto_vec<int> dataref_groups;
8022 int insns = 0;
8023 int current_group = 0;
8024
8025 for (unsigned i = 0; i < bbs.length (); i++)
8026 {
8027 basic_block bb = bbs[i];
8028 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (i: gsi);
8029 gsi_next (i: &gsi))
8030 {
8031 gimple *stmt = gsi_stmt (i: gsi);
8032 if (is_gimple_debug (gs: stmt))
8033 continue;
8034
8035 insns++;
8036
8037 if (gimple_location (g: stmt) != UNKNOWN_LOCATION)
8038 vect_location = stmt;
8039
8040 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
8041 &dataref_groups, current_group))
8042 ++current_group;
8043 }
8044 /* New BBs always start a new DR group. */
8045 ++current_group;
8046 }
8047
8048 return vect_slp_region (bbs, datarefs, dataref_groups: &dataref_groups, n_stmts: insns, orig_loop);
8049}
8050
8051/* Special entry for the BB vectorizer. Analyze and transform a single
8052 if-converted BB with ORIG_LOOPs body being the not if-converted
8053 representation. Returns true if anything in the basic-block was
8054 vectorized. */
8055
8056bool
8057vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
8058{
8059 auto_vec<basic_block> bbs;
8060 bbs.safe_push (obj: bb);
8061 return vect_slp_bbs (bbs, orig_loop);
8062}
8063
8064/* Main entry for the BB vectorizer. Analyze and transform BB, returns
8065 true if anything in the basic-block was vectorized. */
8066
8067bool
8068vect_slp_function (function *fun)
8069{
8070 bool r = false;
8071 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
8072 auto_bitmap exit_bbs;
8073 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
8074 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
8075 unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
8076 true, rpo, NULL);
8077
8078 /* For the moment split the function into pieces to avoid making
8079 the iteration on the vector mode moot. Split at points we know
8080 to not handle well which is CFG merges (SLP discovery doesn't
8081 handle non-loop-header PHIs) and loop exits. Since pattern
8082 recog requires reverse iteration to visit uses before defs
8083 simply chop RPO into pieces. */
8084 auto_vec<basic_block> bbs;
8085 for (unsigned i = 0; i < n; i++)
8086 {
8087 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
8088 bool split = false;
8089
8090 /* Split when a BB is not dominated by the first block. */
8091 if (!bbs.is_empty ()
8092 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
8093 {
8094 if (dump_enabled_p ())
8095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8096 "splitting region at dominance boundary bb%d\n",
8097 bb->index);
8098 split = true;
8099 }
8100 /* Split when the loop determined by the first block
8101 is exited. This is because we eventually insert
8102 invariants at region begin. */
8103 else if (!bbs.is_empty ()
8104 && bbs[0]->loop_father != bb->loop_father
8105 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
8106 {
8107 if (dump_enabled_p ())
8108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8109 "splitting region at loop %d exit at bb%d\n",
8110 bbs[0]->loop_father->num, bb->index);
8111 split = true;
8112 }
8113 else if (!bbs.is_empty ()
8114 && bb->loop_father->header == bb
8115 && bb->loop_father->dont_vectorize)
8116 {
8117 if (dump_enabled_p ())
8118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8119 "splitting region at dont-vectorize loop %d "
8120 "entry at bb%d\n",
8121 bb->loop_father->num, bb->index);
8122 split = true;
8123 }
8124
8125 if (split && !bbs.is_empty ())
8126 {
8127 r |= vect_slp_bbs (bbs, NULL);
8128 bbs.truncate (size: 0);
8129 }
8130
8131 if (bbs.is_empty ())
8132 {
8133 /* We need to be able to insert at the head of the region which
8134 we cannot for region starting with a returns-twice call. */
8135 if (gcall *first = safe_dyn_cast <gcall *> (p: first_stmt (bb)))
8136 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
8137 {
8138 if (dump_enabled_p ())
8139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8140 "skipping bb%d as start of region as it "
8141 "starts with returns-twice call\n",
8142 bb->index);
8143 continue;
8144 }
8145 /* If the loop this BB belongs to is marked as not to be vectorized
8146 honor that also for BB vectorization. */
8147 if (bb->loop_father->dont_vectorize)
8148 continue;
8149 }
8150
8151 bbs.safe_push (obj: bb);
8152
8153 /* When we have a stmt ending this block and defining a
8154 value we have to insert on edges when inserting after it for
8155 a vector containing its definition. Avoid this for now. */
8156 if (gimple *last = *gsi_last_bb (bb))
8157 if (gimple_get_lhs (last)
8158 && is_ctrl_altering_stmt (last))
8159 {
8160 if (dump_enabled_p ())
8161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8162 "splitting region at control altering "
8163 "definition %G", last);
8164 r |= vect_slp_bbs (bbs, NULL);
8165 bbs.truncate (size: 0);
8166 }
8167 }
8168
8169 if (!bbs.is_empty ())
8170 r |= vect_slp_bbs (bbs, NULL);
8171
8172 free (ptr: rpo);
8173
8174 return r;
8175}
8176
8177/* Build a variable-length vector in which the elements in ELTS are repeated
8178 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
8179 RESULTS and add any new instructions to SEQ.
8180
8181 The approach we use is:
8182
8183 (1) Find a vector mode VM with integer elements of mode IM.
8184
8185 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8186 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
8187 from small vectors to IM.
8188
8189 (3) Duplicate each ELTS'[I] into a vector of mode VM.
8190
8191 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
8192 correct byte contents.
8193
8194 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
8195
8196 We try to find the largest IM for which this sequence works, in order
8197 to cut down on the number of interleaves. */
8198
8199void
8200duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8201 const vec<tree> &elts, unsigned int nresults,
8202 vec<tree> &results)
8203{
8204 unsigned int nelts = elts.length ();
8205 tree element_type = TREE_TYPE (vector_type);
8206
8207 /* (1) Find a vector mode VM with integer elements of mode IM. */
8208 unsigned int nvectors = 1;
8209 tree new_vector_type;
8210 tree permutes[2];
8211 if (!can_duplicate_and_interleave_p (vinfo, count: nelts, elt_type: element_type,
8212 nvectors_out: &nvectors, vector_type_out: &new_vector_type,
8213 permutes))
8214 gcc_unreachable ();
8215
8216 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
8217 unsigned int partial_nelts = nelts / nvectors;
8218 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8219
8220 tree_vector_builder partial_elts;
8221 auto_vec<tree, 32> pieces (nvectors * 2);
8222 pieces.quick_grow_cleared (len: nvectors * 2);
8223 for (unsigned int i = 0; i < nvectors; ++i)
8224 {
8225 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8226 ELTS' has mode IM. */
8227 partial_elts.new_vector (type: partial_vector_type, npatterns: partial_nelts, nelts_per_pattern: 1);
8228 for (unsigned int j = 0; j < partial_nelts; ++j)
8229 partial_elts.quick_push (obj: elts[i * partial_nelts + j]);
8230 tree t = gimple_build_vector (seq, builder: &partial_elts);
8231 t = gimple_build (seq, code: VIEW_CONVERT_EXPR,
8232 TREE_TYPE (new_vector_type), ops: t);
8233
8234 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
8235 pieces[i] = gimple_build_vector_from_val (seq, type: new_vector_type, op: t);
8236 }
8237
8238 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8239 correct byte contents.
8240
8241 Conceptually, we need to repeat the following operation log2(nvectors)
8242 times, where hi_start = nvectors / 2:
8243
8244 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8245 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8246
8247 However, if each input repeats every N elements and the VF is
8248 a multiple of N * 2, the HI result is the same as the LO result.
8249 This will be true for the first N1 iterations of the outer loop,
8250 followed by N2 iterations for which both the LO and HI results
8251 are needed. I.e.:
8252
8253 N1 + N2 = log2(nvectors)
8254
8255 Each "N1 iteration" doubles the number of redundant vectors and the
8256 effect of the process as a whole is to have a sequence of nvectors/2**N1
8257 vectors that repeats 2**N1 times. Rather than generate these redundant
8258 vectors, we halve the number of vectors for each N1 iteration. */
8259 unsigned int in_start = 0;
8260 unsigned int out_start = nvectors;
8261 unsigned int new_nvectors = nvectors;
8262 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8263 {
8264 unsigned int hi_start = new_nvectors / 2;
8265 unsigned int out_i = 0;
8266 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8267 {
8268 if ((in_i & 1) != 0
8269 && multiple_p (a: TYPE_VECTOR_SUBPARTS (node: new_vector_type),
8270 b: 2 * in_repeat))
8271 continue;
8272
8273 tree output = make_ssa_name (var: new_vector_type);
8274 tree input1 = pieces[in_start + (in_i / 2)];
8275 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8276 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8277 input1, input2,
8278 permutes[in_i & 1]);
8279 gimple_seq_add_stmt (seq, stmt);
8280 pieces[out_start + out_i] = output;
8281 out_i += 1;
8282 }
8283 std::swap (a&: in_start, b&: out_start);
8284 new_nvectors = out_i;
8285 }
8286
8287 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
8288 results.reserve (nelems: nresults);
8289 for (unsigned int i = 0; i < nresults; ++i)
8290 if (i < new_nvectors)
8291 results.quick_push (obj: gimple_build (seq, code: VIEW_CONVERT_EXPR, type: vector_type,
8292 ops: pieces[in_start + i]));
8293 else
8294 results.quick_push (obj: results[i - new_nvectors]);
8295}
8296
8297
8298/* For constant and loop invariant defs in OP_NODE this function creates
8299 vector defs that will be used in the vectorized stmts and stores them
8300 to SLP_TREE_VEC_DEFS of OP_NODE. */
8301
8302static void
8303vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8304{
8305 unsigned HOST_WIDE_INT nunits;
8306 tree vec_cst;
8307 unsigned j, number_of_places_left_in_vector;
8308 tree vector_type;
8309 tree vop;
8310 int group_size = op_node->ops.length ();
8311 unsigned int vec_num, i;
8312 unsigned number_of_copies = 1;
8313 bool constant_p;
8314 gimple_seq ctor_seq = NULL;
8315 auto_vec<tree, 16> permute_results;
8316
8317 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
8318 vector_type = SLP_TREE_VECTYPE (op_node);
8319
8320 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8321 SLP_TREE_VEC_DEFS (op_node).create (nelems: number_of_vectors);
8322 auto_vec<tree> voprnds (number_of_vectors);
8323
8324 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8325 created vectors. It is greater than 1 if unrolling is performed.
8326
8327 For example, we have two scalar operands, s1 and s2 (e.g., group of
8328 strided accesses of size two), while NUNITS is four (i.e., four scalars
8329 of this type can be packed in a vector). The output vector will contain
8330 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
8331 will be 2).
8332
8333 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8334 containing the operands.
8335
8336 For example, NUNITS is four as before, and the group size is 8
8337 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
8338 {s5, s6, s7, s8}. */
8339
8340 /* When using duplicate_and_interleave, we just need one element for
8341 each scalar statement. */
8342 if (!TYPE_VECTOR_SUBPARTS (node: vector_type).is_constant (const_value: &nunits))
8343 nunits = group_size;
8344
8345 number_of_copies = nunits * number_of_vectors / group_size;
8346
8347 number_of_places_left_in_vector = nunits;
8348 constant_p = true;
8349 tree uniform_elt = NULL_TREE;
8350 tree_vector_builder elts (vector_type, nunits, 1);
8351 elts.quick_grow (len: nunits);
8352 stmt_vec_info insert_after = NULL;
8353 for (j = 0; j < number_of_copies; j++)
8354 {
8355 tree op;
8356 for (i = group_size - 1; op_node->ops.iterate (ix: i, ptr: &op); i--)
8357 {
8358 /* Create 'vect_ = {op0,op1,...,opn}'. */
8359 tree orig_op = op;
8360 if (number_of_places_left_in_vector == nunits)
8361 uniform_elt = op;
8362 else if (uniform_elt && operand_equal_p (uniform_elt, op))
8363 op = elts[number_of_places_left_in_vector];
8364 else
8365 uniform_elt = NULL_TREE;
8366 number_of_places_left_in_vector--;
8367 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8368 {
8369 if (CONSTANT_CLASS_P (op))
8370 {
8371 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8372 {
8373 /* Can't use VIEW_CONVERT_EXPR for booleans because
8374 of possibly different sizes of scalar value and
8375 vector element. */
8376 if (integer_zerop (op))
8377 op = build_int_cst (TREE_TYPE (vector_type), 0);
8378 else if (integer_onep (op))
8379 op = build_all_ones_cst (TREE_TYPE (vector_type));
8380 else
8381 gcc_unreachable ();
8382 }
8383 else
8384 op = fold_unary (VIEW_CONVERT_EXPR,
8385 TREE_TYPE (vector_type), op);
8386 gcc_assert (op && CONSTANT_CLASS_P (op));
8387 }
8388 else
8389 {
8390 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8391 gimple *init_stmt;
8392 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8393 {
8394 tree true_val
8395 = build_all_ones_cst (TREE_TYPE (vector_type));
8396 tree false_val
8397 = build_zero_cst (TREE_TYPE (vector_type));
8398 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8399 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8400 op, true_val,
8401 false_val);
8402 }
8403 else
8404 {
8405 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8406 op);
8407 init_stmt
8408 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8409 op);
8410 }
8411 gimple_seq_add_stmt (&ctor_seq, init_stmt);
8412 op = new_temp;
8413 }
8414 }
8415 elts[number_of_places_left_in_vector] = op;
8416 if (!CONSTANT_CLASS_P (op))
8417 constant_p = false;
8418 /* For BB vectorization we have to compute an insert location
8419 when a def is inside the analyzed region since we cannot
8420 simply insert at the BB start in this case. */
8421 stmt_vec_info opdef;
8422 if (TREE_CODE (orig_op) == SSA_NAME
8423 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8424 && is_a <bb_vec_info> (p: vinfo)
8425 && (opdef = vinfo->lookup_def (orig_op)))
8426 {
8427 if (!insert_after)
8428 insert_after = opdef;
8429 else
8430 insert_after = get_later_stmt (stmt1_info: insert_after, stmt2_info: opdef);
8431 }
8432
8433 if (number_of_places_left_in_vector == 0)
8434 {
8435 auto type_nunits = TYPE_VECTOR_SUBPARTS (node: vector_type);
8436 if (uniform_elt)
8437 vec_cst = gimple_build_vector_from_val (seq: &ctor_seq, type: vector_type,
8438 op: elts[0]);
8439 else if (constant_p
8440 ? multiple_p (a: type_nunits, b: nunits)
8441 : known_eq (type_nunits, nunits))
8442 vec_cst = gimple_build_vector (seq: &ctor_seq, builder: &elts);
8443 else
8444 {
8445 if (permute_results.is_empty ())
8446 duplicate_and_interleave (vinfo, seq: &ctor_seq, vector_type,
8447 elts, nresults: number_of_vectors,
8448 results&: permute_results);
8449 vec_cst = permute_results[number_of_vectors - j - 1];
8450 }
8451 if (!gimple_seq_empty_p (s: ctor_seq))
8452 {
8453 if (insert_after)
8454 {
8455 gimple_stmt_iterator gsi;
8456 if (gimple_code (g: insert_after->stmt) == GIMPLE_PHI)
8457 {
8458 gsi = gsi_after_labels (bb: gimple_bb (g: insert_after->stmt));
8459 gsi_insert_seq_before (&gsi, ctor_seq,
8460 GSI_CONTINUE_LINKING);
8461 }
8462 else if (!stmt_ends_bb_p (insert_after->stmt))
8463 {
8464 gsi = gsi_for_stmt (insert_after->stmt);
8465 gsi_insert_seq_after (&gsi, ctor_seq,
8466 GSI_CONTINUE_LINKING);
8467 }
8468 else
8469 {
8470 /* When we want to insert after a def where the
8471 defining stmt throws then insert on the fallthru
8472 edge. */
8473 edge e = find_fallthru_edge
8474 (edges: gimple_bb (g: insert_after->stmt)->succs);
8475 basic_block new_bb
8476 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8477 gcc_assert (!new_bb);
8478 }
8479 }
8480 else
8481 vinfo->insert_seq_on_entry (NULL, ctor_seq);
8482 ctor_seq = NULL;
8483 }
8484 voprnds.quick_push (obj: vec_cst);
8485 insert_after = NULL;
8486 number_of_places_left_in_vector = nunits;
8487 constant_p = true;
8488 elts.new_vector (type: vector_type, npatterns: nunits, nelts_per_pattern: 1);
8489 elts.quick_grow (len: nunits);
8490 }
8491 }
8492 }
8493
8494 /* Since the vectors are created in the reverse order, we should invert
8495 them. */
8496 vec_num = voprnds.length ();
8497 for (j = vec_num; j != 0; j--)
8498 {
8499 vop = voprnds[j - 1];
8500 SLP_TREE_VEC_DEFS (op_node).quick_push (obj: vop);
8501 }
8502
8503 /* In case that VF is greater than the unrolling factor needed for the SLP
8504 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8505 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8506 to replicate the vectors. */
8507 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8508 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (ix: i, ptr: &vop) && i < vec_num;
8509 i++)
8510 SLP_TREE_VEC_DEFS (op_node).quick_push (obj: vop);
8511}
8512
8513/* Get the Ith vectorized definition from SLP_NODE. */
8514
8515tree
8516vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8517{
8518 return SLP_TREE_VEC_DEFS (slp_node)[i];
8519}
8520
8521/* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8522
8523void
8524vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8525{
8526 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8527 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8528}
8529
8530/* Get N vectorized definitions for SLP_NODE. */
8531
8532void
8533vect_get_slp_defs (vec_info *,
8534 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8535{
8536 if (n == -1U)
8537 n = SLP_TREE_CHILDREN (slp_node).length ();
8538
8539 for (unsigned i = 0; i < n; ++i)
8540 {
8541 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8542 vec<tree> vec_defs = vNULL;
8543 vect_get_slp_defs (slp_node: child, vec_defs: &vec_defs);
8544 vec_oprnds->quick_push (obj: vec_defs);
8545 }
8546}
8547
8548/* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8549 - PERM gives the permutation that the caller wants to use for NODE,
8550 which might be different from SLP_LOAD_PERMUTATION.
8551 - DUMP_P controls whether the function dumps information. */
8552
8553static bool
8554vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8555 load_permutation_t &perm,
8556 const vec<tree> &dr_chain,
8557 gimple_stmt_iterator *gsi, poly_uint64 vf,
8558 bool analyze_only, bool dump_p,
8559 unsigned *n_perms, unsigned int *n_loads,
8560 bool dce_chain)
8561{
8562 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8563 int vec_index = 0;
8564 tree vectype = SLP_TREE_VECTYPE (node);
8565 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8566 unsigned int mask_element;
8567 unsigned dr_group_size;
8568 machine_mode mode;
8569
8570 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8571 dr_group_size = 1;
8572 else
8573 {
8574 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8575 dr_group_size = DR_GROUP_SIZE (stmt_info);
8576 }
8577
8578 mode = TYPE_MODE (vectype);
8579 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype);
8580 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8581
8582 /* Initialize the vect stmts of NODE to properly insert the generated
8583 stmts later. */
8584 if (! analyze_only)
8585 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8586 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8587
8588 /* Generate permutation masks for every NODE. Number of masks for each NODE
8589 is equal to GROUP_SIZE.
8590 E.g., we have a group of three nodes with three loads from the same
8591 location in each node, and the vector size is 4. I.e., we have a
8592 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8593 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8594 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8595 ...
8596
8597 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8598 The last mask is illegal since we assume two operands for permute
8599 operation, and the mask element values can't be outside that range.
8600 Hence, the last mask must be converted into {2,5,5,5}.
8601 For the first two permutations we need the first and the second input
8602 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8603 we need the second and the third vectors: {b1,c1,a2,b2} and
8604 {c2,a3,b3,c3}. */
8605
8606 int vect_stmts_counter = 0;
8607 unsigned int index = 0;
8608 int first_vec_index = -1;
8609 int second_vec_index = -1;
8610 bool noop_p = true;
8611 *n_perms = 0;
8612
8613 vec_perm_builder mask;
8614 unsigned int nelts_to_build;
8615 unsigned int nvectors_per_build;
8616 unsigned int in_nlanes;
8617 bool repeating_p = (group_size == dr_group_size
8618 && multiple_p (a: nunits, b: group_size));
8619 if (repeating_p)
8620 {
8621 /* A single vector contains a whole number of copies of the node, so:
8622 (a) all permutes can use the same mask; and
8623 (b) the permutes only need a single vector input. */
8624 mask.new_vector (full_nelts: nunits, npatterns: group_size, nelts_per_pattern: 3);
8625 nelts_to_build = mask.encoded_nelts ();
8626 /* It's possible to obtain zero nstmts during analyze_only, so make
8627 it at least one to ensure the later computation for n_perms
8628 proceed. */
8629 nvectors_per_build = nstmts > 0 ? nstmts : 1;
8630 in_nlanes = dr_group_size * 3;
8631 }
8632 else
8633 {
8634 /* We need to construct a separate mask for each vector statement. */
8635 unsigned HOST_WIDE_INT const_nunits, const_vf;
8636 if (!nunits.is_constant (const_value: &const_nunits)
8637 || !vf.is_constant (const_value: &const_vf))
8638 return false;
8639 mask.new_vector (full_nelts: const_nunits, npatterns: const_nunits, nelts_per_pattern: 1);
8640 nelts_to_build = const_vf * group_size;
8641 nvectors_per_build = 1;
8642 in_nlanes = const_vf * dr_group_size;
8643 }
8644 auto_sbitmap used_in_lanes (in_nlanes);
8645 bitmap_clear (used_in_lanes);
8646 auto_bitmap used_defs;
8647
8648 unsigned int count = mask.encoded_nelts ();
8649 mask.quick_grow (len: count);
8650 vec_perm_indices indices;
8651
8652 for (unsigned int j = 0; j < nelts_to_build; j++)
8653 {
8654 unsigned int iter_num = j / group_size;
8655 unsigned int stmt_num = j % group_size;
8656 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8657 bitmap_set_bit (map: used_in_lanes, bitno: i);
8658 if (repeating_p)
8659 {
8660 first_vec_index = 0;
8661 mask_element = i;
8662 }
8663 else
8664 {
8665 /* Enforced before the loop when !repeating_p. */
8666 unsigned int const_nunits = nunits.to_constant ();
8667 vec_index = i / const_nunits;
8668 mask_element = i % const_nunits;
8669 if (vec_index == first_vec_index
8670 || first_vec_index == -1)
8671 {
8672 first_vec_index = vec_index;
8673 }
8674 else if (vec_index == second_vec_index
8675 || second_vec_index == -1)
8676 {
8677 second_vec_index = vec_index;
8678 mask_element += const_nunits;
8679 }
8680 else
8681 {
8682 if (dump_p)
8683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8684 "permutation requires at "
8685 "least three vectors %G",
8686 stmt_info->stmt);
8687 gcc_assert (analyze_only);
8688 return false;
8689 }
8690
8691 gcc_assert (mask_element < 2 * const_nunits);
8692 }
8693
8694 if (mask_element != index)
8695 noop_p = false;
8696 mask[index++] = mask_element;
8697
8698 if (index == count)
8699 {
8700 if (!noop_p)
8701 {
8702 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8703 if (!can_vec_perm_const_p (mode, mode, indices))
8704 {
8705 if (dump_p)
8706 {
8707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8708 "unsupported vect permute { ");
8709 for (i = 0; i < count; ++i)
8710 {
8711 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8712 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8713 }
8714 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8715 }
8716 gcc_assert (analyze_only);
8717 return false;
8718 }
8719
8720 tree mask_vec = NULL_TREE;
8721 if (!analyze_only)
8722 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8723
8724 if (second_vec_index == -1)
8725 second_vec_index = first_vec_index;
8726
8727 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8728 {
8729 ++*n_perms;
8730 if (analyze_only)
8731 continue;
8732 /* Generate the permute statement if necessary. */
8733 tree first_vec = dr_chain[first_vec_index + ri];
8734 tree second_vec = dr_chain[second_vec_index + ri];
8735 gassign *stmt = as_a<gassign *> (p: stmt_info->stmt);
8736 tree perm_dest
8737 = vect_create_destination_var (gimple_assign_lhs (gs: stmt),
8738 vectype);
8739 perm_dest = make_ssa_name (var: perm_dest);
8740 gimple *perm_stmt
8741 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8742 second_vec, mask_vec);
8743 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8744 gsi);
8745 if (dce_chain)
8746 {
8747 bitmap_set_bit (used_defs, first_vec_index + ri);
8748 bitmap_set_bit (used_defs, second_vec_index + ri);
8749 }
8750
8751 /* Store the vector statement in NODE. */
8752 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8753 }
8754 }
8755 else if (!analyze_only)
8756 {
8757 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8758 {
8759 tree first_vec = dr_chain[first_vec_index + ri];
8760 /* If mask was NULL_TREE generate the requested
8761 identity transform. */
8762 if (dce_chain)
8763 bitmap_set_bit (used_defs, first_vec_index + ri);
8764
8765 /* Store the vector statement in NODE. */
8766 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8767 }
8768 }
8769
8770 index = 0;
8771 first_vec_index = -1;
8772 second_vec_index = -1;
8773 noop_p = true;
8774 }
8775 }
8776
8777 if (n_loads)
8778 {
8779 if (repeating_p)
8780 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8781 else
8782 {
8783 /* Enforced above when !repeating_p. */
8784 unsigned int const_nunits = nunits.to_constant ();
8785 *n_loads = 0;
8786 bool load_seen = false;
8787 for (unsigned i = 0; i < in_nlanes; ++i)
8788 {
8789 if (i % const_nunits == 0)
8790 {
8791 if (load_seen)
8792 *n_loads += 1;
8793 load_seen = false;
8794 }
8795 if (bitmap_bit_p (map: used_in_lanes, bitno: i))
8796 load_seen = true;
8797 }
8798 if (load_seen)
8799 *n_loads += 1;
8800 }
8801 }
8802
8803 if (dce_chain)
8804 for (unsigned i = 0; i < dr_chain.length (); ++i)
8805 if (!bitmap_bit_p (used_defs, i))
8806 {
8807 tree def = dr_chain[i];
8808 do
8809 {
8810 gimple *stmt = SSA_NAME_DEF_STMT (def);
8811 if (is_gimple_assign (gs: stmt)
8812 && (gimple_assign_rhs_code (gs: stmt) == VIEW_CONVERT_EXPR
8813 || gimple_assign_rhs_code (gs: stmt) == CONSTRUCTOR))
8814 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
8815 else
8816 def = NULL;
8817 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8818 gsi_remove (&rgsi, true);
8819 release_defs (stmt);
8820 }
8821 while (def);
8822 }
8823
8824 return true;
8825}
8826
8827/* Generate vector permute statements from a list of loads in DR_CHAIN.
8828 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8829 permute statements for the SLP node NODE. Store the number of vector
8830 permute instructions in *N_PERMS and the number of vector load
8831 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8832 that were not needed. */
8833
8834bool
8835vect_transform_slp_perm_load (vec_info *vinfo,
8836 slp_tree node, const vec<tree> &dr_chain,
8837 gimple_stmt_iterator *gsi, poly_uint64 vf,
8838 bool analyze_only, unsigned *n_perms,
8839 unsigned int *n_loads, bool dce_chain)
8840{
8841 return vect_transform_slp_perm_load_1 (vinfo, node,
8842 SLP_TREE_LOAD_PERMUTATION (node),
8843 dr_chain, gsi, vf, analyze_only,
8844 dump_p: dump_enabled_p (), n_perms, n_loads,
8845 dce_chain);
8846}
8847
8848/* Produce the next vector result for SLP permutation NODE by adding a vector
8849 statement at GSI. If MASK_VEC is nonnull, add:
8850
8851 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8852
8853 otherwise add:
8854
8855 <new SSA name> = FIRST_DEF. */
8856
8857static void
8858vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8859 slp_tree node, tree first_def, tree second_def,
8860 tree mask_vec, poly_uint64 identity_offset)
8861{
8862 tree vectype = SLP_TREE_VECTYPE (node);
8863
8864 /* ??? We SLP match existing vector element extracts but
8865 allow punning which we need to re-instantiate at uses
8866 but have no good way of explicitly representing. */
8867 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8868 && !types_compatible_p (TREE_TYPE (first_def), type2: vectype))
8869 {
8870 gassign *conv_stmt
8871 = gimple_build_assign (make_ssa_name (var: vectype),
8872 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8873 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8874 first_def = gimple_assign_lhs (gs: conv_stmt);
8875 }
8876 gassign *perm_stmt;
8877 tree perm_dest = make_ssa_name (var: vectype);
8878 if (mask_vec)
8879 {
8880 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8881 TYPE_SIZE (vectype))
8882 && !types_compatible_p (TREE_TYPE (second_def), type2: vectype))
8883 {
8884 gassign *conv_stmt
8885 = gimple_build_assign (make_ssa_name (var: vectype),
8886 build1 (VIEW_CONVERT_EXPR,
8887 vectype, second_def));
8888 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8889 second_def = gimple_assign_lhs (gs: conv_stmt);
8890 }
8891 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8892 first_def, second_def,
8893 mask_vec);
8894 }
8895 else if (!types_compatible_p (TREE_TYPE (first_def), type2: vectype))
8896 {
8897 /* For identity permutes we still need to handle the case
8898 of offsetted extracts or concats. */
8899 unsigned HOST_WIDE_INT c;
8900 auto first_def_nunits
8901 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8902 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8903 {
8904 unsigned HOST_WIDE_INT elsz
8905 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8906 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8907 TYPE_SIZE (vectype),
8908 bitsize_int (identity_offset * elsz));
8909 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8910 }
8911 else if (constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: vectype),
8912 b: first_def_nunits, multiple: &c) && c == 2)
8913 {
8914 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8915 NULL_TREE, second_def);
8916 perm_stmt = gimple_build_assign (perm_dest, ctor);
8917 }
8918 else
8919 gcc_unreachable ();
8920 }
8921 else
8922 {
8923 /* We need a copy here in case the def was external. */
8924 perm_stmt = gimple_build_assign (perm_dest, first_def);
8925 }
8926 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8927 /* Store the vector statement in NODE. */
8928 node->push_vec_def (def: perm_stmt);
8929}
8930
8931/* Subroutine of vectorizable_slp_permutation. Check whether the target
8932 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8933 If GSI is nonnull, emit the permutation there.
8934
8935 When GSI is null, the only purpose of NODE is to give properties
8936 of the result, such as the vector type and number of SLP lanes.
8937 The node does not need to be a VEC_PERM_EXPR.
8938
8939 If the target supports the operation, return the number of individual
8940 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8941 dump file if DUMP_P is true. */
8942
8943static int
8944vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8945 slp_tree node, lane_permutation_t &perm,
8946 vec<slp_tree> &children, bool dump_p)
8947{
8948 tree vectype = SLP_TREE_VECTYPE (node);
8949
8950 /* ??? We currently only support all same vector input types
8951 while the SLP IL should really do a concat + select and thus accept
8952 arbitrary mismatches. */
8953 slp_tree child;
8954 unsigned i;
8955 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype);
8956 bool repeating_p = multiple_p (a: nunits, SLP_TREE_LANES (node));
8957 tree op_vectype = NULL_TREE;
8958 FOR_EACH_VEC_ELT (children, i, child)
8959 if (SLP_TREE_VECTYPE (child))
8960 {
8961 op_vectype = SLP_TREE_VECTYPE (child);
8962 break;
8963 }
8964 if (!op_vectype)
8965 op_vectype = vectype;
8966 FOR_EACH_VEC_ELT (children, i, child)
8967 {
8968 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8969 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8970 || !types_compatible_p (SLP_TREE_VECTYPE (child), type2: op_vectype)
8971 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8972 {
8973 if (dump_p)
8974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8975 "Unsupported vector types in lane permutation\n");
8976 return -1;
8977 }
8978 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8979 repeating_p = false;
8980 }
8981
8982 gcc_assert (perm.length () == SLP_TREE_LANES (node));
8983 if (dump_p)
8984 {
8985 dump_printf_loc (MSG_NOTE, vect_location,
8986 "vectorizing permutation");
8987 for (unsigned i = 0; i < perm.length (); ++i)
8988 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8989 if (repeating_p)
8990 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8991 dump_printf (MSG_NOTE, "\n");
8992 }
8993
8994 /* REPEATING_P is true if every output vector is guaranteed to use the
8995 same permute vector. We can handle that case for both variable-length
8996 and constant-length vectors, but we only handle other cases for
8997 constant-length vectors.
8998
8999 Set:
9000
9001 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
9002 mask vector that we want to build.
9003
9004 - NCOPIES to the number of copies of PERM that we need in order
9005 to build the necessary permute mask vectors.
9006
9007 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
9008 for each permute mask vector. This is only relevant when GSI is
9009 nonnull. */
9010 uint64_t npatterns;
9011 unsigned nelts_per_pattern;
9012 uint64_t ncopies;
9013 unsigned noutputs_per_mask;
9014 if (repeating_p)
9015 {
9016 /* We need a single permute mask vector that has the form:
9017
9018 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
9019
9020 In other words, the original n-element permute in PERM is
9021 "unrolled" to fill a full vector. The stepped vector encoding
9022 that we use for permutes requires 3n elements. */
9023 npatterns = SLP_TREE_LANES (node);
9024 nelts_per_pattern = ncopies = 3;
9025 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9026 }
9027 else
9028 {
9029 /* Calculate every element of every permute mask vector explicitly,
9030 instead of relying on the pattern described above. */
9031 if (!nunits.is_constant (const_value: &npatterns)
9032 || !TYPE_VECTOR_SUBPARTS (node: op_vectype).is_constant ())
9033 return -1;
9034 nelts_per_pattern = ncopies = 1;
9035 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (p: vinfo))
9036 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (const_value: &ncopies))
9037 return -1;
9038 noutputs_per_mask = 1;
9039 }
9040 unsigned olanes = ncopies * SLP_TREE_LANES (node);
9041 gcc_assert (repeating_p || multiple_p (olanes, nunits));
9042
9043 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
9044 from the { SLP operand, scalar lane } permutation as recorded in the
9045 SLP node as intermediate step. This part should already work
9046 with SLP children with arbitrary number of lanes. */
9047 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
9048 auto_vec<unsigned> active_lane;
9049 vperm.create (nelems: olanes);
9050 active_lane.safe_grow_cleared (len: children.length (), exact: true);
9051 for (unsigned i = 0; i < ncopies; ++i)
9052 {
9053 for (unsigned pi = 0; pi < perm.length (); ++pi)
9054 {
9055 std::pair<unsigned, unsigned> p = perm[pi];
9056 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
9057 if (repeating_p)
9058 vperm.quick_push (obj: {{p.first, 0}, p.second + active_lane[p.first]});
9059 else
9060 {
9061 /* We checked above that the vectors are constant-length. */
9062 unsigned vnunits = TYPE_VECTOR_SUBPARTS (node: vtype).to_constant ();
9063 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
9064 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
9065 vperm.quick_push (obj: {{p.first, vi}, vl});
9066 }
9067 }
9068 /* Advance to the next group. */
9069 for (unsigned j = 0; j < children.length (); ++j)
9070 active_lane[j] += SLP_TREE_LANES (children[j]);
9071 }
9072
9073 if (dump_p)
9074 {
9075 dump_printf_loc (MSG_NOTE, vect_location,
9076 "vectorizing permutation");
9077 for (unsigned i = 0; i < perm.length (); ++i)
9078 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
9079 if (repeating_p)
9080 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
9081 dump_printf (MSG_NOTE, "\n");
9082 dump_printf_loc (MSG_NOTE, vect_location, "as");
9083 for (unsigned i = 0; i < vperm.length (); ++i)
9084 {
9085 if (i != 0
9086 && (repeating_p
9087 ? multiple_p (a: i, b: npatterns)
9088 : multiple_p (a: i, b: TYPE_VECTOR_SUBPARTS (node: vectype))))
9089 dump_printf (MSG_NOTE, ",");
9090 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
9091 vperm[i].first.first, vperm[i].first.second,
9092 vperm[i].second);
9093 }
9094 dump_printf (MSG_NOTE, "\n");
9095 }
9096
9097 /* We can only handle two-vector permutes, everything else should
9098 be lowered on the SLP level. The following is closely inspired
9099 by vect_transform_slp_perm_load and is supposed to eventually
9100 replace it.
9101 ??? As intermediate step do code-gen in the SLP tree representation
9102 somehow? */
9103 std::pair<unsigned, unsigned> first_vec = std::make_pair (x: -1U, y: -1U);
9104 std::pair<unsigned, unsigned> second_vec = std::make_pair (x: -1U, y: -1U);
9105 unsigned int index = 0;
9106 poly_uint64 mask_element;
9107 vec_perm_builder mask;
9108 mask.new_vector (full_nelts: nunits, npatterns, nelts_per_pattern);
9109 unsigned int count = mask.encoded_nelts ();
9110 mask.quick_grow (len: count);
9111 vec_perm_indices indices;
9112 unsigned nperms = 0;
9113 for (unsigned i = 0; i < vperm.length (); ++i)
9114 {
9115 mask_element = vperm[i].second;
9116 if (first_vec.first == -1U
9117 || first_vec == vperm[i].first)
9118 first_vec = vperm[i].first;
9119 else if (second_vec.first == -1U
9120 || second_vec == vperm[i].first)
9121 {
9122 second_vec = vperm[i].first;
9123 mask_element += nunits;
9124 }
9125 else
9126 {
9127 if (dump_p)
9128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9129 "permutation requires at "
9130 "least three vectors\n");
9131 gcc_assert (!gsi);
9132 return -1;
9133 }
9134
9135 mask[index++] = mask_element;
9136
9137 if (index == count)
9138 {
9139 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
9140 TYPE_VECTOR_SUBPARTS (node: op_vectype));
9141 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
9142 && constant_multiple_p (a: mask[0], b: nunits));
9143 machine_mode vmode = TYPE_MODE (vectype);
9144 machine_mode op_vmode = TYPE_MODE (op_vectype);
9145 unsigned HOST_WIDE_INT c;
9146 if ((!identity_p
9147 && !can_vec_perm_const_p (vmode, op_vmode, indices))
9148 || (identity_p
9149 && !known_le (nunits,
9150 TYPE_VECTOR_SUBPARTS (op_vectype))
9151 && (!constant_multiple_p (a: nunits,
9152 b: TYPE_VECTOR_SUBPARTS (node: op_vectype),
9153 multiple: &c) || c != 2)))
9154 {
9155 if (dump_p)
9156 {
9157 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
9158 vect_location,
9159 "unsupported vect permute { ");
9160 for (i = 0; i < count; ++i)
9161 {
9162 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
9163 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9164 }
9165 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9166 }
9167 gcc_assert (!gsi);
9168 return -1;
9169 }
9170
9171 if (!identity_p)
9172 nperms++;
9173 if (gsi)
9174 {
9175 if (second_vec.first == -1U)
9176 second_vec = first_vec;
9177
9178 slp_tree
9179 first_node = children[first_vec.first],
9180 second_node = children[second_vec.first];
9181
9182 tree mask_vec = NULL_TREE;
9183 if (!identity_p)
9184 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9185
9186 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
9187 {
9188 tree first_def
9189 = vect_get_slp_vect_def (slp_node: first_node,
9190 i: first_vec.second + vi);
9191 tree second_def
9192 = vect_get_slp_vect_def (slp_node: second_node,
9193 i: second_vec.second + vi);
9194 vect_add_slp_permutation (vinfo, gsi, node, first_def,
9195 second_def, mask_vec, identity_offset: mask[0]);
9196 }
9197 }
9198
9199 index = 0;
9200 first_vec = std::make_pair (x: -1U, y: -1U);
9201 second_vec = std::make_pair (x: -1U, y: -1U);
9202 }
9203 }
9204
9205 return nperms;
9206}
9207
9208/* Vectorize the SLP permutations in NODE as specified
9209 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
9210 child number and lane number.
9211 Interleaving of two two-lane two-child SLP subtrees (not supported):
9212 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
9213 A blend of two four-lane two-child SLP subtrees:
9214 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
9215 Highpart of a four-lane one-child SLP subtree (not supported):
9216 [ { 0, 2 }, { 0, 3 } ]
9217 Where currently only a subset is supported by code generating below. */
9218
9219static bool
9220vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9221 slp_tree node, stmt_vector_for_cost *cost_vec)
9222{
9223 tree vectype = SLP_TREE_VECTYPE (node);
9224 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9225 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9226 SLP_TREE_CHILDREN (node),
9227 dump_p: dump_enabled_p ());
9228 if (nperms < 0)
9229 return false;
9230
9231 if (!gsi)
9232 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9233
9234 return true;
9235}
9236
9237/* Vectorize SLP NODE. */
9238
9239static void
9240vect_schedule_slp_node (vec_info *vinfo,
9241 slp_tree node, slp_instance instance)
9242{
9243 gimple_stmt_iterator si;
9244 int i;
9245 slp_tree child;
9246
9247 /* Vectorize externals and constants. */
9248 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9249 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9250 {
9251 /* ??? vectorizable_shift can end up using a scalar operand which is
9252 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
9253 node in this case. */
9254 if (!SLP_TREE_VECTYPE (node))
9255 return;
9256
9257 /* There are two reasons vector defs might already exist. The first
9258 is that we are vectorizing an existing vector def. The second is
9259 when performing BB vectorization shared constant/external nodes
9260 are not split apart during partitioning so during the code-gen
9261 DFS walk we can end up visiting them twice. */
9262 if (! SLP_TREE_VEC_DEFS (node).exists ())
9263 vect_create_constant_vectors (vinfo, op_node: node);
9264 return;
9265 }
9266
9267 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9268
9269 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9270
9271 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9272 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9273
9274 if (dump_enabled_p ())
9275 dump_printf_loc (MSG_NOTE, vect_location,
9276 "------>vectorizing SLP node starting from: %G",
9277 stmt_info->stmt);
9278
9279 if (STMT_VINFO_DATA_REF (stmt_info)
9280 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9281 {
9282 /* Vectorized loads go before the first scalar load to make it
9283 ready early, vectorized stores go before the last scalar
9284 stmt which is where all uses are ready. */
9285 stmt_vec_info last_stmt_info = NULL;
9286 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9287 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9288 else /* DR_IS_WRITE */
9289 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9290 si = gsi_for_stmt (last_stmt_info->stmt);
9291 }
9292 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9293 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9294 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9295 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9296 {
9297 /* For PHI node vectorization we do not use the insertion iterator. */
9298 si = gsi_none ();
9299 }
9300 else
9301 {
9302 /* Emit other stmts after the children vectorized defs which is
9303 earliest possible. */
9304 gimple *last_stmt = NULL;
9305 if (auto loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
9306 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9307 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9308 {
9309 /* But avoid scheduling internal defs outside of the loop when
9310 we might have only implicitly tracked loop mask/len defs. */
9311 gimple_stmt_iterator si
9312 = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
9313 last_stmt = *si;
9314 }
9315 bool seen_vector_def = false;
9316 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9317 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9318 {
9319 /* For fold-left reductions we are retaining the scalar
9320 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9321 set so the representation isn't perfect. Resort to the
9322 last scalar def here. */
9323 if (SLP_TREE_VEC_DEFS (child).is_empty ())
9324 {
9325 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9326 == cycle_phi_info_type);
9327 gphi *phi = as_a <gphi *>
9328 (p: vect_find_last_scalar_stmt_in_slp (node: child)->stmt);
9329 if (!last_stmt
9330 || vect_stmt_dominates_stmt_p (last_stmt, phi))
9331 last_stmt = phi;
9332 }
9333 /* We are emitting all vectorized stmts in the same place and
9334 the last one is the last.
9335 ??? Unless we have a load permutation applied and that
9336 figures to re-use an earlier generated load. */
9337 unsigned j;
9338 tree vdef;
9339 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9340 {
9341 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9342 if (!last_stmt
9343 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9344 last_stmt = vstmt;
9345 }
9346 }
9347 else if (!SLP_TREE_VECTYPE (child))
9348 {
9349 /* For externals we use unvectorized at all scalar defs. */
9350 unsigned j;
9351 tree def;
9352 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9353 if (TREE_CODE (def) == SSA_NAME
9354 && !SSA_NAME_IS_DEFAULT_DEF (def))
9355 {
9356 gimple *stmt = SSA_NAME_DEF_STMT (def);
9357 if (!last_stmt
9358 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9359 last_stmt = stmt;
9360 }
9361 }
9362 else
9363 {
9364 /* For externals we have to look at all defs since their
9365 insertion place is decided per vector. But beware
9366 of pre-existing vectors where we need to make sure
9367 we do not insert before the region boundary. */
9368 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9369 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9370 seen_vector_def = true;
9371 else
9372 {
9373 unsigned j;
9374 tree vdef;
9375 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9376 if (TREE_CODE (vdef) == SSA_NAME
9377 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9378 {
9379 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9380 if (!last_stmt
9381 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9382 last_stmt = vstmt;
9383 }
9384 }
9385 }
9386 /* This can happen when all children are pre-existing vectors or
9387 constants. */
9388 if (!last_stmt)
9389 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9390 if (!last_stmt)
9391 {
9392 gcc_assert (seen_vector_def);
9393 si = gsi_after_labels (bb: as_a <bb_vec_info> (p: vinfo)->bbs[0]);
9394 }
9395 else if (is_ctrl_altering_stmt (last_stmt))
9396 {
9397 /* We split regions to vectorize at control altering stmts
9398 with a definition so this must be an external which
9399 we can insert at the start of the region. */
9400 si = gsi_after_labels (bb: as_a <bb_vec_info> (p: vinfo)->bbs[0]);
9401 }
9402 else if (is_a <bb_vec_info> (p: vinfo)
9403 && gimple_bb (g: last_stmt) != gimple_bb (g: stmt_info->stmt)
9404 && gimple_could_trap_p (stmt_info->stmt))
9405 {
9406 /* We've constrained possibly trapping operations to all come
9407 from the same basic-block, if vectorized defs would allow earlier
9408 scheduling still force vectorized stmts to the original block.
9409 This is only necessary for BB vectorization since for loop vect
9410 all operations are in a single BB and scalar stmt based
9411 placement doesn't play well with epilogue vectorization. */
9412 gcc_assert (dominated_by_p (CDI_DOMINATORS,
9413 gimple_bb (stmt_info->stmt),
9414 gimple_bb (last_stmt)));
9415 si = gsi_after_labels (bb: gimple_bb (g: stmt_info->stmt));
9416 }
9417 else if (is_a <gphi *> (p: last_stmt))
9418 si = gsi_after_labels (bb: gimple_bb (g: last_stmt));
9419 else
9420 {
9421 si = gsi_for_stmt (last_stmt);
9422 gsi_next (i: &si);
9423 }
9424 }
9425
9426 /* Handle purely internal nodes. */
9427 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9428 {
9429 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
9430 be shared with different SLP nodes (but usually it's the same
9431 operation apart from the case the stmt is only there for denoting
9432 the actual scalar lane defs ...). So do not call vect_transform_stmt
9433 but open-code it here (partly). */
9434 bool done = vectorizable_slp_permutation (vinfo, gsi: &si, node, NULL);
9435 gcc_assert (done);
9436 stmt_vec_info slp_stmt_info;
9437 unsigned int i;
9438 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9439 if (STMT_VINFO_LIVE_P (slp_stmt_info))
9440 {
9441 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9442 instance, i, true, NULL);
9443 gcc_assert (done);
9444 }
9445 }
9446 else
9447 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9448}
9449
9450/* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9451 For loop vectorization this is done in vectorizable_call, but for SLP
9452 it needs to be deferred until end of vect_schedule_slp, because multiple
9453 SLP instances may refer to the same scalar stmt. */
9454
9455static void
9456vect_remove_slp_scalar_calls (vec_info *vinfo,
9457 slp_tree node, hash_set<slp_tree> &visited)
9458{
9459 gimple *new_stmt;
9460 gimple_stmt_iterator gsi;
9461 int i;
9462 slp_tree child;
9463 tree lhs;
9464 stmt_vec_info stmt_info;
9465
9466 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9467 return;
9468
9469 if (visited.add (k: node))
9470 return;
9471
9472 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9473 vect_remove_slp_scalar_calls (vinfo, node: child, visited);
9474
9475 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9476 {
9477 gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt);
9478 if (!stmt || gimple_bb (g: stmt) == NULL)
9479 continue;
9480 if (is_pattern_stmt_p (stmt_info)
9481 || !PURE_SLP_STMT (stmt_info))
9482 continue;
9483 lhs = gimple_call_lhs (gs: stmt);
9484 if (lhs)
9485 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9486 else
9487 {
9488 new_stmt = gimple_build_nop ();
9489 unlink_stmt_vdef (stmt_info->stmt);
9490 }
9491 gsi = gsi_for_stmt (stmt);
9492 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9493 if (lhs)
9494 SSA_NAME_DEF_STMT (lhs) = new_stmt;
9495 }
9496}
9497
9498static void
9499vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9500{
9501 hash_set<slp_tree> visited;
9502 vect_remove_slp_scalar_calls (vinfo, node, visited);
9503}
9504
9505/* Vectorize the instance root. */
9506
9507void
9508vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9509{
9510 gassign *rstmt = NULL;
9511
9512 if (instance->kind == slp_inst_kind_ctor)
9513 {
9514 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9515 {
9516 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9517 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9518 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9519 TREE_TYPE (vect_lhs)))
9520 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9521 vect_lhs);
9522 rstmt = gimple_build_assign (root_lhs, vect_lhs);
9523 }
9524 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9525 {
9526 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9527 tree child_def;
9528 int j;
9529 vec<constructor_elt, va_gc> *v;
9530 vec_alloc (v, nelems: nelts);
9531
9532 /* A CTOR can handle V16HI composition from VNx8HI so we
9533 do not need to convert vector elements if the types
9534 do not match. */
9535 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9536 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9537 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9538 tree rtype
9539 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9540 tree r_constructor = build_constructor (rtype, v);
9541 rstmt = gimple_build_assign (lhs, r_constructor);
9542 }
9543 }
9544 else if (instance->kind == slp_inst_kind_bb_reduc)
9545 {
9546 /* Largely inspired by reduction chain epilogue handling in
9547 vect_create_epilog_for_reduction. */
9548 vec<tree> vec_defs = vNULL;
9549 vect_get_slp_defs (slp_node: node, vec_defs: &vec_defs);
9550 enum tree_code reduc_code
9551 = gimple_assign_rhs_code (gs: instance->root_stmts[0]->stmt);
9552 /* ??? We actually have to reflect signs somewhere. */
9553 if (reduc_code == MINUS_EXPR)
9554 reduc_code = PLUS_EXPR;
9555 gimple_seq epilogue = NULL;
9556 /* We may end up with more than one vector result, reduce them
9557 to one vector. */
9558 tree vec_def = vec_defs[0];
9559 tree vectype = TREE_TYPE (vec_def);
9560 tree compute_vectype = vectype;
9561 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9562 && TYPE_OVERFLOW_UNDEFINED (vectype)
9563 && operation_can_overflow (reduc_code));
9564 if (pun_for_overflow_p)
9565 {
9566 compute_vectype = unsigned_type_for (vectype);
9567 vec_def = gimple_build (seq: &epilogue, code: VIEW_CONVERT_EXPR,
9568 type: compute_vectype, ops: vec_def);
9569 }
9570 for (unsigned i = 1; i < vec_defs.length (); ++i)
9571 {
9572 tree def = vec_defs[i];
9573 if (pun_for_overflow_p)
9574 def = gimple_build (seq: &epilogue, code: VIEW_CONVERT_EXPR,
9575 type: compute_vectype, ops: def);
9576 vec_def = gimple_build (seq: &epilogue, code: reduc_code, type: compute_vectype,
9577 ops: vec_def, ops: def);
9578 }
9579 vec_defs.release ();
9580 /* ??? Support other schemes than direct internal fn. */
9581 internal_fn reduc_fn;
9582 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9583 || reduc_fn == IFN_LAST)
9584 gcc_unreachable ();
9585 tree scalar_def = gimple_build (seq: &epilogue, fn: as_combined_fn (fn: reduc_fn),
9586 TREE_TYPE (compute_vectype), args: vec_def);
9587 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9588 {
9589 tree rem_def = NULL_TREE;
9590 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9591 {
9592 def = gimple_convert (seq: &epilogue, TREE_TYPE (scalar_def), op: def);
9593 if (!rem_def)
9594 rem_def = def;
9595 else
9596 rem_def = gimple_build (seq: &epilogue, code: reduc_code,
9597 TREE_TYPE (scalar_def),
9598 ops: rem_def, ops: def);
9599 }
9600 scalar_def = gimple_build (seq: &epilogue, code: reduc_code,
9601 TREE_TYPE (scalar_def),
9602 ops: scalar_def, ops: rem_def);
9603 }
9604 scalar_def = gimple_convert (seq: &epilogue,
9605 TREE_TYPE (vectype), op: scalar_def);
9606 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9607 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9608 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9609 update_stmt (s: gsi_stmt (i: rgsi));
9610 return;
9611 }
9612 else
9613 gcc_unreachable ();
9614
9615 gcc_assert (rstmt);
9616
9617 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9618 gsi_replace (&rgsi, rstmt, true);
9619}
9620
9621struct slp_scc_info
9622{
9623 bool on_stack;
9624 int dfs;
9625 int lowlink;
9626};
9627
9628/* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9629
9630static void
9631vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9632 hash_map<slp_tree, slp_scc_info> &scc_info,
9633 int &maxdfs, vec<slp_tree> &stack)
9634{
9635 bool existed_p;
9636 slp_scc_info *info = &scc_info.get_or_insert (k: node, existed: &existed_p);
9637 gcc_assert (!existed_p);
9638 info->dfs = maxdfs;
9639 info->lowlink = maxdfs;
9640 maxdfs++;
9641
9642 /* Leaf. */
9643 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9644 {
9645 info->on_stack = false;
9646 vect_schedule_slp_node (vinfo, node, instance);
9647 return;
9648 }
9649
9650 info->on_stack = true;
9651 stack.safe_push (obj: node);
9652
9653 unsigned i;
9654 slp_tree child;
9655 /* DFS recurse. */
9656 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9657 {
9658 if (!child)
9659 continue;
9660 slp_scc_info *child_info = scc_info.get (k: child);
9661 if (!child_info)
9662 {
9663 vect_schedule_scc (vinfo, node: child, instance, scc_info, maxdfs, stack);
9664 /* Recursion might have re-allocated the node. */
9665 info = scc_info.get (k: node);
9666 child_info = scc_info.get (k: child);
9667 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9668 }
9669 else if (child_info->on_stack)
9670 info->lowlink = MIN (info->lowlink, child_info->dfs);
9671 }
9672 if (info->lowlink != info->dfs)
9673 return;
9674
9675 auto_vec<slp_tree, 4> phis_to_fixup;
9676
9677 /* Singleton. */
9678 if (stack.last () == node)
9679 {
9680 stack.pop ();
9681 info->on_stack = false;
9682 vect_schedule_slp_node (vinfo, node, instance);
9683 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9684 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9685 phis_to_fixup.quick_push (obj: node);
9686 }
9687 else
9688 {
9689 /* SCC. */
9690 int last_idx = stack.length () - 1;
9691 while (stack[last_idx] != node)
9692 last_idx--;
9693 /* We can break the cycle at PHIs who have at least one child
9694 code generated. Then we could re-start the DFS walk until
9695 all nodes in the SCC are covered (we might have new entries
9696 for only back-reachable nodes). But it's simpler to just
9697 iterate and schedule those that are ready. */
9698 unsigned todo = stack.length () - last_idx;
9699 do
9700 {
9701 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9702 {
9703 slp_tree entry = stack[idx];
9704 if (!entry)
9705 continue;
9706 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9707 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9708 bool ready = !phi;
9709 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9710 if (!child)
9711 {
9712 gcc_assert (phi);
9713 ready = true;
9714 break;
9715 }
9716 else if (scc_info.get (k: child)->on_stack)
9717 {
9718 if (!phi)
9719 {
9720 ready = false;
9721 break;
9722 }
9723 }
9724 else
9725 {
9726 if (phi)
9727 {
9728 ready = true;
9729 break;
9730 }
9731 }
9732 if (ready)
9733 {
9734 vect_schedule_slp_node (vinfo, node: entry, instance);
9735 scc_info.get (k: entry)->on_stack = false;
9736 stack[idx] = NULL;
9737 todo--;
9738 if (phi)
9739 phis_to_fixup.safe_push (obj: entry);
9740 }
9741 }
9742 }
9743 while (todo != 0);
9744
9745 /* Pop the SCC. */
9746 stack.truncate (size: last_idx);
9747 }
9748
9749 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9750 slp_tree phi_node;
9751 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9752 {
9753 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9754 edge_iterator ei;
9755 edge e;
9756 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9757 {
9758 unsigned dest_idx = e->dest_idx;
9759 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9760 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9761 continue;
9762 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9763 /* Simply fill all args. */
9764 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9765 != vect_first_order_recurrence)
9766 for (unsigned i = 0; i < n; ++i)
9767 {
9768 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9769 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9770 add_phi_arg (phi, vect_get_slp_vect_def (slp_node: child, i),
9771 e, gimple_phi_arg_location (phi, i: dest_idx));
9772 }
9773 else
9774 {
9775 /* Unless it is a first order recurrence which needs
9776 args filled in for both the PHI node and the permutes. */
9777 gimple *perm
9778 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9779 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9780 add_phi_arg (as_a <gphi *> (p: rphi),
9781 vect_get_slp_vect_def (slp_node: child, i: n - 1),
9782 e, gimple_phi_arg_location (phi, i: dest_idx));
9783 for (unsigned i = 0; i < n; ++i)
9784 {
9785 gimple *perm
9786 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9787 if (i > 0)
9788 gimple_assign_set_rhs1 (gs: perm,
9789 rhs: vect_get_slp_vect_def (slp_node: child, i: i - 1));
9790 gimple_assign_set_rhs2 (gs: perm,
9791 rhs: vect_get_slp_vect_def (slp_node: child, i));
9792 update_stmt (s: perm);
9793 }
9794 }
9795 }
9796 }
9797}
9798
9799/* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9800
9801void
9802vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9803{
9804 slp_instance instance;
9805 unsigned int i;
9806
9807 hash_map<slp_tree, slp_scc_info> scc_info;
9808 int maxdfs = 0;
9809 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9810 {
9811 slp_tree node = SLP_INSTANCE_TREE (instance);
9812 if (dump_enabled_p ())
9813 {
9814 dump_printf_loc (MSG_NOTE, vect_location,
9815 "Vectorizing SLP tree:\n");
9816 /* ??? Dump all? */
9817 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9818 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9819 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9820 vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location,
9821 SLP_INSTANCE_TREE (instance));
9822 }
9823 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9824 have a PHI be the node breaking the cycle. */
9825 auto_vec<slp_tree> stack;
9826 if (!scc_info.get (k: node))
9827 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9828
9829 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9830 vectorize_slp_instance_root_stmt (node, instance);
9831
9832 if (dump_enabled_p ())
9833 dump_printf_loc (MSG_NOTE, vect_location,
9834 "vectorizing stmts using SLP.\n");
9835 }
9836
9837 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9838 {
9839 slp_tree root = SLP_INSTANCE_TREE (instance);
9840 stmt_vec_info store_info;
9841 unsigned int j;
9842
9843 /* Remove scalar call stmts. Do not do this for basic-block
9844 vectorization as not all uses may be vectorized.
9845 ??? Why should this be necessary? DCE should be able to
9846 remove the stmts itself.
9847 ??? For BB vectorization we can as well remove scalar
9848 stmts starting from the SLP tree root if they have no
9849 uses. */
9850 if (is_a <loop_vec_info> (p: vinfo))
9851 vect_remove_slp_scalar_calls (vinfo, node: root);
9852
9853 /* Remove vectorized stores original scalar stmts. */
9854 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (ix: j, ptr: &store_info); j++)
9855 {
9856 if (!STMT_VINFO_DATA_REF (store_info)
9857 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9858 break;
9859
9860 store_info = vect_orig_stmt (stmt_info: store_info);
9861 /* Free the attached stmt_vec_info and remove the stmt. */
9862 vinfo->remove_stmt (store_info);
9863
9864 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9865 to not crash in vect_free_slp_tree later. */
9866 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9867 SLP_TREE_REPRESENTATIVE (root) = NULL;
9868 }
9869 }
9870}
9871

source code of gcc/tree-vect-slp.cc