tree-vect-slp.cc source code [gcc/tree-vect-slp.cc]

1	/ SLP - Basic Block Vectorization*
2	Copyright (C) 2007-2024 Free Software Foundation, Inc.
3	Contributed by Dorit Naishlos <dorit@il.ibm.com>
4	and Ira Rosen <irar@il.ibm.com>
5
6	This file is part of GCC.
7
8	GCC is free software; you can redistribute it and/or modify it under
9	the terms of the GNU General Public License as published by the Free
10	Software Foundation; either version 3, or (at your option) any later
11	version.
12
13	GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14	WARRANTY; without even the implied warranty of MERCHANTABILITY or
15	FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16	for more details.
17
18	You should have received a copy of the GNU General Public License
19	along with GCC; see the file COPYING3. If not see
20	<http://www.gnu.org/licenses/>. /*
21
22	#include "config.h"
23	#define INCLUDE_ALGORITHM
24	#include "system.h"
25	#include "coretypes.h"
26	#include "backend.h"
27	#include "target.h"
28	#include "rtl.h"
29	#include "tree.h"
30	#include "gimple.h"
31	#include "tree-pass.h"
32	#include "ssa.h"
33	#include "optabs-tree.h"
34	#include "insn-config.h"
35	#include "recog.h" /* FIXME: for insn_data */
36	#include "fold-const.h"
37	#include "stor-layout.h"
38	#include "gimple-iterator.h"
39	#include "cfgloop.h"
40	#include "tree-vectorizer.h"
41	#include "langhooks.h"
42	#include "gimple-walk.h"
43	#include "dbgcnt.h"
44	#include "tree-vector-builder.h"
45	#include "vec-perm-indices.h"
46	#include "gimple-fold.h"
47	#include "internal-fn.h"
48	#include "dump-context.h"
49	#include "cfganal.h"
50	#include "tree-eh.h"
51	#include "tree-cfg.h"
52	#include "alloc-pool.h"
53	#include "sreal.h"
54	#include "predict.h"
55
56	static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57	load_permutation_t &,
58	const vec<tree> &,
59	gimple_stmt_iterator *,
60	poly_uint64, bool, bool,
61	unsigned *,
62	unsigned * = nullptr,
63	bool = false);
64	static int vectorizable_slp_permutation_1 (vec_info , gimple_stmt_iterator ,
65	slp_tree, lane_permutation_t &,
66	vec<slp_tree> &, bool);
67	static bool vectorizable_slp_permutation (vec_info , gimple_stmt_iterator ,
68	slp_tree, stmt_vector_for_cost *);
69	static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
70
71	static object_allocator<_slp_tree> *slp_tree_pool;
72	static slp_tree slp_first_node;
73
74	void
75	vect_slp_init (void)
76	{
77	slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
78	}
79
80	void
81	vect_slp_fini (void)
82	{
83	while (slp_first_node)
84	delete slp_first_node;
85	delete slp_tree_pool;
86	slp_tree_pool = NULL;
87	}
88
89	void *
90	_slp_tree::operator new (size_t n)
91	{
92	gcc_assert (n == sizeof (_slp_tree));
93	return slp_tree_pool->allocate_raw ();
94	}
95
96	void
97	_slp_tree::operator delete (void *node, size_t n)
98	{
99	gcc_assert (n == sizeof (_slp_tree));
100	slp_tree_pool->remove_raw (object: node);
101	}
102
103
104	/ Initialize a SLP node. /
105
106	_slp_tree::_slp_tree ()
107	{
108	this->prev_node = NULL;
109	if (slp_first_node)
110	slp_first_node->prev_node = this;
111	this->next_node = slp_first_node;
112	slp_first_node = this;
113	SLP_TREE_SCALAR_STMTS (this) = vNULL;
114	SLP_TREE_SCALAR_OPS (this) = vNULL;
115	SLP_TREE_VEC_DEFS (this) = vNULL;
116	SLP_TREE_NUMBER_OF_VEC_STMTS (this) = `0`;
117	SLP_TREE_CHILDREN (this) = vNULL;
118	SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
119	SLP_TREE_LANE_PERMUTATION (this) = vNULL;
120	SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
121	SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122	SLP_TREE_CODE (this) = ERROR_MARK;
123	SLP_TREE_VECTYPE (this) = NULL_TREE;
124	SLP_TREE_REPRESENTATIVE (this) = NULL;
125	SLP_TREE_REF_COUNT (this) = `1`;
126	this->failed = NULL;
127	this->max_nunits = `1`;
128	this->lanes = `0`;
129	}
130
131	/ Tear down a SLP node. /
132
133	_slp_tree::~_slp_tree ()
134	{
135	if (this->prev_node)
136	this->prev_node->next_node = this->next_node;
137	else
138	slp_first_node = this->next_node;
139	if (this->next_node)
140	this->next_node->prev_node = this->prev_node;
141	SLP_TREE_CHILDREN (this).release ();
142	SLP_TREE_SCALAR_STMTS (this).release ();
143	SLP_TREE_SCALAR_OPS (this).release ();
144	SLP_TREE_VEC_DEFS (this).release ();
145	SLP_TREE_LOAD_PERMUTATION (this).release ();
146	SLP_TREE_LANE_PERMUTATION (this).release ();
147	SLP_TREE_SIMD_CLONE_INFO (this).release ();
148	if (this->failed)
149	free (ptr: failed);
150	}
151
152	/ Push the single SSA definition in DEF to the vector of vector defs. /
153
154	void
155	_slp_tree::push_vec_def (gimple *def)
156	{
157	if (gphi phi = dyn_cast <gphi > (p: def))
158	vec_defs.quick_push (obj: gimple_phi_result (gs: phi));
159	else
160	{
161	def_operand_p defop = single_ssa_def_operand (stmt: def, SSA_OP_ALL_DEFS);
162	vec_defs.quick_push (obj: get_def_from_ptr (def: defop));
163	}
164	}
165
166	/ Recursively free the memory allocated for the SLP tree rooted at NODE. /
167
168	void
169	vect_free_slp_tree (slp_tree node)
170	{
171	int i;
172	slp_tree child;
173
174	if (--SLP_TREE_REF_COUNT (node) != `0`)
175	return;
176
177	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
178	if (child)
179	vect_free_slp_tree (node: child);
180
181	/ If the node defines any SLP only patterns then those patterns are no*
182	longer valid and should be removed. /*
183	stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
184	if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
185	{
186	stmt_vec_info stmt_info = vect_orig_stmt (stmt_info: rep_stmt_info);
187	STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
188	STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
189	}
190
191	delete node;
192	}
193
194	/ Return a location suitable for dumpings related to the SLP instance. /
195
196	dump_user_location_t
197	_slp_instance::location () const
198	{
199	if (!root_stmts.is_empty ())
200	return root_stmts [`0`]->stmt;
201	else
202	return SLP_TREE_SCALAR_STMTS (root)[`0`]->stmt;
203	}
204
205
206	/ Free the memory allocated for the SLP instance. /
207
208	void
209	vect_free_slp_instance (slp_instance instance)
210	{
211	vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
212	SLP_INSTANCE_LOADS (instance).release ();
213	SLP_INSTANCE_ROOT_STMTS (instance).release ();
214	SLP_INSTANCE_REMAIN_DEFS (instance).release ();
215	instance->subgraph_entries.release ();
216	instance->cost_vec.release ();
217	free (ptr: instance);
218	}
219
220
221	/ Create an SLP node for SCALAR_STMTS. /
222
223	slp_tree
224	vect_create_new_slp_node (unsigned nops, tree_code code)
225	{
226	slp_tree node = new _slp_tree;
227	SLP_TREE_SCALAR_STMTS (node) = vNULL;
228	SLP_TREE_CHILDREN (node).create (nelems: nops);
229	SLP_TREE_DEF_TYPE (node) = vect_internal_def;
230	SLP_TREE_CODE (node) = code;
231	return node;
232	}
233	/ Create an SLP node for SCALAR_STMTS. /
234
235	static slp_tree
236	vect_create_new_slp_node (slp_tree node,
237	vec<stmt_vec_info> scalar_stmts, unsigned nops)
238	{
239	SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
240	SLP_TREE_CHILDREN (node).create (nelems: nops);
241	SLP_TREE_DEF_TYPE (node) = vect_internal_def;
242	SLP_TREE_REPRESENTATIVE (node) = scalar_stmts [`0`];
243	SLP_TREE_LANES (node) = scalar_stmts.length ();
244	return node;
245	}
246
247	/ Create an SLP node for SCALAR_STMTS. /
248
249	static slp_tree
250	vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
251	{
252	return vect_create_new_slp_node (node: new _slp_tree, scalar_stmts, nops);
253	}
254
255	/ Create an SLP node for OPS. /
256
257	static slp_tree
258	vect_create_new_slp_node (slp_tree node, vec<tree> ops)
259	{
260	SLP_TREE_SCALAR_OPS (node) = ops;
261	SLP_TREE_DEF_TYPE (node) = vect_external_def;
262	SLP_TREE_LANES (node) = ops.length ();
263	return node;
264	}
265
266	/ Create an SLP node for OPS. /
267
268	static slp_tree
269	vect_create_new_slp_node (vec<tree> ops)
270	{
271	return vect_create_new_slp_node (node: new _slp_tree, ops);
272	}
273
274
275	/ This structure is used in creation of an SLP tree. Each instance*
276	corresponds to the same operand in a group of scalar stmts in an SLP
277	node. /*
278	typedef struct _slp_oprnd_info
279	{
280	/ Def-stmts for the operands. /
281	vec<stmt_vec_info> def_stmts;
282	/ Operands. /
283	vec<tree> ops;
284	/ Information about the first statement, its vector def-type, type, the*
285	operand itself in case it's constant, and an indication if it's a pattern
286	stmt and gather/scatter info. /*
287	tree first_op_type;
288	enum vect_def_type first_dt;
289	bool any_pattern;
290	bool first_gs_p;
291	gather_scatter_info first_gs_info;
292	} *slp_oprnd_info;
293
294
295	/ Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each*
296	operand. /*
297	static vec<slp_oprnd_info>
298	vect_create_oprnd_info (int nops, int group_size)
299	{
300	int i;
301	slp_oprnd_info oprnd_info;
302	vec<slp_oprnd_info> oprnds_info;
303
304	oprnds_info.create (nelems: nops);
305	for (i = `0`; i < nops; i++)
306	{
307	oprnd_info = XNEW (struct _slp_oprnd_info);
308	oprnd_info->def_stmts.create (nelems: group_size);
309	oprnd_info->ops.create (nelems: group_size);
310	oprnd_info->first_dt = vect_uninitialized_def;
311	oprnd_info->first_op_type = NULL_TREE;
312	oprnd_info->any_pattern = false;
313	oprnd_info->first_gs_p = false;
314	oprnds_info.quick_push (obj: oprnd_info);
315	}
316
317	return oprnds_info;
318	}
319
320
321	/ Free operands info. /
322
323	static void
324	vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
325	{
326	int i;
327	slp_oprnd_info oprnd_info;
328
329	FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
330	{
331	oprnd_info->def_stmts.release ();
332	oprnd_info->ops.release ();
333	XDELETE (oprnd_info);
334	}
335
336	oprnds_info.release ();
337	}
338
339	/ Return the execution frequency of NODE (so that a higher value indicates*
340	a "more important" node when optimizing for speed). /*
341
342	static sreal
343	vect_slp_node_weight (slp_tree node)
344	{
345	stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
346	basic_block bb = gimple_bb (g: stmt_info->stmt);
347	return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
348	}
349
350	/ Return true if STMTS contains a pattern statement. /
351
352	static bool
353	vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
354	{
355	stmt_vec_info stmt_info;
356	unsigned int i;
357	FOR_EACH_VEC_ELT (stmts, i, stmt_info)
358	if (is_pattern_stmt_p (stmt_info))
359	return true;
360	return false;
361	}
362
363	/ Return true when all lanes in the external or constant NODE have*
364	the same value. /*
365
366	static bool
367	vect_slp_tree_uniform_p (slp_tree node)
368	{
369	gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
370	\|\| SLP_TREE_DEF_TYPE (node) == vect_external_def);
371
372	/ Pre-exsting vectors. /
373	if (SLP_TREE_SCALAR_OPS (node).is_empty ())
374	return false;
375
376	unsigned i;
377	tree op, first = NULL_TREE;
378	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
379	if (!first)
380	first = op;
381	else if (!operand_equal_p (first, op, flags: `0`))
382	return false;
383
384	return true;
385	}
386
387	/ Find the place of the data-ref in STMT_INFO in the interleaving chain*
388	that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
389	of the chain. /*
390
391	int
392	vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
393	stmt_vec_info first_stmt_info)
394	{
395	stmt_vec_info next_stmt_info = first_stmt_info;
396	int result = `0`;
397
398	if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
399	return -`1`;
400
401	do
402	{
403	if (next_stmt_info == stmt_info)
404	return result;
405	next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
406	if (next_stmt_info)
407	result += DR_GROUP_GAP (next_stmt_info);
408	}
409	while (next_stmt_info);
410
411	return -`1`;
412	}
413
414	/ Check whether it is possible to load COUNT elements of type ELT_TYPE*
415	using the method implemented by duplicate_and_interleave. Return true
416	if so, returning the number of intermediate vectors in NVECTORS_OUT*
417	(if nonnull) and the type of each intermediate vector in VECTOR_TYPE_OUT*
418	(if nonnull). /*
419
420	bool
421	can_duplicate_and_interleave_p (vec_info vinfo, unsigned* int count,
422	tree elt_type, unsigned int *nvectors_out,
423	tree *vector_type_out,
424	tree *permutes)
425	{
426	tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
427	if (!base_vector_type \|\| !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
428	return false;
429
430	machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
431	poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
432	unsigned int nvectors = `1`;
433	for (;;)
434	{
435	scalar_int_mode int_mode;
436	poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
437	if (int_mode_for_size (size: elt_bits, limit: `1`).exists (mode: &int_mode))
438	{
439	/ Get the natural vector type for this SLP group size. /
440	tree int_type = build_nonstandard_integer_type
441	(GET_MODE_BITSIZE (mode: int_mode), `1`);
442	tree vector_type
443	= get_vectype_for_scalar_type (vinfo, int_type, count);
444	poly_int64 half_nelts;
445	if (vector_type
446	&& VECTOR_MODE_P (TYPE_MODE (vector_type))
447	&& known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
448	GET_MODE_SIZE (base_vector_mode))
449	&& multiple_p (a: GET_MODE_NUNITS (TYPE_MODE (vector_type)),
450	b: `2`, multiple: &half_nelts))
451	{
452	/ Try fusing consecutive sequences of COUNT / NVECTORS elements*
453	together into elements of type INT_TYPE and using the result
454	to build NVECTORS vectors. /*
455	poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
456	vec_perm_builder sel1 (nelts, `2`, `3`);
457	vec_perm_builder sel2 (nelts, `2`, `3`);
458
459	for (unsigned int i = `0`; i < `3`; ++i)
460	{
461	sel1.quick_push (obj: i);
462	sel1.quick_push (obj: i + nelts);
463	sel2.quick_push (obj: half_nelts + i);
464	sel2.quick_push (obj: half_nelts + i + nelts);
465	}
466	vec_perm_indices indices1 (sel1, `2`, nelts);
467	vec_perm_indices indices2 (sel2, `2`, nelts);
468	machine_mode vmode = TYPE_MODE (vector_type);
469	if (can_vec_perm_const_p (vmode, vmode, indices1)
470	&& can_vec_perm_const_p (vmode, vmode, indices2))
471	{
472	if (nvectors_out)
473	*nvectors_out = nvectors;
474	if (vector_type_out)
475	*vector_type_out = vector_type;
476	if (permutes)
477	{
478	permutes[`0`] = vect_gen_perm_mask_checked (vector_type,
479	indices1);
480	permutes[`1`] = vect_gen_perm_mask_checked (vector_type,
481	indices2);
482	}
483	return true;
484	}
485	}
486	}
487	if (!multiple_p (a: elt_bytes, b: `2`, multiple: &elt_bytes))
488	return false;
489	nvectors *= `2`;
490	}
491	}
492
493	/ Return true if DTA and DTB match. /
494
495	static bool
496	vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
497	{
498	return (dta == dtb
499	\|\| ((dta == vect_external_def \|\| dta == vect_constant_def)
500	&& (dtb == vect_external_def \|\| dtb == vect_constant_def)));
501	}
502
503	static const int cond_expr_maps[`3`][`5`] = {
504	{ `4`, -`1`, -`2`, `1`, `2` },
505	{ `4`, -`2`, -`1`, `1`, `2` },
506	{ `4`, -`1`, -`2`, `2`, `1` }
507	};
508	static const int arg0_map[] = { `1`, `0` };
509	static const int arg1_map[] = { `1`, `1` };
510	static const int arg2_map[] = { `1`, `2` };
511	static const int arg1_arg4_map[] = { `2`, `1`, `4` };
512	static const int arg3_arg2_map[] = { `2`, `3`, `2` };
513	static const int op1_op0_map[] = { `2`, `1`, `0` };
514	static const int off_map[] = { `1`, -`3` };
515	static const int off_op0_map[] = { `2`, -`3`, `0` };
516	static const int off_arg2_map[] = { `2`, -`3`, `2` };
517	static const int off_arg3_arg2_map[] = { `3`, -`3`, `3`, `2` };
518	static const int mask_call_maps[`6`][`7`] = {
519	{ `1`, `1`, },
520	{ `2`, `1`, `2`, },
521	{ `3`, `1`, `2`, `3`, },
522	{ `4`, `1`, `2`, `3`, `4`, },
523	{ `5`, `1`, `2`, `3`, `4`, `5`, },
524	{ `6`, `1`, `2`, `3`, `4`, `5`, `6` },
525	};
526
527	/ For most SLP statements, there is a one-to-one mapping between*
528	gimple arguments and child nodes. If that is not true for STMT,
529	return an array that contains:
530
531	- the number of child nodes, followed by
532	- for each child node, the index of the argument associated with that node.
533	The special index -1 is the first operand of an embedded comparison and
534	the special index -2 is the second operand of an embedded comparison.
535	The special indes -3 is the offset of a gather as analyzed by
536	vect_check_gather_scatter.
537
538	SWAP is as for vect_get_and_check_slp_defs. /*
539
540	static const int *
541	vect_get_operand_map (const gimple stmt, bool* gather_scatter_p = false,
542	unsigned char swap = `0`)
543	{
544	if (auto assign = dyn_cast<const gassign *> (p: stmt))
545	{
546	if (gimple_assign_rhs_code (gs: assign) == COND_EXPR
547	&& COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
548	return cond_expr_maps[swap];
549	if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
550	&& swap)
551	return op1_op0_map;
552	if (gather_scatter_p)
553	return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
554	? off_op0_map : off_map);
555	}
556	gcc_assert (!swap);
557	if (auto call = dyn_cast<const gcall *> (p: stmt))
558	{
559	if (gimple_call_internal_p (gs: call))
560	switch (gimple_call_internal_fn (gs: call))
561	{
562	case IFN_MASK_LOAD:
563	return gather_scatter_p ? off_arg2_map : arg2_map;
564
565	case IFN_GATHER_LOAD:
566	return arg1_map;
567
568	case IFN_MASK_GATHER_LOAD:
569	case IFN_MASK_LEN_GATHER_LOAD:
570	return arg1_arg4_map;
571
572	case IFN_MASK_STORE:
573	return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
574
575	case IFN_MASK_CALL:
576	{
577	unsigned nargs = gimple_call_num_args (gs: call);
578	if (nargs >= `2` && nargs <= `7`)
579	return mask_call_maps[nargs-`2`];
580	else
581	return nullptr;
582	}
583
584	case IFN_CLZ:
585	case IFN_CTZ:
586	return arg0_map;
587
588	default:
589	break;
590	}
591	}
592	return nullptr;
593	}
594
595	/ Return the SLP node child index for operand OP of STMT. /
596
597	int
598	vect_slp_child_index_for_operand (const gimple stmt, int* op,
599	bool gather_scatter_p)
600	{
601	const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
602	if (!opmap)
603	return op;
604	for (int i = `1`; i < `1` + opmap[`0`]; ++i)
605	if (opmap[i] == op)
606	return i - `1`;
607	gcc_unreachable ();
608	}
609
610	/ Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that*
611	they are of a valid type and that they match the defs of the first stmt of
612	the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
613	by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
614	indicates swap is required for cond_expr stmts. Specifically, SWAP
615	is 1 if STMT is cond and operands of comparison need to be swapped;
616	SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
617
618	If there was a fatal error return -1; if the error could be corrected by
619	swapping operands of father node of this one, return 1; if everything is
620	ok return 0. /*
621	static int
622	vect_get_and_check_slp_defs (vec_info vinfo, unsigned* char swap,
623	bool *skip_args,
624	vec<stmt_vec_info> stmts, unsigned stmt_num,
625	vec<slp_oprnd_info> *oprnds_info)
626	{
627	stmt_vec_info stmt_info = stmts [stmt_num];
628	tree oprnd;
629	unsigned int i, number_of_oprnds;
630	enum vect_def_type dt = vect_uninitialized_def;
631	slp_oprnd_info oprnd_info;
632	gather_scatter_info gs_info;
633	unsigned int gs_op = -`1u`;
634	unsigned int commutative_op = -`1U`;
635	bool first = stmt_num == `0`;
636
637	if (!is_a<gcall *> (p: stmt_info->stmt)
638	&& !is_a<gassign *> (p: stmt_info->stmt)
639	&& !is_a<gphi *> (p: stmt_info->stmt))
640	return -`1`;
641
642	number_of_oprnds = gimple_num_args (gs: stmt_info->stmt);
643	const int *map
644	= vect_get_operand_map (stmt: stmt_info->stmt,
645	STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
646	if (map)
647	number_of_oprnds = *map++;
648	if (gcall stmt = dyn_cast <gcall > (p: stmt_info->stmt))
649	{
650	if (gimple_call_internal_p (gs: stmt))
651	{
652	internal_fn ifn = gimple_call_internal_fn (gs: stmt);
653	commutative_op = first_commutative_argument (ifn);
654	}
655	}
656	else if (gassign stmt = dyn_cast <gassign > (p: stmt_info->stmt))
657	{
658	if (commutative_tree_code (gimple_assign_rhs_code (gs: stmt)))
659	commutative_op = `0`;
660	}
661
662	bool swapped = (swap != `0`);
663	bool backedge = false;
664	enum vect_def_type dts = XALLOCAVEC (enum* vect_def_type, number_of_oprnds);
665	for (i = `0`; i < number_of_oprnds; i++)
666	{
667	oprnd_info = (*oprnds_info)[i];
668	int opno = map ? map[i] : int (i);
669	if (opno == -`3`)
670	{
671	gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
672	if (!is_a <loop_vec_info> (p: vinfo)
673	\|\| !vect_check_gather_scatter (stmt_info,
674	as_a <loop_vec_info> (p: vinfo),
675	first ? &oprnd_info->first_gs_info
676	: &gs_info))
677	return -`1`;
678
679	if (first)
680	{
681	oprnd_info->first_gs_p = true;
682	oprnd = oprnd_info->first_gs_info.offset;
683	}
684	else
685	{
686	gs_op = i;
687	oprnd = gs_info.offset;
688	}
689	}
690	else if (opno < `0`)
691	oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, `0`), -`1` - opno);
692	else
693	{
694	oprnd = gimple_arg (gs: stmt_info->stmt, i: opno);
695	if (gphi stmt = dyn_cast <gphi > (p: stmt_info->stmt))
696	{
697	edge e = gimple_phi_arg_edge (phi: stmt, i: opno);
698	backedge = (is_a <bb_vec_info> (p: vinfo)
699	? e->flags & EDGE_DFS_BACK
700	: dominated_by_p (CDI_DOMINATORS, e->src,
701	gimple_bb (g: stmt_info->stmt)));
702	}
703	}
704	if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
705	oprnd = TREE_OPERAND (oprnd, `0`);
706
707	stmt_vec_info def_stmt_info;
708	if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
709	{
710	if (dump_enabled_p ())
711	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
712	"Build SLP failed: can't analyze def for %T\n",
713	oprnd);
714
715	return -`1`;
716	}
717
718	if (skip_args[i])
719	{
720	oprnd_info->def_stmts.quick_push (NULL);
721	oprnd_info->ops.quick_push (NULL_TREE);
722	oprnd_info->first_dt = vect_uninitialized_def;
723	continue;
724	}
725
726	oprnd_info->def_stmts.quick_push (obj: def_stmt_info);
727	oprnd_info->ops.quick_push (obj: oprnd);
728
729	if (def_stmt_info
730	&& is_pattern_stmt_p (stmt_info: def_stmt_info))
731	{
732	if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
733	!= def_stmt_info)
734	oprnd_info->any_pattern = true;
735	else
736	/ If we promote this to external use the original stmt def. /
737	oprnd_info->ops.last ()
738	= gimple_get_lhs (vect_orig_stmt (stmt_info: def_stmt_info)->stmt);
739	}
740
741	/ If there's a extern def on a backedge make sure we can*
742	code-generate at the region start.
743	??? This is another case that could be fixed by adjusting
744	how we split the function but at the moment we'd have conflicting
745	goals there. /*
746	if (backedge
747	&& dts[i] == vect_external_def
748	&& is_a <bb_vec_info> (p: vinfo)
749	&& TREE_CODE (oprnd) == SSA_NAME
750	&& !SSA_NAME_IS_DEFAULT_DEF (oprnd)
751	&& !dominated_by_p (CDI_DOMINATORS,
752	as_a <bb_vec_info> (p: vinfo)->bbs [`0`],
753	gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
754	{
755	if (dump_enabled_p ())
756	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757	"Build SLP failed: extern def %T only defined "
758	"on backedge\n", oprnd);
759	return -`1`;
760	}
761
762	if (first)
763	{
764	tree type = TREE_TYPE (oprnd);
765	dt = dts[i];
766
767	/ For the swapping logic below force vect_reduction_def*
768	for the reduction op in a SLP reduction group. /*
769	if (!STMT_VINFO_DATA_REF (stmt_info)
770	&& REDUC_GROUP_FIRST_ELEMENT (stmt_info)
771	&& (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
772	&& def_stmt_info)
773	dts[i] = dt = vect_reduction_def;
774
775	/ Check the types of the definition. /
776	switch (dt)
777	{
778	case vect_external_def:
779	case vect_constant_def:
780	case vect_internal_def:
781	case vect_reduction_def:
782	case vect_induction_def:
783	case vect_nested_cycle:
784	case vect_first_order_recurrence:
785	break;
786
787	default:
788	/ FORNOW: Not supported. /
789	if (dump_enabled_p ())
790	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
791	"Build SLP failed: illegal type of def %T\n",
792	oprnd);
793	return -`1`;
794	}
795
796	oprnd_info->first_dt = dt;
797	oprnd_info->first_op_type = type;
798	}
799	}
800	if (first)
801	return `0`;
802
803	/ Now match the operand definition types to that of the first stmt. /
804	for (i = `0`; i < number_of_oprnds;)
805	{
806	if (skip_args[i])
807	{
808	++i;
809	continue;
810	}
811
812	oprnd_info = (*oprnds_info)[i];
813	dt = dts[i];
814	stmt_vec_info def_stmt_info = oprnd_info->def_stmts [stmt_num];
815	oprnd = oprnd_info->ops [stmt_num];
816	tree type = TREE_TYPE (oprnd);
817
818	if (!types_compatible_p (type1: oprnd_info->first_op_type, type2: type))
819	{
820	if (dump_enabled_p ())
821	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822	"Build SLP failed: different operand types\n");
823	return `1`;
824	}
825
826	if ((gs_op == i) != oprnd_info->first_gs_p)
827	{
828	if (dump_enabled_p ())
829	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
830	"Build SLP failed: mixed gather and non-gather\n");
831	return `1`;
832	}
833	else if (gs_op == i)
834	{
835	if (!operand_equal_p (oprnd_info->first_gs_info.base,
836	gs_info.base))
837	{
838	if (dump_enabled_p ())
839	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
840	"Build SLP failed: different gather base\n");
841	return `1`;
842	}
843	if (oprnd_info->first_gs_info.scale != gs_info.scale)
844	{
845	if (dump_enabled_p ())
846	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
847	"Build SLP failed: different gather scale\n");
848	return `1`;
849	}
850	}
851
852	/ Not first stmt of the group, check that the def-stmt/s match*
853	the def-stmt/s of the first stmt. Allow different definition
854	types for reduction chains: the first stmt must be a
855	vect_reduction_def (a phi node), and the rest
856	end in the reduction chain. /*
857	if ((!vect_def_types_match (dta: oprnd_info->first_dt, dtb: dt)
858	&& !(oprnd_info->first_dt == vect_reduction_def
859	&& !STMT_VINFO_DATA_REF (stmt_info)
860	&& REDUC_GROUP_FIRST_ELEMENT (stmt_info)
861	&& def_stmt_info
862	&& !STMT_VINFO_DATA_REF (def_stmt_info)
863	&& (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
864	== REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
865	\|\| (!STMT_VINFO_DATA_REF (stmt_info)
866	&& REDUC_GROUP_FIRST_ELEMENT (stmt_info)
867	&& ((!def_stmt_info
868	\|\| STMT_VINFO_DATA_REF (def_stmt_info)
869	\|\| (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
870	!= REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
871	!= (oprnd_info->first_dt != vect_reduction_def))))
872	{
873	/ Try swapping operands if we got a mismatch. For BB*
874	vectorization only in case it will clearly improve things. /*
875	if (i == commutative_op && !swapped
876	&& (!is_a <bb_vec_info> (p: vinfo)
877	\|\| (!vect_def_types_match (dta: (*oprnds_info)[i+`1`]->first_dt,
878	dtb: dts[i+`1`])
879	&& (vect_def_types_match (dta: oprnd_info->first_dt, dtb: dts[i+`1`])
880	\|\| vect_def_types_match
881	(dta: (*oprnds_info)[i+`1`]->first_dt, dtb: dts[i])))))
882	{
883	if (dump_enabled_p ())
884	dump_printf_loc (MSG_NOTE, vect_location,
885	"trying swapped operands\n");
886	std::swap (a&: dts[i], b&: dts[i+`1`]);
887	std::swap (a&: (*oprnds_info)[i]->def_stmts [stmt_num],
888	b&: (*oprnds_info)[i+`1`]->def_stmts [stmt_num]);
889	std::swap (a&: (*oprnds_info)[i]->ops [stmt_num],
890	b&: (*oprnds_info)[i+`1`]->ops [stmt_num]);
891	/ After swapping some operands we lost track whether an*
892	operand has any pattern defs so be conservative here. /*
893	if ((*oprnds_info)[i]->any_pattern
894	\|\| (*oprnds_info)[i+`1`]->any_pattern)
895	(*oprnds_info)[i]->any_pattern
896	= (oprnds_info)[i+`1`]->any_pattern = true*;
897	swapped = true;
898	continue;
899	}
900
901	if (is_a <bb_vec_info> (p: vinfo)
902	&& !oprnd_info->any_pattern)
903	{
904	/ Now for commutative ops we should see whether we can*
905	make the other operand matching. /*
906	if (dump_enabled_p ())
907	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
908	"treating operand as external\n");
909	oprnd_info->first_dt = dt = vect_external_def;
910	}
911	else
912	{
913	if (dump_enabled_p ())
914	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915	"Build SLP failed: different types\n");
916	return `1`;
917	}
918	}
919
920	/ Make sure to demote the overall operand to external. /
921	if (dt == vect_external_def)
922	oprnd_info->first_dt = vect_external_def;
923	/ For a SLP reduction chain we want to duplicate the reduction to*
924	each of the chain members. That gets us a sane SLP graph (still
925	the stmts are not 100% correct wrt the initial values). /*
926	else if ((dt == vect_internal_def
927	\|\| dt == vect_reduction_def)
928	&& oprnd_info->first_dt == vect_reduction_def
929	&& !STMT_VINFO_DATA_REF (stmt_info)
930	&& REDUC_GROUP_FIRST_ELEMENT (stmt_info)
931	&& !STMT_VINFO_DATA_REF (def_stmt_info)
932	&& (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
933	== REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
934	{
935	oprnd_info->def_stmts [stmt_num] = oprnd_info->def_stmts [`0`];
936	oprnd_info->ops [stmt_num] = oprnd_info->ops [`0`];
937	}
938
939	++i;
940	}
941
942	/ Swap operands. /
943	if (swapped)
944	{
945	if (dump_enabled_p ())
946	dump_printf_loc (MSG_NOTE, vect_location,
947	"swapped operands to match def types in %G",
948	stmt_info->stmt);
949	}
950
951	return `0`;
952	}
953
954	/ Return true if call statements CALL1 and CALL2 are similar enough*
955	to be combined into the same SLP group. /*
956
957	bool
958	compatible_calls_p (gcall call1, gcall call2)
959	{
960	unsigned int nargs = gimple_call_num_args (gs: call1);
961	if (nargs != gimple_call_num_args (gs: call2))
962	return false;
963
964	if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
965	return false;
966
967	if (gimple_call_internal_p (gs: call1))
968	{
969	if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
970	TREE_TYPE (gimple_call_lhs (call2))))
971	return false;
972	for (unsigned int i = `0`; i < nargs; ++i)
973	if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
974	TREE_TYPE (gimple_call_arg (call2, i))))
975	return false;
976	}
977	else
978	{
979	if (!operand_equal_p (gimple_call_fn (gs: call1),
980	gimple_call_fn (gs: call2), flags: `0`))
981	return false;
982
983	if (gimple_call_fntype (gs: call1) != gimple_call_fntype (gs: call2))
984	return false;
985	}
986
987	/ Check that any unvectorized arguments are equal. /
988	if (const int *map = vect_get_operand_map (stmt: call1))
989	{
990	unsigned int nkept = *map++;
991	unsigned int mapi = `0`;
992	for (unsigned int i = `0`; i < nargs; ++i)
993	if (mapi < nkept && map[mapi] == int (i))
994	mapi += `1`;
995	else if (!operand_equal_p (gimple_call_arg (gs: call1, index: i),
996	gimple_call_arg (gs: call2, index: i)))
997	return false;
998	}
999
1000	return true;
1001	}
1002
1003	/ A subroutine of vect_build_slp_tree for checking VECTYPE, which is the*
1004	caller's attempt to find the vector type in STMT_INFO with the narrowest
1005	element type. Return true if VECTYPE is nonnull and if it is valid
1006	for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1007	number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1008	vect_build_slp_tree. /*
1009
1010	static bool
1011	vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1012	unsigned int group_size,
1013	tree vectype, poly_uint64 *max_nunits)
1014	{
1015	if (!vectype)
1016	{
1017	if (dump_enabled_p ())
1018	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019	"Build SLP failed: unsupported data-type in %G\n",
1020	stmt_info->stmt);
1021	/ Fatal mismatch. /
1022	return false;
1023	}
1024
1025	/ If populating the vector type requires unrolling then fail*
1026	before adjusting max_nunits for basic-block vectorization. /
1027	if (is_a <bb_vec_info> (p: vinfo)
1028	&& !multiple_p (a: group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype)))
1029	{
1030	if (dump_enabled_p ())
1031	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032	"Build SLP failed: unrolling required "
1033	"in basic block SLP\n");
1034	/ Fatal mismatch. /
1035	return false;
1036	}
1037
1038	/ In case of multiple types we need to detect the smallest type. /
1039	vect_update_max_nunits (max_nunits, vectype);
1040	return true;
1041	}
1042
1043	/ Verify if the scalar stmts STMTS are isomorphic, require data*
1044	permutation or are of unsupported types of operation. Return
1045	true if they are, otherwise return false and indicate in MATCHES*
1046	which stmts are not isomorphic to the first one. If MATCHES[0]
1047	is false then this indicates the comparison could not be
1048	carried out or the stmts will never be vectorized by SLP.
1049
1050	Note COND_EXPR is possibly isomorphic to another one after swapping its
1051	operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1052	the first stmt by swapping the two operands of comparison; set SWAP[i]
1053	to 2 if stmt I is isormorphic to the first stmt by inverting the code
1054	of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1055	to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. /*
1056
1057	static bool
1058	vect_build_slp_tree_1 (vec_info vinfo, unsigned* char *swap,
1059	vec<stmt_vec_info> stmts, unsigned int group_size,
1060	poly_uint64 max_nunits, bool* *matches,
1061	bool two_operators, tree node_vectype)
1062	{
1063	unsigned int i;
1064	stmt_vec_info first_stmt_info = stmts [`0`];
1065	code_helper first_stmt_code = ERROR_MARK;
1066	code_helper alt_stmt_code = ERROR_MARK;
1067	code_helper rhs_code = ERROR_MARK;
1068	code_helper first_cond_code = ERROR_MARK;
1069	tree lhs;
1070	bool need_same_oprnds = false;
1071	tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1072	stmt_vec_info first_load = NULL, prev_first_load = NULL;
1073	bool first_stmt_ldst_p = false, ldst_p = false;
1074	bool first_stmt_phi_p = false, phi_p = false;
1075	bool maybe_soft_fail = false;
1076	tree soft_fail_nunits_vectype = NULL_TREE;
1077
1078	/ For every stmt in NODE find its def stmt/s. /
1079	stmt_vec_info stmt_info;
1080	FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1081	{
1082	gimple *stmt = stmt_info->stmt;
1083	swap[i] = `0`;
1084	matches[i] = false;
1085
1086	if (dump_enabled_p ())
1087	dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1088
1089	/ Fail to vectorize statements marked as unvectorizable, throw*
1090	or are volatile. /*
1091	if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1092	\|\| stmt_can_throw_internal (cfun, stmt)
1093	\|\| gimple_has_volatile_ops (stmt))
1094	{
1095	if (dump_enabled_p ())
1096	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1097	"Build SLP failed: unvectorizable statement %G",
1098	stmt);
1099	/ ??? For BB vectorization we want to commutate operands in a way*
1100	to shuffle all unvectorizable defs into one operand and have
1101	the other still vectorized. The following doesn't reliably
1102	work for this though but it's the easiest we can do here. /*
1103	if (is_a <bb_vec_info> (p: vinfo) && i != `0`)
1104	continue;
1105	/ Fatal mismatch. /
1106	matches[`0`] = false;
1107	return false;
1108	}
1109
1110	gcall call_stmt = dyn_cast <gcall > (p: stmt);
1111	lhs = gimple_get_lhs (stmt);
1112	if (lhs == NULL_TREE
1113	&& (!call_stmt
1114	\|\| !gimple_call_internal_p (gs: stmt)
1115	\|\| !internal_store_fn_p (gimple_call_internal_fn (gs: stmt))))
1116	{
1117	if (dump_enabled_p ())
1118	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1119	"Build SLP failed: not GIMPLE_ASSIGN nor "
1120	"GIMPLE_CALL %G", stmt);
1121	if (is_a <bb_vec_info> (p: vinfo) && i != `0`)
1122	continue;
1123	/ Fatal mismatch. /
1124	matches[`0`] = false;
1125	return false;
1126	}
1127
1128	tree nunits_vectype;
1129	if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1130	&nunits_vectype, group_size))
1131	{
1132	if (is_a <bb_vec_info> (p: vinfo) && i != `0`)
1133	continue;
1134	/ Fatal mismatch. /
1135	matches[`0`] = false;
1136	return false;
1137	}
1138	/ Record nunits required but continue analysis, producing matches[]*
1139	as if nunits was not an issue. This allows splitting of groups
1140	to happen. /*
1141	if (nunits_vectype
1142	&& !vect_record_max_nunits (vinfo, stmt_info, group_size,
1143	vectype: nunits_vectype, max_nunits))
1144	{
1145	gcc_assert (is_a <bb_vec_info> (vinfo));
1146	maybe_soft_fail = true;
1147	soft_fail_nunits_vectype = nunits_vectype;
1148	}
1149
1150	gcc_assert (vectype);
1151
1152	if (call_stmt)
1153	{
1154	combined_fn cfn = gimple_call_combined_fn (call_stmt);
1155	if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1156	rhs_code = cfn;
1157	else
1158	rhs_code = CALL_EXPR;
1159
1160	if (cfn == CFN_MASK_LOAD
1161	\|\| cfn == CFN_GATHER_LOAD
1162	\|\| cfn == CFN_MASK_GATHER_LOAD
1163	\|\| cfn == CFN_MASK_LEN_GATHER_LOAD)
1164	ldst_p = true;
1165	else if (cfn == CFN_MASK_STORE)
1166	{
1167	ldst_p = true;
1168	rhs_code = CFN_MASK_STORE;
1169	}
1170	else if ((cfn != CFN_LAST
1171	&& cfn != CFN_MASK_CALL
1172	&& internal_fn_p (code: cfn)
1173	&& !vectorizable_internal_fn_p (fn: as_internal_fn (code: cfn)))
1174	\|\| gimple_call_tail_p (s: call_stmt)
1175	\|\| gimple_call_noreturn_p (s: call_stmt)
1176	\|\| gimple_call_chain (gs: call_stmt))
1177	{
1178	if (dump_enabled_p ())
1179	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180	"Build SLP failed: unsupported call type %G",
1181	(gimple *) call_stmt);
1182	if (is_a <bb_vec_info> (p: vinfo) && i != `0`)
1183	continue;
1184	/ Fatal mismatch. /
1185	matches[`0`] = false;
1186	return false;
1187	}
1188	}
1189	else if (gimple_code (g: stmt) == GIMPLE_PHI)
1190	{
1191	rhs_code = ERROR_MARK;
1192	phi_p = true;
1193	}
1194	else
1195	{
1196	rhs_code = gimple_assign_rhs_code (gs: stmt);
1197	ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1198	}
1199
1200	/ Check the operation. /
1201	if (i == `0`)
1202	{
1203	*node_vectype = vectype;
1204	first_stmt_code = rhs_code;
1205	first_stmt_ldst_p = ldst_p;
1206	first_stmt_phi_p = phi_p;
1207
1208	/ Shift arguments should be equal in all the packed stmts for a*
1209	vector shift with scalar shift operand. /*
1210	if (rhs_code == LSHIFT_EXPR \|\| rhs_code == RSHIFT_EXPR
1211	\|\| rhs_code == LROTATE_EXPR
1212	\|\| rhs_code == RROTATE_EXPR)
1213	{
1214	/ First see if we have a vector/vector shift. /
1215	if (!directly_supported_p (rhs_code, vectype, optab_vector))
1216	{
1217	/ No vector/vector shift, try for a vector/scalar shift. /
1218	if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1219	{
1220	if (dump_enabled_p ())
1221	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222	"Build SLP failed: "
1223	"op not supported by target.\n");
1224	if (is_a <bb_vec_info> (p: vinfo) && i != `0`)
1225	continue;
1226	/ Fatal mismatch. /
1227	matches[`0`] = false;
1228	return false;
1229	}
1230	need_same_oprnds = true;
1231	first_op1 = gimple_assign_rhs2 (gs: stmt);
1232	}
1233	}
1234	else if (rhs_code == WIDEN_LSHIFT_EXPR)
1235	{
1236	need_same_oprnds = true;
1237	first_op1 = gimple_assign_rhs2 (gs: stmt);
1238	}
1239	else if (!ldst_p
1240	&& rhs_code == BIT_FIELD_REF)
1241	{
1242	tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), `0`);
1243	if (!is_a <bb_vec_info> (p: vinfo)
1244	\|\| TREE_CODE (vec) != SSA_NAME
1245	/ When the element types are not compatible we pun the*
1246	source to the target vectype which requires equal size. /*
1247	\|\| ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1248	\|\| !types_compatible_p (TREE_TYPE (vectype),
1249	TREE_TYPE (TREE_TYPE (vec))))
1250	&& !operand_equal_p (TYPE_SIZE (vectype),
1251	TYPE_SIZE (TREE_TYPE (vec)))))
1252	{
1253	if (dump_enabled_p ())
1254	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1255	"Build SLP failed: "
1256	"BIT_FIELD_REF not supported\n");
1257	/ Fatal mismatch. /
1258	matches[`0`] = false;
1259	return false;
1260	}
1261	}
1262	else if (rhs_code == CFN_DIV_POW2)
1263	{
1264	need_same_oprnds = true;
1265	first_op1 = gimple_call_arg (gs: call_stmt, index: `1`);
1266	}
1267	}
1268	else
1269	{
1270	if (first_stmt_code != rhs_code
1271	&& alt_stmt_code == ERROR_MARK)
1272	alt_stmt_code = rhs_code;
1273	if ((first_stmt_code != rhs_code
1274	&& (first_stmt_code != IMAGPART_EXPR
1275	\|\| rhs_code != REALPART_EXPR)
1276	&& (first_stmt_code != REALPART_EXPR
1277	\|\| rhs_code != IMAGPART_EXPR)
1278	/ Handle mismatches in plus/minus by computing both*
1279	and merging the results. /*
1280	&& !((first_stmt_code == PLUS_EXPR
1281	\|\| first_stmt_code == MINUS_EXPR)
1282	&& (alt_stmt_code == PLUS_EXPR
1283	\|\| alt_stmt_code == MINUS_EXPR)
1284	&& rhs_code == alt_stmt_code)
1285	&& !(first_stmt_code.is_tree_code ()
1286	&& rhs_code.is_tree_code ()
1287	&& (TREE_CODE_CLASS (tree_code (first_stmt_code))
1288	== tcc_comparison)
1289	&& (swap_tree_comparison (tree_code (first_stmt_code))
1290	== tree_code (rhs_code)))
1291	&& !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1292	&& (first_stmt_code == ARRAY_REF
1293	\|\| first_stmt_code == BIT_FIELD_REF
1294	\|\| first_stmt_code == INDIRECT_REF
1295	\|\| first_stmt_code == COMPONENT_REF
1296	\|\| first_stmt_code == MEM_REF)
1297	&& (rhs_code == ARRAY_REF
1298	\|\| rhs_code == BIT_FIELD_REF
1299	\|\| rhs_code == INDIRECT_REF
1300	\|\| rhs_code == COMPONENT_REF
1301	\|\| rhs_code == MEM_REF)))
1302	\|\| (ldst_p
1303	&& (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1304	!= STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1305	\|\| (ldst_p
1306	&& (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1307	!= STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1308	\|\| first_stmt_ldst_p != ldst_p
1309	\|\| first_stmt_phi_p != phi_p)
1310	{
1311	if (dump_enabled_p ())
1312	{
1313	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314	"Build SLP failed: different operation "
1315	"in stmt %G", stmt);
1316	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1317	"original stmt %G", first_stmt_info->stmt);
1318	}
1319	/ Mismatch. /
1320	continue;
1321	}
1322
1323	if (!ldst_p
1324	&& first_stmt_code == BIT_FIELD_REF
1325	&& (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), `0`)
1326	!= TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), `0`)))
1327	{
1328	if (dump_enabled_p ())
1329	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330	"Build SLP failed: different BIT_FIELD_REF "
1331	"arguments in %G", stmt);
1332	/ Mismatch. /
1333	continue;
1334	}
1335
1336	if (call_stmt
1337	&& first_stmt_code != CFN_MASK_LOAD
1338	&& first_stmt_code != CFN_MASK_STORE)
1339	{
1340	if (!compatible_calls_p (call1: as_a <gcall *> (p: stmts [`0`]->stmt),
1341	call2: call_stmt))
1342	{
1343	if (dump_enabled_p ())
1344	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345	"Build SLP failed: different calls in %G",
1346	stmt);
1347	/ Mismatch. /
1348	continue;
1349	}
1350	}
1351
1352	if ((phi_p \|\| gimple_could_trap_p (stmt_info->stmt))
1353	&& (gimple_bb (g: first_stmt_info->stmt)
1354	!= gimple_bb (g: stmt_info->stmt)))
1355	{
1356	if (dump_enabled_p ())
1357	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1358	"Build SLP failed: different BB for PHI "
1359	"or possibly trapping operation in %G", stmt);
1360	/ Mismatch. /
1361	continue;
1362	}
1363
1364	if (need_same_oprnds)
1365	{
1366	tree other_op1 = gimple_arg (gs: stmt, i: `1`);
1367	if (!operand_equal_p (first_op1, other_op1, flags: `0`))
1368	{
1369	if (dump_enabled_p ())
1370	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371	"Build SLP failed: different shift "
1372	"arguments in %G", stmt);
1373	/ Mismatch. /
1374	continue;
1375	}
1376	}
1377
1378	if (!types_compatible_p (type1: vectype, type2: *node_vectype))
1379	{
1380	if (dump_enabled_p ())
1381	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1382	"Build SLP failed: different vector type "
1383	"in %G", stmt);
1384	/ Mismatch. /
1385	continue;
1386	}
1387	}
1388
1389	/ Grouped store or load. /
1390	if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1391	{
1392	gcc_assert (ldst_p);
1393	if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1394	{
1395	/ Store. /
1396	gcc_assert (rhs_code == CFN_MASK_STORE
1397	\|\| REFERENCE_CLASS_P (lhs)
1398	\|\| DECL_P (lhs));
1399	}
1400	else
1401	{
1402	/ Load. /
1403	first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1404	if (prev_first_load)
1405	{
1406	/ Check that there are no loads from different interleaving*
1407	chains in the same node. /*
1408	if (prev_first_load != first_load)
1409	{
1410	if (dump_enabled_p ())
1411	dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1412	vect_location,
1413	"Build SLP failed: different "
1414	"interleaving chains in one node %G",
1415	stmt);
1416	/ Mismatch. /
1417	continue;
1418	}
1419	}
1420	else
1421	prev_first_load = first_load;
1422	}
1423	}
1424	/ Non-grouped store or load. /
1425	else if (ldst_p)
1426	{
1427	if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1428	&& rhs_code != CFN_GATHER_LOAD
1429	&& rhs_code != CFN_MASK_GATHER_LOAD
1430	&& rhs_code != CFN_MASK_LEN_GATHER_LOAD
1431	&& !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1432	/ Not grouped loads are handled as externals for BB*
1433	vectorization. For loop vectorization we can handle
1434	splats the same we handle single element interleaving. /*
1435	&& (is_a <bb_vec_info> (p: vinfo)
1436	\|\| stmt_info != first_stmt_info))
1437	{
1438	/ Not grouped load. /
1439	if (dump_enabled_p ())
1440	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1441	"Build SLP failed: not grouped load %G", stmt);
1442
1443	if (i != `0`)
1444	continue;
1445	/ Fatal mismatch. /
1446	matches[`0`] = false;
1447	return false;
1448	}
1449	}
1450	/ Not memory operation. /
1451	else
1452	{
1453	if (!phi_p
1454	&& rhs_code.is_tree_code ()
1455	&& TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1456	&& TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1457	&& TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1458	&& TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1459	&& rhs_code != VIEW_CONVERT_EXPR
1460	&& rhs_code != CALL_EXPR
1461	&& rhs_code != BIT_FIELD_REF)
1462	{
1463	if (dump_enabled_p ())
1464	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465	"Build SLP failed: operation unsupported %G",
1466	stmt);
1467	if (is_a <bb_vec_info> (p: vinfo) && i != `0`)
1468	continue;
1469	/ Fatal mismatch. /
1470	matches[`0`] = false;
1471	return false;
1472	}
1473
1474	if (rhs_code == COND_EXPR)
1475	{
1476	tree cond_expr = gimple_assign_rhs1 (gs: stmt);
1477	enum tree_code cond_code = TREE_CODE (cond_expr);
1478	enum tree_code swap_code = ERROR_MARK;
1479	enum tree_code invert_code = ERROR_MARK;
1480
1481	if (i == `0`)
1482	first_cond_code = TREE_CODE (cond_expr);
1483	else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1484	{
1485	bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, `0`));
1486	swap_code = swap_tree_comparison (cond_code);
1487	invert_code = invert_tree_comparison (cond_code, honor_nans);
1488	}
1489
1490	if (first_cond_code == cond_code)
1491	;
1492	/ Isomorphic can be achieved by swapping. /
1493	else if (first_cond_code == swap_code)
1494	swap[i] = `1`;
1495	/ Isomorphic can be achieved by inverting. /
1496	else if (first_cond_code == invert_code)
1497	swap[i] = `2`;
1498	else
1499	{
1500	if (dump_enabled_p ())
1501	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1502	"Build SLP failed: different"
1503	" operation %G", stmt);
1504	/ Mismatch. /
1505	continue;
1506	}
1507	}
1508
1509	if (rhs_code.is_tree_code ()
1510	&& TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1511	&& (swap_tree_comparison ((tree_code)first_stmt_code)
1512	== (tree_code)rhs_code))
1513	swap[i] = `1`;
1514	}
1515
1516	matches[i] = true;
1517	}
1518
1519	for (i = `0`; i < group_size; ++i)
1520	if (!matches[i])
1521	return false;
1522
1523	/ If we allowed a two-operation SLP node verify the target can cope*
1524	with the permute we are going to use. /*
1525	if (alt_stmt_code != ERROR_MARK
1526	&& (!alt_stmt_code.is_tree_code ()
1527	\|\| (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1528	&& TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1529	{
1530	two_operators = true*;
1531	}
1532
1533	if (maybe_soft_fail)
1534	{
1535	unsigned HOST_WIDE_INT const_nunits;
1536	if (!TYPE_VECTOR_SUBPARTS
1537	(node: soft_fail_nunits_vectype).is_constant (const_value: &const_nunits)
1538	\|\| const_nunits > group_size)
1539	matches[`0`] = false;
1540	else
1541	{
1542	/ With constant vector elements simulate a mismatch at the*
1543	point we need to split. /*
1544	unsigned tail = group_size & (const_nunits - `1`);
1545	memset (s: &matches[group_size - tail], c: `0`, n: sizeof (bool) * tail);
1546	}
1547	return false;
1548	}
1549
1550	return true;
1551	}
1552
1553	/ Traits for the hash_set to record failed SLP builds for a stmt set.*
1554	Note we never remove apart from at destruction time so we do not
1555	need a special value for deleted that differs from empty. /*
1556	struct bst_traits
1557	{
1558	typedef vec <stmt_vec_info> value_type;
1559	typedef vec <stmt_vec_info> compare_type;
1560	static inline hashval_t hash (value_type);
1561	static inline bool equal (value_type existing, value_type candidate);
1562	static inline bool is_empty (value_type x) { return !x.exists (); }
1563	static inline bool is_deleted (value_type x) { return !x.exists (); }
1564	static const bool empty_zero_p = true;
1565	static inline void mark_empty (value_type &x) { x.release (); }
1566	static inline void mark_deleted (value_type &x) { x.release (); }
1567	static inline void remove (value_type &x) { x.release (); }
1568	};
1569	inline hashval_t
1570	bst_traits::hash (value_type x)
1571	{
1572	inchash::hash h;
1573	for (unsigned i = `0`; i < x.length (); ++i)
1574	h.add_int (v: gimple_uid (g: x [i]->stmt));
1575	return h.end ();
1576	}
1577	inline bool
1578	bst_traits::equal (value_type existing, value_type candidate)
1579	{
1580	if (existing.length () != candidate.length ())
1581	return false;
1582	for (unsigned i = `0`; i < existing.length (); ++i)
1583	if (existing [i] != candidate [i])
1584	return false;
1585	return true;
1586	}
1587
1588	/ ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>*
1589	but then vec::insert does memmove and that's not compatible with
1590	std::pair. /*
1591	struct chain_op_t
1592	{
1593	chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1594	: code (code_), dt (dt_), op (op_) {}
1595	tree_code code;
1596	vect_def_type dt;
1597	tree op;
1598	};
1599
1600	/ Comparator for sorting associatable chains. /
1601
1602	static int
1603	dt_sort_cmp (const void op1_, const* void op2_, void* *)
1604	{
1605	auto op1 = (const* chain_op_t *) op1_;
1606	auto op2 = (const* chain_op_t *) op2_;
1607	if (op1->dt != op2->dt)
1608	return (int)op1->dt - (int)op2->dt;
1609	return (int)op1->code - (int)op2->code;
1610	}
1611
1612	/ Linearize the associatable expression chain at START with the*
1613	associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1614	filling CHAIN with the result and using WORKLIST as intermediate storage.
1615	CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1616	or MINUS_EXPR. CHAIN_STMTS if not NULL is filled with all computation*
1617	stmts, starting with START. /*
1618
1619	static void
1620	vect_slp_linearize_chain (vec_info *vinfo,
1621	vec<std::pair<tree_code, gimple *> > &worklist,
1622	vec<chain_op_t> &chain,
1623	enum tree_code code, gimple *start,
1624	gimple &code_stmt, gimple &alt_code_stmt,
1625	vec<gimple > chain_stmts)
1626	{
1627	/ For each lane linearize the addition/subtraction (or other*
1628	uniform associatable operation) expression tree. /*
1629	worklist.safe_push (obj: std::make_pair (x&: code, y&: start));
1630	while (!worklist.is_empty ())
1631	{
1632	auto entry = worklist.pop ();
1633	gassign stmt = as_a <gassign > (p: entry.second);
1634	enum tree_code in_code = entry.first;
1635	enum tree_code this_code = gimple_assign_rhs_code (gs: stmt);
1636	/ Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. /
1637	if (!code_stmt
1638	&& gimple_assign_rhs_code (gs: stmt) == code)
1639	code_stmt = stmt;
1640	else if (!alt_code_stmt
1641	&& gimple_assign_rhs_code (gs: stmt) == MINUS_EXPR)
1642	alt_code_stmt = stmt;
1643	if (chain_stmts)
1644	chain_stmts->safe_push (obj: stmt);
1645	for (unsigned opnum = `1`; opnum <= `2`; ++opnum)
1646	{
1647	tree op = gimple_op (gs: stmt, i: opnum);
1648	vect_def_type dt;
1649	stmt_vec_info def_stmt_info;
1650	bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1651	gcc_assert (res);
1652	if (dt == vect_internal_def
1653	&& is_pattern_stmt_p (stmt_info: def_stmt_info))
1654	op = gimple_get_lhs (def_stmt_info->stmt);
1655	gimple *use_stmt;
1656	use_operand_p use_p;
1657	if (dt == vect_internal_def
1658	&& single_imm_use (var: op, use_p: &use_p, stmt: &use_stmt)
1659	&& is_gimple_assign (gs: def_stmt_info->stmt)
1660	&& (gimple_assign_rhs_code (gs: def_stmt_info->stmt) == code
1661	\|\| (code == PLUS_EXPR
1662	&& (gimple_assign_rhs_code (gs: def_stmt_info->stmt)
1663	== MINUS_EXPR))))
1664	{
1665	tree_code op_def_code = this_code;
1666	if (op_def_code == MINUS_EXPR && opnum == `1`)
1667	op_def_code = PLUS_EXPR;
1668	if (in_code == MINUS_EXPR)
1669	op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1670	worklist.safe_push (obj: std::make_pair (x&: op_def_code,
1671	y&: def_stmt_info->stmt));
1672	}
1673	else
1674	{
1675	tree_code op_def_code = this_code;
1676	if (op_def_code == MINUS_EXPR && opnum == `1`)
1677	op_def_code = PLUS_EXPR;
1678	if (in_code == MINUS_EXPR)
1679	op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1680	chain.safe_push (obj: chain_op_t (op_def_code, dt, op));
1681	}
1682	}
1683	}
1684	}
1685
1686	typedef hash_map <vec <stmt_vec_info>, slp_tree,
1687	simple_hashmap_traits <bst_traits, slp_tree> >
1688	scalar_stmts_to_slp_tree_map_t;
1689
1690	static slp_tree
1691	vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1692	vec<stmt_vec_info> stmts, unsigned int group_size,
1693	poly_uint64 *max_nunits,
1694	bool matches, unsigned* limit, unsigned* *tree_size,
1695	scalar_stmts_to_slp_tree_map_t *bst_map);
1696
1697	static slp_tree
1698	vect_build_slp_tree (vec_info *vinfo,
1699	vec<stmt_vec_info> stmts, unsigned int group_size,
1700	poly_uint64 *max_nunits,
1701	bool matches, unsigned* limit, unsigned* *tree_size,
1702	scalar_stmts_to_slp_tree_map_t *bst_map)
1703	{
1704	if (slp_tree *leader = bst_map->get (k: stmts))
1705	{
1706	if (dump_enabled_p ())
1707	dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1708	!(*leader)->failed ? "" : "failed ",
1709	(void ) leader);
1710	if (!(*leader)->failed)
1711	{
1712	SLP_TREE_REF_COUNT (*leader)++;
1713	vect_update_max_nunits (max_nunits, nunits: (*leader)->max_nunits);
1714	stmts.release ();
1715	return *leader;
1716	}
1717	memcpy (dest: matches, src: (leader)->failed, n: sizeof* (bool) * group_size);
1718	return NULL;
1719	}
1720
1721	/ Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2*
1722	so we can pick up backedge destinations during discovery. /*
1723	slp_tree res = new _slp_tree;
1724	SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1725	SLP_TREE_SCALAR_STMTS (res) = stmts;
1726	bst_map->put (k: stmts.copy (), v: res);
1727
1728	if (*limit == `0`)
1729	{
1730	if (dump_enabled_p ())
1731	dump_printf_loc (MSG_NOTE, vect_location,
1732	"SLP discovery limit exceeded\n");
1733	/ Mark the node invalid so we can detect those when still in use*
1734	as backedge destinations. /*
1735	SLP_TREE_SCALAR_STMTS (res) = vNULL;
1736	SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1737	res->failed = XNEWVEC (bool, group_size);
1738	memset (s: res->failed, c: `0`, n: sizeof (bool) * group_size);
1739	memset (s: matches, c: `0`, n: sizeof (bool) * group_size);
1740	return NULL;
1741	}
1742	--*limit;
1743
1744	if (dump_enabled_p ())
1745	dump_printf_loc (MSG_NOTE, vect_location,
1746	"starting SLP discovery for node %p\n", (void *) res);
1747
1748	poly_uint64 this_max_nunits = `1`;
1749	slp_tree res_ = vect_build_slp_tree_2 (vinfo, node: res, stmts, group_size,
1750	max_nunits: &this_max_nunits,
1751	matches, limit, tree_size, bst_map);
1752	if (!res_)
1753	{
1754	if (dump_enabled_p ())
1755	dump_printf_loc (MSG_NOTE, vect_location,
1756	"SLP discovery for node %p failed\n", (void *) res);
1757	/ Mark the node invalid so we can detect those when still in use*
1758	as backedge destinations. /*
1759	SLP_TREE_SCALAR_STMTS (res) = vNULL;
1760	SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1761	res->failed = XNEWVEC (bool, group_size);
1762	if (flag_checking)
1763	{
1764	unsigned i;
1765	for (i = `0`; i < group_size; ++i)
1766	if (!matches[i])
1767	break;
1768	gcc_assert (i < group_size);
1769	}
1770	memcpy (dest: res->failed, src: matches, n: sizeof (bool) * group_size);
1771	}
1772	else
1773	{
1774	if (dump_enabled_p ())
1775	dump_printf_loc (MSG_NOTE, vect_location,
1776	"SLP discovery for node %p succeeded\n",
1777	(void *) res);
1778	gcc_assert (res_ == res);
1779	res->max_nunits = this_max_nunits;
1780	vect_update_max_nunits (max_nunits, nunits: this_max_nunits);
1781	/ Keep a reference for the bst_map use. /
1782	SLP_TREE_REF_COUNT (res)++;
1783	}
1784	return res_;
1785	}
1786
1787	/ Helper for building an associated SLP node chain. /
1788
1789	static void
1790	vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1791	slp_tree op0, slp_tree op1,
1792	stmt_vec_info oper1, stmt_vec_info oper2,
1793	vec<std::pair<unsigned, unsigned> > lperm)
1794	{
1795	unsigned group_size = SLP_TREE_LANES (op1);
1796
1797	slp_tree child1 = new _slp_tree;
1798	SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1799	SLP_TREE_VECTYPE (child1) = vectype;
1800	SLP_TREE_LANES (child1) = group_size;
1801	SLP_TREE_CHILDREN (child1).create (nelems: `2`);
1802	SLP_TREE_CHILDREN (child1).quick_push (obj: op0);
1803	SLP_TREE_CHILDREN (child1).quick_push (obj: op1);
1804	SLP_TREE_REPRESENTATIVE (child1) = oper1;
1805
1806	slp_tree child2 = new _slp_tree;
1807	SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1808	SLP_TREE_VECTYPE (child2) = vectype;
1809	SLP_TREE_LANES (child2) = group_size;
1810	SLP_TREE_CHILDREN (child2).create (nelems: `2`);
1811	SLP_TREE_CHILDREN (child2).quick_push (obj: op0);
1812	SLP_TREE_REF_COUNT (op0)++;
1813	SLP_TREE_CHILDREN (child2).quick_push (obj: op1);
1814	SLP_TREE_REF_COUNT (op1)++;
1815	SLP_TREE_REPRESENTATIVE (child2) = oper2;
1816
1817	SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1818	SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1819	SLP_TREE_VECTYPE (perm) = vectype;
1820	SLP_TREE_LANES (perm) = group_size;
1821	/ ??? We should set this NULL but that's not expected. /
1822	SLP_TREE_REPRESENTATIVE (perm) = oper1;
1823	SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1824	SLP_TREE_CHILDREN (perm).quick_push (obj: child1);
1825	SLP_TREE_CHILDREN (perm).quick_push (obj: child2);
1826	}
1827
1828	/ Recursively build an SLP tree starting from NODE.*
1829	Fail (and return a value not equal to zero) if def-stmts are not
1830	isomorphic, require data permutation or are of unsupported types of
1831	operation. Otherwise, return 0.
1832	The value returned is the depth in the SLP tree where a mismatch
1833	was found. /*
1834
1835	static slp_tree
1836	vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1837	vec<stmt_vec_info> stmts, unsigned int group_size,
1838	poly_uint64 *max_nunits,
1839	bool matches, unsigned* limit, unsigned* *tree_size,
1840	scalar_stmts_to_slp_tree_map_t *bst_map)
1841	{
1842	unsigned nops, i, this_tree_size = `0`;
1843	poly_uint64 this_max_nunits = *max_nunits;
1844
1845	matches[`0`] = false;
1846
1847	stmt_vec_info stmt_info = stmts [`0`];
1848	if (!is_a<gcall *> (p: stmt_info->stmt)
1849	&& !is_a<gassign *> (p: stmt_info->stmt)
1850	&& !is_a<gphi *> (p: stmt_info->stmt))
1851	return NULL;
1852
1853	nops = gimple_num_args (gs: stmt_info->stmt);
1854	if (const int *map = vect_get_operand_map (stmt: stmt_info->stmt,
1855	STMT_VINFO_GATHER_SCATTER_P
1856	(stmt_info)))
1857	nops = map[`0`];
1858
1859	/ If the SLP node is a PHI (induction or reduction), terminate*
1860	the recursion. /*
1861	bool skip_args = XALLOCAVEC (bool*, nops);
1862	memset (s: skip_args, c: `0`, n: sizeof (bool) * nops);
1863	if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
1864	if (gphi stmt = dyn_cast <gphi > (p: stmt_info->stmt))
1865	{
1866	tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1867	tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1868	group_size);
1869	if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1870	max_nunits))
1871	return NULL;
1872
1873	vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1874	if (def_type == vect_induction_def)
1875	{
1876	/ Induction PHIs are not cycles but walk the initial*
1877	value. Only for inner loops through, for outer loops
1878	we need to pick up the value from the actual PHIs
1879	to more easily support peeling and epilogue vectorization. /*
1880	class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1881	if (!nested_in_vect_loop_p (loop, stmt_info))
1882	skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1883	else
1884	loop = loop->inner;
1885	skip_args[loop_latch_edge (loop)->dest_idx] = true;
1886	}
1887	else if (def_type == vect_reduction_def
1888	\|\| def_type == vect_double_reduction_def
1889	\|\| def_type == vect_nested_cycle
1890	\|\| def_type == vect_first_order_recurrence)
1891	{
1892	/ Else def types have to match. /
1893	stmt_vec_info other_info;
1894	bool all_same = true;
1895	FOR_EACH_VEC_ELT (stmts, i, other_info)
1896	{
1897	if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1898	return NULL;
1899	if (other_info != stmt_info)
1900	all_same = false;
1901	}
1902	class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1903	/ Reduction initial values are not explicitely represented. /
1904	if (def_type != vect_first_order_recurrence
1905	&& !nested_in_vect_loop_p (loop, stmt_info))
1906	skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1907	/ Reduction chain backedge defs are filled manually.*
1908	??? Need a better way to identify a SLP reduction chain PHI.
1909	Or a better overall way to SLP match those. /*
1910	if (all_same && def_type == vect_reduction_def)
1911	skip_args[loop_latch_edge (loop)->dest_idx] = true;
1912	}
1913	else if (def_type != vect_internal_def)
1914	return NULL;
1915	}
1916
1917
1918	bool two_operators = false;
1919	unsigned char swap = XALLOCAVEC (unsigned* char, group_size);
1920	tree vectype = NULL_TREE;
1921	if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1922	max_nunits: &this_max_nunits, matches, two_operators: &two_operators,
1923	node_vectype: &vectype))
1924	return NULL;
1925
1926	/ If the SLP node is a load, terminate the recursion unless masked. /
1927	if (STMT_VINFO_DATA_REF (stmt_info)
1928	&& DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1929	{
1930	if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1931	gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1932	else
1933	{
1934	*max_nunits = this_max_nunits;
1935	(*tree_size)++;
1936	node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: `0`);
1937	SLP_TREE_VECTYPE (node) = vectype;
1938	/ And compute the load permutation. Whether it is actually*
1939	a permutation depends on the unrolling factor which is
1940	decided later. /*
1941	vec<unsigned> load_permutation;
1942	int j;
1943	stmt_vec_info load_info;
1944	load_permutation.create (nelems: group_size);
1945	stmt_vec_info first_stmt_info
1946	= DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[`0`]);
1947	bool any_permute = false;
1948	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1949	{
1950	int load_place;
1951	if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1952	load_place = vect_get_place_in_interleaving_chain
1953	(stmt_info: load_info, first_stmt_info);
1954	else
1955	load_place = `0`;
1956	gcc_assert (load_place != -`1`);
1957	any_permute \|= load_place != j;
1958	load_permutation.quick_push (obj: load_place);
1959	}
1960
1961	if (gcall stmt = dyn_cast <gcall > (p: stmt_info->stmt))
1962	{
1963	gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1964	\|\| gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1965	\|\| gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1966	\|\| gimple_call_internal_p (stmt,
1967	IFN_MASK_LEN_GATHER_LOAD));
1968	load_permutation.release ();
1969	/ We cannot handle permuted masked loads, see PR114375. /
1970	if (any_permute
1971	\|\| (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1972	&& DR_GROUP_SIZE (first_stmt_info) != group_size)
1973	\|\| STMT_VINFO_STRIDED_P (stmt_info))
1974	{
1975	matches[`0`] = false;
1976	return NULL;
1977	}
1978	}
1979	else
1980	{
1981	SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1982	return node;
1983	}
1984	}
1985	}
1986	else if (gimple_assign_single_p (gs: stmt_info->stmt)
1987	&& !gimple_vuse (g: stmt_info->stmt)
1988	&& gimple_assign_rhs_code (gs: stmt_info->stmt) == BIT_FIELD_REF)
1989	{
1990	/ vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference*
1991	the same SSA name vector of a compatible type to vectype. /*
1992	vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1993	tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), `0`);
1994	stmt_vec_info estmt_info;
1995	FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1996	{
1997	gassign estmt = as_a <gassign > (p: estmt_info->stmt);
1998	tree bfref = gimple_assign_rhs1 (gs: estmt);
1999	HOST_WIDE_INT lane;
2000	if (!known_eq (bit_field_size (bfref),
2001	tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2002	\|\| !constant_multiple_p (a: bit_field_offset (t: bfref),
2003	b: bit_field_size (t: bfref), multiple: &lane))
2004	{
2005	lperm.release ();
2006	matches[`0`] = false;
2007	return NULL;
2008	}
2009	lperm.safe_push (obj: std::make_pair (x: `0`, y: (unsigned)lane));
2010	}
2011	slp_tree vnode = vect_create_new_slp_node (ops: vNULL);
2012	if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2013	/ ??? We record vectype here but we hide eventually necessary*
2014	punning and instead rely on code generation to materialize
2015	VIEW_CONVERT_EXPRs as necessary. We instead should make
2016	this explicit somehow. /*
2017	SLP_TREE_VECTYPE (vnode) = vectype;
2018	else
2019	{
2020	/ For different size but compatible elements we can still*
2021	use VEC_PERM_EXPR without punning. /*
2022	gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2023	&& types_compatible_p (TREE_TYPE (vectype),
2024	TREE_TYPE (TREE_TYPE (vec))));
2025	SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2026	}
2027	auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2028	unsigned HOST_WIDE_INT const_nunits;
2029	if (nunits.is_constant (const_value: &const_nunits))
2030	SLP_TREE_LANES (vnode) = const_nunits;
2031	SLP_TREE_VEC_DEFS (vnode).safe_push (obj: vec);
2032	/ We are always building a permutation node even if it is an identity*
2033	permute to shield the rest of the vectorizer from the odd node
2034	representing an actual vector without any scalar ops.
2035	??? We could hide it completely with making the permute node
2036	external? /*
2037	node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: `1`);
2038	SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2039	SLP_TREE_LANE_PERMUTATION (node) = lperm;
2040	SLP_TREE_VECTYPE (node) = vectype;
2041	SLP_TREE_CHILDREN (node).quick_push (obj: vnode);
2042	return node;
2043	}
2044	/ When discovery reaches an associatable operation see whether we can*
2045	improve that to match up lanes in a way superior to the operand
2046	swapping code which at most looks at two defs.
2047	??? For BB vectorization we cannot do the brute-force search
2048	for matching as we can succeed by means of builds from scalars
2049	and have no good way to "cost" one build against another. /*
2050	else if (is_a <loop_vec_info> (p: vinfo)
2051	/ ??? We don't handle !vect_internal_def defs below. /
2052	&& STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2053	&& is_gimple_assign (gs: stmt_info->stmt)
2054	&& (associative_tree_code (gimple_assign_rhs_code (gs: stmt_info->stmt))
2055	\|\| gimple_assign_rhs_code (gs: stmt_info->stmt) == MINUS_EXPR)
2056	&& ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2057	\|\| (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2058	&& TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2059	{
2060	/ See if we have a chain of (mixed) adds or subtracts or other*
2061	associatable ops. /*
2062	enum tree_code code = gimple_assign_rhs_code (gs: stmt_info->stmt);
2063	if (code == MINUS_EXPR)
2064	code = PLUS_EXPR;
2065	stmt_vec_info other_op_stmt_info = NULL;
2066	stmt_vec_info op_stmt_info = NULL;
2067	unsigned chain_len = `0`;
2068	auto_vec<chain_op_t> chain;
2069	auto_vec<std::pair<tree_code, gimple *> > worklist;
2070	auto_vec<vec<chain_op_t> > chains (group_size);
2071	auto_vec<slp_tree, `4`> children;
2072	bool hard_fail = true;
2073	for (unsigned lane = `0`; lane < group_size; ++lane)
2074	{
2075	/ For each lane linearize the addition/subtraction (or other*
2076	uniform associatable operation) expression tree. /*
2077	gimple op_stmt = NULL, other_op_stmt = NULL;
2078	vect_slp_linearize_chain (vinfo, worklist, chain, code,
2079	start: stmts [lane]->stmt, code_stmt&: op_stmt, alt_code_stmt&: other_op_stmt,
2080	NULL);
2081	if (!op_stmt_info && op_stmt)
2082	op_stmt_info = vinfo->lookup_stmt (op_stmt);
2083	if (!other_op_stmt_info && other_op_stmt)
2084	other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2085	if (chain.length () == `2`)
2086	{
2087	/ In a chain of just two elements resort to the regular*
2088	operand swapping scheme. If we run into a length
2089	mismatch still hard-FAIL. /*
2090	if (chain_len == `0`)
2091	hard_fail = false;
2092	else
2093	{
2094	matches[lane] = false;
2095	/ ??? We might want to process the other lanes, but*
2096	make sure to not give false matching hints to the
2097	caller for lanes we did not process. /*
2098	if (lane != group_size - `1`)
2099	matches[`0`] = false;
2100	}
2101	break;
2102	}
2103	else if (chain_len == `0`)
2104	chain_len = chain.length ();
2105	else if (chain.length () != chain_len)
2106	{
2107	/ ??? Here we could slip in magic to compensate with*
2108	neutral operands. /*
2109	matches[lane] = false;
2110	if (lane != group_size - `1`)
2111	matches[`0`] = false;
2112	break;
2113	}
2114	chains.quick_push (obj: chain.copy ());
2115	chain.truncate (size: `0`);
2116	}
2117	if (chains.length () == group_size)
2118	{
2119	/ We cannot yet use SLP_TREE_CODE to communicate the operation. /
2120	if (!op_stmt_info)
2121	{
2122	hard_fail = false;
2123	goto out;
2124	}
2125	/ Now we have a set of chains with the same length. /
2126	/ 1. pre-sort according to def_type and operation. /
2127	for (unsigned lane = `0`; lane < group_size; ++lane)
2128	chains [lane].stablesort (cmp: dt_sort_cmp, data: vinfo);
2129	if (dump_enabled_p ())
2130	{
2131	dump_printf_loc (MSG_NOTE, vect_location,
2132	"pre-sorted chains of %s\n",
2133	get_tree_code_name (code));
2134	for (unsigned lane = `0`; lane < group_size; ++lane)
2135	{
2136	for (unsigned opnum = `0`; opnum < chain_len; ++opnum)
2137	dump_printf (MSG_NOTE, "%s %T ",
2138	get_tree_code_name (chains [lane][opnum].code),
2139	chains [lane][opnum].op);
2140	dump_printf (MSG_NOTE, "\n");
2141	}
2142	}
2143	/ 2. try to build children nodes, associating as necessary. /
2144	for (unsigned n = `0`; n < chain_len; ++n)
2145	{
2146	vect_def_type dt = chains [`0`][n].dt;
2147	unsigned lane;
2148	for (lane = `0`; lane < group_size; ++lane)
2149	if (chains [lane][n].dt != dt)
2150	{
2151	if (dt == vect_constant_def
2152	&& chains [lane][n].dt == vect_external_def)
2153	dt = vect_external_def;
2154	else if (dt == vect_external_def
2155	&& chains [lane][n].dt == vect_constant_def)
2156	;
2157	else
2158	break;
2159	}
2160	if (lane != group_size)
2161	{
2162	if (dump_enabled_p ())
2163	dump_printf_loc (MSG_NOTE, vect_location,
2164	"giving up on chain due to mismatched "
2165	"def types\n");
2166	matches[lane] = false;
2167	if (lane != group_size - `1`)
2168	matches[`0`] = false;
2169	goto out;
2170	}
2171	if (dt == vect_constant_def
2172	\|\| dt == vect_external_def)
2173	{
2174	/ Check whether we can build the invariant. If we can't*
2175	we never will be able to. /*
2176	tree type = TREE_TYPE (chains[`0`][n].op);
2177	if (!GET_MODE_SIZE (mode: vinfo->vector_mode).is_constant ()
2178	&& (TREE_CODE (type) == BOOLEAN_TYPE
2179	\|\| !can_duplicate_and_interleave_p (vinfo, count: group_size,
2180	elt_type: type)))
2181	{
2182	matches[`0`] = false;
2183	goto out;
2184	}
2185	vec<tree> ops;
2186	ops.create (nelems: group_size);
2187	for (lane = `0`; lane < group_size; ++lane)
2188	ops.quick_push (obj: chains [lane][n].op);
2189	slp_tree child = vect_create_new_slp_node (ops);
2190	SLP_TREE_DEF_TYPE (child) = dt;
2191	children.safe_push (obj: child);
2192	}
2193	else if (dt != vect_internal_def)
2194	{
2195	/ Not sure, we might need sth special.*
2196	gcc.dg/vect/pr96854.c,
2197	gfortran.dg/vect/fast-math-pr37021.f90
2198	and gfortran.dg/vect/pr61171.f trigger. /*
2199	/ Soft-fail for now. /
2200	hard_fail = false;
2201	goto out;
2202	}
2203	else
2204	{
2205	vec<stmt_vec_info> op_stmts;
2206	op_stmts.create (nelems: group_size);
2207	slp_tree child = NULL;
2208	/ Brute-force our way. We have to consider a lane*
2209	failing after fixing an earlier fail up in the
2210	SLP discovery recursion. So track the current
2211	permute per lane. /*
2212	unsigned perms = XALLOCAVEC (unsigned*, group_size);
2213	memset (s: perms, c: `0`, n: sizeof (unsigned) * group_size);
2214	do
2215	{
2216	op_stmts.truncate (size: `0`);
2217	for (lane = `0`; lane < group_size; ++lane)
2218	op_stmts.quick_push
2219	(obj: vinfo->lookup_def (chains [lane][n].op));
2220	child = vect_build_slp_tree (vinfo, stmts: op_stmts,
2221	group_size, max_nunits: &this_max_nunits,
2222	matches, limit,
2223	tree_size: &this_tree_size, bst_map);
2224	/ ??? We're likely getting too many fatal mismatches*
2225	here so maybe we want to ignore them (but then we
2226	have no idea which lanes fatally mismatched). /*
2227	if (child \|\| !matches[`0`])
2228	break;
2229	/ Swap another lane we have not yet matched up into*
2230	lanes that did not match. If we run out of
2231	permute possibilities for a lane terminate the
2232	search. /*
2233	bool term = false;
2234	for (lane = `1`; lane < group_size; ++lane)
2235	if (!matches[lane])
2236	{
2237	if (n + perms[lane] + `1` == chain_len)
2238	{
2239	term = true;
2240	break;
2241	}
2242	std::swap (a&: chains [lane][n],
2243	b&: chains [lane][n + perms[lane] + `1`]);
2244	perms[lane]++;
2245	}
2246	if (term)
2247	break;
2248	}
2249	while (`1`);
2250	if (!child)
2251	{
2252	if (dump_enabled_p ())
2253	dump_printf_loc (MSG_NOTE, vect_location,
2254	"failed to match up op %d\n", n);
2255	op_stmts.release ();
2256	if (lane != group_size - `1`)
2257	matches[`0`] = false;
2258	else
2259	matches[lane] = false;
2260	goto out;
2261	}
2262	if (dump_enabled_p ())
2263	{
2264	dump_printf_loc (MSG_NOTE, vect_location,
2265	"matched up op %d to\n", n);
2266	vect_print_slp_tree (MSG_NOTE, vect_location, child);
2267	}
2268	children.safe_push (obj: child);
2269	}
2270	}
2271	/ 3. build SLP nodes to combine the chain. /
2272	for (unsigned lane = `0`; lane < group_size; ++lane)
2273	if (chains [lane][`0`].code != code)
2274	{
2275	/ See if there's any alternate all-PLUS entry. /
2276	unsigned n;
2277	for (n = `1`; n < chain_len; ++n)
2278	{
2279	for (lane = `0`; lane < group_size; ++lane)
2280	if (chains [lane][n].code != code)
2281	break;
2282	if (lane == group_size)
2283	break;
2284	}
2285	if (n != chain_len)
2286	{
2287	/ Swap that in at first position. /
2288	std::swap (a&: children [`0`], b&: children [n]);
2289	for (lane = `0`; lane < group_size; ++lane)
2290	std::swap (a&: chains [lane][`0`], b&: chains [lane][n]);
2291	}
2292	else
2293	{
2294	/ ??? When this triggers and we end up with two*
2295	vect_constant/external_def up-front things break (ICE)
2296	spectacularly finding an insertion place for the
2297	all-constant op. We should have a fully
2298	vect_internal_def operand though(?) so we can swap
2299	that into first place and then prepend the all-zero
2300	constant. /*
2301	if (dump_enabled_p ())
2302	dump_printf_loc (MSG_NOTE, vect_location,
2303	"inserting constant zero to compensate "
2304	"for (partially) negated first "
2305	"operand\n");
2306	chain_len++;
2307	for (lane = `0`; lane < group_size; ++lane)
2308	chains [lane].safe_insert
2309	(ix: `0`, obj: chain_op_t (code, vect_constant_def, NULL_TREE));
2310	vec<tree> zero_ops;
2311	zero_ops.create (nelems: group_size);
2312	zero_ops.quick_push (obj: build_zero_cst (TREE_TYPE (vectype)));
2313	for (lane = `1`; lane < group_size; ++lane)
2314	zero_ops.quick_push (obj: zero_ops [`0`]);
2315	slp_tree zero = vect_create_new_slp_node (ops: zero_ops);
2316	SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2317	children.safe_insert (ix: `0`, obj: zero);
2318	}
2319	break;
2320	}
2321	for (unsigned i = `1`; i < children.length (); ++i)
2322	{
2323	slp_tree op0 = children [i - `1`];
2324	slp_tree op1 = children [i];
2325	bool this_two_op = false;
2326	for (unsigned lane = `0`; lane < group_size; ++lane)
2327	if (chains [lane][i].code != chains [`0`][i].code)
2328	{
2329	this_two_op = true;
2330	break;
2331	}
2332	slp_tree child;
2333	if (i == children.length () - `1`)
2334	child = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: `2`);
2335	else
2336	child = vect_create_new_slp_node (nops: `2`, code: ERROR_MARK);
2337	if (this_two_op)
2338	{
2339	vec<std::pair<unsigned, unsigned> > lperm;
2340	lperm.create (nelems: group_size);
2341	for (unsigned lane = `0`; lane < group_size; ++lane)
2342	lperm.quick_push (obj: std::make_pair
2343	(x: chains [lane][i].code != chains [`0`][i].code, y&: lane));
2344	vect_slp_build_two_operator_nodes (perm: child, vectype, op0, op1,
2345	oper1: (chains [`0`][i].code == code
2346	? op_stmt_info
2347	: other_op_stmt_info),
2348	oper2: (chains [`0`][i].code == code
2349	? other_op_stmt_info
2350	: op_stmt_info),
2351	lperm);
2352	}
2353	else
2354	{
2355	SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2356	SLP_TREE_VECTYPE (child) = vectype;
2357	SLP_TREE_LANES (child) = group_size;
2358	SLP_TREE_CHILDREN (child).quick_push (obj: op0);
2359	SLP_TREE_CHILDREN (child).quick_push (obj: op1);
2360	SLP_TREE_REPRESENTATIVE (child)
2361	= (chains [`0`][i].code == code
2362	? op_stmt_info : other_op_stmt_info);
2363	}
2364	children [i] = child;
2365	}
2366	*tree_size += this_tree_size + `1`;
2367	*max_nunits = this_max_nunits;
2368	while (!chains.is_empty ())
2369	chains.pop ().release ();
2370	return node;
2371	}
2372	out:
2373	while (!children.is_empty ())
2374	vect_free_slp_tree (node: children.pop ());
2375	while (!chains.is_empty ())
2376	chains.pop ().release ();
2377	/ Hard-fail, otherwise we might run into quadratic processing of the*
2378	chains starting one stmt into the chain again. /*
2379	if (hard_fail)
2380	return NULL;
2381	/ Fall thru to normal processing. /
2382	}
2383
2384	/ Get at the operands, verifying they are compatible. /
2385	vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2386	slp_oprnd_info oprnd_info;
2387	FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2388	{
2389	int res = vect_get_and_check_slp_defs (vinfo, swap: swap[i], skip_args,
2390	stmts, stmt_num: i, oprnds_info: &oprnds_info);
2391	if (res != `0`)
2392	matches[(res == -`1`) ? `0` : i] = false;
2393	if (!matches[`0`])
2394	break;
2395	}
2396	for (i = `0`; i < group_size; ++i)
2397	if (!matches[i])
2398	{
2399	vect_free_oprnd_info (oprnds_info);
2400	return NULL;
2401	}
2402	swap = NULL;
2403
2404	auto_vec<slp_tree, `4`> children;
2405
2406	stmt_info = stmts [`0`];
2407
2408	/ Create SLP_TREE nodes for the definition node/s. /
2409	FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2410	{
2411	slp_tree child = nullptr;
2412	unsigned int j;
2413
2414	/ We're skipping certain operands from processing, for example*
2415	outer loop reduction initial defs. /*
2416	if (skip_args[i])
2417	{
2418	children.safe_push (NULL);
2419	continue;
2420	}
2421
2422	if (oprnd_info->first_dt == vect_uninitialized_def)
2423	{
2424	/ COND_EXPR have one too many eventually if the condition*
2425	is a SSA name. /*
2426	gcc_assert (i == `3` && nops == `4`);
2427	continue;
2428	}
2429
2430	if (is_a <bb_vec_info> (p: vinfo)
2431	&& oprnd_info->first_dt == vect_internal_def
2432	&& !oprnd_info->any_pattern)
2433	{
2434	/ For BB vectorization, if all defs are the same do not*
2435	bother to continue the build along the single-lane
2436	graph but use a splat of the scalar value. /*
2437	stmt_vec_info first_def = oprnd_info->def_stmts [`0`];
2438	for (j = `1`; j < group_size; ++j)
2439	if (oprnd_info->def_stmts [j] != first_def)
2440	break;
2441	if (j == group_size
2442	/ But avoid doing this for loads where we may be*
2443	able to CSE things, unless the stmt is not
2444	vectorizable. /*
2445	&& (!STMT_VINFO_VECTORIZABLE (first_def)
2446	\|\| !gimple_vuse (g: first_def->stmt)))
2447	{
2448	if (dump_enabled_p ())
2449	dump_printf_loc (MSG_NOTE, vect_location,
2450	"Using a splat of the uniform operand %G",
2451	first_def->stmt);
2452	oprnd_info->first_dt = vect_external_def;
2453	}
2454	}
2455
2456	if (oprnd_info->first_dt == vect_external_def
2457	\|\| oprnd_info->first_dt == vect_constant_def)
2458	{
2459	if (!GET_MODE_SIZE (mode: vinfo->vector_mode).is_constant ())
2460	{
2461	tree op0;
2462	tree uniform_val = op0 = oprnd_info->ops [`0`];
2463	for (j = `1`; j < oprnd_info->ops.length (); ++j)
2464	if (!operand_equal_p (uniform_val, oprnd_info->ops [j]))
2465	{
2466	uniform_val = NULL_TREE;
2467	break;
2468	}
2469	if (!uniform_val
2470	&& !can_duplicate_and_interleave_p (vinfo,
2471	count: oprnd_info->ops.length (),
2472	TREE_TYPE (op0)))
2473	{
2474	matches[j] = false;
2475	if (dump_enabled_p ())
2476	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2477	"Build SLP failed: invalid type of def "
2478	"for variable-length SLP %T\n", op0);
2479	goto fail;
2480	}
2481	}
2482	slp_tree invnode = vect_create_new_slp_node (ops: oprnd_info->ops);
2483	SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2484	oprnd_info->ops = vNULL;
2485	children.safe_push (obj: invnode);
2486	continue;
2487	}
2488
2489	if ((child = vect_build_slp_tree (vinfo, stmts: oprnd_info->def_stmts,
2490	group_size, max_nunits: &this_max_nunits,
2491	matches, limit,
2492	tree_size: &this_tree_size, bst_map)) != NULL)
2493	{
2494	oprnd_info->def_stmts = vNULL;
2495	children.safe_push (obj: child);
2496	continue;
2497	}
2498
2499	/ If the SLP build for operand zero failed and operand zero*
2500	and one can be commutated try that for the scalar stmts
2501	that failed the match. /*
2502	if (i == `0`
2503	/ A first scalar stmt mismatch signals a fatal mismatch. /
2504	&& matches[`0`]
2505	/ ??? For COND_EXPRs we can swap the comparison operands*
2506	as well as the arms under some constraints. /*
2507	&& nops == `2`
2508	&& oprnds_info [`1`]->first_dt == vect_internal_def
2509	&& is_gimple_assign (gs: stmt_info->stmt)
2510	/ Swapping operands for reductions breaks assumptions later on. /
2511	&& STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2512	&& STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2513	{
2514	/ See whether we can swap the matching or the non-matching*
2515	stmt operands. /*
2516	bool swap_not_matching = true;
2517	do
2518	{
2519	for (j = `0`; j < group_size; ++j)
2520	{
2521	if (matches[j] != !swap_not_matching)
2522	continue;
2523	stmt_vec_info stmt_info = stmts [j];
2524	/ Verify if we can swap operands of this stmt. /
2525	gassign stmt = dyn_cast <gassign > (p: stmt_info->stmt);
2526	if (!stmt
2527	\|\| !commutative_tree_code (gimple_assign_rhs_code (gs: stmt)))
2528	{
2529	if (!swap_not_matching)
2530	goto fail;
2531	swap_not_matching = false;
2532	break;
2533	}
2534	}
2535	}
2536	while (j != group_size);
2537
2538	/ Swap mismatched definition stmts. /
2539	if (dump_enabled_p ())
2540	dump_printf_loc (MSG_NOTE, vect_location,
2541	"Re-trying with swapped operands of stmts ");
2542	for (j = `0`; j < group_size; ++j)
2543	if (matches[j] == !swap_not_matching)
2544	{
2545	std::swap (a&: oprnds_info [`0`]->def_stmts [j],
2546	b&: oprnds_info [`1`]->def_stmts [j]);
2547	std::swap (a&: oprnds_info [`0`]->ops [j],
2548	b&: oprnds_info [`1`]->ops [j]);
2549	if (dump_enabled_p ())
2550	dump_printf (MSG_NOTE, "%d ", j);
2551	}
2552	if (dump_enabled_p ())
2553	dump_printf (MSG_NOTE, "\n");
2554	/ After swapping some operands we lost track whether an*
2555	operand has any pattern defs so be conservative here. /*
2556	if (oprnds_info [`0`]->any_pattern \|\| oprnds_info [`1`]->any_pattern)
2557	oprnds_info [`0`]->any_pattern = oprnds_info [`1`]->any_pattern = true;
2558	/ And try again with scratch 'matches' ... /
2559	bool tem = XALLOCAVEC (bool*, group_size);
2560	if ((child = vect_build_slp_tree (vinfo, stmts: oprnd_info->def_stmts,
2561	group_size, max_nunits: &this_max_nunits,
2562	matches: tem, limit,
2563	tree_size: &this_tree_size, bst_map)) != NULL)
2564	{
2565	oprnd_info->def_stmts = vNULL;
2566	children.safe_push (obj: child);
2567	continue;
2568	}
2569	}
2570	fail:
2571
2572	/ If the SLP build failed and we analyze a basic-block*
2573	simply treat nodes we fail to build as externally defined
2574	(and thus build vectors from the scalar defs).
2575	The cost model will reject outright expensive cases.
2576	??? This doesn't treat cases where permutation ultimatively
2577	fails (or we don't try permutation below). Ideally we'd
2578	even compute a permutation that will end up with the maximum
2579	SLP tree size... /*
2580	if (is_a <bb_vec_info> (p: vinfo)
2581	/ ??? Rejecting patterns this way doesn't work. We'd have to*
2582	do extra work to cancel the pattern so the uses see the
2583	scalar version. /*
2584	&& !is_pattern_stmt_p (stmt_info)
2585	&& !oprnd_info->any_pattern)
2586	{
2587	/ But if there's a leading vector sized set of matching stmts*
2588	fail here so we can split the group. This matches the condition
2589	vect_analyze_slp_instance uses. /*
2590	/ ??? We might want to split here and combine the results to support*
2591	multiple vector sizes better. /*
2592	for (j = `0`; j < group_size; ++j)
2593	if (!matches[j])
2594	break;
2595	if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2596	{
2597	if (dump_enabled_p ())
2598	dump_printf_loc (MSG_NOTE, vect_location,
2599	"Building vector operands from scalars\n");
2600	this_tree_size++;
2601	child = vect_create_new_slp_node (ops: oprnd_info->ops);
2602	children.safe_push (obj: child);
2603	oprnd_info->ops = vNULL;
2604	continue;
2605	}
2606	}
2607
2608	gcc_assert (child == NULL);
2609	FOR_EACH_VEC_ELT (children, j, child)
2610	if (child)
2611	vect_free_slp_tree (node: child);
2612	vect_free_oprnd_info (oprnds_info);
2613	return NULL;
2614	}
2615
2616	vect_free_oprnd_info (oprnds_info);
2617
2618	/ If we have all children of a child built up from uniform scalars*
2619	or does more than one possibly expensive vector construction then
2620	just throw that away, causing it built up from scalars.
2621	The exception is the SLP node for the vector store. /*
2622	if (is_a <bb_vec_info> (p: vinfo)
2623	&& !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2624	/ ??? Rejecting patterns this way doesn't work. We'd have to*
2625	do extra work to cancel the pattern so the uses see the
2626	scalar version. /*
2627	&& !is_pattern_stmt_p (stmt_info))
2628	{
2629	slp_tree child;
2630	unsigned j;
2631	bool all_uniform_p = true;
2632	unsigned n_vector_builds = `0`;
2633	FOR_EACH_VEC_ELT (children, j, child)
2634	{
2635	if (!child)
2636	;
2637	else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2638	all_uniform_p = false;
2639	else if (!vect_slp_tree_uniform_p (node: child))
2640	{
2641	all_uniform_p = false;
2642	if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2643	n_vector_builds++;
2644	}
2645	}
2646	if (all_uniform_p
2647	\|\| n_vector_builds > `1`
2648	\|\| (n_vector_builds == children.length ()
2649	&& is_a <gphi *> (p: stmt_info->stmt)))
2650	{
2651	/ Roll back. /
2652	matches[`0`] = false;
2653	FOR_EACH_VEC_ELT (children, j, child)
2654	if (child)
2655	vect_free_slp_tree (node: child);
2656
2657	if (dump_enabled_p ())
2658	dump_printf_loc (MSG_NOTE, vect_location,
2659	"Building parent vector operands from "
2660	"scalars instead\n");
2661	return NULL;
2662	}
2663	}
2664
2665	*tree_size += this_tree_size + `1`;
2666	*max_nunits = this_max_nunits;
2667
2668	if (two_operators)
2669	{
2670	/ ??? We'd likely want to either cache in bst_map sth like*
2671	{ a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2672	the true { a+b, a+b, a+b, a+b } ... but there we don't have
2673	explicit stmts to put in so the keying on 'stmts' doesn't
2674	work (but we have the same issue with nodes that use 'ops'). /*
2675	slp_tree one = new _slp_tree;
2676	slp_tree two = new _slp_tree;
2677	SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2678	SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2679	SLP_TREE_VECTYPE (one) = vectype;
2680	SLP_TREE_VECTYPE (two) = vectype;
2681	SLP_TREE_CHILDREN (one).safe_splice (src: children);
2682	SLP_TREE_CHILDREN (two).safe_splice (src: children);
2683	slp_tree child;
2684	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2685	SLP_TREE_REF_COUNT (child)++;
2686
2687	/ Here we record the original defs since this*
2688	node represents the final lane configuration. /*
2689	node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: `2`);
2690	SLP_TREE_VECTYPE (node) = vectype;
2691	SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2692	SLP_TREE_CHILDREN (node).quick_push (obj: one);
2693	SLP_TREE_CHILDREN (node).quick_push (obj: two);
2694	gassign stmt = as_a <gassign > (p: stmts [`0`]->stmt);
2695	enum tree_code code0 = gimple_assign_rhs_code (gs: stmt);
2696	enum tree_code ocode = ERROR_MARK;
2697	stmt_vec_info ostmt_info;
2698	unsigned j = `0`;
2699	FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2700	{
2701	gassign ostmt = as_a <gassign > (p: ostmt_info->stmt);
2702	if (gimple_assign_rhs_code (gs: ostmt) != code0)
2703	{
2704	SLP_TREE_LANE_PERMUTATION (node).safe_push (obj: std::make_pair (x: `1`, y&: i));
2705	ocode = gimple_assign_rhs_code (gs: ostmt);
2706	j = i;
2707	}
2708	else
2709	SLP_TREE_LANE_PERMUTATION (node).safe_push (obj: std::make_pair (x: `0`, y&: i));
2710	}
2711	SLP_TREE_CODE (one) = code0;
2712	SLP_TREE_CODE (two) = ocode;
2713	SLP_TREE_LANES (one) = stmts.length ();
2714	SLP_TREE_LANES (two) = stmts.length ();
2715	SLP_TREE_REPRESENTATIVE (one) = stmts [`0`];
2716	SLP_TREE_REPRESENTATIVE (two) = stmts [j];
2717	return node;
2718	}
2719
2720	node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops);
2721	SLP_TREE_VECTYPE (node) = vectype;
2722	SLP_TREE_CHILDREN (node).splice (src: children);
2723	return node;
2724	}
2725
2726	/ Dump a single SLP tree NODE. /
2727
2728	static void
2729	vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2730	slp_tree node)
2731	{
2732	unsigned i, j;
2733	slp_tree child;
2734	stmt_vec_info stmt_info;
2735	tree op;
2736
2737	dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2738	dump_user_location_t user_loc = loc.get_user_location ();
2739	dump_printf_loc (metadata, user_loc,
2740	"node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2741	", refcnt=%u)",
2742	SLP_TREE_DEF_TYPE (node) == vect_external_def
2743	? " (external)"
2744	: (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2745	? " (constant)"
2746	: ""), (void *) node,
2747	estimated_poly_value (x: node->max_nunits),
2748	SLP_TREE_REF_COUNT (node));
2749	if (SLP_TREE_VECTYPE (node))
2750	dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2751	dump_printf (metadata, "\n");
2752	if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2753	{
2754	if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2755	dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2756	else
2757	dump_printf_loc (metadata, user_loc, "op template: %G",
2758	SLP_TREE_REPRESENTATIVE (node)->stmt);
2759	}
2760	if (SLP_TREE_SCALAR_STMTS (node).exists ())
2761	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2762	dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2763	else
2764	{
2765	dump_printf_loc (metadata, user_loc, "\t{ ");
2766	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2767	dump_printf (metadata, "%T%s ", op,
2768	i < SLP_TREE_SCALAR_OPS (node).length () - `1` ? "," : "");
2769	dump_printf (metadata, "}\n");
2770	}
2771	if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2772	{
2773	dump_printf_loc (metadata, user_loc, "\tload permutation {");
2774	FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2775	dump_printf (dump_kind, " %u", j);
2776	dump_printf (dump_kind, " }\n");
2777	}
2778	if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2779	{
2780	dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2781	for (i = `0`; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2782	dump_printf (dump_kind, " %u[%u]",
2783	SLP_TREE_LANE_PERMUTATION (node)[i].first,
2784	SLP_TREE_LANE_PERMUTATION (node)[i].second);
2785	dump_printf (dump_kind, " }\n");
2786	}
2787	if (SLP_TREE_CHILDREN (node).is_empty ())
2788	return;
2789	dump_printf_loc (metadata, user_loc, "\tchildren");
2790	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2791	dump_printf (dump_kind, " %p", (void *)child);
2792	dump_printf (dump_kind, "\n");
2793	}
2794
2795	DEBUG_FUNCTION void
2796	debug (slp_tree node)
2797	{
2798	debug_dump_context ctx;
2799	vect_print_slp_tree (dump_kind: MSG_NOTE,
2800	loc: dump_location_t::from_location_t (UNKNOWN_LOCATION),
2801	node);
2802	}
2803
2804	/ Recursive helper for the dot producer below. /
2805
2806	static void
2807	dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2808	{
2809	if (visited.add (k: node))
2810	return;
2811
2812	fprintf (stream: f, format: "\"%p\" [label=\"", (void *)node);
2813	vect_print_slp_tree (dump_kind: MSG_NOTE,
2814	loc: dump_location_t::from_location_t (UNKNOWN_LOCATION),
2815	node);
2816	fprintf (stream: f, format: "\"];\n");
2817
2818
2819	for (slp_tree child : SLP_TREE_CHILDREN (node))
2820	fprintf (stream: f, format: "\"%p\" -> \"%p\";", (void )node, (void* *)child);
2821
2822	for (slp_tree child : SLP_TREE_CHILDREN (node))
2823	if (child)
2824	dot_slp_tree (f, node: child, visited);
2825	}
2826
2827	DEBUG_FUNCTION void
2828	dot_slp_tree (const char *fname, slp_tree node)
2829	{
2830	FILE *f = fopen (filename: fname, modes: "w");
2831	fprintf (stream: f, format: "digraph {\n");
2832	fflush (stream: f);
2833	{
2834	debug_dump_context ctx (f);
2835	hash_set<slp_tree> visited;
2836	dot_slp_tree (f, node, visited);
2837	}
2838	fflush (stream: f);
2839	fprintf (stream: f, format: "}\n");
2840	fclose (stream: f);
2841	}
2842
2843	/ Dump a slp tree NODE using flags specified in DUMP_KIND. /
2844
2845	static void
2846	vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2847	slp_tree node, hash_set<slp_tree> &visited)
2848	{
2849	unsigned i;
2850	slp_tree child;
2851
2852	if (visited.add (k: node))
2853	return;
2854
2855	vect_print_slp_tree (dump_kind, loc, node);
2856
2857	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2858	if (child)
2859	vect_print_slp_graph (dump_kind, loc, node: child, visited);
2860	}
2861
2862	static void
2863	vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2864	slp_tree entry)
2865	{
2866	hash_set<slp_tree> visited;
2867	vect_print_slp_graph (dump_kind, loc, node: entry, visited);
2868	}
2869
2870	/ Mark the tree rooted at NODE with PURE_SLP. /
2871
2872	static void
2873	vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2874	{
2875	int i;
2876	stmt_vec_info stmt_info;
2877	slp_tree child;
2878
2879	if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2880	return;
2881
2882	if (visited.add (k: node))
2883	return;
2884
2885	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2886	STMT_SLP_TYPE (stmt_info) = pure_slp;
2887
2888	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2889	if (child)
2890	vect_mark_slp_stmts (node: child, visited);
2891	}
2892
2893	static void
2894	vect_mark_slp_stmts (slp_tree node)
2895	{
2896	hash_set<slp_tree> visited;
2897	vect_mark_slp_stmts (node, visited);
2898	}
2899
2900	/ Mark the statements of the tree rooted at NODE as relevant (vect_used). /
2901
2902	static void
2903	vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2904	{
2905	int i;
2906	stmt_vec_info stmt_info;
2907	slp_tree child;
2908
2909	if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2910	return;
2911
2912	if (visited.add (k: node))
2913	return;
2914
2915	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2916	{
2917	gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2918	\|\| STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2919	STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2920	}
2921
2922	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2923	if (child)
2924	vect_mark_slp_stmts_relevant (node: child, visited);
2925	}
2926
2927	static void
2928	vect_mark_slp_stmts_relevant (slp_tree node)
2929	{
2930	hash_set<slp_tree> visited;
2931	vect_mark_slp_stmts_relevant (node, visited);
2932	}
2933
2934
2935	/ Gather loads in the SLP graph NODE and populate the INST loads array. /
2936
2937	static void
2938	vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2939	hash_set<slp_tree> &visited)
2940	{
2941	if (!node \|\| visited.add (k: node))
2942	return;
2943
2944	if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2945	return;
2946
2947	if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2948	{
2949	stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2950	if (STMT_VINFO_DATA_REF (stmt_info)
2951	&& DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2952	loads.safe_push (obj: node);
2953	}
2954
2955	unsigned i;
2956	slp_tree child;
2957	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2958	vect_gather_slp_loads (loads, node: child, visited);
2959	}
2960
2961
2962	/ Find the last store in SLP INSTANCE. /
2963
2964	stmt_vec_info
2965	vect_find_last_scalar_stmt_in_slp (slp_tree node)
2966	{
2967	stmt_vec_info last = NULL;
2968	stmt_vec_info stmt_vinfo;
2969
2970	for (int i = `0`; SLP_TREE_SCALAR_STMTS (node).iterate (ix: i, ptr: &stmt_vinfo); i++)
2971	{
2972	stmt_vinfo = vect_orig_stmt (stmt_info: stmt_vinfo);
2973	last = last ? get_later_stmt (stmt1_info: stmt_vinfo, stmt2_info: last) : stmt_vinfo;
2974	}
2975
2976	return last;
2977	}
2978
2979	/ Find the first stmt in NODE. /
2980
2981	stmt_vec_info
2982	vect_find_first_scalar_stmt_in_slp (slp_tree node)
2983	{
2984	stmt_vec_info first = NULL;
2985	stmt_vec_info stmt_vinfo;
2986
2987	for (int i = `0`; SLP_TREE_SCALAR_STMTS (node).iterate (ix: i, ptr: &stmt_vinfo); i++)
2988	{
2989	stmt_vinfo = vect_orig_stmt (stmt_info: stmt_vinfo);
2990	if (!first
2991	\|\| get_later_stmt (stmt1_info: stmt_vinfo, stmt2_info: first) == first)
2992	first = stmt_vinfo;
2993	}
2994
2995	return first;
2996	}
2997
2998	/ Splits a group of stores, currently beginning at FIRST_VINFO, into*
2999	two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3000	(also containing the first GROUP1_SIZE stmts, since stores are
3001	consecutive), the second containing the remainder.
3002	Return the first stmt in the second group. /*
3003
3004	static stmt_vec_info
3005	vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3006	{
3007	gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3008	gcc_assert (group1_size > `0`);
3009	int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3010	gcc_assert (group2_size > `0`);
3011	DR_GROUP_SIZE (first_vinfo) = group1_size;
3012
3013	stmt_vec_info stmt_info = first_vinfo;
3014	for (unsigned i = group1_size; i > `1`; i--)
3015	{
3016	stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3017	gcc_assert (DR_GROUP_GAP (stmt_info) == `1`);
3018	}
3019	/ STMT is now the last element of the first group. /
3020	stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3021	DR_GROUP_NEXT_ELEMENT (stmt_info) = `0`;
3022
3023	DR_GROUP_SIZE (group2) = group2_size;
3024	for (stmt_info = group2; stmt_info;
3025	stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3026	{
3027	DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3028	gcc_assert (DR_GROUP_GAP (stmt_info) == `1`);
3029	}
3030
3031	/ For the second group, the DR_GROUP_GAP is that before the original group,*
3032	plus skipping over the first vector. /*
3033	DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3034
3035	/ DR_GROUP_GAP of the first group now has to skip over the second group too. /
3036	DR_GROUP_GAP (first_vinfo) += group2_size;
3037
3038	if (dump_enabled_p ())
3039	dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3040	group1_size, group2_size);
3041
3042	return group2;
3043	}
3044
3045	/ Calculate the unrolling factor for an SLP instance with GROUP_SIZE*
3046	statements and a vector of NUNITS elements. /*
3047
3048	static poly_uint64
3049	calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3050	{
3051	return exact_div (a: common_multiple (a: nunits, b: group_size), b: group_size);
3052	}
3053
3054	/ Helper that checks to see if a node is a load node. /
3055
3056	static inline bool
3057	vect_is_slp_load_node (slp_tree root)
3058	{
3059	return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3060	&& STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3061	&& DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3062	}
3063
3064
3065	/ Helper function of optimize_load_redistribution that performs the operation*
3066	recursively. /*
3067
3068	static slp_tree
3069	optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3070	vec_info vinfo, unsigned* int group_size,
3071	hash_map<slp_tree, slp_tree> *load_map,
3072	slp_tree root)
3073	{
3074	if (slp_tree *leader = load_map->get (k: root))
3075	return *leader;
3076
3077	slp_tree node;
3078	unsigned i;
3079
3080	/ For now, we don't know anything about externals so do not do anything. /
3081	if (!root \|\| SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3082	return NULL;
3083	else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3084	{
3085	/ First convert this node into a load node and add it to the leaves*
3086	list and flatten the permute from a lane to a load one. If it's
3087	unneeded it will be elided later. /*
3088	vec<stmt_vec_info> stmts;
3089	stmts.create (SLP_TREE_LANES (root));
3090	lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3091	for (unsigned j = `0`; j < lane_perm.length (); j++)
3092	{
3093	std::pair<unsigned, unsigned> perm = lane_perm [j];
3094	node = SLP_TREE_CHILDREN (root)[perm.first];
3095
3096	if (!vect_is_slp_load_node (root: node)
3097	\|\| SLP_TREE_CHILDREN (node).exists ())
3098	{
3099	stmts.release ();
3100	goto next;
3101	}
3102
3103	stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3104	}
3105
3106	if (dump_enabled_p ())
3107	dump_printf_loc (MSG_NOTE, vect_location,
3108	"converting stmts on permute node %p\n",
3109	(void *) root);
3110
3111	bool matches = XALLOCAVEC (bool*, group_size);
3112	poly_uint64 max_nunits = `1`;
3113	unsigned tree_size = `0`, limit = `1`;
3114	node = vect_build_slp_tree (vinfo, stmts, group_size, max_nunits: &max_nunits,
3115	matches, limit: &limit, tree_size: &tree_size, bst_map);
3116	if (!node)
3117	stmts.release ();
3118
3119	load_map->put (k: root, v: node);
3120	return node;
3121	}
3122
3123	next:
3124	load_map->put (k: root, NULL);
3125
3126	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3127	{
3128	slp_tree value
3129	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3130	root: node);
3131	if (value)
3132	{
3133	SLP_TREE_REF_COUNT (value)++;
3134	SLP_TREE_CHILDREN (root)[i] = value;
3135	/ ??? We know the original leafs of the replaced nodes will*
3136	be referenced by bst_map, only the permutes created by
3137	pattern matching are not. /*
3138	if (SLP_TREE_REF_COUNT (node) == `1`)
3139	load_map->remove (k: node);
3140	vect_free_slp_tree (node);
3141	}
3142	}
3143
3144	return NULL;
3145	}
3146
3147	/ Temporary workaround for loads not being CSEd during SLP build. This*
3148	function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3149	VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3150	same DR such that the final operation is equal to a permuted load. Such
3151	NODES are then directly converted into LOADS themselves. The nodes are
3152	CSEd using BST_MAP. /*
3153
3154	static void
3155	optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3156	vec_info vinfo, unsigned* int group_size,
3157	hash_map<slp_tree, slp_tree> *load_map,
3158	slp_tree root)
3159	{
3160	slp_tree node;
3161	unsigned i;
3162
3163	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3164	{
3165	slp_tree value
3166	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3167	root: node);
3168	if (value)
3169	{
3170	SLP_TREE_REF_COUNT (value)++;
3171	SLP_TREE_CHILDREN (root)[i] = value;
3172	/ ??? We know the original leafs of the replaced nodes will*
3173	be referenced by bst_map, only the permutes created by
3174	pattern matching are not. /*
3175	if (SLP_TREE_REF_COUNT (node) == `1`)
3176	load_map->remove (k: node);
3177	vect_free_slp_tree (node);
3178	}
3179	}
3180	}
3181
3182	/ Helper function of vect_match_slp_patterns.*
3183
3184	Attempts to match patterns against the slp tree rooted in REF_NODE using
3185	VINFO. Patterns are matched in post-order traversal.
3186
3187	If matching is successful the value in REF_NODE is updated and returned, if
3188	not then it is returned unchanged. /*
3189
3190	static bool
3191	vect_match_slp_patterns_2 (slp_tree ref_node, vec_info vinfo,
3192	slp_tree_to_load_perm_map_t *perm_cache,
3193	slp_compat_nodes_map_t *compat_cache,
3194	hash_set<slp_tree> *visited)
3195	{
3196	unsigned i;
3197	slp_tree node = *ref_node;
3198	bool found_p = false;
3199	if (!node \|\| visited->add (k: node))
3200	return false;
3201
3202	slp_tree child;
3203	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3204	found_p \|= vect_match_slp_patterns_2 (ref_node: &SLP_TREE_CHILDREN (node)[i],
3205	vinfo, perm_cache, compat_cache,
3206	visited);
3207
3208	for (unsigned x = `0`; x < num__slp_patterns; x++)
3209	{
3210	vect_pattern *pattern
3211	= slp_patterns[x] (perm_cache, compat_cache, ref_node);
3212	if (pattern)
3213	{
3214	pattern->build (vinfo);
3215	delete pattern;
3216	found_p = true;
3217	}
3218	}
3219
3220	return found_p;
3221	}
3222
3223	/ Applies pattern matching to the given SLP tree rooted in REF_NODE using*
3224	vec_info VINFO.
3225
3226	The modified tree is returned. Patterns are tried in order and multiple
3227	patterns may match. /*
3228
3229	static bool
3230	vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3231	hash_set<slp_tree> *visited,
3232	slp_tree_to_load_perm_map_t *perm_cache,
3233	slp_compat_nodes_map_t *compat_cache)
3234	{
3235	DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3236	slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3237
3238	if (dump_enabled_p ())
3239	dump_printf_loc (MSG_NOTE, vect_location,
3240	"Analyzing SLP tree %p for patterns\n",
3241	(void *) SLP_INSTANCE_TREE (instance));
3242
3243	return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3244	visited);
3245	}
3246
3247	/ STMT_INFO is a store group of size GROUP_SIZE that we are considering*
3248	splitting into two, with the first split group having size NEW_GROUP_SIZE.
3249	Return true if we could use IFN_STORE_LANES instead and if that appears
3250	to be the better approach. /*
3251
3252	static bool
3253	vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3254	unsigned int group_size,
3255	unsigned int new_group_size)
3256	{
3257	tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3258	tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3259	if (!vectype)
3260	return false;
3261	/ Allow the split if one of the two new groups would operate on full*
3262	vectors within* rather than across one scalar loop iteration.*
3263	This is purely a heuristic, but it should work well for group
3264	sizes of 3 and 4, where the possible splits are:
3265
3266	3->2+1: OK if the vector has exactly two elements
3267	4->2+2: Likewise
3268	4->3+1: Less clear-cut. /*
3269	if (multiple_p (a: group_size - new_group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype))
3270	\|\| multiple_p (a: new_group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype)))
3271	return false;
3272	return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3273	}
3274
3275	/ Analyze an SLP instance starting from a group of grouped stores. Call*
3276	vect_build_slp_tree to build a tree of packed stmts if possible.
3277	Return FALSE if it's impossible to SLP any stmt in the loop. /*
3278
3279	static bool
3280	vect_analyze_slp_instance (vec_info *vinfo,
3281	scalar_stmts_to_slp_tree_map_t *bst_map,
3282	stmt_vec_info stmt_info, slp_instance_kind kind,
3283	unsigned max_tree_size, unsigned *limit);
3284
3285	/ Analyze an SLP instance starting from SCALAR_STMTS which are a group*
3286	of KIND. Return true if successful. /*
3287
3288	static bool
3289	vect_build_slp_instance (vec_info *vinfo,
3290	slp_instance_kind kind,
3291	vec<stmt_vec_info> &scalar_stmts,
3292	vec<stmt_vec_info> &root_stmt_infos,
3293	vec<tree> &remain,
3294	unsigned max_tree_size, unsigned *limit,
3295	scalar_stmts_to_slp_tree_map_t *bst_map,
3296	/ ??? We need stmt_info for group splitting. /
3297	stmt_vec_info stmt_info_)
3298	{
3299	if (kind == slp_inst_kind_ctor)
3300	{
3301	if (dump_enabled_p ())
3302	dump_printf_loc (MSG_NOTE, vect_location,
3303	"Analyzing vectorizable constructor: %G\n",
3304	root_stmt_infos [`0`]->stmt);
3305	}
3306
3307	if (dump_enabled_p ())
3308	{
3309	dump_printf_loc (MSG_NOTE, vect_location,
3310	"Starting SLP discovery for\n");
3311	for (unsigned i = `0`; i < scalar_stmts.length (); ++i)
3312	dump_printf_loc (MSG_NOTE, vect_location,
3313	" %G", scalar_stmts [i]->stmt);
3314	}
3315
3316	/ Build the tree for the SLP instance. /
3317	unsigned int group_size = scalar_stmts.length ();
3318	bool matches = XALLOCAVEC (bool*, group_size);
3319	poly_uint64 max_nunits = `1`;
3320	unsigned tree_size = `0`;
3321	unsigned i;
3322	slp_tree node = vect_build_slp_tree (vinfo, stmts: scalar_stmts, group_size,
3323	max_nunits: &max_nunits, matches, limit,
3324	tree_size: &tree_size, bst_map);
3325	if (node != NULL)
3326	{
3327	/ Calculate the unrolling factor based on the smallest type. /
3328	poly_uint64 unrolling_factor
3329	= calculate_unrolling_factor (nunits: max_nunits, group_size);
3330
3331	if (maybe_ne (a: unrolling_factor, b: `1U`)
3332	&& is_a <bb_vec_info> (p: vinfo))
3333	{
3334	unsigned HOST_WIDE_INT const_max_nunits;
3335	if (!max_nunits.is_constant (const_value: &const_max_nunits)
3336	\|\| const_max_nunits > group_size)
3337	{
3338	if (dump_enabled_p ())
3339	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3340	"Build SLP failed: store group "
3341	"size not a multiple of the vector size "
3342	"in basic block SLP\n");
3343	vect_free_slp_tree (node);
3344	return false;
3345	}
3346	/ Fatal mismatch. /
3347	if (dump_enabled_p ())
3348	dump_printf_loc (MSG_NOTE, vect_location,
3349	"SLP discovery succeeded but node needs "
3350	"splitting\n");
3351	memset (s: matches, c: true, n: group_size);
3352	matches[group_size / const_max_nunits * const_max_nunits] = false;
3353	vect_free_slp_tree (node);
3354	}
3355	else
3356	{
3357	/ Create a new SLP instance. /
3358	slp_instance new_instance = XNEW (class _slp_instance);
3359	SLP_INSTANCE_TREE (new_instance) = node;
3360	SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3361	SLP_INSTANCE_LOADS (new_instance) = vNULL;
3362	SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3363	SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3364	SLP_INSTANCE_KIND (new_instance) = kind;
3365	new_instance->reduc_phis = NULL;
3366	new_instance->cost_vec = vNULL;
3367	new_instance->subgraph_entries = vNULL;
3368
3369	if (dump_enabled_p ())
3370	dump_printf_loc (MSG_NOTE, vect_location,
3371	"SLP size %u vs. limit %u.\n",
3372	tree_size, max_tree_size);
3373
3374	/ Fixup SLP reduction chains. /
3375	if (kind == slp_inst_kind_reduc_chain)
3376	{
3377	/ If this is a reduction chain with a conversion in front*
3378	amend the SLP tree with a node for that. /*
3379	gimple *scalar_def
3380	= vect_orig_stmt (stmt_info: scalar_stmts [group_size - `1`])->stmt;
3381	if (STMT_VINFO_DEF_TYPE (scalar_stmts[`0`]) != vect_reduction_def)
3382	{
3383	/ Get at the conversion stmt - we know it's the single use*
3384	of the last stmt of the reduction chain. /*
3385	use_operand_p use_p;
3386	bool r = single_imm_use (var: gimple_assign_lhs (gs: scalar_def),
3387	use_p: &use_p, stmt: &scalar_def);
3388	gcc_assert (r);
3389	stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3390	next_info = vect_stmt_to_vectorize (stmt_info: next_info);
3391	scalar_stmts = vNULL;
3392	scalar_stmts.create (nelems: group_size);
3393	for (unsigned i = `0`; i < group_size; ++i)
3394	scalar_stmts.quick_push (obj: next_info);
3395	slp_tree conv = vect_create_new_slp_node (scalar_stmts, nops: `1`);
3396	SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3397	SLP_TREE_CHILDREN (conv).quick_push (obj: node);
3398	SLP_INSTANCE_TREE (new_instance) = conv;
3399	/ We also have to fake this conversion stmt as SLP reduction*
3400	group so we don't have to mess with too much code
3401	elsewhere. /*
3402	REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3403	REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3404	}
3405	/ Fill the backedge child of the PHI SLP node. The*
3406	general matching code cannot find it because the
3407	scalar code does not reflect how we vectorize the
3408	reduction. /*
3409	use_operand_p use_p;
3410	imm_use_iterator imm_iter;
3411	class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3412	FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3413	gimple_get_lhs (scalar_def))
3414	/ There are exactly two non-debug uses, the reduction*
3415	PHI and the loop-closed PHI node. /*
3416	if (!is_gimple_debug (USE_STMT (use_p))
3417	&& gimple_bb (USE_STMT (use_p)) == loop->header)
3418	{
3419	auto_vec<stmt_vec_info, `64`> phis (group_size);
3420	stmt_vec_info phi_info
3421	= vinfo->lookup_stmt (USE_STMT (use_p));
3422	for (unsigned i = `0`; i < group_size; ++i)
3423	phis.quick_push (obj: phi_info);
3424	slp_tree *phi_node = bst_map->get (k: phis);
3425	unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3426	SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3427	= SLP_INSTANCE_TREE (new_instance);
3428	SLP_INSTANCE_TREE (new_instance)->refcnt++;
3429	}
3430	}
3431
3432	vinfo->slp_instances.safe_push (obj: new_instance);
3433
3434	/ ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with*
3435	the number of scalar stmts in the root in a few places.
3436	Verify that assumption holds. /*
3437	gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3438	.length () == group_size);
3439
3440	if (dump_enabled_p ())
3441	{
3442	dump_printf_loc (MSG_NOTE, vect_location,
3443	"Final SLP tree for instance %p:\n",
3444	(void *) new_instance);
3445	vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location,
3446	SLP_INSTANCE_TREE (new_instance));
3447	}
3448
3449	return true;
3450	}
3451	}
3452	else
3453	{
3454	/ Failed to SLP. /
3455	/ Free the allocated memory. /
3456	scalar_stmts.release ();
3457	}
3458
3459	stmt_vec_info stmt_info = stmt_info_;
3460	/ Try to break the group up into pieces. /
3461	if (kind == slp_inst_kind_store)
3462	{
3463	/ ??? We could delay all the actual splitting of store-groups*
3464	until after SLP discovery of the original group completed.
3465	Then we can recurse to vect_build_slp_instance directly. /*
3466	for (i = `0`; i < group_size; i++)
3467	if (!matches[i])
3468	break;
3469
3470	/ For basic block SLP, try to break the group up into multiples of*
3471	a vector size. /*
3472	if (is_a <bb_vec_info> (p: vinfo)
3473	&& (i > `1` && i < group_size))
3474	{
3475	tree scalar_type
3476	= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3477	tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3478	`1` << floor_log2 (x: i));
3479	unsigned HOST_WIDE_INT const_nunits;
3480	if (vectype
3481	&& TYPE_VECTOR_SUBPARTS (node: vectype).is_constant (const_value: &const_nunits))
3482	{
3483	/ Split into two groups at the first vector boundary. /
3484	gcc_assert ((const_nunits & (const_nunits - `1`)) == `0`);
3485	unsigned group1_size = i & ~(const_nunits - `1`);
3486
3487	if (dump_enabled_p ())
3488	dump_printf_loc (MSG_NOTE, vect_location,
3489	"Splitting SLP group at stmt %u\n", i);
3490	stmt_vec_info rest = vect_split_slp_store_group (first_vinfo: stmt_info,
3491	group1_size);
3492	bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3493	kind, max_tree_size,
3494	limit);
3495	/ Split the rest at the failure point and possibly*
3496	re-analyze the remaining matching part if it has
3497	at least two lanes. /*
3498	if (group1_size < i
3499	&& (i + `1` < group_size
3500	\|\| i - group1_size > `1`))
3501	{
3502	stmt_vec_info rest2 = rest;
3503	rest = vect_split_slp_store_group (first_vinfo: rest, group1_size: i - group1_size);
3504	if (i - group1_size > `1`)
3505	res \|= vect_analyze_slp_instance (vinfo, bst_map, stmt_info: rest2,
3506	kind, max_tree_size,
3507	limit);
3508	}
3509	/ Re-analyze the non-matching tail if it has at least*
3510	two lanes. /*
3511	if (i + `1` < group_size)
3512	res \|= vect_analyze_slp_instance (vinfo, bst_map,
3513	stmt_info: rest, kind, max_tree_size,
3514	limit);
3515	return res;
3516	}
3517	}
3518
3519	/ For loop vectorization split into arbitrary pieces of size > 1. /
3520	if (is_a <loop_vec_info> (p: vinfo)
3521	&& (i > `1` && i < group_size)
3522	&& !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, new_group_size: i))
3523	{
3524	unsigned group1_size = i;
3525
3526	if (dump_enabled_p ())
3527	dump_printf_loc (MSG_NOTE, vect_location,
3528	"Splitting SLP group at stmt %u\n", i);
3529
3530	stmt_vec_info rest = vect_split_slp_store_group (first_vinfo: stmt_info,
3531	group1_size);
3532	/ Loop vectorization cannot handle gaps in stores, make sure*
3533	the split group appears as strided. /*
3534	STMT_VINFO_STRIDED_P (rest) = `1`;
3535	DR_GROUP_GAP (rest) = `0`;
3536	STMT_VINFO_STRIDED_P (stmt_info) = `1`;
3537	DR_GROUP_GAP (stmt_info) = `0`;
3538
3539	bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3540	kind, max_tree_size, limit);
3541	if (i + `1` < group_size)
3542	res \|= vect_analyze_slp_instance (vinfo, bst_map,
3543	stmt_info: rest, kind, max_tree_size, limit);
3544
3545	return res;
3546	}
3547
3548	/ Even though the first vector did not all match, we might be able to SLP*
3549	(some) of the remainder. FORNOW ignore this possibility. /*
3550	}
3551
3552	/ Failed to SLP. /
3553	if (dump_enabled_p ())
3554	dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3555	return false;
3556	}
3557
3558
3559	/ Analyze an SLP instance starting from a group of grouped stores. Call*
3560	vect_build_slp_tree to build a tree of packed stmts if possible.
3561	Return FALSE if it's impossible to SLP any stmt in the loop. /*
3562
3563	static bool
3564	vect_analyze_slp_instance (vec_info *vinfo,
3565	scalar_stmts_to_slp_tree_map_t *bst_map,
3566	stmt_vec_info stmt_info,
3567	slp_instance_kind kind,
3568	unsigned max_tree_size, unsigned *limit)
3569	{
3570	unsigned int i;
3571	vec<stmt_vec_info> scalar_stmts;
3572
3573	if (is_a <bb_vec_info> (p: vinfo))
3574	vect_location = stmt_info->stmt;
3575
3576	stmt_vec_info next_info = stmt_info;
3577	if (kind == slp_inst_kind_store)
3578	{
3579	/ Collect the stores and store them in scalar_stmts. /
3580	scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3581	while (next_info)
3582	{
3583	scalar_stmts.quick_push (obj: vect_stmt_to_vectorize (stmt_info: next_info));
3584	next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3585	}
3586	}
3587	else if (kind == slp_inst_kind_reduc_chain)
3588	{
3589	/ Collect the reduction stmts and store them in scalar_stmts. /
3590	scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3591	while (next_info)
3592	{
3593	scalar_stmts.quick_push (obj: vect_stmt_to_vectorize (stmt_info: next_info));
3594	next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3595	}
3596	/ Mark the first element of the reduction chain as reduction to properly*
3597	transform the node. In the reduction analysis phase only the last
3598	element of the chain is marked as reduction. /*
3599	STMT_VINFO_DEF_TYPE (stmt_info)
3600	= STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3601	STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3602	= STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3603	}
3604	else if (kind == slp_inst_kind_reduc_group)
3605	{
3606	/ Collect reduction statements. /
3607	const vec<stmt_vec_info> &reductions
3608	= as_a <loop_vec_info> (p: vinfo)->reductions;
3609	scalar_stmts.create (nelems: reductions.length ());
3610	for (i = `0`; reductions.iterate (ix: i, ptr: &next_info); i++)
3611	if ((STMT_VINFO_RELEVANT_P (next_info)
3612	\|\| STMT_VINFO_LIVE_P (next_info))
3613	/ ??? Make sure we didn't skip a conversion around a reduction*
3614	path. In that case we'd have to reverse engineer that conversion
3615	stmt following the chain using reduc_idx and from the PHI
3616	using reduc_def. /*
3617	&& STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3618	scalar_stmts.quick_push (obj: next_info);
3619	/ If less than two were relevant/live there's nothing to SLP. /
3620	if (scalar_stmts.length () < `2`)
3621	return false;
3622	}
3623	else
3624	gcc_unreachable ();
3625
3626	vec<stmt_vec_info> roots = vNULL;
3627	vec<tree> remain = vNULL;
3628	/ Build the tree for the SLP instance. /
3629	bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3630	root_stmt_infos&: roots, remain,
3631	max_tree_size, limit, bst_map,
3632	stmt_info_: kind == slp_inst_kind_store
3633	? stmt_info : NULL);
3634
3635	/ ??? If this is slp_inst_kind_store and the above succeeded here's*
3636	where we should do store group splitting. /*
3637
3638	return res;
3639	}
3640
3641	/ Check if there are stmts in the loop can be vectorized using SLP. Build SLP*
3642	trees of packed scalar stmts if SLP is possible. /*
3643
3644	opt_result
3645	vect_analyze_slp (vec_info vinfo, unsigned* max_tree_size)
3646	{
3647	unsigned int i;
3648	stmt_vec_info first_element;
3649	slp_instance instance;
3650
3651	DUMP_VECT_SCOPE ("vect_analyze_slp");
3652
3653	unsigned limit = max_tree_size;
3654
3655	scalar_stmts_to_slp_tree_map_t *bst_map
3656	= new scalar_stmts_to_slp_tree_map_t ();
3657
3658	/ Find SLP sequences starting from groups of grouped stores. /
3659	FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3660	vect_analyze_slp_instance (vinfo, bst_map, stmt_info: first_element,
3661	kind: slp_inst_kind_store, max_tree_size, limit: &limit);
3662
3663	if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo))
3664	{
3665	for (unsigned i = `0`; i < bb_vinfo->roots.length (); ++i)
3666	{
3667	vect_location = bb_vinfo->roots [i].roots [`0`]->stmt;
3668	/ Apply patterns. /
3669	for (unsigned j = `0`; j < bb_vinfo->roots [i].stmts.length (); ++j)
3670	bb_vinfo->roots [i].stmts [j]
3671	= vect_stmt_to_vectorize (stmt_info: bb_vinfo->roots [i].stmts [j]);
3672	if (vect_build_slp_instance (vinfo: bb_vinfo, kind: bb_vinfo->roots [i].kind,
3673	scalar_stmts&: bb_vinfo->roots [i].stmts,
3674	root_stmt_infos&: bb_vinfo->roots [i].roots,
3675	remain&: bb_vinfo->roots [i].remain,
3676	max_tree_size, limit: &limit, bst_map, NULL))
3677	{
3678	bb_vinfo->roots [i].stmts = vNULL;
3679	bb_vinfo->roots [i].roots = vNULL;
3680	bb_vinfo->roots [i].remain = vNULL;
3681	}
3682	}
3683	}
3684
3685	if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
3686	{
3687	/ Find SLP sequences starting from reduction chains. /
3688	FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3689	if (! STMT_VINFO_RELEVANT_P (first_element)
3690	&& ! STMT_VINFO_LIVE_P (first_element))
3691	;
3692	else if (! vect_analyze_slp_instance (vinfo, bst_map, stmt_info: first_element,
3693	kind: slp_inst_kind_reduc_chain,
3694	max_tree_size, limit: &limit))
3695	{
3696	/ Dissolve reduction chain group. /
3697	stmt_vec_info vinfo = first_element;
3698	stmt_vec_info last = NULL;
3699	while (vinfo)
3700	{
3701	stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3702	REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3703	REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3704	last = vinfo;
3705	vinfo = next;
3706	}
3707	STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3708	/ It can be still vectorized as part of an SLP reduction. /
3709	loop_vinfo->reductions.safe_push (obj: last);
3710	}
3711
3712	/ Find SLP sequences starting from groups of reductions. /
3713	if (loop_vinfo->reductions.length () > `1`)
3714	vect_analyze_slp_instance (vinfo, bst_map, stmt_info: loop_vinfo->reductions [`0`],
3715	kind: slp_inst_kind_reduc_group, max_tree_size,
3716	limit: &limit);
3717	}
3718
3719	hash_set<slp_tree> visited_patterns;
3720	slp_tree_to_load_perm_map_t perm_cache;
3721	slp_compat_nodes_map_t compat_cache;
3722
3723	/ See if any patterns can be found in the SLP tree. /
3724	bool pattern_found = false;
3725	FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3726	pattern_found \|= vect_match_slp_patterns (instance, vinfo,
3727	visited: &visited_patterns, perm_cache: &perm_cache,
3728	compat_cache: &compat_cache);
3729
3730	/ If any were found optimize permutations of loads. /
3731	if (pattern_found)
3732	{
3733	hash_map<slp_tree, slp_tree> load_map;
3734	FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3735	{
3736	slp_tree root = SLP_INSTANCE_TREE (instance);
3737	optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3738	load_map: &load_map, root);
3739	}
3740	}
3741
3742
3743
3744	/ The map keeps a reference on SLP nodes built, release that. /
3745	for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3746	it != bst_map->end (); ++it)
3747	if ((*it).second)
3748	vect_free_slp_tree (node: (*it).second);
3749	delete bst_map;
3750
3751	if (pattern_found && dump_enabled_p ())
3752	{
3753	dump_printf_loc (MSG_NOTE, vect_location,
3754	"Pattern matched SLP tree\n");
3755	hash_set<slp_tree> visited;
3756	FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3757	vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location,
3758	SLP_INSTANCE_TREE (instance), visited);
3759	}
3760
3761	return opt_result::success ();
3762	}
3763
3764	/ Estimates the cost of inserting layout changes into the SLP graph.*
3765	It can also say that the insertion is impossible. /*
3766
3767	struct slpg_layout_cost
3768	{
3769	slpg_layout_cost () = default;
3770	slpg_layout_cost (sreal, bool);
3771
3772	static slpg_layout_cost impossible () { return { sreal::max (), `0` }; }
3773	bool is_possible () const { return depth != sreal::max (); }
3774
3775	bool operator== (const slpg_layout_cost &) const;
3776	bool operator!= (const slpg_layout_cost &) const;
3777
3778	bool is_better_than (const slpg_layout_cost &, bool) const;
3779
3780	void add_parallel_cost (const slpg_layout_cost &);
3781	void add_serial_cost (const slpg_layout_cost &);
3782	void split (unsigned int);
3783
3784	/ The longest sequence of layout changes needed during any traversal*
3785	of the partition dag, weighted by execution frequency.
3786
3787	This is the most important metric when optimizing for speed, since
3788	it helps to ensure that we keep the number of operations on
3789	critical paths to a minimum. /*
3790	sreal depth = `0`;
3791
3792	/ An estimate of the total number of operations needed. It is weighted by*
3793	execution frequency when optimizing for speed but not when optimizing for
3794	size. In order to avoid double-counting, a node with a fanout of N will
3795	distribute 1/N of its total cost to each successor.
3796
3797	This is the most important metric when optimizing for size, since
3798	it helps to keep the total number of operations to a minimum, /*
3799	sreal total = `0`;
3800	};
3801
3802	/ Construct costs for a node with weight WEIGHT. A higher weight*
3803	indicates more frequent execution. IS_FOR_SIZE is true if we are
3804	optimizing for size rather than speed. /*
3805
3806	slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3807	: depth (weight), total (is_for_size && weight > `0` ? `1` : weight)
3808	{
3809	}
3810
3811	bool
3812	slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3813	{
3814	return depth == other.depth && total == other.total;
3815	}
3816
3817	bool
3818	slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3819	{
3820	return !operator== (other);
3821	}
3822
3823	/ Return true if these costs are better than OTHER. IS_FOR_SIZE is*
3824	true if we are optimizing for size rather than speed. /*
3825
3826	bool
3827	slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3828	bool is_for_size) const
3829	{
3830	if (is_for_size)
3831	{
3832	if (total != other.total)
3833	return total < other.total;
3834	return depth < other.depth;
3835	}
3836	else
3837	{
3838	if (depth != other.depth)
3839	return depth < other.depth;
3840	return total < other.total;
3841	}
3842	}
3843
3844	/ Increase the costs to account for something with cost INPUT_COST*
3845	happening in parallel with the current costs. /*
3846
3847	void
3848	slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3849	{
3850	depth = std::max (a: depth, b: input_cost.depth);
3851	total += input_cost.total;
3852	}
3853
3854	/ Increase the costs to account for something with cost INPUT_COST*
3855	happening in series with the current costs. /*
3856
3857	void
3858	slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3859	{
3860	depth += other.depth;
3861	total += other.total;
3862	}
3863
3864	/ Split the total cost among TIMES successors or predecessors. /
3865
3866	void
3867	slpg_layout_cost::split (unsigned int times)
3868	{
3869	if (times > `1`)
3870	total /= times;
3871	}
3872
3873	/ Information about one node in the SLP graph, for use during*
3874	vect_optimize_slp_pass. /*
3875
3876	struct slpg_vertex
3877	{
3878	slpg_vertex (slp_tree node_) : node (node_) {}
3879
3880	/ The node itself. /
3881	slp_tree node;
3882
3883	/ Which partition the node belongs to, or -1 if none. Nodes outside of*
3884	partitions are flexible; they can have whichever layout consumers
3885	want them to have. /*
3886	int partition = -`1`;
3887
3888	/ The number of nodes that directly use the result of this one*
3889	(i.e. the number of nodes that count this one as a child). /*
3890	unsigned int out_degree = `0`;
3891
3892	/ The execution frequency of the node. /
3893	sreal weight = `0`;
3894
3895	/ The total execution frequency of all nodes that directly use the*
3896	result of this one. /*
3897	sreal out_weight = `0`;
3898	};
3899
3900	/ Information about one partition of the SLP graph, for use during*
3901	vect_optimize_slp_pass. /*
3902
3903	struct slpg_partition_info
3904	{
3905	/ The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)*
3906	of m_partitioned_nodes. /*
3907	unsigned int node_begin = `0`;
3908	unsigned int node_end = `0`;
3909
3910	/ Which layout we've chosen to use for this partition, or -1 if*
3911	we haven't picked one yet. /*
3912	int layout = -`1`;
3913
3914	/ The number of predecessors and successors in the partition dag.*
3915	The predecessors always have lower partition numbers and the
3916	successors always have higher partition numbers.
3917
3918	Note that the directions of these edges are not necessarily the
3919	same as in the data flow graph. For example, if an SCC has separate
3920	partitions for an inner loop and an outer loop, the inner loop's
3921	partition will have at least two incoming edges from the outer loop's
3922	partition: one for a live-in value and one for a live-out value.
3923	In data flow terms, one of these edges would also be from the outer loop
3924	to the inner loop, but the other would be in the opposite direction. /*
3925	unsigned int in_degree = `0`;
3926	unsigned int out_degree = `0`;
3927	};
3928
3929	/ Information about the costs of using a particular layout for a*
3930	particular partition. It can also say that the combination is
3931	impossible. /*
3932
3933	struct slpg_partition_layout_costs
3934	{
3935	bool is_possible () const { return internal_cost.is_possible (); }
3936	void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3937
3938	/ The costs inherited from predecessor partitions. /
3939	slpg_layout_cost in_cost;
3940
3941	/ The inherent cost of the layout within the node itself. For example,*
3942	this is nonzero for a load if choosing a particular layout would require
3943	the load to permute the loaded elements. It is nonzero for a
3944	VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3945	to full-vector moves. /*
3946	slpg_layout_cost internal_cost;
3947
3948	/ The costs inherited from successor partitions. /
3949	slpg_layout_cost out_cost;
3950	};
3951
3952	/ This class tries to optimize the layout of vectors in order to avoid*
3953	unnecessary shuffling. At the moment, the set of possible layouts are
3954	restricted to bijective permutations.
3955
3956	The goal of the pass depends on whether we're optimizing for size or
3957	for speed. When optimizing for size, the goal is to reduce the overall
3958	number of layout changes (including layout changes implied by things
3959	like load permutations). When optimizing for speed, the goal is to
3960	reduce the maximum latency attributable to layout changes on any
3961	non-cyclical path through the data flow graph.
3962
3963	For example, when optimizing a loop nest for speed, we will prefer
3964	to make layout changes outside of a loop rather than inside of a loop,
3965	and will prefer to make layout changes in parallel rather than serially,
3966	even if that increases the overall number of layout changes.
3967
3968	The high-level procedure is:
3969
3970	(1) Build a graph in which edges go from uses (parents) to definitions
3971	(children).
3972
3973	(2) Divide the graph into a dag of strongly-connected components (SCCs).
3974
3975	(3) When optimizing for speed, partition the nodes in each SCC based
3976	on their containing cfg loop. When optimizing for size, treat
3977	each SCC as a single partition.
3978
3979	This gives us a dag of partitions. The goal is now to assign a
3980	layout to each partition.
3981
3982	(4) Construct a set of vector layouts that are worth considering.
3983	Record which nodes must keep their current layout.
3984
3985	(5) Perform a forward walk over the partition dag (from loads to stores)
3986	accumulating the "forward" cost of using each layout. When visiting
3987	each partition, assign a tentative choice of layout to the partition
3988	and use that choice when calculating the cost of using a different
3989	layout in successor partitions.
3990
3991	(6) Perform a backward walk over the partition dag (from stores to loads),
3992	accumulating the "backward" cost of using each layout. When visiting
3993	each partition, make a final choice of layout for that partition based
3994	on the accumulated forward costs (from (5)) and backward costs
3995	(from (6)).
3996
3997	(7) Apply the chosen layouts to the SLP graph.
3998
3999	For example, consider the SLP statements:
4000
4001	S1: a_1 = load
4002	loop:
4003	S2: a_2 = PHI<a_1, a_3>
4004	S3: b_1 = load
4005	S4: a_3 = a_2 + b_1
4006	exit:
4007	S5: a_4 = PHI<a_3>
4008	S6: store a_4
4009
4010	S2 and S4 form an SCC and are part of the same loop. Every other
4011	statement is in a singleton SCC. In this example there is a one-to-one
4012	mapping between SCCs and partitions and the partition dag looks like this;
4013
4014	S1 S3
4015	\ /
4016	S2+S4
4017	\|
4018	S5
4019	\|
4020	S6
4021
4022	S2, S3 and S4 will have a higher execution frequency than the other
4023	statements, so when optimizing for speed, the goal is to avoid any
4024	layout changes:
4025
4026	- within S3
4027	- within S2+S4
4028	- on the S3->S2+S4 edge
4029
4030	For example, if S3 was originally a reversing load, the goal of the
4031	pass is to make it an unreversed load and change the layout on the
4032	S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
4033	on S1->S2+S4 and S5->S6 would also be acceptable.)
4034
4035	The difference between SCCs and partitions becomes important if we
4036	add an outer loop:
4037
4038	S1: a_1 = ...
4039	loop1:
4040	S2: a_2 = PHI<a_1, a_6>
4041	S3: b_1 = load
4042	S4: a_3 = a_2 + b_1
4043	loop2:
4044	S5: a_4 = PHI<a_3, a_5>
4045	S6: c_1 = load
4046	S7: a_5 = a_4 + c_1
4047	exit2:
4048	S8: a_6 = PHI<a_5>
4049	S9: store a_6
4050	exit1:
4051
4052	Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
4053	for speed, we usually do not want restrictions in the outer loop to "infect"
4054	the decision for the inner loop. For example, if an outer-loop node
4055	in the SCC contains a statement with a fixed layout, that should not
4056	prevent the inner loop from using a different layout. Conversely,
4057	the inner loop should not dictate a layout to the outer loop: if the
4058	outer loop does a lot of computation, then it may not be efficient to
4059	do all of that computation in the inner loop's preferred layout.
4060
4061	So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4062	and S5+S7 (inner). We also try to arrange partitions so that:
4063
4064	- the partition for an outer loop comes before the partition for
4065	an inner loop
4066
4067	- if a sibling loop A dominates a sibling loop B, A's partition
4068	comes before B's
4069
4070	This gives the following partition dag for the example above:
4071
4072	S1 S3
4073	\ /
4074	S2+S4+S8 S6
4075	\| \\ /
4076	\| S5+S7
4077	\|
4078	S9
4079
4080	There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4081	one for a reversal of the edge S7->S8.
4082
4083	The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
4084	for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4085	preferred layout against the cost of changing the layout on entry to the
4086	inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4087
4088	Although this works well when optimizing for speed, it has the downside
4089	when optimizing for size that the choice of layout for S5+S7 is completely
4090	independent of S9, which lessens the chance of reducing the overall number
4091	of permutations. We therefore do not partition SCCs when optimizing
4092	for size.
4093
4094	To give a concrete example of the difference between optimizing
4095	for size and speed, consider:
4096
4097	a[0] = (b[1] << c[3]) - d[1];
4098	a[1] = (b[0] << c[2]) - d[0];
4099	a[2] = (b[3] << c[1]) - d[3];
4100	a[3] = (b[2] << c[0]) - d[2];
4101
4102	There are three different layouts here: one for a, one for b and d,
4103	and one for c. When optimizing for speed it is better to permute each
4104	of b, c and d into the order required by a, since those permutations
4105	happen in parallel. But when optimizing for size, it is better to:
4106
4107	- permute c into the same order as b
4108	- do the arithmetic
4109	- permute the result into the order required by a
4110
4111	This gives 2 permutations rather than 3. /*
4112
4113	class vect_optimize_slp_pass
4114	{
4115	public:
4116	vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4117	void run ();
4118
4119	private:
4120	/ Graph building. /
4121	struct loop *containing_loop (slp_tree);
4122	bool is_cfg_latch_edge (graph_edge *);
4123	void build_vertices (hash_set<slp_tree> &, slp_tree);
4124	void build_vertices ();
4125	void build_graph ();
4126
4127	/ Partitioning. /
4128	void create_partitions ();
4129	template<typename T> void for_each_partition_edge (unsigned int, T);
4130
4131	/ Layout selection. /
4132	bool is_compatible_layout (slp_tree, unsigned int);
4133	int change_layout_cost (slp_tree, unsigned int, unsigned int);
4134	slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4135	unsigned int);
4136	void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4137	int, unsigned int);
4138	int internal_node_cost (slp_tree, int, unsigned int);
4139	void start_choosing_layouts ();
4140
4141	/ Cost propagation. /
4142	slpg_layout_cost edge_layout_cost (graph_edge , unsigned* int,
4143	unsigned int, unsigned int);
4144	slpg_layout_cost total_in_cost (unsigned int);
4145	slpg_layout_cost forward_cost (graph_edge , unsigned* int, unsigned int);
4146	slpg_layout_cost backward_cost (graph_edge , unsigned* int, unsigned int);
4147	void forward_pass ();
4148	void backward_pass ();
4149
4150	/ Rematerialization. /
4151	slp_tree get_result_with_layout (slp_tree, unsigned int);
4152	void materialize ();
4153
4154	/ Clean-up. /
4155	void remove_redundant_permutations ();
4156
4157	void dump ();
4158
4159	vec_info *m_vinfo;
4160
4161	/ True if we should optimize the graph for size, false if we should*
4162	optimize it for speed. (It wouldn't be easy to make this decision
4163	more locally.) /*
4164	bool m_optimize_size;
4165
4166	/ A graph of all SLP nodes, with edges leading from uses to definitions.*
4167	In other words, a node's predecessors are its slp_tree parents and
4168	a node's successors are its slp_tree children. /*
4169	graph m_slpg = nullptr*;
4170
4171	/ The vertices of M_SLPG, indexed by slp_tree::vertex. /
4172	auto_vec<slpg_vertex> m_vertices;
4173
4174	/ The list of all leaves of M_SLPG. such as external definitions, constants,*
4175	and loads. /*
4176	auto_vec<int> m_leafs;
4177
4178	/ This array has one entry for every vector layout that we're considering.*
4179	Element 0 is null and indicates "no change". Other entries describe
4180	permutations that are inherent in the current graph and that we would
4181	like to reverse if possible.
4182
4183	For example, a permutation { 1, 2, 3, 0 } means that something has
4184	effectively been permuted in that way, such as a load group
4185	{ a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4186	We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4187	in order to put things "back" in order. /*
4188	auto_vec<vec<unsigned> > m_perms;
4189
4190	/ A partitioning of the nodes for which a layout must be chosen.*
4191	Each partition represents an <SCC, cfg loop> pair; that is,
4192	nodes in different SCCs belong to different partitions, and nodes
4193	within an SCC can be further partitioned according to a containing
4194	cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4195
4196	- SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4197	from leaves (such as loads) to roots (such as stores).
4198
4199	- SCC1 == SCC2 and L1's header strictly dominates L2's header. /*
4200	auto_vec<slpg_partition_info> m_partitions;
4201
4202	/ The list of all nodes for which a layout must be chosen. Nodes for*
4203	partition P come before the nodes for partition P+1. Nodes within a
4204	partition are in reverse postorder. /*
4205	auto_vec<unsigned int> m_partitioned_nodes;
4206
4207	/ Index P * num-layouts + L contains the cost of using layout L*
4208	for partition P. /*
4209	auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4210
4211	/ Index N * num-layouts + L, if nonnull, is a node that provides the*
4212	original output of node N adjusted to have layout L. /*
4213	auto_vec<slp_tree> m_node_layouts;
4214	};
4215
4216	/ Fill the vertices and leafs vector with all nodes in the SLP graph.*
4217	Also record whether we should optimize anything for speed rather
4218	than size. /*
4219
4220	void
4221	vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4222	slp_tree node)
4223	{
4224	unsigned i;
4225	slp_tree child;
4226
4227	if (visited.add (k: node))
4228	return;
4229
4230	if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4231	{
4232	basic_block bb = gimple_bb (g: vect_orig_stmt (stmt_info: rep)->stmt);
4233	if (optimize_bb_for_speed_p (bb))
4234	m_optimize_size = false;
4235	}
4236
4237	node->vertex = m_vertices.length ();
4238	m_vertices.safe_push (obj: slpg_vertex (node));
4239
4240	bool leaf = true;
4241	bool force_leaf = false;
4242	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4243	if (child)
4244	{
4245	leaf = false;
4246	build_vertices (visited, node: child);
4247	}
4248	else
4249	force_leaf = true;
4250	/ Since SLP discovery works along use-def edges all cycles have an*
4251	entry - but there's the exception of cycles where we do not handle
4252	the entry explicitely (but with a NULL SLP node), like some reductions
4253	and inductions. Force those SLP PHIs to act as leafs to make them
4254	backwards reachable. /*
4255	if (leaf \|\| force_leaf)
4256	m_leafs.safe_push (obj: node->vertex);
4257	}
4258
4259	/ Fill the vertices and leafs vector with all nodes in the SLP graph. /
4260
4261	void
4262	vect_optimize_slp_pass::build_vertices ()
4263	{
4264	hash_set<slp_tree> visited;
4265	unsigned i;
4266	slp_instance instance;
4267	FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4268	build_vertices (visited, SLP_INSTANCE_TREE (instance));
4269	}
4270
4271	/ Apply (reverse) bijectite PERM to VEC. /
4272
4273	template <class T>
4274	static void
4275	vect_slp_permute (vec<unsigned> perm,
4276	vec<T> &vec, bool reverse)
4277	{
4278	auto_vec<T, `64`> saved;
4279	saved.create (vec.length ());
4280	for (unsigned i = `0`; i < vec.length (); ++i)
4281	saved.quick_push (vec[i]);
4282
4283	if (reverse)
4284	{
4285	for (unsigned i = `0`; i < vec.length (); ++i)
4286	vec[perm [i]] = saved[i];
4287	for (unsigned i = `0`; i < vec.length (); ++i)
4288	gcc_assert (vec[perm[i]] == saved[i]);
4289	}
4290	else
4291	{
4292	for (unsigned i = `0`; i < vec.length (); ++i)
4293	vec[i] = saved[perm [i]];
4294	for (unsigned i = `0`; i < vec.length (); ++i)
4295	gcc_assert (vec[i] == saved[perm[i]]);
4296	}
4297	}
4298
4299	/ Return the cfg loop that contains NODE. /
4300
4301	struct loop *
4302	vect_optimize_slp_pass::containing_loop (slp_tree node)
4303	{
4304	stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4305	if (!rep)
4306	return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4307	return gimple_bb (g: vect_orig_stmt (stmt_info: rep)->stmt)->loop_father;
4308	}
4309
4310	/ Return true if UD (an edge from a use to a definition) is associated*
4311	with a loop latch edge in the cfg. /*
4312
4313	bool
4314	vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4315	{
4316	slp_tree use = m_vertices [ud->src].node;
4317	slp_tree def = m_vertices [ud->dest].node;
4318	if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
4319	\|\| SLP_TREE_CODE (use) == VEC_PERM_EXPR)
4320	\|\| SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4321	return false;
4322
4323	stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4324	return (is_a<gphi *> (p: use_rep->stmt)
4325	&& bb_loop_header_p (gimple_bb (g: use_rep->stmt))
4326	&& containing_loop (node: def) == containing_loop (node: use));
4327	}
4328
4329	/ Build the graph. Mark edges that correspond to cfg loop latch edges with*
4330	a nonnull data field. /*
4331
4332	void
4333	vect_optimize_slp_pass::build_graph ()
4334	{
4335	m_optimize_size = true;
4336	build_vertices ();
4337
4338	m_slpg = new_graph (m_vertices.length ());
4339	for (slpg_vertex &v : m_vertices)
4340	for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4341	if (child)
4342	{
4343	graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4344	if (is_cfg_latch_edge (ud))
4345	ud->data = this;
4346	}
4347	}
4348
4349	/ Return true if E corresponds to a loop latch edge in the cfg. /
4350
4351	static bool
4352	skip_cfg_latch_edges (graph_edge *e)
4353	{
4354	return e->data;
4355	}
4356
4357	/ Create the node partitions. /
4358
4359	void
4360	vect_optimize_slp_pass::create_partitions ()
4361	{
4362	/ Calculate a postorder of the graph, ignoring edges that correspond*
4363	to natural latch edges in the cfg. Reading the vector from the end
4364	to the beginning gives the reverse postorder. /*
4365	auto_vec<int> initial_rpo;
4366	graphds_dfs (m_slpg, &m_leafs [`0`], m_leafs.length (), &initial_rpo,
4367	false, NULL, skip_cfg_latch_edges);
4368	gcc_assert (initial_rpo.length () == m_vertices.length ());
4369
4370	/ Calculate the strongly connected components of the graph. /
4371	auto_vec<int> scc_grouping;
4372	unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4373
4374	/ Create a new index order in which all nodes from the same SCC are*
4375	consecutive. Use scc_pos to record the index of the first node in
4376	each SCC. /*
4377	auto_vec<unsigned int> scc_pos (num_sccs);
4378	int last_component = -`1`;
4379	unsigned int node_count = `0`;
4380	for (unsigned int node_i : scc_grouping)
4381	{
4382	if (last_component != m_slpg->vertices[node_i].component)
4383	{
4384	last_component = m_slpg->vertices[node_i].component;
4385	gcc_assert (last_component == int (scc_pos.length ()));
4386	scc_pos.quick_push (obj: node_count);
4387	}
4388	node_count += `1`;
4389	}
4390	gcc_assert (node_count == initial_rpo.length ()
4391	&& last_component + `1` == int (num_sccs));
4392
4393	/ Use m_partitioned_nodes to group nodes into SCC order, with the nodes*
4394	inside each SCC following the RPO we calculated above. The fact that
4395	we ignored natural latch edges when calculating the RPO should ensure
4396	that, for natural loop nests:
4397
4398	- the first node that we encounter in a cfg loop is the loop header phi
4399	- the loop header phis are in dominance order
4400
4401	Arranging for this is an optimization (see below) rather than a
4402	correctness issue. Unnatural loops with a tangled mess of backedges
4403	will still work correctly, but might give poorer results.
4404
4405	Also update scc_pos so that it gives 1 + the index of the last node
4406	in the SCC. /*
4407	m_partitioned_nodes.safe_grow (len: node_count);
4408	for (unsigned int old_i = initial_rpo.length (); old_i-- > `0`;)
4409	{
4410	unsigned int node_i = initial_rpo [old_i];
4411	unsigned int new_i = scc_pos [m_slpg->vertices[node_i].component]++;
4412	m_partitioned_nodes [new_i] = node_i;
4413	}
4414
4415	/ When optimizing for speed, partition each SCC based on the containing*
4416	cfg loop. The order we constructed above should ensure that, for natural
4417	cfg loops, we'll create sub-SCC partitions for outer loops before
4418	the corresponding sub-SCC partitions for inner loops. Similarly,
4419	when one sibling loop A dominates another sibling loop B, we should
4420	create a sub-SCC partition for A before a sub-SCC partition for B.
4421
4422	As above, nothing depends for correctness on whether this achieves
4423	a natural nesting, but we should get better results when it does. /*
4424	m_partitions.reserve (nelems: m_vertices.length ());
4425	unsigned int next_partition_i = `0`;
4426	hash_map<struct loop , int*> loop_partitions;
4427	unsigned int rpo_begin = `0`;
4428	unsigned int num_partitioned_nodes = `0`;
4429	for (unsigned int rpo_end : scc_pos)
4430	{
4431	loop_partitions.empty ();
4432	unsigned int partition_i = next_partition_i;
4433	for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4434	{
4435	/ Handle externals and constants optimistically throughout.*
4436	But treat existing vectors as fixed since we do not handle
4437	permuting them. /*
4438	unsigned int node_i = m_partitioned_nodes [rpo_i];
4439	auto &vertex = m_vertices [node_i];
4440	if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4441	&& !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4442	\|\| SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4443	vertex.partition = -`1`;
4444	else
4445	{
4446	bool existed;
4447	if (m_optimize_size)
4448	existed = next_partition_i > partition_i;
4449	else
4450	{
4451	struct loop *loop = containing_loop (node: vertex.node);
4452	auto &entry = loop_partitions.get_or_insert (k: loop, existed: &existed);
4453	if (!existed)
4454	entry = next_partition_i;
4455	partition_i = entry;
4456	}
4457	if (!existed)
4458	{
4459	m_partitions.quick_push (obj: slpg_partition_info ());
4460	next_partition_i += `1`;
4461	}
4462	vertex.partition = partition_i;
4463	num_partitioned_nodes += `1`;
4464	m_partitions [partition_i].node_end += `1`;
4465	}
4466	}
4467	rpo_begin = rpo_end;
4468	}
4469
4470	/ Assign ranges of consecutive node indices to each partition,*
4471	in partition order. Start with node_end being the same as
4472	node_begin so that the next loop can use it as a counter. /*
4473	unsigned int node_begin = `0`;
4474	for (auto &partition : m_partitions)
4475	{
4476	partition.node_begin = node_begin;
4477	node_begin += partition.node_end;
4478	partition.node_end = partition.node_begin;
4479	}
4480	gcc_assert (node_begin == num_partitioned_nodes);
4481
4482	/ Finally build the list of nodes in partition order. /
4483	m_partitioned_nodes.truncate (size: num_partitioned_nodes);
4484	for (unsigned int node_i = `0`; node_i < m_vertices.length (); ++node_i)
4485	{
4486	int partition_i = m_vertices [node_i].partition;
4487	if (partition_i >= `0`)
4488	{
4489	unsigned int order_i = m_partitions [partition_i].node_end++;
4490	m_partitioned_nodes [order_i] = node_i;
4491	}
4492	}
4493	}
4494
4495	/ Look for edges from earlier partitions into node NODE_I and edges from*
4496	node NODE_I into later partitions. Call:
4497
4498	FN (ud, other_node_i)
4499
4500	for each such use-to-def edge ud, where other_node_i is the node at the
4501	other end of the edge. /*
4502
4503	template<typename T>
4504	void
4505	vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4506	{
4507	int partition_i = m_vertices [node_i].partition;
4508	for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4509	pred; pred = pred->pred_next)
4510	{
4511	int src_partition_i = m_vertices [pred->src].partition;
4512	if (src_partition_i >= `0` && src_partition_i != partition_i)
4513	fn (pred, pred->src);
4514	}
4515	for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4516	succ; succ = succ->succ_next)
4517	{
4518	int dest_partition_i = m_vertices [succ->dest].partition;
4519	if (dest_partition_i >= `0` && dest_partition_i != partition_i)
4520	fn (succ, succ->dest);
4521	}
4522	}
4523
4524	/ Return true if layout LAYOUT_I is compatible with the number of SLP lanes*
4525	that NODE would operate on. This test is independent of NODE's actual
4526	operation. /*
4527
4528	bool
4529	vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4530	unsigned int layout_i)
4531	{
4532	if (layout_i == `0`)
4533	return true;
4534
4535	if (SLP_TREE_LANES (node) != m_perms [layout_i].length ())
4536	return false;
4537
4538	return true;
4539	}
4540
4541	/ Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I*
4542	to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4543	layouts is incompatible with NODE or if the change is not possible for
4544	some other reason.
4545
4546	The properties taken from NODE include the number of lanes and the
4547	vector type. The actual operation doesn't matter. /*
4548
4549	int
4550	vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4551	unsigned int from_layout_i,
4552	unsigned int to_layout_i)
4553	{
4554	if (!is_compatible_layout (node, layout_i: from_layout_i)
4555	\|\| !is_compatible_layout (node, layout_i: to_layout_i))
4556	return -`1`;
4557
4558	if (from_layout_i == to_layout_i)
4559	return `0`;
4560
4561	auto_vec<slp_tree, `1`> children (`1`);
4562	children.quick_push (obj: node);
4563	auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4564	if (from_layout_i > `0`)
4565	for (unsigned int i : m_perms [from_layout_i])
4566	perm.quick_push (obj: { `0`, i });
4567	else
4568	for (unsigned int i = `0`; i < SLP_TREE_LANES (node); ++i)
4569	perm.quick_push (obj: { `0`, i });
4570	if (to_layout_i > `0`)
4571	vect_slp_permute (perm: m_perms [to_layout_i], vec&: perm, reverse: true);
4572	auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4573	children, false);
4574	if (count >= `0`)
4575	return MAX (count, `1`);
4576
4577	/ ??? In principle we could try changing via layout 0, giving two*
4578	layout changes rather than 1. Doing that would require
4579	corresponding support in get_result_with_layout. /*
4580	return -`1`;
4581	}
4582
4583	/ Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. /
4584
4585	inline slpg_partition_layout_costs &
4586	vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4587	unsigned int layout_i)
4588	{
4589	return m_partition_layout_costs [partition_i * m_perms.length () + layout_i];
4590	}
4591
4592	/ Change PERM in one of two ways:*
4593
4594	- if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4595	chosen for child I of NODE.
4596
4597	- if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4598
4599	In both cases, arrange for the output to have layout OUT_LAYOUT_I /*
4600
4601	void
4602	vect_optimize_slp_pass::
4603	change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4604	int in_layout_i, unsigned int out_layout_i)
4605	{
4606	for (auto &entry : perm)
4607	{
4608	int this_in_layout_i = in_layout_i;
4609	if (this_in_layout_i < `0`)
4610	{
4611	slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4612	unsigned int in_partition_i = m_vertices [in_node->vertex].partition;
4613	this_in_layout_i = m_partitions [in_partition_i].layout;
4614	}
4615	if (this_in_layout_i > `0`)
4616	entry.second = m_perms [this_in_layout_i][entry.second];
4617	}
4618	if (out_layout_i > `0`)
4619	vect_slp_permute (perm: m_perms [out_layout_i], vec&: perm, reverse: true);
4620	}
4621
4622	/ Check whether the target allows NODE to be rearranged so that the node's*
4623	output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4624	in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4625
4626	If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4627	NODE can adapt to the layout changes that have (perhaps provisionally)
4628	been chosen for NODE's children, so that no extra permutations are
4629	needed on either the input or the output of NODE.
4630
4631	If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4632	that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4633
4634	IN_LAYOUT_I has no meaning for other types of node.
4635
4636	Keeping the node as-is is always valid. If the target doesn't appear
4637	to support the node as-is, but might realistically support other layouts,
4638	then layout 0 instead has the cost of a worst-case permutation. On the
4639	one hand, this ensures that every node has at least one valid layout,
4640	avoiding what would otherwise be an awkward special case. On the other,
4641	it still encourages the pass to change an invalid pre-existing layout
4642	choice into a valid one. /*
4643
4644	int
4645	vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4646	unsigned int out_layout_i)
4647	{
4648	const int fallback_cost = `1`;
4649
4650	if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4651	{
4652	auto_lane_permutation_t tmp_perm;
4653	tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4654
4655	/ Check that the child nodes support the chosen layout. Checking*
4656	the first child is enough, since any second child would have the
4657	same shape. /*
4658	auto first_child = SLP_TREE_CHILDREN (node)[`0`];
4659	if (in_layout_i > `0`
4660	&& !is_compatible_layout (node: first_child, layout_i: in_layout_i))
4661	return -`1`;
4662
4663	change_vec_perm_layout (node, perm&: tmp_perm, in_layout_i, out_layout_i);
4664	int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4665	node, tmp_perm,
4666	SLP_TREE_CHILDREN (node),
4667	false);
4668	if (count < `0`)
4669	{
4670	if (in_layout_i == `0` && out_layout_i == `0`)
4671	{
4672	/ Use the fallback cost if the node could in principle support*
4673	some nonzero layout for both the inputs and the outputs.
4674	Otherwise assume that the node will be rejected later
4675	and rebuilt from scalars. /*
4676	if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4677	return fallback_cost;
4678	return `0`;
4679	}
4680	return -`1`;
4681	}
4682
4683	/ We currently have no way of telling whether the new layout is cheaper*
4684	or more expensive than the old one. But at least in principle,
4685	it should be worth making zero permutations (whole-vector shuffles)
4686	cheaper than real permutations, in case the pass is able to remove
4687	the latter. /*
4688	return count == `0` ? `0` : `1`;
4689	}
4690
4691	stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4692	if (rep
4693	&& STMT_VINFO_DATA_REF (rep)
4694	&& DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4695	&& SLP_TREE_LOAD_PERMUTATION (node).exists ())
4696	{
4697	auto_load_permutation_t tmp_perm;
4698	tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4699	if (out_layout_i > `0`)
4700	vect_slp_permute (perm: m_perms [out_layout_i], vec&: tmp_perm, reverse: true);
4701
4702	poly_uint64 vf = `1`;
4703	if (auto loop_vinfo = dyn_cast<loop_vec_info> (p: m_vinfo))
4704	vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4705	unsigned int n_perms;
4706	if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4707	nullptr, vf, true, false, &n_perms))
4708	{
4709	auto rep = SLP_TREE_REPRESENTATIVE (node);
4710	if (out_layout_i == `0`)
4711	{
4712	/ Use the fallback cost if the load is an N-to-N permutation.*
4713	Otherwise assume that the node will be rejected later
4714	and rebuilt from scalars. /*
4715	if (STMT_VINFO_GROUPED_ACCESS (rep)
4716	&& (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4717	== SLP_TREE_LANES (node)))
4718	return fallback_cost;
4719	return `0`;
4720	}
4721	return -`1`;
4722	}
4723
4724	/ See the comment above the corresponding VEC_PERM_EXPR handling. /
4725	return n_perms == `0` ? `0` : `1`;
4726	}
4727
4728	return `0`;
4729	}
4730
4731	/ Decide which element layouts we should consider using. Calculate the*
4732	weights associated with inserting layout changes on partition edges.
4733	Also mark partitions that cannot change layout, by setting their
4734	layout to zero. /*
4735
4736	void
4737	vect_optimize_slp_pass::start_choosing_layouts ()
4738	{
4739	/ Used to assign unique permutation indices. /
4740	using perm_hash = unbounded_hashmap_traits<
4741	vec_free_hash_base<int_hash_base<unsigned>>,
4742	int_hash<int, -`1`, -`2`>
4743	>;
4744	hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4745
4746	/ Layout 0 is "no change". /
4747	m_perms.safe_push (obj: vNULL);
4748
4749	/ Create layouts from existing permutations. /
4750	auto_load_permutation_t tmp_perm;
4751	for (unsigned int node_i : m_partitioned_nodes)
4752	{
4753	/ Leafs also double as entries to the reverse graph. Allow the*
4754	layout of those to be changed. /*
4755	auto &vertex = m_vertices [node_i];
4756	auto &partition = m_partitions [vertex.partition];
4757	if (!m_slpg->vertices[node_i].succ)
4758	partition.layout = `0`;
4759
4760	/ Loads and VEC_PERM_EXPRs are the only things generating permutes. /
4761	slp_tree node = vertex.node;
4762	stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4763	slp_tree child;
4764	unsigned HOST_WIDE_INT imin, imax = `0`;
4765	bool any_permute = false;
4766	tmp_perm.truncate (size: `0`);
4767	if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4768	{
4769	/ If splitting out a SLP_TREE_LANE_PERMUTATION can make the node*
4770	unpermuted, record a layout that reverses this permutation.
4771
4772	We would need more work to cope with loads that are internally
4773	permuted and also have inputs (such as masks for
4774	IFN_MASK_LOADs). /*
4775	gcc_assert (partition.layout == `0` && !m_slpg->vertices[node_i].succ);
4776	if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4777	{
4778	partition.layout = -`1`;
4779	continue;
4780	}
4781	dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4782	imin = DR_GROUP_SIZE (dr_stmt) + `1`;
4783	tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4784	}
4785	else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4786	&& SLP_TREE_CHILDREN (node).length () == `1`
4787	&& (child = SLP_TREE_CHILDREN (node)[`0`])
4788	&& (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4789	.is_constant (const_value: &imin)))
4790	{
4791	/ If the child has the same vector size as this node,*
4792	reversing the permutation can make the permutation a no-op.
4793	In other cases it can change a true permutation into a
4794	full-vector extract. /*
4795	tmp_perm.reserve (SLP_TREE_LANES (node));
4796	for (unsigned j = `0`; j < SLP_TREE_LANES (node); ++j)
4797	tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4798	}
4799	else
4800	continue;
4801
4802	for (unsigned j = `0`; j < SLP_TREE_LANES (node); ++j)
4803	{
4804	unsigned idx = tmp_perm [j];
4805	imin = MIN (imin, idx);
4806	imax = MAX (imax, idx);
4807	if (idx - tmp_perm [`0`] != j)
4808	any_permute = true;
4809	}
4810	/ If the span doesn't match we'd disrupt VF computation, avoid*
4811	that for now. /*
4812	if (imax - imin + `1` != SLP_TREE_LANES (node))
4813	continue;
4814	/ If there's no permute no need to split one out. In this case*
4815	we can consider turning a load into a permuted load, if that
4816	turns out to be cheaper than alternatives. /*
4817	if (!any_permute)
4818	{
4819	partition.layout = -`1`;
4820	continue;
4821	}
4822
4823	/ For now only handle true permutes, like*
4824	vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4825	when permuting constants and invariants keeping the permute
4826	bijective. /*
4827	auto_sbitmap load_index (SLP_TREE_LANES (node));
4828	bitmap_clear (load_index);
4829	for (unsigned j = `0`; j < SLP_TREE_LANES (node); ++j)
4830	bitmap_set_bit (map: load_index, bitno: tmp_perm [j] - imin);
4831	unsigned j;
4832	for (j = `0`; j < SLP_TREE_LANES (node); ++j)
4833	if (!bitmap_bit_p (map: load_index, bitno: j))
4834	break;
4835	if (j != SLP_TREE_LANES (node))
4836	continue;
4837
4838	vec<unsigned> perm = vNULL;
4839	perm.safe_grow (SLP_TREE_LANES (node), exact: true);
4840	for (unsigned j = `0`; j < SLP_TREE_LANES (node); ++j)
4841	perm [j] = tmp_perm [j] - imin;
4842
4843	if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4844	{
4845	/ Continue to use existing layouts, but don't add any more. /
4846	int *entry = layout_ids.get (k: perm);
4847	partition.layout = entry ? *entry : `0`;
4848	perm.release ();
4849	}
4850	else
4851	{
4852	bool existed;
4853	int &layout_i = layout_ids.get_or_insert (k: perm, existed: &existed);
4854	if (existed)
4855	perm.release ();
4856	else
4857	{
4858	layout_i = m_perms.length ();
4859	m_perms.safe_push (obj: perm);
4860	}
4861	partition.layout = layout_i;
4862	}
4863	}
4864
4865	/ Initially assume that every layout is possible and has zero cost*
4866	in every partition. /*
4867	m_partition_layout_costs.safe_grow_cleared (len: m_partitions.length ()
4868	* m_perms.length ());
4869
4870	/ We have to mark outgoing permutations facing non-associating-reduction*
4871	graph entries that are not represented as to be materialized.
4872	slp_inst_kind_bb_reduc currently only covers associatable reductions. /*
4873	for (slp_instance instance : m_vinfo->slp_instances)
4874	if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4875	{
4876	unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4877	m_partitions [m_vertices [node_i].partition].layout = `0`;
4878	}
4879	else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4880	{
4881	stmt_vec_info stmt_info
4882	= SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4883	stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4884	if (needs_fold_left_reduction_p (TREE_TYPE
4885	(gimple_get_lhs (stmt_info->stmt)),
4886	STMT_VINFO_REDUC_CODE (reduc_info)))
4887	{
4888	unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4889	m_partitions [m_vertices [node_i].partition].layout = `0`;
4890	}
4891	}
4892
4893	/ Check which layouts each node and partition can handle. Calculate the*
4894	weights associated with inserting layout changes on edges. /*
4895	for (unsigned int node_i : m_partitioned_nodes)
4896	{
4897	auto &vertex = m_vertices [node_i];
4898	auto &partition = m_partitions [vertex.partition];
4899	slp_tree node = vertex.node;
4900
4901	if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4902	{
4903	vertex.weight = vect_slp_node_weight (node);
4904
4905	/ We do not handle stores with a permutation, so all*
4906	incoming permutations must have been materialized.
4907
4908	We also don't handle masked grouped loads, which lack a
4909	permutation vector. In this case the memory locations
4910	form an implicit second input to the loads, on top of the
4911	explicit mask input, and the memory input's layout cannot
4912	be changed.
4913
4914	On the other hand, we do support permuting gather loads and
4915	masked gather loads, where each scalar load is independent
4916	of the others. This can be useful if the address/index input
4917	benefits from permutation. /*
4918	if (STMT_VINFO_DATA_REF (rep)
4919	&& STMT_VINFO_GROUPED_ACCESS (rep)
4920	&& !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4921	partition.layout = `0`;
4922
4923	/ We cannot change the layout of an operation that is*
4924	not independent on lanes. Note this is an explicit
4925	negative list since that's much shorter than the respective
4926	positive one but it's critical to keep maintaining it. /*
4927	if (is_gimple_call (STMT_VINFO_STMT (rep)))
4928	switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4929	{
4930	case CFN_COMPLEX_ADD_ROT90:
4931	case CFN_COMPLEX_ADD_ROT270:
4932	case CFN_COMPLEX_MUL:
4933	case CFN_COMPLEX_MUL_CONJ:
4934	case CFN_VEC_ADDSUB:
4935	case CFN_VEC_FMADDSUB:
4936	case CFN_VEC_FMSUBADD:
4937	partition.layout = `0`;
4938	default:;
4939	}
4940	}
4941
4942	auto process_edge = [&](graph_edge ud, unsigned* int other_node_i)
4943	{
4944	auto &other_vertex = m_vertices [other_node_i];
4945
4946	/ Count the number of edges from earlier partitions and the number*
4947	of edges to later partitions. /*
4948	if (other_vertex.partition < vertex.partition)
4949	partition.in_degree += `1`;
4950	else
4951	partition.out_degree += `1`;
4952
4953	/ If the current node uses the result of OTHER_NODE_I, accumulate*
4954	the effects of that. /*
4955	if (ud->src == int (node_i))
4956	{
4957	other_vertex.out_weight += vertex.weight;
4958	other_vertex.out_degree += `1`;
4959	}
4960	};
4961	for_each_partition_edge (node_i, fn: process_edge);
4962	}
4963	}
4964
4965	/ Return the incoming costs for node NODE_I, assuming that each input keeps*
4966	its current (provisional) choice of layout. The inputs do not necessarily
4967	have the same layout as each other. /*
4968
4969	slpg_layout_cost
4970	vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4971	{
4972	auto &vertex = m_vertices [node_i];
4973	slpg_layout_cost cost;
4974	auto add_cost = [&](graph_edge , unsigned* int other_node_i)
4975	{
4976	auto &other_vertex = m_vertices [other_node_i];
4977	if (other_vertex.partition < vertex.partition)
4978	{
4979	auto &other_partition = m_partitions [other_vertex.partition];
4980	auto &other_costs = partition_layout_costs (partition_i: other_vertex.partition,
4981	layout_i: other_partition.layout);
4982	slpg_layout_cost this_cost = other_costs.in_cost;
4983	this_cost.add_serial_cost (other: other_costs.internal_cost);
4984	this_cost.split (times: other_partition.out_degree);
4985	cost.add_parallel_cost (input_cost: this_cost);
4986	}
4987	};
4988	for_each_partition_edge (node_i, fn: add_cost);
4989	return cost;
4990	}
4991
4992	/ Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)*
4993	and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4994	slpg_layout_cost::impossible () if the change isn't possible. /*
4995
4996	slpg_layout_cost
4997	vect_optimize_slp_pass::
4998	edge_layout_cost (graph_edge ud, unsigned* int node1_i, unsigned int layout1_i,
4999	unsigned int layout2_i)
5000	{
5001	auto &def_vertex = m_vertices [ud->dest];
5002	auto &use_vertex = m_vertices [ud->src];
5003	auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
5004	auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
5005	auto factor = change_layout_cost (node: def_vertex.node, from_layout_i: def_layout_i,
5006	to_layout_i: use_layout_i);
5007	if (factor < `0`)
5008	return slpg_layout_cost::impossible ();
5009
5010	/ We have a choice of putting the layout change at the site of the*
5011	definition or at the site of the use. Prefer the former when
5012	optimizing for size or when the execution frequency of the
5013	definition is no greater than the combined execution frequencies of
5014	the uses. When putting the layout change at the site of the definition,
5015	divvy up the cost among all consumers. /*
5016	if (m_optimize_size \|\| def_vertex.weight <= def_vertex.out_weight)
5017	{
5018	slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
5019	cost.split (times: def_vertex.out_degree);
5020	return cost;
5021	}
5022	return { use_vertex.weight * factor, m_optimize_size };
5023	}
5024
5025	/ UD represents a use-def link between FROM_NODE_I and a node in a later*
5026	partition; FROM_NODE_I could be the definition node or the use node.
5027	The node at the other end of the link wants to use layout TO_LAYOUT_I.
5028	Return the cost of any necessary fix-ups on edge UD, or return
5029	slpg_layout_cost::impossible () if the change isn't possible.
5030
5031	At this point, FROM_NODE_I's partition has chosen the cheapest
5032	layout based on the information available so far, but this choice
5033	is only provisional. /*
5034
5035	slpg_layout_cost
5036	vect_optimize_slp_pass::forward_cost (graph_edge ud, unsigned* int from_node_i,
5037	unsigned int to_layout_i)
5038	{
5039	auto &from_vertex = m_vertices [from_node_i];
5040	unsigned int from_partition_i = from_vertex.partition;
5041	slpg_partition_info &from_partition = m_partitions [from_partition_i];
5042	gcc_assert (from_partition.layout >= `0`);
5043
5044	/ First calculate the cost on the assumption that FROM_PARTITION sticks*
5045	with its current layout preference. /*
5046	slpg_layout_cost cost = slpg_layout_cost::impossible ();
5047	auto edge_cost = edge_layout_cost (ud, node1_i: from_node_i,
5048	layout1_i: from_partition.layout, layout2_i: to_layout_i);
5049	if (edge_cost.is_possible ())
5050	{
5051	auto &from_costs = partition_layout_costs (partition_i: from_partition_i,
5052	layout_i: from_partition.layout);
5053	cost = from_costs.in_cost;
5054	cost.add_serial_cost (other: from_costs.internal_cost);
5055	cost.split (times: from_partition.out_degree);
5056	cost.add_serial_cost (other: edge_cost);
5057	}
5058	else if (from_partition.layout == `0`)
5059	/ We must allow the source partition to have layout 0 as a fallback,*
5060	in case all other options turn out to be impossible. /*
5061	return cost;
5062
5063	/ Take the minimum of that cost and the cost that applies if*
5064	FROM_PARTITION instead switches to TO_LAYOUT_I. /*
5065	auto &direct_layout_costs = partition_layout_costs (partition_i: from_partition_i,
5066	layout_i: to_layout_i);
5067	if (direct_layout_costs.is_possible ())
5068	{
5069	slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5070	direct_cost.add_serial_cost (other: direct_layout_costs.internal_cost);
5071	direct_cost.split (times: from_partition.out_degree);
5072	if (!cost.is_possible ()
5073	\|\| direct_cost.is_better_than (other: cost, is_for_size: m_optimize_size))
5074	cost = direct_cost;
5075	}
5076
5077	return cost;
5078	}
5079
5080	/ UD represents a use-def link between TO_NODE_I and a node in an earlier*
5081	partition; TO_NODE_I could be the definition node or the use node.
5082	The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5083	return the cost of any necessary fix-ups on edge UD, or
5084	slpg_layout_cost::impossible () if the choice cannot be made.
5085
5086	At this point, TO_NODE_I's partition has a fixed choice of layout. /*
5087
5088	slpg_layout_cost
5089	vect_optimize_slp_pass::backward_cost (graph_edge ud, unsigned* int to_node_i,
5090	unsigned int from_layout_i)
5091	{
5092	auto &to_vertex = m_vertices [to_node_i];
5093	unsigned int to_partition_i = to_vertex.partition;
5094	slpg_partition_info &to_partition = m_partitions [to_partition_i];
5095	gcc_assert (to_partition.layout >= `0`);
5096
5097	/ If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be*
5098	adjusted for this input having layout FROM_LAYOUT_I. Assume that
5099	any other inputs keep their current choice of layout. /*
5100	auto &to_costs = partition_layout_costs (partition_i: to_partition_i,
5101	layout_i: to_partition.layout);
5102	if (ud->src == int (to_node_i)
5103	&& SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5104	{
5105	auto &from_partition = m_partitions [m_vertices [ud->dest].partition];
5106	auto old_layout = from_partition.layout;
5107	from_partition.layout = from_layout_i;
5108	int factor = internal_node_cost (node: to_vertex.node, in_layout_i: -`1`,
5109	out_layout_i: to_partition.layout);
5110	from_partition.layout = old_layout;
5111	if (factor >= `0`)
5112	{
5113	slpg_layout_cost cost = to_costs.out_cost;
5114	cost.add_serial_cost (other: { to_vertex.weight * factor,
5115	m_optimize_size });
5116	cost.split (times: to_partition.in_degree);
5117	return cost;
5118	}
5119	}
5120
5121	/ Compute the cost if we insert any necessary layout change on edge UD. /
5122	auto edge_cost = edge_layout_cost (ud, node1_i: to_node_i,
5123	layout1_i: to_partition.layout, layout2_i: from_layout_i);
5124	if (edge_cost.is_possible ())
5125	{
5126	slpg_layout_cost cost = to_costs.out_cost;
5127	cost.add_serial_cost (other: to_costs.internal_cost);
5128	cost.split (times: to_partition.in_degree);
5129	cost.add_serial_cost (other: edge_cost);
5130	return cost;
5131	}
5132
5133	return slpg_layout_cost::impossible ();
5134	}
5135
5136	/ Make a forward pass through the partitions, accumulating input costs.*
5137	Make a tentative (provisional) choice of layout for each partition,
5138	ensuring that this choice still allows later partitions to keep
5139	their original layout. /*
5140
5141	void
5142	vect_optimize_slp_pass::forward_pass ()
5143	{
5144	for (unsigned int partition_i = `0`; partition_i < m_partitions.length ();
5145	++partition_i)
5146	{
5147	auto &partition = m_partitions [partition_i];
5148
5149	/ If the partition consists of a single VEC_PERM_EXPR, precompute*
5150	the incoming cost that would apply if every predecessor partition
5151	keeps its current layout. This is used within the loop below. /*
5152	slpg_layout_cost in_cost;
5153	slp_tree single_node = nullptr;
5154	if (partition.node_end == partition.node_begin + `1`)
5155	{
5156	unsigned int node_i = m_partitioned_nodes [partition.node_begin];
5157	single_node = m_vertices [node_i].node;
5158	if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5159	in_cost = total_in_cost (node_i);
5160	}
5161
5162	/ Go through the possible layouts. Decide which ones are valid*
5163	for this partition and record which of the valid layouts has
5164	the lowest cost. /*
5165	unsigned int min_layout_i = `0`;
5166	slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5167	for (unsigned int layout_i = `0`; layout_i < m_perms.length (); ++layout_i)
5168	{
5169	auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5170	if (!layout_costs.is_possible ())
5171	continue;
5172
5173	/ If the recorded layout is already 0 then the layout cannot*
5174	change. /*
5175	if (partition.layout == `0` && layout_i != `0`)
5176	{
5177	layout_costs.mark_impossible ();
5178	continue;
5179	}
5180
5181	bool is_possible = true;
5182	for (unsigned int order_i = partition.node_begin;
5183	order_i < partition.node_end; ++order_i)
5184	{
5185	unsigned int node_i = m_partitioned_nodes [order_i];
5186	auto &vertex = m_vertices [node_i];
5187
5188	/ Reject the layout if it is individually incompatible*
5189	with any node in the partition. /*
5190	if (!is_compatible_layout (node: vertex.node, layout_i))
5191	{
5192	is_possible = false;
5193	break;
5194	}
5195
5196	auto add_cost = [&](graph_edge ud, unsigned* int other_node_i)
5197	{
5198	auto &other_vertex = m_vertices [other_node_i];
5199	if (other_vertex.partition < vertex.partition)
5200	{
5201	/ Accumulate the incoming costs from earlier*
5202	partitions, plus the cost of any layout changes
5203	on UD itself. /*
5204	auto cost = forward_cost (ud, from_node_i: other_node_i, to_layout_i: layout_i);
5205	if (!cost.is_possible ())
5206	is_possible = false;
5207	else
5208	layout_costs.in_cost.add_parallel_cost (input_cost: cost);
5209	}
5210	else
5211	/ Reject the layout if it would make layout 0 impossible*
5212	for later partitions. This amounts to testing that the
5213	target supports reversing the layout change on edges
5214	to later partitions.
5215
5216	In principle, it might be possible to push a layout
5217	change all the way down a graph, so that it never
5218	needs to be reversed and so that the target doesn't
5219	need to support the reverse operation. But it would
5220	be awkward to bail out if we hit a partition that
5221	does not support the new layout, especially since
5222	we are not dealing with a lattice. /*
5223	is_possible &= edge_layout_cost (ud, node1_i: other_node_i, layout1_i: `0`,
5224	layout2_i: layout_i).is_possible ();
5225	};
5226	for_each_partition_edge (node_i, fn: add_cost);
5227
5228	/ Accumulate the cost of using LAYOUT_I within NODE,*
5229	both for the inputs and the outputs. /*
5230	int factor = internal_node_cost (node: vertex.node, in_layout_i: layout_i,
5231	out_layout_i: layout_i);
5232	if (factor < `0`)
5233	{
5234	is_possible = false;
5235	break;
5236	}
5237	else if (factor)
5238	layout_costs.internal_cost.add_serial_cost
5239	(other: { vertex.weight * factor, m_optimize_size });
5240	}
5241	if (!is_possible)
5242	{
5243	layout_costs.mark_impossible ();
5244	continue;
5245	}
5246
5247	/ Combine the incoming and partition-internal costs. /
5248	slpg_layout_cost combined_cost = layout_costs.in_cost;
5249	combined_cost.add_serial_cost (other: layout_costs.internal_cost);
5250
5251	/ If this partition consists of a single VEC_PERM_EXPR, see*
5252	if the VEC_PERM_EXPR can be changed to support output layout
5253	LAYOUT_I while keeping all the provisional choices of input
5254	layout. /*
5255	if (single_node
5256	&& SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5257	{
5258	int factor = internal_node_cost (node: single_node, in_layout_i: -`1`, out_layout_i: layout_i);
5259	if (factor >= `0`)
5260	{
5261	auto weight = m_vertices [single_node->vertex].weight;
5262	slpg_layout_cost internal_cost
5263	= { weight * factor, m_optimize_size };
5264
5265	slpg_layout_cost alt_cost = in_cost;
5266	alt_cost.add_serial_cost (other: internal_cost);
5267	if (alt_cost.is_better_than (other: combined_cost, is_for_size: m_optimize_size))
5268	{
5269	combined_cost = alt_cost;
5270	layout_costs.in_cost = in_cost;
5271	layout_costs.internal_cost = internal_cost;
5272	}
5273	}
5274	}
5275
5276	/ Record the layout with the lowest cost. Prefer layout 0 in*
5277	the event of a tie between it and another layout. /*
5278	if (!min_layout_cost.is_possible ()
5279	\|\| combined_cost.is_better_than (other: min_layout_cost,
5280	is_for_size: m_optimize_size))
5281	{
5282	min_layout_i = layout_i;
5283	min_layout_cost = combined_cost;
5284	}
5285	}
5286
5287	/ This loop's handling of earlier partitions should ensure that*
5288	choosing the original layout for the current partition is no
5289	less valid than it was in the original graph, even with the
5290	provisional layout choices for those earlier partitions. /*
5291	gcc_assert (min_layout_cost.is_possible ());
5292	partition.layout = min_layout_i;
5293	}
5294	}
5295
5296	/ Make a backward pass through the partitions, accumulating output costs.*
5297	Make a final choice of layout for each partition. /*
5298
5299	void
5300	vect_optimize_slp_pass::backward_pass ()
5301	{
5302	for (unsigned int partition_i = m_partitions.length (); partition_i-- > `0`;)
5303	{
5304	auto &partition = m_partitions [partition_i];
5305
5306	unsigned int min_layout_i = `0`;
5307	slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5308	for (unsigned int layout_i = `0`; layout_i < m_perms.length (); ++layout_i)
5309	{
5310	auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5311	if (!layout_costs.is_possible ())
5312	continue;
5313
5314	/ Accumulate the costs from successor partitions. /
5315	bool is_possible = true;
5316	for (unsigned int order_i = partition.node_begin;
5317	order_i < partition.node_end; ++order_i)
5318	{
5319	unsigned int node_i = m_partitioned_nodes [order_i];
5320	auto &vertex = m_vertices [node_i];
5321	auto add_cost = [&](graph_edge ud, unsigned* int other_node_i)
5322	{
5323	auto &other_vertex = m_vertices [other_node_i];
5324	auto &other_partition = m_partitions [other_vertex.partition];
5325	if (other_vertex.partition > vertex.partition)
5326	{
5327	/ Accumulate the incoming costs from later*
5328	partitions, plus the cost of any layout changes
5329	on UD itself. /*
5330	auto cost = backward_cost (ud, to_node_i: other_node_i, from_layout_i: layout_i);
5331	if (!cost.is_possible ())
5332	is_possible = false;
5333	else
5334	layout_costs.out_cost.add_parallel_cost (input_cost: cost);
5335	}
5336	else
5337	/ Make sure that earlier partitions can (if necessary*
5338	or beneficial) keep the layout that they chose in
5339	the forward pass. This ensures that there is at
5340	least one valid choice of layout. /*
5341	is_possible &= edge_layout_cost (ud, node1_i: other_node_i,
5342	layout1_i: other_partition.layout,
5343	layout2_i: layout_i).is_possible ();
5344	};
5345	for_each_partition_edge (node_i, fn: add_cost);
5346	}
5347	if (!is_possible)
5348	{
5349	layout_costs.mark_impossible ();
5350	continue;
5351	}
5352
5353	/ Locally combine the costs from the forward and backward passes.*
5354	(This combined cost is not passed on, since that would lead
5355	to double counting.) /*
5356	slpg_layout_cost combined_cost = layout_costs.in_cost;
5357	combined_cost.add_serial_cost (other: layout_costs.internal_cost);
5358	combined_cost.add_serial_cost (other: layout_costs.out_cost);
5359
5360	/ Record the layout with the lowest cost. Prefer layout 0 in*
5361	the event of a tie between it and another layout. /*
5362	if (!min_layout_cost.is_possible ()
5363	\|\| combined_cost.is_better_than (other: min_layout_cost,
5364	is_for_size: m_optimize_size))
5365	{
5366	min_layout_i = layout_i;
5367	min_layout_cost = combined_cost;
5368	}
5369	}
5370
5371	gcc_assert (min_layout_cost.is_possible ());
5372	partition.layout = min_layout_i;
5373	}
5374	}
5375
5376	/ Return a node that applies layout TO_LAYOUT_I to the original form of NODE.*
5377	NODE already has the layout that was selected for its partition. /*
5378
5379	slp_tree
5380	vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5381	unsigned int to_layout_i)
5382	{
5383	unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5384	slp_tree result = m_node_layouts [result_i];
5385	if (result)
5386	return result;
5387
5388	if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5389	\|\| (SLP_TREE_DEF_TYPE (node) == vect_external_def
5390	/ We can't permute vector defs in place. /
5391	&& SLP_TREE_VEC_DEFS (node).is_empty ()))
5392	{
5393	/ If the vector is uniform or unchanged, there's nothing to do. /
5394	if (to_layout_i == `0` \|\| vect_slp_tree_uniform_p (node))
5395	result = node;
5396	else
5397	{
5398	auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5399	result = vect_create_new_slp_node (ops: scalar_ops);
5400	vect_slp_permute (perm: m_perms [to_layout_i], vec&: scalar_ops, reverse: true);
5401	}
5402	}
5403	else
5404	{
5405	unsigned int partition_i = m_vertices [node->vertex].partition;
5406	unsigned int from_layout_i = m_partitions [partition_i].layout;
5407	if (from_layout_i == to_layout_i)
5408	return node;
5409
5410	/ If NODE is itself a VEC_PERM_EXPR, try to create a parallel*
5411	permutation instead of a serial one. Leave the new permutation
5412	in TMP_PERM on success. /*
5413	auto_lane_permutation_t tmp_perm;
5414	unsigned int num_inputs = `1`;
5415	if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5416	{
5417	tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5418	if (from_layout_i != `0`)
5419	vect_slp_permute (perm: m_perms [from_layout_i], vec&: tmp_perm, reverse: false);
5420	if (to_layout_i != `0`)
5421	vect_slp_permute (perm: m_perms [to_layout_i], vec&: tmp_perm, reverse: true);
5422	if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5423	tmp_perm,
5424	SLP_TREE_CHILDREN (node),
5425	false) >= `0`)
5426	num_inputs = SLP_TREE_CHILDREN (node).length ();
5427	else
5428	tmp_perm.truncate (size: `0`);
5429	}
5430
5431	if (dump_enabled_p ())
5432	{
5433	if (tmp_perm.length () > `0`)
5434	dump_printf_loc (MSG_NOTE, vect_location,
5435	"duplicating permutation node %p with"
5436	" layout %d\n",
5437	(void *) node, to_layout_i);
5438	else
5439	dump_printf_loc (MSG_NOTE, vect_location,
5440	"inserting permutation node in place of %p\n",
5441	(void *) node);
5442	}
5443
5444	unsigned int num_lanes = SLP_TREE_LANES (node);
5445	result = vect_create_new_slp_node (nops: num_inputs, code: VEC_PERM_EXPR);
5446	if (SLP_TREE_SCALAR_STMTS (node).length ())
5447	{
5448	auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5449	stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5450	if (from_layout_i != `0`)
5451	vect_slp_permute (perm: m_perms [from_layout_i], vec&: stmts, reverse: false);
5452	if (to_layout_i != `0`)
5453	vect_slp_permute (perm: m_perms [to_layout_i], vec&: stmts, reverse: true);
5454	}
5455	SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5456	SLP_TREE_LANES (result) = num_lanes;
5457	SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5458	result->vertex = -`1`;
5459
5460	auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5461	if (tmp_perm.length ())
5462	{
5463	lane_perm.safe_splice (src: tmp_perm);
5464	SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5465	}
5466	else
5467	{
5468	lane_perm.create (nelems: num_lanes);
5469	for (unsigned j = `0`; j < num_lanes; ++j)
5470	lane_perm.quick_push (obj: { `0`, j });
5471	if (from_layout_i != `0`)
5472	vect_slp_permute (perm: m_perms [from_layout_i], vec&: lane_perm, reverse: false);
5473	if (to_layout_i != `0`)
5474	vect_slp_permute (perm: m_perms [to_layout_i], vec&: lane_perm, reverse: true);
5475	SLP_TREE_CHILDREN (result).safe_push (obj: node);
5476	}
5477	for (slp_tree child : SLP_TREE_CHILDREN (result))
5478	child->refcnt++;
5479	}
5480	m_node_layouts [result_i] = result;
5481	return result;
5482	}
5483
5484	/ Apply the chosen vector layouts to the SLP graph. /
5485
5486	void
5487	vect_optimize_slp_pass::materialize ()
5488	{
5489	/ We no longer need the costs, so avoid having two O(N * P) arrays*
5490	live at the same time. /*
5491	m_partition_layout_costs.release ();
5492	m_node_layouts.safe_grow_cleared (len: m_vertices.length () * m_perms.length ());
5493
5494	auto_sbitmap fully_folded (m_vertices.length ());
5495	bitmap_clear (fully_folded);
5496	for (unsigned int node_i : m_partitioned_nodes)
5497	{
5498	auto &vertex = m_vertices [node_i];
5499	slp_tree node = vertex.node;
5500	int layout_i = m_partitions [vertex.partition].layout;
5501	gcc_assert (layout_i >= `0`);
5502
5503	/ Rearrange the scalar statements to match the chosen layout. /
5504	if (layout_i > `0`)
5505	vect_slp_permute (perm: m_perms [layout_i],
5506	SLP_TREE_SCALAR_STMTS (node), reverse: true);
5507
5508	/ Update load and lane permutations. /
5509	if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5510	{
5511	/ First try to absorb the input vector layouts. If that fails,*
5512	force the inputs to have layout LAYOUT_I too. We checked that
5513	that was possible before deciding to use nonzero output layouts.
5514	(Note that at this stage we don't really have any guarantee that
5515	the target supports the original VEC_PERM_EXPR.) /*
5516	auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5517	auto_lane_permutation_t tmp_perm;
5518	tmp_perm.safe_splice (src: perm);
5519	change_vec_perm_layout (node, perm&: tmp_perm, in_layout_i: -`1`, out_layout_i: layout_i);
5520	if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5521	tmp_perm,
5522	SLP_TREE_CHILDREN (node),
5523	false) >= `0`)
5524	{
5525	if (dump_enabled_p ()
5526	&& !std::equal (first1: tmp_perm.begin (), last1: tmp_perm.end (),
5527	first2: perm.begin ()))
5528	dump_printf_loc (MSG_NOTE, vect_location,
5529	"absorbing input layouts into %p\n",
5530	(void *) node);
5531	std::copy (first: tmp_perm.begin (), last: tmp_perm.end (), result: perm.begin ());
5532	bitmap_set_bit (map: fully_folded, bitno: node_i);
5533	}
5534	else
5535	{
5536	/ Not MSG_MISSED because it would make no sense to users. /
5537	if (dump_enabled_p ())
5538	dump_printf_loc (MSG_NOTE, vect_location,
5539	"failed to absorb input layouts into %p\n",
5540	(void *) node);
5541	change_vec_perm_layout (node: nullptr, perm, in_layout_i: layout_i, out_layout_i: layout_i);
5542	}
5543	}
5544	else
5545	{
5546	gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5547	auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5548	if (layout_i > `0`)
5549	/ ??? When we handle non-bijective permutes the idea*
5550	is that we can force the load-permutation to be
5551	{ min, min + 1, min + 2, ... max }. But then the
5552	scalar defs might no longer match the lane content
5553	which means wrong-code with live lane vectorization.
5554	So we possibly have to have NULL entries for those. /*
5555	vect_slp_permute (perm: m_perms [layout_i], vec&: load_perm, reverse: true);
5556	}
5557	}
5558
5559	/ Do this before any nodes disappear, since it involves a walk*
5560	over the leaves. /*
5561	remove_redundant_permutations ();
5562
5563	/ Replace each child with a correctly laid-out version. /
5564	for (unsigned int node_i : m_partitioned_nodes)
5565	{
5566	/ Skip nodes that have already been handled above. /
5567	if (bitmap_bit_p (map: fully_folded, bitno: node_i))
5568	continue;
5569
5570	auto &vertex = m_vertices [node_i];
5571	int in_layout_i = m_partitions [vertex.partition].layout;
5572	gcc_assert (in_layout_i >= `0`);
5573
5574	unsigned j;
5575	slp_tree child;
5576	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5577	{
5578	if (!child)
5579	continue;
5580
5581	slp_tree new_child = get_result_with_layout (node: child, to_layout_i: in_layout_i);
5582	if (new_child != child)
5583	{
5584	vect_free_slp_tree (node: child);
5585	SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5586	new_child->refcnt += `1`;
5587	}
5588	}
5589	}
5590	}
5591
5592	/ Elide load permutations that are not necessary. Such permutations might*
5593	be pre-existing, rather than created by the layout optimizations. /*
5594
5595	void
5596	vect_optimize_slp_pass::remove_redundant_permutations ()
5597	{
5598	for (unsigned int node_i : m_leafs)
5599	{
5600	slp_tree node = m_vertices [node_i].node;
5601	if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5602	continue;
5603
5604	/ In basic block vectorization we allow any subchain of an interleaving*
5605	chain.
5606	FORNOW: not in loop SLP because of realignment complications. /*
5607	if (is_a <bb_vec_info> (p: m_vinfo))
5608	{
5609	bool subchain_p = true;
5610	stmt_vec_info next_load_info = NULL;
5611	stmt_vec_info load_info;
5612	unsigned j;
5613	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5614	{
5615	if (j != `0`
5616	&& (next_load_info != load_info
5617	\|\| DR_GROUP_GAP (load_info) != `1`))
5618	{
5619	subchain_p = false;
5620	break;
5621	}
5622	next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5623	}
5624	if (subchain_p)
5625	{
5626	SLP_TREE_LOAD_PERMUTATION (node).release ();
5627	continue;
5628	}
5629	}
5630	else
5631	{
5632	loop_vec_info loop_vinfo = as_a<loop_vec_info> (p: m_vinfo);
5633	stmt_vec_info load_info;
5634	bool this_load_permuted = false;
5635	unsigned j;
5636	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5637	if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5638	{
5639	this_load_permuted = true;
5640	break;
5641	}
5642	/ When this isn't a grouped access we know it's single element*
5643	and contiguous. /*
5644	if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[`0`]))
5645	{
5646	if (!this_load_permuted
5647	&& (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), `1U`)
5648	\|\| SLP_TREE_LANES (node) == `1`))
5649	SLP_TREE_LOAD_PERMUTATION (node).release ();
5650	continue;
5651	}
5652	stmt_vec_info first_stmt_info
5653	= DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[`0`]);
5654	if (!this_load_permuted
5655	/ The load requires permutation when unrolling exposes*
5656	a gap either because the group is larger than the SLP
5657	group-size or because there is a gap between the groups. /*
5658	&& (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), `1U`)
5659	\|\| ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5660	&& DR_GROUP_GAP (first_stmt_info) == `0`)))
5661	{
5662	SLP_TREE_LOAD_PERMUTATION (node).release ();
5663	continue;
5664	}
5665	}
5666	}
5667	}
5668
5669	/ Print the partition graph and layout information to the dump file. /
5670
5671	void
5672	vect_optimize_slp_pass::dump ()
5673	{
5674	dump_printf_loc (MSG_NOTE, vect_location,
5675	"SLP optimize permutations:\n");
5676	for (unsigned int layout_i = `1`; layout_i < m_perms.length (); ++layout_i)
5677	{
5678	dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5679	const char *sep = "";
5680	for (unsigned int idx : m_perms [layout_i])
5681	{
5682	dump_printf (MSG_NOTE, "%s%d", sep, idx);
5683	sep = ", ";
5684	}
5685	dump_printf (MSG_NOTE, " }\n");
5686	}
5687	dump_printf_loc (MSG_NOTE, vect_location,
5688	"SLP optimize partitions:\n");
5689	for (unsigned int partition_i = `0`; partition_i < m_partitions.length ();
5690	++partition_i)
5691	{
5692	auto &partition = m_partitions [partition_i];
5693	dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5694	dump_printf_loc (MSG_NOTE, vect_location,
5695	" partition %d (layout %d):\n",
5696	partition_i, partition.layout);
5697	dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5698	for (unsigned int order_i = partition.node_begin;
5699	order_i < partition.node_end; ++order_i)
5700	{
5701	auto &vertex = m_vertices [m_partitioned_nodes [order_i]];
5702	dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5703	(void *) vertex.node);
5704	dump_printf_loc (MSG_NOTE, vect_location,
5705	" weight: %f\n",
5706	vertex.weight.to_double ());
5707	if (vertex.out_degree)
5708	dump_printf_loc (MSG_NOTE, vect_location,
5709	" out weight: %f (degree %d)\n",
5710	vertex.out_weight.to_double (),
5711	vertex.out_degree);
5712	if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5713	dump_printf_loc (MSG_NOTE, vect_location,
5714	" op: VEC_PERM_EXPR\n");
5715	else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5716	dump_printf_loc (MSG_NOTE, vect_location,
5717	" op template: %G", rep->stmt);
5718	}
5719	dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5720	for (unsigned int order_i = partition.node_begin;
5721	order_i < partition.node_end; ++order_i)
5722	{
5723	unsigned int node_i = m_partitioned_nodes [order_i];
5724	auto &vertex = m_vertices [node_i];
5725	auto print_edge = [&](graph_edge , unsigned* int other_node_i)
5726	{
5727	auto &other_vertex = m_vertices [other_node_i];
5728	if (other_vertex.partition < vertex.partition)
5729	dump_printf_loc (MSG_NOTE, vect_location,
5730	" - %p [%d] --> %p\n",
5731	(void *) other_vertex.node,
5732	other_vertex.partition,
5733	(void *) vertex.node);
5734	else
5735	dump_printf_loc (MSG_NOTE, vect_location,
5736	" - %p --> [%d] %p\n",
5737	(void *) vertex.node,
5738	other_vertex.partition,
5739	(void *) other_vertex.node);
5740	};
5741	for_each_partition_edge (node_i, fn: print_edge);
5742	}
5743
5744	for (unsigned int layout_i = `0`; layout_i < m_perms.length (); ++layout_i)
5745	{
5746	auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5747	if (layout_costs.is_possible ())
5748	{
5749	dump_printf_loc (MSG_NOTE, vect_location,
5750	" layout %d:%s\n", layout_i,
5751	partition.layout == int (layout_i)
5752	? " (*)" : "");
5753	slpg_layout_cost combined_cost = layout_costs.in_cost;
5754	combined_cost.add_serial_cost (other: layout_costs.internal_cost);
5755	combined_cost.add_serial_cost (other: layout_costs.out_cost);
5756	#define TEMPLATE "{depth: %f, total: %f}"
5757	dump_printf_loc (MSG_NOTE, vect_location,
5758	" " TEMPLATE "\n",
5759	layout_costs.in_cost.depth.to_double (),
5760	layout_costs.in_cost.total.to_double ());
5761	dump_printf_loc (MSG_NOTE, vect_location,
5762	" + " TEMPLATE "\n",
5763	layout_costs.internal_cost.depth.to_double (),
5764	layout_costs.internal_cost.total.to_double ());
5765	dump_printf_loc (MSG_NOTE, vect_location,
5766	" + " TEMPLATE "\n",
5767	layout_costs.out_cost.depth.to_double (),
5768	layout_costs.out_cost.total.to_double ());
5769	dump_printf_loc (MSG_NOTE, vect_location,
5770	" = " TEMPLATE "\n",
5771	combined_cost.depth.to_double (),
5772	combined_cost.total.to_double ());
5773	#undef TEMPLATE
5774	}
5775	else
5776	dump_printf_loc (MSG_NOTE, vect_location,
5777	" layout %d: rejected\n", layout_i);
5778	}
5779	}
5780	}
5781
5782	/ Main entry point for the SLP graph optimization pass. /
5783
5784	void
5785	vect_optimize_slp_pass::run ()
5786	{
5787	build_graph ();
5788	create_partitions ();
5789	start_choosing_layouts ();
5790	if (m_perms.length () > `1`)
5791	{
5792	forward_pass ();
5793	backward_pass ();
5794	if (dump_enabled_p ())
5795	dump ();
5796	materialize ();
5797	while (!m_perms.is_empty ())
5798	m_perms.pop ().release ();
5799	}
5800	else
5801	remove_redundant_permutations ();
5802	free_graph (g: m_slpg);
5803	}
5804
5805	/ Optimize the SLP graph of VINFO. /
5806
5807	void
5808	vect_optimize_slp (vec_info *vinfo)
5809	{
5810	if (vinfo->slp_instances.is_empty ())
5811	return;
5812	vect_optimize_slp_pass (vinfo).run ();
5813	}
5814
5815	/ Gather loads reachable from the individual SLP graph entries. /
5816
5817	void
5818	vect_gather_slp_loads (vec_info *vinfo)
5819	{
5820	unsigned i;
5821	slp_instance instance;
5822	FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5823	{
5824	hash_set<slp_tree> visited;
5825	vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5826	SLP_INSTANCE_TREE (instance), visited);
5827	}
5828	}
5829
5830
5831	/ For each possible SLP instance decide whether to SLP it and calculate overall*
5832	unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5833	least one instance. /*
5834
5835	bool
5836	vect_make_slp_decision (loop_vec_info loop_vinfo)
5837	{
5838	unsigned int i;
5839	poly_uint64 unrolling_factor = `1`;
5840	const vec<slp_instance> &slp_instances
5841	= LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5842	slp_instance instance;
5843	int decided_to_slp = `0`;
5844
5845	DUMP_VECT_SCOPE ("vect_make_slp_decision");
5846
5847	FOR_EACH_VEC_ELT (slp_instances, i, instance)
5848	{
5849	/ FORNOW: SLP if you can. /
5850	/ All unroll factors have the form:*
5851
5852	GET_MODE_SIZE (vinfo->vector_mode) X*
5853
5854	for some rational X, so they must have a common multiple. /*
5855	unrolling_factor
5856	= force_common_multiple (a: unrolling_factor,
5857	SLP_INSTANCE_UNROLLING_FACTOR (instance));
5858
5859	/ Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we*
5860	call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5861	loop-based vectorization. Such stmts will be marked as HYBRID. /*
5862	vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5863	decided_to_slp++;
5864	}
5865
5866	LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5867
5868	if (decided_to_slp && dump_enabled_p ())
5869	{
5870	dump_printf_loc (MSG_NOTE, vect_location,
5871	"Decided to SLP %d instances. Unrolling factor ",
5872	decided_to_slp);
5873	dump_dec (MSG_NOTE, unrolling_factor);
5874	dump_printf (MSG_NOTE, "\n");
5875	}
5876
5877	return (decided_to_slp > `0`);
5878	}
5879
5880	/ Private data for vect_detect_hybrid_slp. /
5881	struct vdhs_data
5882	{
5883	loop_vec_info loop_vinfo;
5884	vec<stmt_vec_info> *worklist;
5885	};
5886
5887	/ Walker for walk_gimple_op. /
5888
5889	static tree
5890	vect_detect_hybrid_slp (tree tp, int* , void* *data)
5891	{
5892	walk_stmt_info wi = (walk_stmt_info )data;
5893	vdhs_data dat = (vdhs_data )wi->info;
5894
5895	if (wi->is_lhs)
5896	return NULL_TREE;
5897
5898	stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5899	if (!def_stmt_info)
5900	return NULL_TREE;
5901	def_stmt_info = vect_stmt_to_vectorize (stmt_info: def_stmt_info);
5902	if (PURE_SLP_STMT (def_stmt_info))
5903	{
5904	if (dump_enabled_p ())
5905	dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5906	def_stmt_info->stmt);
5907	STMT_SLP_TYPE (def_stmt_info) = hybrid;
5908	dat->worklist->safe_push (obj: def_stmt_info);
5909	}
5910
5911	return NULL_TREE;
5912	}
5913
5914	/ Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp*
5915	if so, otherwise pushing it to WORKLIST. /*
5916
5917	static void
5918	maybe_push_to_hybrid_worklist (vec_info *vinfo,
5919	vec<stmt_vec_info> &worklist,
5920	stmt_vec_info stmt_info)
5921	{
5922	if (dump_enabled_p ())
5923	dump_printf_loc (MSG_NOTE, vect_location,
5924	"Processing hybrid candidate : %G", stmt_info->stmt);
5925	stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5926	imm_use_iterator iter2;
5927	ssa_op_iter iter1;
5928	use_operand_p use_p;
5929	def_operand_p def_p;
5930	bool any_def = false;
5931	FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5932	{
5933	any_def = true;
5934	FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5935	{
5936	if (is_gimple_debug (USE_STMT (use_p)))
5937	continue;
5938	stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5939	/ An out-of loop use means this is a loop_vect sink. /
5940	if (!use_info)
5941	{
5942	if (dump_enabled_p ())
5943	dump_printf_loc (MSG_NOTE, vect_location,
5944	"Found loop_vect sink: %G", stmt_info->stmt);
5945	worklist.safe_push (obj: stmt_info);
5946	return;
5947	}
5948	else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5949	{
5950	if (dump_enabled_p ())
5951	dump_printf_loc (MSG_NOTE, vect_location,
5952	"Found loop_vect use: %G", use_info->stmt);
5953	worklist.safe_push (obj: stmt_info);
5954	return;
5955	}
5956	}
5957	}
5958	/ No def means this is a loo_vect sink. /
5959	if (!any_def)
5960	{
5961	if (dump_enabled_p ())
5962	dump_printf_loc (MSG_NOTE, vect_location,
5963	"Found loop_vect sink: %G", stmt_info->stmt);
5964	worklist.safe_push (obj: stmt_info);
5965	return;
5966	}
5967	if (dump_enabled_p ())
5968	dump_printf_loc (MSG_NOTE, vect_location,
5969	"Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5970	STMT_SLP_TYPE (stmt_info) = pure_slp;
5971	}
5972
5973	/ Find stmts that must be both vectorized and SLPed. /
5974
5975	void
5976	vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5977	{
5978	DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5979
5980	/ All stmts participating in SLP are marked pure_slp, all other*
5981	stmts are loop_vect.
5982	First collect all loop_vect stmts into a worklist.
5983	SLP patterns cause not all original scalar stmts to appear in
5984	SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5985	Rectify this here and do a backward walk over the IL only considering
5986	stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5987	mark them as pure_slp. /*
5988	auto_vec<stmt_vec_info> worklist;
5989	for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - `1`; i >= `0`; --i)
5990	{
5991	basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5992	for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi);
5993	gsi_next (i: &gsi))
5994	{
5995	gphi *phi = gsi.phi ();
5996	stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5997	if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5998	maybe_push_to_hybrid_worklist (vinfo: loop_vinfo,
5999	worklist, stmt_info);
6000	}
6001	for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (i: gsi);
6002	gsi_prev (i: &gsi))
6003	{
6004	gimple *stmt = gsi_stmt (i: gsi);
6005	if (is_gimple_debug (gs: stmt))
6006	continue;
6007	stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
6008	if (STMT_VINFO_IN_PATTERN_P (stmt_info))
6009	{
6010	for (gimple_stmt_iterator gsi2
6011	= gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
6012	!gsi_end_p (i: gsi2); gsi_next (i: &gsi2))
6013	{
6014	stmt_vec_info patt_info
6015	= loop_vinfo->lookup_stmt (gsi_stmt (i: gsi2));
6016	if (!STMT_SLP_TYPE (patt_info)
6017	&& STMT_VINFO_RELEVANT (patt_info))
6018	maybe_push_to_hybrid_worklist (vinfo: loop_vinfo,
6019	worklist, stmt_info: patt_info);
6020	}
6021	stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6022	}
6023	if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
6024	maybe_push_to_hybrid_worklist (vinfo: loop_vinfo,
6025	worklist, stmt_info);
6026	}
6027	}
6028
6029	/ Now we have a worklist of non-SLP stmts, follow use->def chains and*
6030	mark any SLP vectorized stmt as hybrid.
6031	??? We're visiting def stmts N times (once for each non-SLP and
6032	once for each hybrid-SLP use). /*
6033	walk_stmt_info wi;
6034	vdhs_data dat;
6035	dat.worklist = &worklist;
6036	dat.loop_vinfo = loop_vinfo;
6037	memset (s: &wi, c: `0`, n: sizeof (wi));
6038	wi.info = (void *)&dat;
6039	while (!worklist.is_empty ())
6040	{
6041	stmt_vec_info stmt_info = worklist.pop ();
6042	/ Since SSA operands are not set up for pattern stmts we need*
6043	to use walk_gimple_op. /*
6044	wi.is_lhs = `0`;
6045	walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6046	/ For gather/scatter make sure to walk the offset operand, that*
6047	can be a scaling and conversion away. /*
6048	gather_scatter_info gs_info;
6049	if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6050	&& vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6051	{
6052	int dummy;
6053	vect_detect_hybrid_slp (tp: &gs_info.offset, &dummy, data: &wi);
6054	}
6055	}
6056	}
6057
6058
6059	/ Initialize a bb_vec_info struct for the statements in BBS basic blocks. /
6060
6061	_bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6062	: vec_info (vec_info::bb, shared),
6063	bbs (_bbs),
6064	roots (vNULL)
6065	{
6066	for (unsigned i = `0`; i < bbs.length (); ++i)
6067	{
6068	if (i != `0`)
6069	for (gphi_iterator si = gsi_start_phis (bbs [i]); !gsi_end_p (i: si);
6070	gsi_next (i: &si))
6071	{
6072	gphi *phi = si.phi ();
6073	gimple_set_uid (g: phi, uid: `0`);
6074	add_stmt (phi);
6075	}
6076	for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs [i]);
6077	!gsi_end_p (i: gsi); gsi_next (i: &gsi))
6078	{
6079	gimple *stmt = gsi_stmt (i: gsi);
6080	gimple_set_uid (g: stmt, uid: `0`);
6081	if (is_gimple_debug (gs: stmt))
6082	continue;
6083	add_stmt (stmt);
6084	}
6085	}
6086	}
6087
6088
6089	/ Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the*
6090	stmts in the basic block. /*
6091
6092	_bb_vec_info::~_bb_vec_info ()
6093	{
6094	/ Reset region marker. /
6095	for (unsigned i = `0`; i < bbs.length (); ++i)
6096	{
6097	if (i != `0`)
6098	for (gphi_iterator si = gsi_start_phis (bbs [i]); !gsi_end_p (i: si);
6099	gsi_next (i: &si))
6100	{
6101	gphi *phi = si.phi ();
6102	gimple_set_uid (g: phi, uid: -`1`);
6103	}
6104	for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs [i]);
6105	!gsi_end_p (i: gsi); gsi_next (i: &gsi))
6106	{
6107	gimple *stmt = gsi_stmt (i: gsi);
6108	gimple_set_uid (g: stmt, uid: -`1`);
6109	}
6110	}
6111
6112	for (unsigned i = `0`; i < roots.length (); ++i)
6113	{
6114	roots [i].stmts.release ();
6115	roots [i].roots.release ();
6116	roots [i].remain.release ();
6117	}
6118	roots.release ();
6119	}
6120
6121	/ Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,*
6122	given then that child nodes have already been processed, and that
6123	their def types currently match their SLP node's def type. /*
6124
6125	static bool
6126	vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6127	slp_instance node_instance,
6128	stmt_vector_for_cost *cost_vec)
6129	{
6130	stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6131
6132	/ Calculate the number of vector statements to be created for the*
6133	scalar stmts in this node. For SLP reductions it is equal to the
6134	number of vector statements in the children (which has already been
6135	calculated by the recursive call). Otherwise it is the number of
6136	scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6137	VF divided by the number of elements in a vector. /*
6138	if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6139	&& !STMT_VINFO_DATA_REF (stmt_info)
6140	&& REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6141	{
6142	for (unsigned i = `0`; i < SLP_TREE_CHILDREN (node).length (); ++i)
6143	if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6144	{
6145	SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6146	= SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6147	break;
6148	}
6149	}
6150	else
6151	{
6152	poly_uint64 vf;
6153	if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
6154	vf = loop_vinfo->vectorization_factor;
6155	else
6156	vf = `1`;
6157	unsigned int group_size = SLP_TREE_LANES (node);
6158	tree vectype = SLP_TREE_VECTYPE (node);
6159	SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6160	= vect_get_num_vectors (nunits: vf * group_size, vectype);
6161	}
6162
6163	/ Handle purely internal nodes. /
6164	if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6165	{
6166	if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6167	return false;
6168
6169	stmt_vec_info slp_stmt_info;
6170	unsigned int i;
6171	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6172	{
6173	if (STMT_VINFO_LIVE_P (slp_stmt_info)
6174	&& !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6175	node_instance, i,
6176	false, cost_vec))
6177	return false;
6178	}
6179	return true;
6180	}
6181
6182	bool dummy;
6183	return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6184	node, node_instance, cost_vec);
6185	}
6186
6187	/ Try to build NODE from scalars, returning true on success.*
6188	NODE_INSTANCE is the SLP instance that contains NODE. /*
6189
6190	static bool
6191	vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6192	slp_instance node_instance)
6193	{
6194	stmt_vec_info stmt_info;
6195	unsigned int i;
6196
6197	if (!is_a <bb_vec_info> (p: vinfo)
6198	\|\| node == SLP_INSTANCE_TREE (node_instance)
6199	\|\| !SLP_TREE_SCALAR_STMTS (node).exists ()
6200	\|\| vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6201	/ Force the mask use to be built from scalars instead. /
6202	\|\| VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6203	return false;
6204
6205	if (dump_enabled_p ())
6206	dump_printf_loc (MSG_NOTE, vect_location,
6207	"Building vector operands of %p from scalars instead\n",
6208	(void *) node);
6209
6210	/ Don't remove and free the child nodes here, since they could be*
6211	referenced by other structures. The analysis and scheduling phases
6212	(need to) ignore child nodes of anything that isn't vect_internal_def. /*
6213	unsigned int group_size = SLP_TREE_LANES (node);
6214	SLP_TREE_DEF_TYPE (node) = vect_external_def;
6215	/ Invariants get their vector type from the uses. /
6216	SLP_TREE_VECTYPE (node) = NULL_TREE;
6217	SLP_TREE_SCALAR_OPS (node).safe_grow (len: group_size, exact: true);
6218	SLP_TREE_LOAD_PERMUTATION (node).release ();
6219	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6220	{
6221	tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6222	SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6223	}
6224	return true;
6225	}
6226
6227	/ Return true if all elements of the slice are the same. /
6228	bool
6229	vect_scalar_ops_slice::all_same_p () const
6230	{
6231	for (unsigned int i = `1`; i < length; ++i)
6232	if (!operand_equal_p (op (i: `0`), op (i)))
6233	return false;
6234	return true;
6235	}
6236
6237	hashval_t
6238	vect_scalar_ops_slice_hash::hash (const value_type &s)
6239	{
6240	hashval_t hash = `0`;
6241	for (unsigned i = `0`; i < s.length; ++i)
6242	hash = iterative_hash_expr (tree: s.op (i), seed: hash);
6243	return hash;
6244	}
6245
6246	bool
6247	vect_scalar_ops_slice_hash::equal (const value_type &s1,
6248	const compare_type &s2)
6249	{
6250	if (s1.length != s2.length)
6251	return false;
6252	for (unsigned i = `0`; i < s1.length; ++i)
6253	if (!operand_equal_p (s1.op (i), s2.op (i)))
6254	return false;
6255	return true;
6256	}
6257
6258	/ Compute the prologue cost for invariant or constant operands represented*
6259	by NODE. /*
6260
6261	static void
6262	vect_prologue_cost_for_slp (slp_tree node,
6263	stmt_vector_for_cost *cost_vec)
6264	{
6265	/ There's a special case of an existing vector, that costs nothing. /
6266	if (SLP_TREE_SCALAR_OPS (node).length () == `0`
6267	&& !SLP_TREE_VEC_DEFS (node).is_empty ())
6268	return;
6269	/ Without looking at the actual initializer a vector of*
6270	constants can be implemented as load from the constant pool.
6271	When all elements are the same we can use a splat. /*
6272	tree vectype = SLP_TREE_VECTYPE (node);
6273	unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6274	unsigned HOST_WIDE_INT const_nunits;
6275	unsigned nelt_limit;
6276	auto ops = &SLP_TREE_SCALAR_OPS (node);
6277	auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6278	if (TYPE_VECTOR_SUBPARTS (node: vectype).is_constant (const_value: &const_nunits)
6279	&& ! multiple_p (a: const_nunits, b: group_size))
6280	{
6281	nelt_limit = const_nunits;
6282	hash_set<vect_scalar_ops_slice_hash> vector_ops;
6283	for (unsigned int i = `0`; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6284	if (!vector_ops.add (k: { .ops: ops, .start: i * const_nunits, .length: const_nunits }))
6285	starts.quick_push (obj: i * const_nunits);
6286	}
6287	else
6288	{
6289	/ If either the vector has variable length or the vectors*
6290	are composed of repeated whole groups we only need to
6291	cost construction once. All vectors will be the same. /*
6292	nelt_limit = group_size;
6293	starts.quick_push (obj: `0`);
6294	}
6295	/ ??? We're just tracking whether vectors in a single node are the same.*
6296	Ideally we'd do something more global. /*
6297	bool passed = false;
6298	for (unsigned int start : starts)
6299	{
6300	vect_cost_for_stmt kind;
6301	if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6302	kind = vector_load;
6303	else if (vect_scalar_ops_slice { .ops: ops, .start: start, .length: nelt_limit }.all_same_p ())
6304	kind = scalar_to_vec;
6305	else
6306	kind = vec_construct;
6307	/ The target cost hook has no idea which part of the SLP node*
6308	we are costing so avoid passing it down more than once. Pass
6309	it to the first vec_construct or scalar_to_vec part since for those
6310	the x86 backend tries to account for GPR to XMM register moves. /*
6311	record_stmt_cost (cost_vec, `1`, kind,
6312	(kind != vector_load && !passed) ? node : nullptr,
6313	vectype, `0`, vect_prologue);
6314	if (kind != vector_load)
6315	passed = true;
6316	}
6317	}
6318
6319	/ Analyze statements contained in SLP tree NODE after recursively analyzing*
6320	the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6321
6322	Return true if the operations are supported. /*
6323
6324	static bool
6325	vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6326	slp_instance node_instance,
6327	hash_set<slp_tree> &visited_set,
6328	vec<slp_tree> &visited_vec,
6329	stmt_vector_for_cost *cost_vec)
6330	{
6331	int i, j;
6332	slp_tree child;
6333
6334	/ Assume we can code-generate all invariants. /
6335	if (!node
6336	\|\| SLP_TREE_DEF_TYPE (node) == vect_constant_def
6337	\|\| SLP_TREE_DEF_TYPE (node) == vect_external_def)
6338	return true;
6339
6340	if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6341	{
6342	if (dump_enabled_p ())
6343	dump_printf_loc (MSG_NOTE, vect_location,
6344	"Failed cyclic SLP reference in %p\n", (void *) node);
6345	return false;
6346	}
6347	gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6348
6349	/ If we already analyzed the exact same set of scalar stmts we're done.*
6350	We share the generated vector stmts for those. /*
6351	if (visited_set.add (k: node))
6352	return true;
6353	visited_vec.safe_push (obj: node);
6354
6355	bool res = true;
6356	unsigned visited_rec_start = visited_vec.length ();
6357	unsigned cost_vec_rec_start = cost_vec->length ();
6358	bool seen_non_constant_child = false;
6359	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6360	{
6361	res = vect_slp_analyze_node_operations (vinfo, node: child, node_instance,
6362	visited_set, visited_vec,
6363	cost_vec);
6364	if (!res)
6365	break;
6366	if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6367	seen_non_constant_child = true;
6368	}
6369	/ We're having difficulties scheduling nodes with just constant*
6370	operands and no scalar stmts since we then cannot compute a stmt
6371	insertion place. /*
6372	if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6373	{
6374	if (dump_enabled_p ())
6375	dump_printf_loc (MSG_NOTE, vect_location,
6376	"Cannot vectorize all-constant op node %p\n",
6377	(void *) node);
6378	res = false;
6379	}
6380
6381	if (res)
6382	res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6383	cost_vec);
6384	/ If analysis failed we have to pop all recursive visited nodes*
6385	plus ourselves. /*
6386	if (!res)
6387	{
6388	while (visited_vec.length () >= visited_rec_start)
6389	visited_set.remove (k: visited_vec.pop ());
6390	cost_vec->truncate (size: cost_vec_rec_start);
6391	}
6392
6393	/ When the node can be vectorized cost invariant nodes it references.*
6394	This is not done in DFS order to allow the refering node
6395	vectorizable_ calls to nail down the invariant nodes vector type*
6396	and possibly unshare it if it needs a different vector type than
6397	other referrers. /*
6398	if (res)
6399	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6400	if (child
6401	&& (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6402	\|\| SLP_TREE_DEF_TYPE (child) == vect_external_def)
6403	/ Perform usual caching, note code-generation still*
6404	code-gens these nodes multiple times but we expect
6405	to CSE them later. /*
6406	&& !visited_set.add (k: child))
6407	{
6408	visited_vec.safe_push (obj: child);
6409	/ ??? After auditing more code paths make a "default"*
6410	and push the vector type from NODE to all children
6411	if it is not already set. /*
6412	/ Compute the number of vectors to be generated. /
6413	tree vector_type = SLP_TREE_VECTYPE (child);
6414	if (!vector_type)
6415	{
6416	/ For shifts with a scalar argument we don't need*
6417	to cost or code-generate anything.
6418	??? Represent this more explicitely. /*
6419	gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6420	== shift_vec_info_type)
6421	&& j == `1`);
6422	continue;
6423	}
6424	unsigned group_size = SLP_TREE_LANES (child);
6425	poly_uint64 vf = `1`;
6426	if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
6427	vf = loop_vinfo->vectorization_factor;
6428	SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6429	= vect_get_num_vectors (nunits: vf * group_size, vectype: vector_type);
6430	/ And cost them. /
6431	vect_prologue_cost_for_slp (node: child, cost_vec);
6432	}
6433
6434	/ If this node or any of its children can't be vectorized, try pruning*
6435	the tree here rather than felling the whole thing. /*
6436	if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6437	{
6438	/ We'll need to revisit this for invariant costing and number*
6439	of vectorized stmt setting. /*
6440	res = true;
6441	}
6442
6443	return res;
6444	}
6445
6446	/ Given a definition DEF, analyze if it will have any live scalar use after*
6447	performing SLP vectorization whose information is represented by BB_VINFO,
6448	and record result into hash map SCALAR_USE_MAP as cache for later fast
6449	check. If recursion DEPTH exceeds a limit, stop analysis and make a
6450	conservative assumption. Return 0 if no scalar use, 1 if there is, -1
6451	means recursion is limited. /*
6452
6453	static int
6454	vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
6455	hash_map<tree, int> &scalar_use_map,
6456	int depth = `0`)
6457	{
6458	const int depth_limit = `2`;
6459	imm_use_iterator use_iter;
6460	gimple *use_stmt;
6461
6462	if (int *res = scalar_use_map.get (k: def))
6463	return *res;
6464
6465	int scalar_use = `1`;
6466
6467	FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
6468	{
6469	if (is_gimple_debug (gs: use_stmt))
6470	continue;
6471
6472	stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6473
6474	if (!use_stmt_info)
6475	break;
6476
6477	if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6478	continue;
6479
6480	/ Do not step forward when encounter PHI statement, since it may*
6481	involve cyclic reference and cause infinite recursive invocation. /*
6482	if (gimple_code (g: use_stmt) == GIMPLE_PHI)
6483	break;
6484
6485	/ When pattern recognition is involved, a statement whose definition is*
6486	consumed in some pattern, may not be included in the final replacement
6487	pattern statements, so would be skipped when building SLP graph.
6488
6489	* Original
6490	char a_c = (char ) a;
6491	char b_c = (char ) b;
6492	unsigned short a_s = (unsigned short) a_c;
6493	int a_i = (int) a_s;
6494	int b_i = (int) b_c;
6495	int r_i = a_i - b_i;
6496
6497	* After pattern replacement
6498	a_s = (unsigned short) a_c;
6499	a_i = (int) a_s;
6500
6501	patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
6502	patt_b_i = (int) patt_b_s; // b_i = (int) b_c
6503
6504	patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
6505	patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
6506
6507	The definitions of a_i(original statement) and b_i(pattern statement)
6508	are related to, but actually not part of widen_minus pattern.
6509	Vectorizing the pattern does not cause these definition statements to
6510	be marked as PURE_SLP. For this case, we need to recursively check
6511	whether their uses are all absorbed into vectorized code. But there
6512	is an exception that some use may participate in an vectorized
6513	operation via an external SLP node containing that use as an element.
6514	The parameter "scalar_use_map" tags such kind of SSA as having scalar
6515	use in advance. /*
6516	tree lhs = gimple_get_lhs (use_stmt);
6517
6518	if (!lhs \|\| TREE_CODE (lhs) != SSA_NAME)
6519	break;
6520
6521	if (depth_limit && depth >= depth_limit)
6522	return -`1`;
6523
6524	if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, def: lhs, scalar_use_map,
6525	depth: depth + `1`)))
6526	break;
6527	}
6528
6529	if (end_imm_use_stmt_p (imm: &use_iter))
6530	scalar_use = `0`;
6531
6532	/ If recursion is limited, do not cache result for non-root defs. /
6533	if (!depth \|\| scalar_use >= `0`)
6534	{
6535	bool added = scalar_use_map.put (k: def, v: scalar_use);
6536	gcc_assert (!added);
6537	}
6538
6539	return scalar_use;
6540	}
6541
6542	/ Mark lanes of NODE that are live outside of the basic-block vectorized*
6543	region and that can be vectorized using vectorizable_live_operation
6544	with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6545	scalar code computing it to be retained. /*
6546
6547	static void
6548	vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6549	slp_instance instance,
6550	stmt_vector_for_cost *cost_vec,
6551	hash_map<tree, int> &scalar_use_map,
6552	hash_set<stmt_vec_info> &svisited,
6553	hash_set<slp_tree> &visited)
6554	{
6555	if (visited.add (k: node))
6556	return;
6557
6558	unsigned i;
6559	stmt_vec_info stmt_info;
6560	stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6561	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6562	{
6563	if (svisited.contains (k: stmt_info))
6564	continue;
6565	stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6566	if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6567	&& STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6568	/ Only the pattern root stmt computes the original scalar value. /
6569	continue;
6570	bool mark_visited = true;
6571	gimple *orig_stmt = orig_stmt_info->stmt;
6572	ssa_op_iter op_iter;
6573	def_operand_p def_p;
6574	FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6575	{
6576	if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
6577	scalar_use_map))
6578	{
6579	STMT_VINFO_LIVE_P (stmt_info) = true;
6580	if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
6581	instance, i, false, cost_vec))
6582	/ ??? So we know we can vectorize the live stmt from one SLP*
6583	node. If we cannot do so from all or none consistently
6584	we'd have to record which SLP node (and lane) we want to
6585	use for the live operation. So make sure we can
6586	code-generate from all nodes. /*
6587	mark_visited = false;
6588	else
6589	STMT_VINFO_LIVE_P (stmt_info) = false;
6590	}
6591
6592	/ We have to verify whether we can insert the lane extract*
6593	before all uses. The following is a conservative approximation.
6594	We cannot put this into vectorizable_live_operation because
6595	iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6596	doesn't work.
6597	Note that while the fact that we emit code for loads at the
6598	first load should make this a non-problem leafs we construct
6599	from scalars are vectorized after the last scalar def.
6600	??? If we'd actually compute the insert location during
6601	analysis we could use sth less conservative than the last
6602	scalar stmt in the node for the dominance check. /*
6603	/ ??? What remains is "live" uses in vector CTORs in the same*
6604	SLP graph which is where those uses can end up code-generated
6605	right after their definition instead of close to their original
6606	use. But that would restrict us to code-generate lane-extracts
6607	from the latest stmt in a node. So we compensate for this
6608	during code-generation, simply not replacing uses for those
6609	hopefully rare cases. /*
6610	imm_use_iterator use_iter;
6611	gimple *use_stmt;
6612	stmt_vec_info use_stmt_info;
6613
6614	if (STMT_VINFO_LIVE_P (stmt_info))
6615	FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6616	if (!is_gimple_debug (gs: use_stmt)
6617	&& (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6618	\|\| !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6619	&& !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6620	{
6621	if (dump_enabled_p ())
6622	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6623	"Cannot determine insertion place for "
6624	"lane extract\n");
6625	STMT_VINFO_LIVE_P (stmt_info) = false;
6626	mark_visited = true;
6627	}
6628	}
6629	if (mark_visited)
6630	svisited.add (k: stmt_info);
6631	}
6632
6633	slp_tree child;
6634	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6635	if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6636	vect_bb_slp_mark_live_stmts (bb_vinfo, node: child, instance, cost_vec,
6637	scalar_use_map, svisited, visited);
6638	}
6639
6640	/ Traverse all slp instances of BB_VINFO, and mark lanes of every node that*
6641	are live outside of the basic-block vectorized region and that can be
6642	vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. /*
6643
6644	static void
6645	vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
6646	{
6647	if (bb_vinfo->slp_instances.is_empty ())
6648	return;
6649
6650	hash_set<stmt_vec_info> svisited;
6651	hash_set<slp_tree> visited;
6652	hash_map<tree, int> scalar_use_map;
6653	auto_vec<slp_tree> worklist;
6654
6655	for (slp_instance instance : bb_vinfo->slp_instances)
6656	{
6657	if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
6658	for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
6659	if (TREE_CODE (op) == SSA_NAME)
6660	scalar_use_map.put (k: op, v: `1`);
6661	if (!visited.add (SLP_INSTANCE_TREE (instance)))
6662	worklist.safe_push (SLP_INSTANCE_TREE (instance));
6663	}
6664
6665	do
6666	{
6667	slp_tree node = worklist.pop ();
6668
6669	if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
6670	{
6671	for (tree op : SLP_TREE_SCALAR_OPS (node))
6672	if (TREE_CODE (op) == SSA_NAME)
6673	scalar_use_map.put (k: op, v: `1`);
6674	}
6675	else
6676	{
6677	for (slp_tree child : SLP_TREE_CHILDREN (node))
6678	if (child && !visited.add (k: child))
6679	worklist.safe_push (obj: child);
6680	}
6681	}
6682	while (!worklist.is_empty ());
6683
6684	visited.empty ();
6685
6686	for (slp_instance instance : bb_vinfo->slp_instances)
6687	{
6688	vect_location = instance->location ();
6689	vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6690	instance, cost_vec: &instance->cost_vec,
6691	scalar_use_map, svisited, visited);
6692	}
6693	}
6694
6695	/ Determine whether we can vectorize the reduction epilogue for INSTANCE. /
6696
6697	static bool
6698	vectorizable_bb_reduc_epilogue (slp_instance instance,
6699	stmt_vector_for_cost *cost_vec)
6700	{
6701	gassign stmt = as_a <gassign > (p: instance->root_stmts [`0`]->stmt);
6702	enum tree_code reduc_code = gimple_assign_rhs_code (gs: stmt);
6703	if (reduc_code == MINUS_EXPR)
6704	reduc_code = PLUS_EXPR;
6705	internal_fn reduc_fn;
6706	tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6707	if (!vectype
6708	\|\| !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6709	\|\| reduc_fn == IFN_LAST
6710	\|\| !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6711	\|\| !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6712	TREE_TYPE (vectype)))
6713	{
6714	if (dump_enabled_p ())
6715	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6716	"not vectorized: basic block reduction epilogue "
6717	"operation unsupported.\n");
6718	return false;
6719	}
6720
6721	/ There's no way to cost a horizontal vector reduction via REDUC_FN so*
6722	cost log2 vector operations plus shuffles and one extraction. /*
6723	unsigned steps = floor_log2 (x: vect_nunits_for_cost (vec_type: vectype));
6724	record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts [`0`],
6725	vectype, `0`, vect_body);
6726	record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts [`0`],
6727	vectype, `0`, vect_body);
6728	record_stmt_cost (cost_vec, `1`, vec_to_scalar, instance->root_stmts [`0`],
6729	vectype, `0`, vect_body);
6730
6731	/ Since we replace all stmts of a possibly longer scalar reduction*
6732	chain account for the extra scalar stmts for that. /*
6733	record_stmt_cost (body_cost_vec: cost_vec, count: instance->remain_defs.length (), kind: scalar_stmt,
6734	stmt_info: instance->root_stmts [`0`], misalign: `0`, where: vect_body);
6735	return true;
6736	}
6737
6738	/ Prune from ROOTS all stmts that are computed as part of lanes of NODE*
6739	and recurse to children. /*
6740
6741	static void
6742	vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6743	hash_set<slp_tree> &visited)
6744	{
6745	if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6746	\|\| visited.add (k: node))
6747	return;
6748
6749	stmt_vec_info stmt;
6750	unsigned i;
6751	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6752	roots.remove (k: vect_orig_stmt (stmt_info: stmt));
6753
6754	slp_tree child;
6755	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6756	if (child)
6757	vect_slp_prune_covered_roots (node: child, roots, visited);
6758	}
6759
6760	/ Analyze statements in SLP instances of VINFO. Return true if the*
6761	operations are supported. /*
6762
6763	bool
6764	vect_slp_analyze_operations (vec_info *vinfo)
6765	{
6766	slp_instance instance;
6767	int i;
6768
6769	DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6770
6771	hash_set<slp_tree> visited;
6772	for (i = `0`; vinfo->slp_instances.iterate (ix: i, ptr: &instance); )
6773	{
6774	auto_vec<slp_tree> visited_vec;
6775	stmt_vector_for_cost cost_vec;
6776	cost_vec.create (nelems: `2`);
6777	if (is_a <bb_vec_info> (p: vinfo))
6778	vect_location = instance->location ();
6779	if (!vect_slp_analyze_node_operations (vinfo,
6780	SLP_INSTANCE_TREE (instance),
6781	node_instance: instance, visited_set&: visited, visited_vec,
6782	cost_vec: &cost_vec)
6783	/ CTOR instances require vectorized defs for the SLP tree root. /
6784	\|\| (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6785	&& (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6786	!= vect_internal_def
6787	/ Make sure we vectorized with the expected type. /
6788	\|\| !useless_type_conversion_p
6789	(TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6790	(instance->root_stmts[`0`]->stmt))),
6791	TREE_TYPE (SLP_TREE_VECTYPE
6792	(SLP_INSTANCE_TREE (instance))))))
6793	/ Check we can vectorize the reduction. /
6794	\|\| (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6795	&& !vectorizable_bb_reduc_epilogue (instance, cost_vec: &cost_vec)))
6796	{
6797	slp_tree node = SLP_INSTANCE_TREE (instance);
6798	stmt_vec_info stmt_info;
6799	if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6800	stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[`0`];
6801	else
6802	stmt_info = SLP_TREE_SCALAR_STMTS (node)[`0`];
6803	if (dump_enabled_p ())
6804	dump_printf_loc (MSG_NOTE, vect_location,
6805	"removing SLP instance operations starting from: %G",
6806	stmt_info->stmt);
6807	vect_free_slp_instance (instance);
6808	vinfo->slp_instances.ordered_remove (ix: i);
6809	cost_vec.release ();
6810	while (!visited_vec.is_empty ())
6811	visited.remove (k: visited_vec.pop ());
6812	}
6813	else
6814	{
6815	i++;
6816	if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (p: vinfo))
6817	{
6818	add_stmt_costs (costs: loop_vinfo->vector_costs, cost_vec: &cost_vec);
6819	cost_vec.release ();
6820	}
6821	else
6822	/ For BB vectorization remember the SLP graph entry*
6823	cost for later. /*
6824	instance->cost_vec = cost_vec;
6825	}
6826	}
6827
6828	/ Now look for SLP instances with a root that are covered by other*
6829	instances and remove them. /*
6830	hash_set<stmt_vec_info> roots;
6831	for (i = `0`; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i)
6832	if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6833	roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[`0`]);
6834	if (!roots.is_empty ())
6835	{
6836	visited.empty ();
6837	for (i = `0`; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i)
6838	vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6839	visited);
6840	for (i = `0`; vinfo->slp_instances.iterate (ix: i, ptr: &instance); )
6841	if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6842	&& !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[`0`]))
6843	{
6844	stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[`0`];
6845	if (dump_enabled_p ())
6846	dump_printf_loc (MSG_NOTE, vect_location,
6847	"removing SLP instance operations starting "
6848	"from: %G", root->stmt);
6849	vect_free_slp_instance (instance);
6850	vinfo->slp_instances.ordered_remove (ix: i);
6851	}
6852	else
6853	++i;
6854	}
6855
6856	/ Compute vectorizable live stmts. /
6857	if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo))
6858	vect_bb_slp_mark_live_stmts (bb_vinfo);
6859
6860	return !vinfo->slp_instances.is_empty ();
6861	}
6862
6863	/ Get the SLP instance leader from INSTANCE_LEADER thereby transitively*
6864	closing the eventual chain. /*
6865
6866	static slp_instance
6867	get_ultimate_leader (slp_instance instance,
6868	hash_map<slp_instance, slp_instance> &instance_leader)
6869	{
6870	auto_vec<slp_instance *, `8`> chain;
6871	slp_instance *tem;
6872	while (*(tem = instance_leader.get (k: instance)) != instance)
6873	{
6874	chain.safe_push (obj: tem);
6875	instance = *tem;
6876	}
6877	while (!chain.is_empty ())
6878	*chain.pop () = instance;
6879	return instance;
6880	}
6881
6882	namespace {
6883	/ Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in*
6884	KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6885	for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6886
6887	INSTANCE_LEADER is as for get_ultimate_leader. /*
6888
6889	template<typename T>
6890	bool
6891	vect_map_to_instance (slp_instance instance, T key,
6892	hash_map<T, slp_instance> &key_to_instance,
6893	hash_map<slp_instance, slp_instance> &instance_leader)
6894	{
6895	bool existed_p;
6896	slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6897	if (!existed_p)
6898	;
6899	else if (key_instance != instance)
6900	{
6901	/ If we're running into a previously marked key make us the*
6902	leader of the current ultimate leader. This keeps the
6903	leader chain acyclic and works even when the current instance
6904	connects two previously independent graph parts. /*
6905	slp_instance key_leader
6906	= get_ultimate_leader (instance: key_instance, instance_leader);
6907	if (key_leader != instance)
6908	instance_leader.put (k: key_leader, v: instance);
6909	}
6910	key_instance = instance;
6911	return existed_p;
6912	}
6913	}
6914
6915	/ Worker of vect_bb_partition_graph, recurse on NODE. /
6916
6917	static void
6918	vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6919	slp_instance instance, slp_tree node,
6920	hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6921	hash_map<slp_tree, slp_instance> &node_to_instance,
6922	hash_map<slp_instance, slp_instance> &instance_leader)
6923	{
6924	stmt_vec_info stmt_info;
6925	unsigned i;
6926
6927	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6928	vect_map_to_instance (instance, key: stmt_info, key_to_instance&: stmt_to_instance,
6929	instance_leader);
6930
6931	if (vect_map_to_instance (instance, key: node, key_to_instance&: node_to_instance,
6932	instance_leader))
6933	return;
6934
6935	slp_tree child;
6936	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6937	if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6938	vect_bb_partition_graph_r (bb_vinfo, instance, node: child, stmt_to_instance,
6939	node_to_instance, instance_leader);
6940	}
6941
6942	/ Partition the SLP graph into pieces that can be costed independently. /
6943
6944	static void
6945	vect_bb_partition_graph (bb_vec_info bb_vinfo)
6946	{
6947	DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6948
6949	/ First walk the SLP graph assigning each involved scalar stmt a*
6950	corresponding SLP graph entry and upon visiting a previously
6951	marked stmt, make the stmts leader the current SLP graph entry. /*
6952	hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6953	hash_map<slp_tree, slp_instance> node_to_instance;
6954	hash_map<slp_instance, slp_instance> instance_leader;
6955	slp_instance instance;
6956	for (unsigned i = `0`; bb_vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i)
6957	{
6958	instance_leader.put (k: instance, v: instance);
6959	vect_bb_partition_graph_r (bb_vinfo,
6960	instance, SLP_INSTANCE_TREE (instance),
6961	stmt_to_instance, node_to_instance,
6962	instance_leader);
6963	}
6964
6965	/ Then collect entries to each independent subgraph. /
6966	for (unsigned i = `0`; bb_vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i)
6967	{
6968	slp_instance leader = get_ultimate_leader (instance, instance_leader);
6969	leader->subgraph_entries.safe_push (obj: instance);
6970	if (dump_enabled_p ()
6971	&& leader != instance)
6972	dump_printf_loc (MSG_NOTE, vect_location,
6973	"instance %p is leader of %p\n",
6974	(void ) leader, (void* *) instance);
6975	}
6976	}
6977
6978	/ Compute the set of scalar stmts participating in internal and external*
6979	nodes. /*
6980
6981	static void
6982	vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6983	hash_set<slp_tree> &visited,
6984	hash_set<stmt_vec_info> &vstmts,
6985	hash_set<stmt_vec_info> &estmts)
6986	{
6987	int i;
6988	stmt_vec_info stmt_info;
6989	slp_tree child;
6990
6991	if (visited.add (k: node))
6992	return;
6993
6994	if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6995	{
6996	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6997	vstmts.add (k: stmt_info);
6998
6999	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7000	if (child)
7001	vect_slp_gather_vectorized_scalar_stmts (vinfo, node: child, visited,
7002	vstmts, estmts);
7003	}
7004	else
7005	for (tree def : SLP_TREE_SCALAR_OPS (node))
7006	{
7007	stmt_vec_info def_stmt = vinfo->lookup_def (def);
7008	if (def_stmt)
7009	estmts.add (k: def_stmt);
7010	}
7011	}
7012
7013
7014	/ Compute the scalar cost of the SLP node NODE and its children*
7015	and return it. Do not account defs that are marked in LIFE and
7016	update LIFE according to uses of NODE. /*
7017
7018	static void
7019	vect_bb_slp_scalar_cost (vec_info *vinfo,
7020	slp_tree node, vec<bool, va_heap> *life,
7021	stmt_vector_for_cost *cost_vec,
7022	hash_set<stmt_vec_info> &vectorized_scalar_stmts,
7023	hash_set<slp_tree> &visited)
7024	{
7025	unsigned i;
7026	stmt_vec_info stmt_info;
7027	slp_tree child;
7028
7029	if (visited.add (k: node))
7030	return;
7031
7032	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7033	{
7034	ssa_op_iter op_iter;
7035	def_operand_p def_p;
7036
7037	if ((*life)[i])
7038	continue;
7039
7040	stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7041	gimple *orig_stmt = orig_stmt_info->stmt;
7042
7043	/ If there is a non-vectorized use of the defs then the scalar*
7044	stmt is kept live in which case we do not account it or any
7045	required defs in the SLP children in the scalar cost. This
7046	way we make the vectorization more costly when compared to
7047	the scalar cost. /*
7048	if (!STMT_VINFO_LIVE_P (stmt_info))
7049	{
7050	auto_vec<gimple *, `8`> worklist;
7051	hash_set<gimple > worklist_visited = NULL;
7052	worklist.quick_push (obj: orig_stmt);
7053	do
7054	{
7055	gimple *work_stmt = worklist.pop ();
7056	FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
7057	{
7058	imm_use_iterator use_iter;
7059	gimple *use_stmt;
7060	FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
7061	DEF_FROM_PTR (def_p))
7062	if (!is_gimple_debug (gs: use_stmt))
7063	{
7064	stmt_vec_info use_stmt_info
7065	= vinfo->lookup_stmt (use_stmt);
7066	if (!use_stmt_info
7067	\|\| !vectorized_scalar_stmts.contains (k: use_stmt_info))
7068	{
7069	if (use_stmt_info
7070	&& STMT_VINFO_IN_PATTERN_P (use_stmt_info))
7071	{
7072	/ For stmts participating in patterns we have*
7073	to check its uses recursively. /*
7074	if (!worklist_visited)
7075	worklist_visited = new hash_set<gimple *> ();
7076	if (!worklist_visited->add (k: use_stmt))
7077	worklist.safe_push (obj: use_stmt);
7078	continue;
7079	}
7080	(life)[i] = true*;
7081	goto next_lane;
7082	}
7083	}
7084	}
7085	}
7086	while (!worklist.is_empty ());
7087	next_lane:
7088	if (worklist_visited)
7089	delete worklist_visited;
7090	if ((*life)[i])
7091	continue;
7092	}
7093
7094	/ Count scalar stmts only once. /
7095	if (gimple_visited_p (stmt: orig_stmt))
7096	continue;
7097	gimple_set_visited (stmt: orig_stmt, visited_p: true);
7098
7099	vect_cost_for_stmt kind;
7100	if (STMT_VINFO_DATA_REF (orig_stmt_info))
7101	{
7102	if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
7103	kind = scalar_load;
7104	else
7105	kind = scalar_store;
7106	}
7107	else if (vect_nop_conversion_p (orig_stmt_info))
7108	continue;
7109	/ For single-argument PHIs assume coalescing which means zero cost*
7110	for the scalar and the vector PHIs. This avoids artificially
7111	favoring the vector path (but may pessimize it in some cases). /*
7112	else if (is_a <gphi *> (p: orig_stmt_info->stmt)
7113	&& gimple_phi_num_args
7114	(gs: as_a <gphi *> (p: orig_stmt_info->stmt)) == `1`)
7115	continue;
7116	else
7117	kind = scalar_stmt;
7118	record_stmt_cost (cost_vec, `1`, kind, orig_stmt_info,
7119	SLP_TREE_VECTYPE (node), `0`, vect_body);
7120	}
7121
7122	auto_vec<bool, `20`> subtree_life;
7123	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7124	{
7125	if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7126	{
7127	/ Do not directly pass LIFE to the recursive call, copy it to*
7128	confine changes in the callee to the current child/subtree. /*
7129	if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7130	{
7131	subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), exact: true);
7132	for (unsigned j = `0`;
7133	j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
7134	{
7135	auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
7136	if (perm.first == i)
7137	subtree_life [perm.second] = (*life)[j];
7138	}
7139	}
7140	else
7141	{
7142	gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
7143	subtree_life.safe_splice (src: *life);
7144	}
7145	vect_bb_slp_scalar_cost (vinfo, node: child, life: &subtree_life, cost_vec,
7146	vectorized_scalar_stmts, visited);
7147	subtree_life.truncate (size: `0`);
7148	}
7149	}
7150	}
7151
7152	/ Comparator for the loop-index sorted cost vectors. /
7153
7154	static int
7155	li_cost_vec_cmp (const void a_, const* void *b_)
7156	{
7157	auto a = (const* std::pair<unsigned, stmt_info_for_cost > )a_;
7158	auto b = (const* std::pair<unsigned, stmt_info_for_cost > )b_;
7159	if (a->first < b->first)
7160	return -`1`;
7161	else if (a->first == b->first)
7162	return `0`;
7163	return `1`;
7164	}
7165
7166	/ Check if vectorization of the basic block is profitable for the*
7167	subgraph denoted by SLP_INSTANCES. /*
7168
7169	static bool
7170	vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
7171	vec<slp_instance> slp_instances,
7172	loop_p orig_loop)
7173	{
7174	slp_instance instance;
7175	int i;
7176	unsigned int vec_inside_cost = `0`, vec_outside_cost = `0`, scalar_cost = `0`;
7177	unsigned int vec_prologue_cost = `0`, vec_epilogue_cost = `0`;
7178
7179	if (dump_enabled_p ())
7180	{
7181	dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7182	hash_set<slp_tree> visited;
7183	FOR_EACH_VEC_ELT (slp_instances, i, instance)
7184	vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location,
7185	SLP_INSTANCE_TREE (instance), visited);
7186	}
7187
7188	/ Compute the set of scalar stmts we know will go away 'locally' when*
7189	vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
7190	not accurate for nodes promoted extern late or for scalar stmts that
7191	are used both in extern defs and in vectorized defs. /*
7192	hash_set<stmt_vec_info> vectorized_scalar_stmts;
7193	hash_set<stmt_vec_info> scalar_stmts_in_externs;
7194	hash_set<slp_tree> visited;
7195	FOR_EACH_VEC_ELT (slp_instances, i, instance)
7196	{
7197	vect_slp_gather_vectorized_scalar_stmts (vinfo: bb_vinfo,
7198	SLP_INSTANCE_TREE (instance),
7199	visited,
7200	vstmts&: vectorized_scalar_stmts,
7201	estmts&: scalar_stmts_in_externs);
7202	for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7203	vectorized_scalar_stmts.add (k: rstmt);
7204	}
7205	/ Scalar stmts used as defs in external nodes need to be preseved, so*
7206	remove them from vectorized_scalar_stmts. /*
7207	for (stmt_vec_info stmt : scalar_stmts_in_externs)
7208	vectorized_scalar_stmts.remove (k: stmt);
7209
7210	/ Calculate scalar cost and sum the cost for the vector stmts*
7211	previously collected. /*
7212	stmt_vector_for_cost scalar_costs = vNULL;
7213	stmt_vector_for_cost vector_costs = vNULL;
7214	visited.empty ();
7215	FOR_EACH_VEC_ELT (slp_instances, i, instance)
7216	{
7217	auto_vec<bool, `20`> life;
7218	life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7219	exact: true);
7220	if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7221	record_stmt_cost (body_cost_vec: &scalar_costs,
7222	SLP_INSTANCE_ROOT_STMTS (instance).length (),
7223	kind: scalar_stmt,
7224	SLP_INSTANCE_ROOT_STMTS (instance)[`0`], misalign: `0`, where: vect_body);
7225	vect_bb_slp_scalar_cost (vinfo: bb_vinfo,
7226	SLP_INSTANCE_TREE (instance),
7227	life: &life, cost_vec: &scalar_costs, vectorized_scalar_stmts,
7228	visited);
7229	vector_costs.safe_splice (src: instance->cost_vec);
7230	instance->cost_vec.release ();
7231	}
7232
7233	if (dump_enabled_p ())
7234	dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7235
7236	/ When costing non-loop vectorization we need to consider each covered*
7237	loop independently and make sure vectorization is profitable. For
7238	now we assume a loop may be not entered or executed an arbitrary
7239	number of iterations (??? static information can provide more
7240	precise info here) which means we can simply cost each containing
7241	loops stmts separately. /*
7242
7243	/ First produce cost vectors sorted by loop index. /
7244	auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7245	li_scalar_costs (scalar_costs.length ());
7246	auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7247	li_vector_costs (vector_costs.length ());
7248	stmt_info_for_cost *cost;
7249	FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7250	{
7251	unsigned l = gimple_bb (g: cost->stmt_info->stmt)->loop_father->num;
7252	li_scalar_costs.quick_push (obj: std::make_pair (x&: l, y&: cost));
7253	}
7254	/ Use a random used loop as fallback in case the first vector_costs*
7255	entry does not have a stmt_info associated with it. /*
7256	unsigned l = li_scalar_costs [`0`].first;
7257	FOR_EACH_VEC_ELT (vector_costs, i, cost)
7258	{
7259	/ We inherit from the previous COST, invariants, externals and*
7260	extracts immediately follow the cost for the related stmt. /*
7261	if (cost->stmt_info)
7262	l = gimple_bb (g: cost->stmt_info->stmt)->loop_father->num;
7263	li_vector_costs.quick_push (obj: std::make_pair (x&: l, y&: cost));
7264	}
7265	li_scalar_costs.qsort (li_cost_vec_cmp);
7266	li_vector_costs.qsort (li_cost_vec_cmp);
7267
7268	/ Now cost the portions individually. /
7269	unsigned vi = `0`;
7270	unsigned si = `0`;
7271	bool profitable = true;
7272	while (si < li_scalar_costs.length ()
7273	&& vi < li_vector_costs.length ())
7274	{
7275	unsigned sl = li_scalar_costs [si].first;
7276	unsigned vl = li_vector_costs [vi].first;
7277	if (sl != vl)
7278	{
7279	if (dump_enabled_p ())
7280	dump_printf_loc (MSG_NOTE, vect_location,
7281	"Scalar %d and vector %d loop part do not "
7282	"match up, skipping scalar part\n", sl, vl);
7283	/ Skip the scalar part, assuming zero cost on the vector side. /
7284	do
7285	{
7286	si++;
7287	}
7288	while (si < li_scalar_costs.length ()
7289	&& li_scalar_costs [si].first == sl);
7290	continue;
7291	}
7292
7293	class vector_costs scalar_target_cost_data = init_cost (vinfo: bb_vinfo, costing_for_scalar: true*);
7294	do
7295	{
7296	add_stmt_cost (costs: scalar_target_cost_data, i: li_scalar_costs [si].second);
7297	si++;
7298	}
7299	while (si < li_scalar_costs.length ()
7300	&& li_scalar_costs [si].first == sl);
7301	unsigned dummy;
7302	finish_cost (costs: scalar_target_cost_data, scalar_costs: nullptr,
7303	prologue_cost: &dummy, body_cost: &scalar_cost, epilogue_cost: &dummy);
7304
7305	/ Complete the target-specific vector cost calculation. /
7306	class vector_costs vect_target_cost_data = init_cost (vinfo: bb_vinfo, costing_for_scalar: false*);
7307	do
7308	{
7309	add_stmt_cost (costs: vect_target_cost_data, i: li_vector_costs [vi].second);
7310	vi++;
7311	}
7312	while (vi < li_vector_costs.length ()
7313	&& li_vector_costs [vi].first == vl);
7314	finish_cost (costs: vect_target_cost_data, scalar_costs: scalar_target_cost_data,
7315	prologue_cost: &vec_prologue_cost, body_cost: &vec_inside_cost, epilogue_cost: &vec_epilogue_cost);
7316	delete scalar_target_cost_data;
7317	delete vect_target_cost_data;
7318
7319	vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7320
7321	if (dump_enabled_p ())
7322	{
7323	dump_printf_loc (MSG_NOTE, vect_location,
7324	"Cost model analysis for part in loop %d:\n", sl);
7325	dump_printf (MSG_NOTE, " Vector cost: %d\n",
7326	vec_inside_cost + vec_outside_cost);
7327	dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
7328	}
7329
7330	/ Vectorization is profitable if its cost is more than the cost of scalar*
7331	version. Note that we err on the vector side for equal cost because
7332	the cost estimate is otherwise quite pessimistic (constant uses are
7333	free on the scalar side but cost a load on the vector side for
7334	example). /*
7335	if (vec_outside_cost + vec_inside_cost > scalar_cost)
7336	{
7337	profitable = false;
7338	break;
7339	}
7340	}
7341	if (profitable && vi < li_vector_costs.length ())
7342	{
7343	if (dump_enabled_p ())
7344	dump_printf_loc (MSG_NOTE, vect_location,
7345	"Excess vector cost for part in loop %d:\n",
7346	li_vector_costs [vi].first);
7347	profitable = false;
7348	}
7349
7350	/ Unset visited flag. This is delayed when the subgraph is profitable*
7351	and we process the loop for remaining unvectorized if-converted code. /*
7352	if (!orig_loop \|\| !profitable)
7353	FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7354	gimple_set_visited (stmt: cost->stmt_info->stmt, visited_p: false);
7355
7356	scalar_costs.release ();
7357	vector_costs.release ();
7358
7359	return profitable;
7360	}
7361
7362	/ qsort comparator for lane defs. /
7363
7364	static int
7365	vld_cmp (const void a_, const* void *b_)
7366	{
7367	auto a = (const* std::pair<unsigned, tree> *)a_;
7368	auto b = (const* std::pair<unsigned, tree> *)b_;
7369	return a->first - b->first;
7370	}
7371
7372	/ Return true if USE_STMT is a vector lane insert into VEC and set*
7373	THIS_LANE to the lane number that is set. /
7374
7375	static bool
7376	vect_slp_is_lane_insert (gimple use_stmt, tree vec, unsigned* *this_lane)
7377	{
7378	gassign use_ass = dyn_cast <gassign > (p: use_stmt);
7379	if (!use_ass
7380	\|\| gimple_assign_rhs_code (gs: use_ass) != BIT_INSERT_EXPR
7381	\|\| (vec
7382	? gimple_assign_rhs1 (gs: use_ass) != vec
7383	: ((vec = gimple_assign_rhs1 (gs: use_ass)), false))
7384	\|\| !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7385	TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7386	\|\| !constant_multiple_p
7387	(a: tree_to_poly_uint64 (gimple_assign_rhs3 (gs: use_ass)),
7388	b: tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7389	multiple: this_lane))
7390	return false;
7391	return true;
7392	}
7393
7394	/ Find any vectorizable constructors and add them to the grouped_store*
7395	array. /*
7396
7397	static void
7398	vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7399	{
7400	for (unsigned i = `0`; i < bb_vinfo->bbs.length (); ++i)
7401	for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bb_vinfo->bbs [i]);
7402	!gsi_end_p (i: gsi); gsi_next (i: &gsi))
7403	{
7404	gassign assign = dyn_cast<gassign > (p: gsi_stmt (i: gsi));
7405	if (!assign)
7406	continue;
7407
7408	tree rhs = gimple_assign_rhs1 (gs: assign);
7409	enum tree_code code = gimple_assign_rhs_code (gs: assign);
7410	use_operand_p use_p;
7411	gimple *use_stmt;
7412	if (code == CONSTRUCTOR)
7413	{
7414	if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7415	\|\| maybe_ne (a: TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7416	CONSTRUCTOR_NELTS (rhs))
7417	\|\| VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, `0`)->value))
7418	\|\| uniform_vector_p (rhs))
7419	continue;
7420
7421	unsigned j;
7422	tree val;
7423	FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7424	if (TREE_CODE (val) != SSA_NAME
7425	\|\| !bb_vinfo->lookup_def (val))
7426	break;
7427	if (j != CONSTRUCTOR_NELTS (rhs))
7428	continue;
7429
7430	vec<stmt_vec_info> roots = vNULL;
7431	roots.safe_push (obj: bb_vinfo->lookup_stmt (assign));
7432	vec<stmt_vec_info> stmts;
7433	stmts.create (CONSTRUCTOR_NELTS (rhs));
7434	FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7435	stmts.quick_push
7436	(obj: vect_stmt_to_vectorize (stmt_info: bb_vinfo->lookup_def (val)));
7437	bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_ctor,
7438	stmts, roots));
7439	}
7440	else if (code == BIT_INSERT_EXPR
7441	&& VECTOR_TYPE_P (TREE_TYPE (rhs))
7442	&& TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7443	&& TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > `1`
7444	&& integer_zerop (gimple_assign_rhs3 (gs: assign))
7445	&& useless_type_conversion_p
7446	(TREE_TYPE (TREE_TYPE (rhs)),
7447	TREE_TYPE (gimple_assign_rhs2 (assign)))
7448	&& bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: assign)))
7449	{
7450	/ We start to match on insert to lane zero but since the*
7451	inserts need not be ordered we'd have to search both
7452	the def and the use chains. /*
7453	tree vectype = TREE_TYPE (rhs);
7454	unsigned nlanes = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant ();
7455	auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7456	auto_sbitmap lanes (nlanes);
7457	bitmap_clear (lanes);
7458	bitmap_set_bit (map: lanes, bitno: `0`);
7459	tree def = gimple_assign_lhs (gs: assign);
7460	lane_defs.quick_push
7461	(obj: std::make_pair (x: `0`, y: gimple_assign_rhs2 (gs: assign)));
7462	unsigned lanes_found = `1`;
7463	/ Start with the use chains, the last stmt will be the root. /
7464	stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7465	vec<stmt_vec_info> roots = vNULL;
7466	roots.safe_push (obj: last);
7467	do
7468	{
7469	use_operand_p use_p;
7470	gimple *use_stmt;
7471	if (!single_imm_use (var: def, use_p: &use_p, stmt: &use_stmt))
7472	break;
7473	unsigned this_lane;
7474	if (!bb_vinfo->lookup_stmt (use_stmt)
7475	\|\| !vect_slp_is_lane_insert (use_stmt, vec: def, this_lane: &this_lane)
7476	\|\| !bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: use_stmt)))
7477	break;
7478	if (bitmap_bit_p (map: lanes, bitno: this_lane))
7479	break;
7480	lanes_found++;
7481	bitmap_set_bit (map: lanes, bitno: this_lane);
7482	gassign use_ass = as_a <gassign > (p: use_stmt);
7483	lane_defs.quick_push (obj: std::make_pair
7484	(x&: this_lane, y: gimple_assign_rhs2 (gs: use_ass)));
7485	last = bb_vinfo->lookup_stmt (use_ass);
7486	roots.safe_push (obj: last);
7487	def = gimple_assign_lhs (gs: use_ass);
7488	}
7489	while (lanes_found < nlanes);
7490	if (roots.length () > `1`)
7491	std::swap(a&: roots [`0`], b&: roots [roots.length () - `1`]);
7492	if (lanes_found < nlanes)
7493	{
7494	/ Now search the def chain. /
7495	def = gimple_assign_rhs1 (gs: assign);
7496	do
7497	{
7498	if (TREE_CODE (def) != SSA_NAME
7499	\|\| !has_single_use (var: def))
7500	break;
7501	gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7502	unsigned this_lane;
7503	if (!bb_vinfo->lookup_stmt (def_stmt)
7504	\|\| !vect_slp_is_lane_insert (use_stmt: def_stmt,
7505	NULL_TREE, this_lane: &this_lane)
7506	\|\| !bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: def_stmt)))
7507	break;
7508	if (bitmap_bit_p (map: lanes, bitno: this_lane))
7509	break;
7510	lanes_found++;
7511	bitmap_set_bit (map: lanes, bitno: this_lane);
7512	lane_defs.quick_push (obj: std::make_pair
7513	(x&: this_lane,
7514	y: gimple_assign_rhs2 (gs: def_stmt)));
7515	roots.safe_push (obj: bb_vinfo->lookup_stmt (def_stmt));
7516	def = gimple_assign_rhs1 (gs: def_stmt);
7517	}
7518	while (lanes_found < nlanes);
7519	}
7520	if (lanes_found == nlanes)
7521	{
7522	/ Sort lane_defs after the lane index and register the root. /
7523	lane_defs.qsort (vld_cmp);
7524	vec<stmt_vec_info> stmts;
7525	stmts.create (nelems: nlanes);
7526	for (unsigned i = `0`; i < nlanes; ++i)
7527	stmts.quick_push (obj: bb_vinfo->lookup_def (lane_defs [i].second));
7528	bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_ctor,
7529	stmts, roots));
7530	}
7531	else
7532	roots.release ();
7533	}
7534	else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7535	&& (associative_tree_code (code) \|\| code == MINUS_EXPR)
7536	/ ??? This pessimizes a two-element reduction. PR54400.*
7537	??? In-order reduction could be handled if we only
7538	traverse one operand chain in vect_slp_linearize_chain. /*
7539	&& !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7540	/ Ops with constants at the tail can be stripped here. /
7541	&& TREE_CODE (rhs) == SSA_NAME
7542	&& TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7543	/ Should be the chain end. /
7544	&& (!single_imm_use (var: gimple_assign_lhs (gs: assign),
7545	use_p: &use_p, stmt: &use_stmt)
7546	\|\| !is_gimple_assign (gs: use_stmt)
7547	\|\| (gimple_assign_rhs_code (gs: use_stmt) != code
7548	&& ((code != PLUS_EXPR && code != MINUS_EXPR)
7549	\|\| (gimple_assign_rhs_code (gs: use_stmt)
7550	!= (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7551	{
7552	/ We start the match at the end of a possible association*
7553	chain. /*
7554	auto_vec<chain_op_t> chain;
7555	auto_vec<std::pair<tree_code, gimple *> > worklist;
7556	auto_vec<gimple *> chain_stmts;
7557	gimple code_stmt = NULL, alt_code_stmt = NULL;
7558	if (code == MINUS_EXPR)
7559	code = PLUS_EXPR;
7560	internal_fn reduc_fn;
7561	if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7562	\|\| reduc_fn == IFN_LAST)
7563	continue;
7564	vect_slp_linearize_chain (vinfo: bb_vinfo, worklist, chain, code, start: assign,
7565	/ ??? /
7566	code_stmt, alt_code_stmt, chain_stmts: &chain_stmts);
7567	if (chain.length () > `1`)
7568	{
7569	/ Sort the chain according to def_type and operation. /
7570	chain.sort (cmp: dt_sort_cmp, data: bb_vinfo);
7571	/ ??? Now we'd want to strip externals and constants*
7572	but record those to be handled in the epilogue. /*
7573	/ ??? For now do not allow mixing ops or externs/constants. /
7574	bool invalid = false;
7575	unsigned remain_cnt = `0`;
7576	unsigned last_idx = `0`;
7577	for (unsigned i = `0`; i < chain.length (); ++i)
7578	{
7579	if (chain [i].code != code)
7580	{
7581	invalid = true;
7582	break;
7583	}
7584	if (chain [i].dt != vect_internal_def
7585	/ Avoid stmts where the def is not the LHS, like*
7586	ASMs. /*
7587	\|\| (gimple_get_lhs (bb_vinfo->lookup_def
7588	(chain [i].op)->stmt)
7589	!= chain [i].op))
7590	remain_cnt++;
7591	else
7592	last_idx = i;
7593	}
7594	/ Make sure to have an even number of lanes as we later do*
7595	all-or-nothing discovery, not trying to split further. /*
7596	if ((chain.length () - remain_cnt) & `1`)
7597	remain_cnt++;
7598	if (!invalid && chain.length () - remain_cnt > `1`)
7599	{
7600	vec<stmt_vec_info> stmts;
7601	vec<tree> remain = vNULL;
7602	stmts.create (nelems: chain.length ());
7603	if (remain_cnt > `0`)
7604	remain.create (nelems: remain_cnt);
7605	for (unsigned i = `0`; i < chain.length (); ++i)
7606	{
7607	stmt_vec_info stmt_info;
7608	if (chain [i].dt == vect_internal_def
7609	&& ((stmt_info = bb_vinfo->lookup_def (chain [i].op)),
7610	gimple_get_lhs (stmt_info->stmt) == chain [i].op)
7611	&& (i != last_idx
7612	\|\| (stmts.length () & `1`)))
7613	stmts.quick_push (obj: stmt_info);
7614	else
7615	remain.quick_push (obj: chain [i].op);
7616	}
7617	vec<stmt_vec_info> roots;
7618	roots.create (nelems: chain_stmts.length ());
7619	for (unsigned i = `0`; i < chain_stmts.length (); ++i)
7620	roots.quick_push (obj: bb_vinfo->lookup_stmt (chain_stmts [i]));
7621	bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_bb_reduc,
7622	stmts, roots, remain));
7623	}
7624	}
7625	}
7626	}
7627	}
7628
7629	/ Walk the grouped store chains and replace entries with their*
7630	pattern variant if any. /*
7631
7632	static void
7633	vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7634	{
7635	stmt_vec_info first_element;
7636	unsigned i;
7637
7638	FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7639	{
7640	/ We also have CTORs in this array. /
7641	if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7642	continue;
7643	if (STMT_VINFO_IN_PATTERN_P (first_element))
7644	{
7645	stmt_vec_info orig = first_element;
7646	first_element = STMT_VINFO_RELATED_STMT (first_element);
7647	DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7648	DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7649	DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7650	DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7651	vinfo->grouped_stores [i] = first_element;
7652	}
7653	stmt_vec_info prev = first_element;
7654	while (DR_GROUP_NEXT_ELEMENT (prev))
7655	{
7656	stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7657	if (STMT_VINFO_IN_PATTERN_P (elt))
7658	{
7659	stmt_vec_info orig = elt;
7660	elt = STMT_VINFO_RELATED_STMT (elt);
7661	DR_GROUP_NEXT_ELEMENT (prev) = elt;
7662	DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7663	DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7664	}
7665	DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7666	prev = elt;
7667	}
7668	}
7669	}
7670
7671	/ Check if the region described by BB_VINFO can be vectorized, returning*
7672	true if so. When returning false, set FATAL to true if the same failure
7673	would prevent vectorization at other vector sizes, false if it is still
7674	worth trying other sizes. N_STMTS is the number of statements in the
7675	region. /*
7676
7677	static bool
7678	vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7679	vec<int> *dataref_groups)
7680	{
7681	DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7682
7683	slp_instance instance;
7684	int i;
7685	poly_uint64 min_vf = `2`;
7686
7687	/ The first group of checks is independent of the vector size. /
7688	fatal = true;
7689
7690	/ Analyze the data references. /
7691
7692	if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7693	{
7694	if (dump_enabled_p ())
7695	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7696	"not vectorized: unhandled data-ref in basic "
7697	"block.\n");
7698	return false;
7699	}
7700
7701	if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7702	{
7703	if (dump_enabled_p ())
7704	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7705	"not vectorized: unhandled data access in "
7706	"basic block.\n");
7707	return false;
7708	}
7709
7710	vect_slp_check_for_roots (bb_vinfo);
7711
7712	/ If there are no grouped stores and no constructors in the region*
7713	there is no need to continue with pattern recog as vect_analyze_slp
7714	will fail anyway. /*
7715	if (bb_vinfo->grouped_stores.is_empty ()
7716	&& bb_vinfo->roots.is_empty ())
7717	{
7718	if (dump_enabled_p ())
7719	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7720	"not vectorized: no grouped stores in "
7721	"basic block.\n");
7722	return false;
7723	}
7724
7725	/ While the rest of the analysis below depends on it in some way. /
7726	fatal = false;
7727
7728	vect_pattern_recog (bb_vinfo);
7729
7730	/ Update store groups from pattern processing. /
7731	vect_fixup_store_groups_with_patterns (vinfo: bb_vinfo);
7732
7733	/ Check the SLP opportunities in the basic block, analyze and build SLP*
7734	trees. /*
7735	if (!vect_analyze_slp (vinfo: bb_vinfo, max_tree_size: n_stmts))
7736	{
7737	if (dump_enabled_p ())
7738	{
7739	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7740	"Failed to SLP the basic block.\n");
7741	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7742	"not vectorized: failed to find SLP opportunities "
7743	"in basic block.\n");
7744	}
7745	return false;
7746	}
7747
7748	/ Optimize permutations. /
7749	vect_optimize_slp (vinfo: bb_vinfo);
7750
7751	/ Gather the loads reachable from the SLP graph entries. /
7752	vect_gather_slp_loads (vinfo: bb_vinfo);
7753
7754	vect_record_base_alignments (bb_vinfo);
7755
7756	/ Analyze and verify the alignment of data references and the*
7757	dependence in the SLP instances. /*
7758	for (i = `0`; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (ix: i, ptr: &instance); )
7759	{
7760	vect_location = instance->location ();
7761	if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7762	\|\| ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7763	{
7764	slp_tree node = SLP_INSTANCE_TREE (instance);
7765	stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[`0`];
7766	if (dump_enabled_p ())
7767	dump_printf_loc (MSG_NOTE, vect_location,
7768	"removing SLP instance operations starting from: %G",
7769	stmt_info->stmt);
7770	vect_free_slp_instance (instance);
7771	BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (ix: i);
7772	continue;
7773	}
7774
7775	/ Mark all the statements that we want to vectorize as pure SLP and*
7776	relevant. /*
7777	vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7778	vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7779	unsigned j;
7780	stmt_vec_info root;
7781	/ Likewise consider instance root stmts as vectorized. /
7782	FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7783	STMT_SLP_TYPE (root) = pure_slp;
7784
7785	i++;
7786	}
7787	if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7788	return false;
7789
7790	if (!vect_slp_analyze_operations (vinfo: bb_vinfo))
7791	{
7792	if (dump_enabled_p ())
7793	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7794	"not vectorized: bad operation in basic block.\n");
7795	return false;
7796	}
7797
7798	vect_bb_partition_graph (bb_vinfo);
7799
7800	return true;
7801	}
7802
7803	/ Subroutine of vect_slp_bb. Try to vectorize the statements for all*
7804	basic blocks in BBS, returning true on success.
7805	The region has N_STMTS statements and has the datarefs given by DATAREFS. /*
7806
7807	static bool
7808	vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7809	vec<int> dataref_groups, unsigned* int n_stmts,
7810	loop_p orig_loop)
7811	{
7812	bb_vec_info bb_vinfo;
7813	auto_vector_modes vector_modes;
7814
7815	/ Autodetect first vector size we try. /
7816	machine_mode next_vector_mode = VOIDmode;
7817	targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7818	unsigned int mode_i = `0`;
7819
7820	vec_info_shared shared;
7821
7822	machine_mode autodetected_vector_mode = VOIDmode;
7823	while (`1`)
7824	{
7825	bool vectorized = false;
7826	bool fatal = false;
7827	bb_vinfo = new _bb_vec_info (bbs, &shared);
7828
7829	bool first_time_p = shared.datarefs.is_empty ();
7830	BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7831	if (first_time_p)
7832	bb_vinfo->shared->save_datarefs ();
7833	else
7834	bb_vinfo->shared->check_datarefs ();
7835	bb_vinfo->vector_mode = next_vector_mode;
7836
7837	if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7838	{
7839	if (dump_enabled_p ())
7840	{
7841	dump_printf_loc (MSG_NOTE, vect_location,
7842	"***** Analysis succeeded with vector mode"
7843	" %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7844	dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7845	}
7846
7847	bb_vinfo->shared->check_datarefs ();
7848
7849	bool force_clear = false;
7850	auto_vec<slp_instance> profitable_subgraphs;
7851	for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7852	{
7853	if (instance->subgraph_entries.is_empty ())
7854	continue;
7855
7856	dump_user_location_t saved_vect_location = vect_location;
7857	vect_location = instance->location ();
7858	if (!unlimited_cost_model (NULL)
7859	&& !vect_bb_vectorization_profitable_p
7860	(bb_vinfo, slp_instances: instance->subgraph_entries, orig_loop))
7861	{
7862	if (dump_enabled_p ())
7863	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7864	"not vectorized: vectorization is not "
7865	"profitable.\n");
7866	vect_location = saved_vect_location;
7867	continue;
7868	}
7869
7870	vect_location = saved_vect_location;
7871	if (!dbg_cnt (index: vect_slp))
7872	{
7873	force_clear = true;
7874	continue;
7875	}
7876
7877	profitable_subgraphs.safe_push (obj: instance);
7878	}
7879
7880	/ When we're vectorizing an if-converted loop body make sure*
7881	we vectorized all if-converted code. /*
7882	if ((!profitable_subgraphs.is_empty () \|\| force_clear) && orig_loop)
7883	{
7884	gcc_assert (bb_vinfo->bbs.length () == `1`);
7885	for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bb_vinfo->bbs [`0`]);
7886	!gsi_end_p (i: gsi); gsi_next (i: &gsi))
7887	{
7888	/ The costing above left us with DCEable vectorized scalar*
7889	stmts having the visited flag set on profitable
7890	subgraphs. Do the delayed clearing of the flag here. /*
7891	if (gimple_visited_p (stmt: gsi_stmt (i: gsi)))
7892	{
7893	gimple_set_visited (stmt: gsi_stmt (i: gsi), visited_p: false);
7894	continue;
7895	}
7896	if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7897	continue;
7898
7899	if (gassign ass = dyn_cast <gassign > (p: gsi_stmt (i: gsi)))
7900	if (gimple_assign_rhs_code (gs: ass) == COND_EXPR)
7901	{
7902	if (!profitable_subgraphs.is_empty ()
7903	&& dump_enabled_p ())
7904	dump_printf_loc (MSG_NOTE, vect_location,
7905	"not profitable because of "
7906	"unprofitable if-converted scalar "
7907	"code\n");
7908	profitable_subgraphs.truncate (size: `0`);
7909	}
7910	}
7911	}
7912
7913	/ Finally schedule the profitable subgraphs. /
7914	for (slp_instance instance : profitable_subgraphs)
7915	{
7916	if (!vectorized && dump_enabled_p ())
7917	dump_printf_loc (MSG_NOTE, vect_location,
7918	"Basic block will be vectorized "
7919	"using SLP\n");
7920	vectorized = true;
7921
7922	/ Dump before scheduling as store vectorization will remove*
7923	the original stores and mess with the instance tree
7924	so querying its location will eventually ICE. /*
7925	if (flag_checking)
7926	for (slp_instance sub : instance->subgraph_entries)
7927	gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7928	unsigned HOST_WIDE_INT bytes;
7929	if (dump_enabled_p ())
7930	for (slp_instance sub : instance->subgraph_entries)
7931	{
7932	tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7933	if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (const_value: &bytes))
7934	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7935	sub->location (),
7936	"basic block part vectorized using %wu "
7937	"byte vectors\n", bytes);
7938	else
7939	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7940	sub->location (),
7941	"basic block part vectorized using "
7942	"variable length vectors\n");
7943	}
7944
7945	dump_user_location_t saved_vect_location = vect_location;
7946	vect_location = instance->location ();
7947
7948	vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7949
7950	vect_location = saved_vect_location;
7951	}
7952	}
7953	else
7954	{
7955	if (dump_enabled_p ())
7956	dump_printf_loc (MSG_NOTE, vect_location,
7957	"***** Analysis failed with vector mode %s\n",
7958	GET_MODE_NAME (bb_vinfo->vector_mode));
7959	}
7960
7961	if (mode_i == `0`)
7962	autodetected_vector_mode = bb_vinfo->vector_mode;
7963
7964	if (!fatal)
7965	while (mode_i < vector_modes.length ()
7966	&& vect_chooses_same_modes_p (bb_vinfo, vector_modes [mode_i]))
7967	{
7968	if (dump_enabled_p ())
7969	dump_printf_loc (MSG_NOTE, vect_location,
7970	"***** The result for vector mode %s would"
7971	" be the same\n",
7972	GET_MODE_NAME (vector_modes[mode_i]));
7973	mode_i += `1`;
7974	}
7975
7976	delete bb_vinfo;
7977
7978	if (mode_i < vector_modes.length ()
7979	&& VECTOR_MODE_P (autodetected_vector_mode)
7980	&& (related_vector_mode (vector_modes [mode_i],
7981	GET_MODE_INNER (autodetected_vector_mode))
7982	== autodetected_vector_mode)
7983	&& (related_vector_mode (autodetected_vector_mode,
7984	GET_MODE_INNER (vector_modes[mode_i]))
7985	== vector_modes [mode_i]))
7986	{
7987	if (dump_enabled_p ())
7988	dump_printf_loc (MSG_NOTE, vect_location,
7989	"***** Skipping vector mode %s, which would"
7990	" repeat the analysis for %s\n",
7991	GET_MODE_NAME (vector_modes[mode_i]),
7992	GET_MODE_NAME (autodetected_vector_mode));
7993	mode_i += `1`;
7994	}
7995
7996	if (vectorized
7997	\|\| mode_i == vector_modes.length ()
7998	\|\| autodetected_vector_mode == VOIDmode
7999	/ If vect_slp_analyze_bb_1 signaled that analysis for all*
8000	vector sizes will fail do not bother iterating. /*
8001	\|\| fatal)
8002	return vectorized;
8003
8004	/ Try the next biggest vector size. /
8005	next_vector_mode = vector_modes [mode_i++];
8006	if (dump_enabled_p ())
8007	dump_printf_loc (MSG_NOTE, vect_location,
8008	"***** Re-trying analysis with vector mode %s\n",
8009	GET_MODE_NAME (next_vector_mode));
8010	}
8011	}
8012
8013
8014	/ Main entry for the BB vectorizer. Analyze and transform BBS, returns*
8015	true if anything in the basic-block was vectorized. /*
8016
8017	static bool
8018	vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
8019	{
8020	vec<data_reference_p> datarefs = vNULL;
8021	auto_vec<int> dataref_groups;
8022	int insns = `0`;
8023	int current_group = `0`;
8024
8025	for (unsigned i = `0`; i < bbs.length (); i++)
8026	{
8027	basic_block bb = bbs [i];
8028	for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (i: gsi);
8029	gsi_next (i: &gsi))
8030	{
8031	gimple *stmt = gsi_stmt (i: gsi);
8032	if (is_gimple_debug (gs: stmt))
8033	continue;
8034
8035	insns++;
8036
8037	if (gimple_location (g: stmt) != UNKNOWN_LOCATION)
8038	vect_location = stmt;
8039
8040	if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
8041	&dataref_groups, current_group))
8042	++current_group;
8043	}
8044	/ New BBs always start a new DR group. /
8045	++current_group;
8046	}
8047
8048	return vect_slp_region (bbs, datarefs, dataref_groups: &dataref_groups, n_stmts: insns, orig_loop);
8049	}
8050
8051	/ Special entry for the BB vectorizer. Analyze and transform a single*
8052	if-converted BB with ORIG_LOOPs body being the not if-converted
8053	representation. Returns true if anything in the basic-block was
8054	vectorized. /*
8055
8056	bool
8057	vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
8058	{
8059	auto_vec<basic_block> bbs;
8060	bbs.safe_push (obj: bb);
8061	return vect_slp_bbs (bbs, orig_loop);
8062	}
8063
8064	/ Main entry for the BB vectorizer. Analyze and transform BB, returns*
8065	true if anything in the basic-block was vectorized. /*
8066
8067	bool
8068	vect_slp_function (function *fun)
8069	{
8070	bool r = false;
8071	int rpo = XNEWVEC (int*, n_basic_blocks_for_fn (fun));
8072	auto_bitmap exit_bbs;
8073	bitmap_set_bit (exit_bbs, EXIT_BLOCK);
8074	edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
8075	unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
8076	true, rpo, NULL);
8077
8078	/ For the moment split the function into pieces to avoid making*
8079	the iteration on the vector mode moot. Split at points we know
8080	to not handle well which is CFG merges (SLP discovery doesn't
8081	handle non-loop-header PHIs) and loop exits. Since pattern
8082	recog requires reverse iteration to visit uses before defs
8083	simply chop RPO into pieces. /*
8084	auto_vec<basic_block> bbs;
8085	for (unsigned i = `0`; i < n; i++)
8086	{
8087	basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
8088	bool split = false;
8089
8090	/ Split when a BB is not dominated by the first block. /
8091	if (!bbs.is_empty ()
8092	&& !dominated_by_p (CDI_DOMINATORS, bb, bbs [`0`]))
8093	{
8094	if (dump_enabled_p ())
8095	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8096	"splitting region at dominance boundary bb%d\n",
8097	bb->index);
8098	split = true;
8099	}
8100	/ Split when the loop determined by the first block*
8101	is exited. This is because we eventually insert
8102	invariants at region begin. /*
8103	else if (!bbs.is_empty ()
8104	&& bbs [`0`]->loop_father != bb->loop_father
8105	&& !flow_loop_nested_p (bbs [`0`]->loop_father, bb->loop_father))
8106	{
8107	if (dump_enabled_p ())
8108	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8109	"splitting region at loop %d exit at bb%d\n",
8110	bbs [`0`]->loop_father->num, bb->index);
8111	split = true;
8112	}
8113	else if (!bbs.is_empty ()
8114	&& bb->loop_father->header == bb
8115	&& bb->loop_father->dont_vectorize)
8116	{
8117	if (dump_enabled_p ())
8118	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8119	"splitting region at dont-vectorize loop %d "
8120	"entry at bb%d\n",
8121	bb->loop_father->num, bb->index);
8122	split = true;
8123	}
8124
8125	if (split && !bbs.is_empty ())
8126	{
8127	r \|= vect_slp_bbs (bbs, NULL);
8128	bbs.truncate (size: `0`);
8129	}
8130
8131	if (bbs.is_empty ())
8132	{
8133	/ We need to be able to insert at the head of the region which*
8134	we cannot for region starting with a returns-twice call. /*
8135	if (gcall first = safe_dyn_cast <gcall > (p: first_stmt (bb)))
8136	if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
8137	{
8138	if (dump_enabled_p ())
8139	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8140	"skipping bb%d as start of region as it "
8141	"starts with returns-twice call\n",
8142	bb->index);
8143	continue;
8144	}
8145	/ If the loop this BB belongs to is marked as not to be vectorized*
8146	honor that also for BB vectorization. /*
8147	if (bb->loop_father->dont_vectorize)
8148	continue;
8149	}
8150
8151	bbs.safe_push (obj: bb);
8152
8153	/ When we have a stmt ending this block and defining a*
8154	value we have to insert on edges when inserting after it for
8155	a vector containing its definition. Avoid this for now. /*
8156	if (gimple last = gsi_last_bb (bb))
8157	if (gimple_get_lhs (last)
8158	&& is_ctrl_altering_stmt (last))
8159	{
8160	if (dump_enabled_p ())
8161	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8162	"splitting region at control altering "
8163	"definition %G", last);
8164	r \|= vect_slp_bbs (bbs, NULL);
8165	bbs.truncate (size: `0`);
8166	}
8167	}
8168
8169	if (!bbs.is_empty ())
8170	r \|= vect_slp_bbs (bbs, NULL);
8171
8172	free (ptr: rpo);
8173
8174	return r;
8175	}
8176
8177	/ Build a variable-length vector in which the elements in ELTS are repeated*
8178	to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
8179	RESULTS and add any new instructions to SEQ.
8180
8181	The approach we use is:
8182
8183	(1) Find a vector mode VM with integer elements of mode IM.
8184
8185	(2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8186	ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
8187	from small vectors to IM.
8188
8189	(3) Duplicate each ELTS'[I] into a vector of mode VM.
8190
8191	(4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
8192	correct byte contents.
8193
8194	(5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
8195
8196	We try to find the largest IM for which this sequence works, in order
8197	to cut down on the number of interleaves. /*
8198
8199	void
8200	duplicate_and_interleave (vec_info vinfo, gimple_seq seq, tree vector_type,
8201	const vec<tree> &elts, unsigned int nresults,
8202	vec<tree> &results)
8203	{
8204	unsigned int nelts = elts.length ();
8205	tree element_type = TREE_TYPE (vector_type);
8206
8207	/ (1) Find a vector mode VM with integer elements of mode IM. /
8208	unsigned int nvectors = `1`;
8209	tree new_vector_type;
8210	tree permutes[`2`];
8211	if (!can_duplicate_and_interleave_p (vinfo, count: nelts, elt_type: element_type,
8212	nvectors_out: &nvectors, vector_type_out: &new_vector_type,
8213	permutes))
8214	gcc_unreachable ();
8215
8216	/ Get a vector type that holds ELTS[0:NELTS/NELTS']. /
8217	unsigned int partial_nelts = nelts / nvectors;
8218	tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8219
8220	tree_vector_builder partial_elts;
8221	auto_vec<tree, `32`> pieces (nvectors * `2`);
8222	pieces.quick_grow_cleared (len: nvectors * `2`);
8223	for (unsigned int i = `0`; i < nvectors; ++i)
8224	{
8225	/ (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of*
8226	ELTS' has mode IM. /*
8227	partial_elts.new_vector (type: partial_vector_type, npatterns: partial_nelts, nelts_per_pattern: `1`);
8228	for (unsigned int j = `0`; j < partial_nelts; ++j)
8229	partial_elts.quick_push (obj: elts [i * partial_nelts + j]);
8230	tree t = gimple_build_vector (seq, builder: &partial_elts);
8231	t = gimple_build (seq, code: VIEW_CONVERT_EXPR,
8232	TREE_TYPE (new_vector_type), ops: t);
8233
8234	/ (3) Duplicate each ELTS'[I] into a vector of mode VM. /
8235	pieces [i] = gimple_build_vector_from_val (seq, type: new_vector_type, op: t);
8236	}
8237
8238	/ (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the*
8239	correct byte contents.
8240
8241	Conceptually, we need to repeat the following operation log2(nvectors)
8242	times, where hi_start = nvectors / 2:
8243
8244	out[i 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);*
8245	out[i 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);*
8246
8247	However, if each input repeats every N elements and the VF is
8248	a multiple of N 2, the HI result is the same as the LO result.*
8249	This will be true for the first N1 iterations of the outer loop,
8250	followed by N2 iterations for which both the LO and HI results
8251	are needed. I.e.:
8252
8253	N1 + N2 = log2(nvectors)
8254
8255	Each "N1 iteration" doubles the number of redundant vectors and the
8256	effect of the process as a whole is to have a sequence of nvectors/2N1
8257	vectors that repeats 2N1 times. Rather than generate these redundant
8258	vectors, we halve the number of vectors for each N1 iteration. /*
8259	unsigned int in_start = `0`;
8260	unsigned int out_start = nvectors;
8261	unsigned int new_nvectors = nvectors;
8262	for (unsigned int in_repeat = `1`; in_repeat < nvectors; in_repeat *= `2`)
8263	{
8264	unsigned int hi_start = new_nvectors / `2`;
8265	unsigned int out_i = `0`;
8266	for (unsigned int in_i = `0`; in_i < new_nvectors; ++in_i)
8267	{
8268	if ((in_i & `1`) != `0`
8269	&& multiple_p (a: TYPE_VECTOR_SUBPARTS (node: new_vector_type),
8270	b: `2` * in_repeat))
8271	continue;
8272
8273	tree output = make_ssa_name (var: new_vector_type);
8274	tree input1 = pieces [in_start + (in_i / `2`)];
8275	tree input2 = pieces [in_start + (in_i / `2`) + hi_start];
8276	gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8277	input1, input2,
8278	permutes[in_i & `1`]);
8279	gimple_seq_add_stmt (seq, stmt);
8280	pieces [out_start + out_i] = output;
8281	out_i += `1`;
8282	}
8283	std::swap (a&: in_start, b&: out_start);
8284	new_nvectors = out_i;
8285	}
8286
8287	/ (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. /
8288	results.reserve (nelems: nresults);
8289	for (unsigned int i = `0`; i < nresults; ++i)
8290	if (i < new_nvectors)
8291	results.quick_push (obj: gimple_build (seq, code: VIEW_CONVERT_EXPR, type: vector_type,
8292	ops: pieces [in_start + i]));
8293	else
8294	results.quick_push (obj: results [i - new_nvectors]);
8295	}
8296
8297
8298	/ For constant and loop invariant defs in OP_NODE this function creates*
8299	vector defs that will be used in the vectorized stmts and stores them
8300	to SLP_TREE_VEC_DEFS of OP_NODE. /*
8301
8302	static void
8303	vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8304	{
8305	unsigned HOST_WIDE_INT nunits;
8306	tree vec_cst;
8307	unsigned j, number_of_places_left_in_vector;
8308	tree vector_type;
8309	tree vop;
8310	int group_size = op_node->ops.length ();
8311	unsigned int vec_num, i;
8312	unsigned number_of_copies = `1`;
8313	bool constant_p;
8314	gimple_seq ctor_seq = NULL;
8315	auto_vec<tree, `16`> permute_results;
8316
8317	/ We always want SLP_TREE_VECTYPE (op_node) here correctly set. /
8318	vector_type = SLP_TREE_VECTYPE (op_node);
8319
8320	unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8321	SLP_TREE_VEC_DEFS (op_node).create (nelems: number_of_vectors);
8322	auto_vec<tree> voprnds (number_of_vectors);
8323
8324	/ NUMBER_OF_COPIES is the number of times we need to use the same values in*
8325	created vectors. It is greater than 1 if unrolling is performed.
8326
8327	For example, we have two scalar operands, s1 and s2 (e.g., group of
8328	strided accesses of size two), while NUNITS is four (i.e., four scalars
8329	of this type can be packed in a vector). The output vector will contain
8330	two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
8331	will be 2).
8332
8333	If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8334	containing the operands.
8335
8336	For example, NUNITS is four as before, and the group size is 8
8337	(s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
8338	{s5, s6, s7, s8}. /*
8339
8340	/ When using duplicate_and_interleave, we just need one element for*
8341	each scalar statement. /*
8342	if (!TYPE_VECTOR_SUBPARTS (node: vector_type).is_constant (const_value: &nunits))
8343	nunits = group_size;
8344
8345	number_of_copies = nunits * number_of_vectors / group_size;
8346
8347	number_of_places_left_in_vector = nunits;
8348	constant_p = true;
8349	tree uniform_elt = NULL_TREE;
8350	tree_vector_builder elts (vector_type, nunits, `1`);
8351	elts.quick_grow (len: nunits);
8352	stmt_vec_info insert_after = NULL;
8353	for (j = `0`; j < number_of_copies; j++)
8354	{
8355	tree op;
8356	for (i = group_size - `1`; op_node->ops.iterate (ix: i, ptr: &op); i--)
8357	{
8358	/ Create 'vect_ = {op0,op1,...,opn}'. /
8359	tree orig_op = op;
8360	if (number_of_places_left_in_vector == nunits)
8361	uniform_elt = op;
8362	else if (uniform_elt && operand_equal_p (uniform_elt, op))
8363	op = elts [number_of_places_left_in_vector];
8364	else
8365	uniform_elt = NULL_TREE;
8366	number_of_places_left_in_vector--;
8367	if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8368	{
8369	if (CONSTANT_CLASS_P (op))
8370	{
8371	if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8372	{
8373	/ Can't use VIEW_CONVERT_EXPR for booleans because*
8374	of possibly different sizes of scalar value and
8375	vector element. /*
8376	if (integer_zerop (op))
8377	op = build_int_cst (TREE_TYPE (vector_type), `0`);
8378	else if (integer_onep (op))
8379	op = build_all_ones_cst (TREE_TYPE (vector_type));
8380	else
8381	gcc_unreachable ();
8382	}
8383	else
8384	op = fold_unary (VIEW_CONVERT_EXPR,
8385	TREE_TYPE (vector_type), op);
8386	gcc_assert (op && CONSTANT_CLASS_P (op));
8387	}
8388	else
8389	{
8390	tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8391	gimple *init_stmt;
8392	if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8393	{
8394	tree true_val
8395	= build_all_ones_cst (TREE_TYPE (vector_type));
8396	tree false_val
8397	= build_zero_cst (TREE_TYPE (vector_type));
8398	gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8399	init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8400	op, true_val,
8401	false_val);
8402	}
8403	else
8404	{
8405	op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8406	op);
8407	init_stmt
8408	= gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8409	op);
8410	}
8411	gimple_seq_add_stmt (&ctor_seq, init_stmt);
8412	op = new_temp;
8413	}
8414	}
8415	elts [number_of_places_left_in_vector] = op;
8416	if (!CONSTANT_CLASS_P (op))
8417	constant_p = false;
8418	/ For BB vectorization we have to compute an insert location*
8419	when a def is inside the analyzed region since we cannot
8420	simply insert at the BB start in this case. /*
8421	stmt_vec_info opdef;
8422	if (TREE_CODE (orig_op) == SSA_NAME
8423	&& !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8424	&& is_a <bb_vec_info> (p: vinfo)
8425	&& (opdef = vinfo->lookup_def (orig_op)))
8426	{
8427	if (!insert_after)
8428	insert_after = opdef;
8429	else
8430	insert_after = get_later_stmt (stmt1_info: insert_after, stmt2_info: opdef);
8431	}
8432
8433	if (number_of_places_left_in_vector == `0`)
8434	{
8435	auto type_nunits = TYPE_VECTOR_SUBPARTS (node: vector_type);
8436	if (uniform_elt)
8437	vec_cst = gimple_build_vector_from_val (seq: &ctor_seq, type: vector_type,
8438	op: elts [`0`]);
8439	else if (constant_p
8440	? multiple_p (a: type_nunits, b: nunits)
8441	: known_eq (type_nunits, nunits))
8442	vec_cst = gimple_build_vector (seq: &ctor_seq, builder: &elts);
8443	else
8444	{
8445	if (permute_results.is_empty ())
8446	duplicate_and_interleave (vinfo, seq: &ctor_seq, vector_type,
8447	elts, nresults: number_of_vectors,
8448	results&: permute_results);
8449	vec_cst = permute_results [number_of_vectors - j - `1`];
8450	}
8451	if (!gimple_seq_empty_p (s: ctor_seq))
8452	{
8453	if (insert_after)
8454	{
8455	gimple_stmt_iterator gsi;
8456	if (gimple_code (g: insert_after->stmt) == GIMPLE_PHI)
8457	{
8458	gsi = gsi_after_labels (bb: gimple_bb (g: insert_after->stmt));
8459	gsi_insert_seq_before (&gsi, ctor_seq,
8460	GSI_CONTINUE_LINKING);
8461	}
8462	else if (!stmt_ends_bb_p (insert_after->stmt))
8463	{
8464	gsi = gsi_for_stmt (insert_after->stmt);
8465	gsi_insert_seq_after (&gsi, ctor_seq,
8466	GSI_CONTINUE_LINKING);
8467	}
8468	else
8469	{
8470	/ When we want to insert after a def where the*
8471	defining stmt throws then insert on the fallthru
8472	edge. /*
8473	edge e = find_fallthru_edge
8474	(edges: gimple_bb (g: insert_after->stmt)->succs);
8475	basic_block new_bb
8476	= gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8477	gcc_assert (!new_bb);
8478	}
8479	}
8480	else
8481	vinfo->insert_seq_on_entry (NULL, ctor_seq);
8482	ctor_seq = NULL;
8483	}
8484	voprnds.quick_push (obj: vec_cst);
8485	insert_after = NULL;
8486	number_of_places_left_in_vector = nunits;
8487	constant_p = true;
8488	elts.new_vector (type: vector_type, npatterns: nunits, nelts_per_pattern: `1`);
8489	elts.quick_grow (len: nunits);
8490	}
8491	}
8492	}
8493
8494	/ Since the vectors are created in the reverse order, we should invert*
8495	them. /*
8496	vec_num = voprnds.length ();
8497	for (j = vec_num; j != `0`; j--)
8498	{
8499	vop = voprnds [j - `1`];
8500	SLP_TREE_VEC_DEFS (op_node).quick_push (obj: vop);
8501	}
8502
8503	/ In case that VF is greater than the unrolling factor needed for the SLP*
8504	group of stmts, NUMBER_OF_VECTORS to be created is greater than
8505	NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8506	to replicate the vectors. /*
8507	while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8508	for (i = `0`; SLP_TREE_VEC_DEFS (op_node).iterate (ix: i, ptr: &vop) && i < vec_num;
8509	i++)
8510	SLP_TREE_VEC_DEFS (op_node).quick_push (obj: vop);
8511	}
8512
8513	/ Get the Ith vectorized definition from SLP_NODE. /
8514
8515	tree
8516	vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8517	{
8518	return SLP_TREE_VEC_DEFS (slp_node)[i];
8519	}
8520
8521	/ Get the vectorized definitions of SLP_NODE in VEC_DEFS. /*
8522
8523	void
8524	vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8525	{
8526	vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8527	vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8528	}
8529
8530	/ Get N vectorized definitions for SLP_NODE. /
8531
8532	void
8533	vect_get_slp_defs (vec_info *,
8534	slp_tree slp_node, vec<vec<tree> > vec_oprnds, unsigned* n)
8535	{
8536	if (n == -`1U`)
8537	n = SLP_TREE_CHILDREN (slp_node).length ();
8538
8539	for (unsigned i = `0`; i < n; ++i)
8540	{
8541	slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8542	vec<tree> vec_defs = vNULL;
8543	vect_get_slp_defs (slp_node: child, vec_defs: &vec_defs);
8544	vec_oprnds->quick_push (obj: vec_defs);
8545	}
8546	}
8547
8548	/ A subroutine of vect_transform_slp_perm_load with two extra arguments:*
8549	- PERM gives the permutation that the caller wants to use for NODE,
8550	which might be different from SLP_LOAD_PERMUTATION.
8551	- DUMP_P controls whether the function dumps information. /*
8552
8553	static bool
8554	vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8555	load_permutation_t &perm,
8556	const vec<tree> &dr_chain,
8557	gimple_stmt_iterator *gsi, poly_uint64 vf,
8558	bool analyze_only, bool dump_p,
8559	unsigned n_perms, unsigned* int *n_loads,
8560	bool dce_chain)
8561	{
8562	stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[`0`];
8563	int vec_index = `0`;
8564	tree vectype = SLP_TREE_VECTYPE (node);
8565	unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8566	unsigned int mask_element;
8567	unsigned dr_group_size;
8568	machine_mode mode;
8569
8570	if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8571	dr_group_size = `1`;
8572	else
8573	{
8574	stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8575	dr_group_size = DR_GROUP_SIZE (stmt_info);
8576	}
8577
8578	mode = TYPE_MODE (vectype);
8579	poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype);
8580	unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8581
8582	/ Initialize the vect stmts of NODE to properly insert the generated*
8583	stmts later. /*
8584	if (! analyze_only)
8585	for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8586	SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8587
8588	/ Generate permutation masks for every NODE. Number of masks for each NODE*
8589	is equal to GROUP_SIZE.
8590	E.g., we have a group of three nodes with three loads from the same
8591	location in each node, and the vector size is 4. I.e., we have a
8592	a0b0c0a1b1c1... sequence and we need to create the following vectors:
8593	for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8594	for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8595	...
8596
8597	The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8598	The last mask is illegal since we assume two operands for permute
8599	operation, and the mask element values can't be outside that range.
8600	Hence, the last mask must be converted into {2,5,5,5}.
8601	For the first two permutations we need the first and the second input
8602	vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8603	we need the second and the third vectors: {b1,c1,a2,b2} and
8604	{c2,a3,b3,c3}. /*
8605
8606	int vect_stmts_counter = `0`;
8607	unsigned int index = `0`;
8608	int first_vec_index = -`1`;
8609	int second_vec_index = -`1`;
8610	bool noop_p = true;
8611	*n_perms = `0`;
8612
8613	vec_perm_builder mask;
8614	unsigned int nelts_to_build;
8615	unsigned int nvectors_per_build;
8616	unsigned int in_nlanes;
8617	bool repeating_p = (group_size == dr_group_size
8618	&& multiple_p (a: nunits, b: group_size));
8619	if (repeating_p)
8620	{
8621	/ A single vector contains a whole number of copies of the node, so:*
8622	(a) all permutes can use the same mask; and
8623	(b) the permutes only need a single vector input. /*
8624	mask.new_vector (full_nelts: nunits, npatterns: group_size, nelts_per_pattern: `3`);
8625	nelts_to_build = mask.encoded_nelts ();
8626	/ It's possible to obtain zero nstmts during analyze_only, so make*
8627	it at least one to ensure the later computation for n_perms
8628	proceed. /*
8629	nvectors_per_build = nstmts > `0` ? nstmts : `1`;
8630	in_nlanes = dr_group_size * `3`;
8631	}
8632	else
8633	{
8634	/ We need to construct a separate mask for each vector statement. /
8635	unsigned HOST_WIDE_INT const_nunits, const_vf;
8636	if (!nunits.is_constant (const_value: &const_nunits)
8637	\|\| !vf.is_constant (const_value: &const_vf))
8638	return false;
8639	mask.new_vector (full_nelts: const_nunits, npatterns: const_nunits, nelts_per_pattern: `1`);
8640	nelts_to_build = const_vf * group_size;
8641	nvectors_per_build = `1`;
8642	in_nlanes = const_vf * dr_group_size;
8643	}
8644	auto_sbitmap used_in_lanes (in_nlanes);
8645	bitmap_clear (used_in_lanes);
8646	auto_bitmap used_defs;
8647
8648	unsigned int count = mask.encoded_nelts ();
8649	mask.quick_grow (len: count);
8650	vec_perm_indices indices;
8651
8652	for (unsigned int j = `0`; j < nelts_to_build; j++)
8653	{
8654	unsigned int iter_num = j / group_size;
8655	unsigned int stmt_num = j % group_size;
8656	unsigned int i = (iter_num * dr_group_size + perm [stmt_num]);
8657	bitmap_set_bit (map: used_in_lanes, bitno: i);
8658	if (repeating_p)
8659	{
8660	first_vec_index = `0`;
8661	mask_element = i;
8662	}
8663	else
8664	{
8665	/ Enforced before the loop when !repeating_p. /
8666	unsigned int const_nunits = nunits.to_constant ();
8667	vec_index = i / const_nunits;
8668	mask_element = i % const_nunits;
8669	if (vec_index == first_vec_index
8670	\|\| first_vec_index == -`1`)
8671	{
8672	first_vec_index = vec_index;
8673	}
8674	else if (vec_index == second_vec_index
8675	\|\| second_vec_index == -`1`)
8676	{
8677	second_vec_index = vec_index;
8678	mask_element += const_nunits;
8679	}
8680	else
8681	{
8682	if (dump_p)
8683	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8684	"permutation requires at "
8685	"least three vectors %G",
8686	stmt_info->stmt);
8687	gcc_assert (analyze_only);
8688	return false;
8689	}
8690
8691	gcc_assert (mask_element < `2` * const_nunits);
8692	}
8693
8694	if (mask_element != index)
8695	noop_p = false;
8696	mask [index++] = mask_element;
8697
8698	if (index == count)
8699	{
8700	if (!noop_p)
8701	{
8702	indices.new_vector (mask, second_vec_index == -`1` ? `1` : `2`, nunits);
8703	if (!can_vec_perm_const_p (mode, mode, indices))
8704	{
8705	if (dump_p)
8706	{
8707	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8708	"unsupported vect permute { ");
8709	for (i = `0`; i < count; ++i)
8710	{
8711	dump_dec (MSG_MISSED_OPTIMIZATION, mask [i]);
8712	dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8713	}
8714	dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8715	}
8716	gcc_assert (analyze_only);
8717	return false;
8718	}
8719
8720	tree mask_vec = NULL_TREE;
8721	if (!analyze_only)
8722	mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8723
8724	if (second_vec_index == -`1`)
8725	second_vec_index = first_vec_index;
8726
8727	for (unsigned int ri = `0`; ri < nvectors_per_build; ++ri)
8728	{
8729	++*n_perms;
8730	if (analyze_only)
8731	continue;
8732	/ Generate the permute statement if necessary. /
8733	tree first_vec = dr_chain [first_vec_index + ri];
8734	tree second_vec = dr_chain [second_vec_index + ri];
8735	gassign stmt = as_a<gassign > (p: stmt_info->stmt);
8736	tree perm_dest
8737	= vect_create_destination_var (gimple_assign_lhs (gs: stmt),
8738	vectype);
8739	perm_dest = make_ssa_name (var: perm_dest);
8740	gimple *perm_stmt
8741	= gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8742	second_vec, mask_vec);
8743	vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8744	gsi);
8745	if (dce_chain)
8746	{
8747	bitmap_set_bit (used_defs, first_vec_index + ri);
8748	bitmap_set_bit (used_defs, second_vec_index + ri);
8749	}
8750
8751	/ Store the vector statement in NODE. /
8752	SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8753	}
8754	}
8755	else if (!analyze_only)
8756	{
8757	for (unsigned int ri = `0`; ri < nvectors_per_build; ++ri)
8758	{
8759	tree first_vec = dr_chain [first_vec_index + ri];
8760	/ If mask was NULL_TREE generate the requested*
8761	identity transform. /*
8762	if (dce_chain)
8763	bitmap_set_bit (used_defs, first_vec_index + ri);
8764
8765	/ Store the vector statement in NODE. /
8766	SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8767	}
8768	}
8769
8770	index = `0`;
8771	first_vec_index = -`1`;
8772	second_vec_index = -`1`;
8773	noop_p = true;
8774	}
8775	}
8776
8777	if (n_loads)
8778	{
8779	if (repeating_p)
8780	*n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8781	else
8782	{
8783	/ Enforced above when !repeating_p. /
8784	unsigned int const_nunits = nunits.to_constant ();
8785	*n_loads = `0`;
8786	bool load_seen = false;
8787	for (unsigned i = `0`; i < in_nlanes; ++i)
8788	{
8789	if (i % const_nunits == `0`)
8790	{
8791	if (load_seen)
8792	*n_loads += `1`;
8793	load_seen = false;
8794	}
8795	if (bitmap_bit_p (map: used_in_lanes, bitno: i))
8796	load_seen = true;
8797	}
8798	if (load_seen)
8799	*n_loads += `1`;
8800	}
8801	}
8802
8803	if (dce_chain)
8804	for (unsigned i = `0`; i < dr_chain.length (); ++i)
8805	if (!bitmap_bit_p (used_defs, i))
8806	{
8807	tree def = dr_chain [i];
8808	do
8809	{
8810	gimple *stmt = SSA_NAME_DEF_STMT (def);
8811	if (is_gimple_assign (gs: stmt)
8812	&& (gimple_assign_rhs_code (gs: stmt) == VIEW_CONVERT_EXPR
8813	\|\| gimple_assign_rhs_code (gs: stmt) == CONSTRUCTOR))
8814	def = single_ssa_tree_operand (stmt, SSA_OP_USE);
8815	else
8816	def = NULL;
8817	gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8818	gsi_remove (&rgsi, true);
8819	release_defs (stmt);
8820	}
8821	while (def);
8822	}
8823
8824	return true;
8825	}
8826
8827	/ Generate vector permute statements from a list of loads in DR_CHAIN.*
8828	If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8829	permute statements for the SLP node NODE. Store the number of vector
8830	permute instructions in N_PERMS and the number of vector load*
8831	instructions in N_LOADS. If DCE_CHAIN is true, remove all definitions*
8832	that were not needed. /*
8833
8834	bool
8835	vect_transform_slp_perm_load (vec_info *vinfo,
8836	slp_tree node, const vec<tree> &dr_chain,
8837	gimple_stmt_iterator *gsi, poly_uint64 vf,
8838	bool analyze_only, unsigned *n_perms,
8839	unsigned int n_loads, bool* dce_chain)
8840	{
8841	return vect_transform_slp_perm_load_1 (vinfo, node,
8842	SLP_TREE_LOAD_PERMUTATION (node),
8843	dr_chain, gsi, vf, analyze_only,
8844	dump_p: dump_enabled_p (), n_perms, n_loads,
8845	dce_chain);
8846	}
8847
8848	/ Produce the next vector result for SLP permutation NODE by adding a vector*
8849	statement at GSI. If MASK_VEC is nonnull, add:
8850
8851	<new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8852
8853	otherwise add:
8854
8855	<new SSA name> = FIRST_DEF. /*
8856
8857	static void
8858	vect_add_slp_permutation (vec_info vinfo, gimple_stmt_iterator gsi,
8859	slp_tree node, tree first_def, tree second_def,
8860	tree mask_vec, poly_uint64 identity_offset)
8861	{
8862	tree vectype = SLP_TREE_VECTYPE (node);
8863
8864	/ ??? We SLP match existing vector element extracts but*
8865	allow punning which we need to re-instantiate at uses
8866	but have no good way of explicitly representing. /*
8867	if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8868	&& !types_compatible_p (TREE_TYPE (first_def), type2: vectype))
8869	{
8870	gassign *conv_stmt
8871	= gimple_build_assign (make_ssa_name (var: vectype),
8872	build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8873	vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8874	first_def = gimple_assign_lhs (gs: conv_stmt);
8875	}
8876	gassign *perm_stmt;
8877	tree perm_dest = make_ssa_name (var: vectype);
8878	if (mask_vec)
8879	{
8880	if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8881	TYPE_SIZE (vectype))
8882	&& !types_compatible_p (TREE_TYPE (second_def), type2: vectype))
8883	{
8884	gassign *conv_stmt
8885	= gimple_build_assign (make_ssa_name (var: vectype),
8886	build1 (VIEW_CONVERT_EXPR,
8887	vectype, second_def));
8888	vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8889	second_def = gimple_assign_lhs (gs: conv_stmt);
8890	}
8891	perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8892	first_def, second_def,
8893	mask_vec);
8894	}
8895	else if (!types_compatible_p (TREE_TYPE (first_def), type2: vectype))
8896	{
8897	/ For identity permutes we still need to handle the case*
8898	of offsetted extracts or concats. /*
8899	unsigned HOST_WIDE_INT c;
8900	auto first_def_nunits
8901	= TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8902	if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8903	{
8904	unsigned HOST_WIDE_INT elsz
8905	= tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8906	tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8907	TYPE_SIZE (vectype),
8908	bitsize_int (identity_offset * elsz));
8909	perm_stmt = gimple_build_assign (perm_dest, lowpart);
8910	}
8911	else if (constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: vectype),
8912	b: first_def_nunits, multiple: &c) && c == `2`)
8913	{
8914	tree ctor = build_constructor_va (vectype, `2`, NULL_TREE, first_def,
8915	NULL_TREE, second_def);
8916	perm_stmt = gimple_build_assign (perm_dest, ctor);
8917	}
8918	else
8919	gcc_unreachable ();
8920	}
8921	else
8922	{
8923	/ We need a copy here in case the def was external. /
8924	perm_stmt = gimple_build_assign (perm_dest, first_def);
8925	}
8926	vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8927	/ Store the vector statement in NODE. /
8928	node->push_vec_def (def: perm_stmt);
8929	}
8930
8931	/ Subroutine of vectorizable_slp_permutation. Check whether the target*
8932	can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8933	If GSI is nonnull, emit the permutation there.
8934
8935	When GSI is null, the only purpose of NODE is to give properties
8936	of the result, such as the vector type and number of SLP lanes.
8937	The node does not need to be a VEC_PERM_EXPR.
8938
8939	If the target supports the operation, return the number of individual
8940	VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8941	dump file if DUMP_P is true. /*
8942
8943	static int
8944	vectorizable_slp_permutation_1 (vec_info vinfo, gimple_stmt_iterator gsi,
8945	slp_tree node, lane_permutation_t &perm,
8946	vec<slp_tree> &children, bool dump_p)
8947	{
8948	tree vectype = SLP_TREE_VECTYPE (node);
8949
8950	/ ??? We currently only support all same vector input types*
8951	while the SLP IL should really do a concat + select and thus accept
8952	arbitrary mismatches. /*
8953	slp_tree child;
8954	unsigned i;
8955	poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype);
8956	bool repeating_p = multiple_p (a: nunits, SLP_TREE_LANES (node));
8957	tree op_vectype = NULL_TREE;
8958	FOR_EACH_VEC_ELT (children, i, child)
8959	if (SLP_TREE_VECTYPE (child))
8960	{
8961	op_vectype = SLP_TREE_VECTYPE (child);
8962	break;
8963	}
8964	if (!op_vectype)
8965	op_vectype = vectype;
8966	FOR_EACH_VEC_ELT (children, i, child)
8967	{
8968	if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8969	&& !vect_maybe_update_slp_op_vectype (child, op_vectype))
8970	\|\| !types_compatible_p (SLP_TREE_VECTYPE (child), type2: op_vectype)
8971	\|\| !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8972	{
8973	if (dump_p)
8974	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8975	"Unsupported vector types in lane permutation\n");
8976	return -`1`;
8977	}
8978	if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8979	repeating_p = false;
8980	}
8981
8982	gcc_assert (perm.length () == SLP_TREE_LANES (node));
8983	if (dump_p)
8984	{
8985	dump_printf_loc (MSG_NOTE, vect_location,
8986	"vectorizing permutation");
8987	for (unsigned i = `0`; i < perm.length (); ++i)
8988	dump_printf (MSG_NOTE, " op%u[%u]", perm [i].first, perm [i].second);
8989	if (repeating_p)
8990	dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8991	dump_printf (MSG_NOTE, "\n");
8992	}
8993
8994	/ REPEATING_P is true if every output vector is guaranteed to use the*
8995	same permute vector. We can handle that case for both variable-length
8996	and constant-length vectors, but we only handle other cases for
8997	constant-length vectors.
8998
8999	Set:
9000
9001	- NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
9002	mask vector that we want to build.
9003
9004	- NCOPIES to the number of copies of PERM that we need in order
9005	to build the necessary permute mask vectors.
9006
9007	- NOUTPUTS_PER_MASK to the number of output vectors we want to create
9008	for each permute mask vector. This is only relevant when GSI is
9009	nonnull. /*
9010	uint64_t npatterns;
9011	unsigned nelts_per_pattern;
9012	uint64_t ncopies;
9013	unsigned noutputs_per_mask;
9014	if (repeating_p)
9015	{
9016	/ We need a single permute mask vector that has the form:*
9017
9018	{ X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
9019
9020	In other words, the original n-element permute in PERM is
9021	"unrolled" to fill a full vector. The stepped vector encoding
9022	that we use for permutes requires 3n elements. /*
9023	npatterns = SLP_TREE_LANES (node);
9024	nelts_per_pattern = ncopies = `3`;
9025	noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9026	}
9027	else
9028	{
9029	/ Calculate every element of every permute mask vector explicitly,*
9030	instead of relying on the pattern described above. /*
9031	if (!nunits.is_constant (const_value: &npatterns)
9032	\|\| !TYPE_VECTOR_SUBPARTS (node: op_vectype).is_constant ())
9033	return -`1`;
9034	nelts_per_pattern = ncopies = `1`;
9035	if (loop_vec_info linfo = dyn_cast <loop_vec_info> (p: vinfo))
9036	if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (const_value: &ncopies))
9037	return -`1`;
9038	noutputs_per_mask = `1`;
9039	}
9040	unsigned olanes = ncopies * SLP_TREE_LANES (node);
9041	gcc_assert (repeating_p \|\| multiple_p (olanes, nunits));
9042
9043	/ Compute the { { SLP operand, vector index}, lane } permutation sequence*
9044	from the { SLP operand, scalar lane } permutation as recorded in the
9045	SLP node as intermediate step. This part should already work
9046	with SLP children with arbitrary number of lanes. /*
9047	auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
9048	auto_vec<unsigned> active_lane;
9049	vperm.create (nelems: olanes);
9050	active_lane.safe_grow_cleared (len: children.length (), exact: true);
9051	for (unsigned i = `0`; i < ncopies; ++i)
9052	{
9053	for (unsigned pi = `0`; pi < perm.length (); ++pi)
9054	{
9055	std::pair<unsigned, unsigned> p = perm [pi];
9056	tree vtype = SLP_TREE_VECTYPE (children[p.first]);
9057	if (repeating_p)
9058	vperm.quick_push (obj: {{p.first, `0`}, p.second + active_lane [p.first]});
9059	else
9060	{
9061	/ We checked above that the vectors are constant-length. /
9062	unsigned vnunits = TYPE_VECTOR_SUBPARTS (node: vtype).to_constant ();
9063	unsigned vi = (active_lane [p.first] + p.second) / vnunits;
9064	unsigned vl = (active_lane [p.first] + p.second) % vnunits;
9065	vperm.quick_push (obj: {{p.first, vi}, vl});
9066	}
9067	}
9068	/ Advance to the next group. /
9069	for (unsigned j = `0`; j < children.length (); ++j)
9070	active_lane [j] += SLP_TREE_LANES (children[j]);
9071	}
9072
9073	if (dump_p)
9074	{
9075	dump_printf_loc (MSG_NOTE, vect_location,
9076	"vectorizing permutation");
9077	for (unsigned i = `0`; i < perm.length (); ++i)
9078	dump_printf (MSG_NOTE, " op%u[%u]", perm [i].first, perm [i].second);
9079	if (repeating_p)
9080	dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
9081	dump_printf (MSG_NOTE, "\n");
9082	dump_printf_loc (MSG_NOTE, vect_location, "as");
9083	for (unsigned i = `0`; i < vperm.length (); ++i)
9084	{
9085	if (i != `0`
9086	&& (repeating_p
9087	? multiple_p (a: i, b: npatterns)
9088	: multiple_p (a: i, b: TYPE_VECTOR_SUBPARTS (node: vectype))))
9089	dump_printf (MSG_NOTE, ",");
9090	dump_printf (MSG_NOTE, " vops%u[%u][%u]",
9091	vperm [i].first.first, vperm [i].first.second,
9092	vperm [i].second);
9093	}
9094	dump_printf (MSG_NOTE, "\n");
9095	}
9096
9097	/ We can only handle two-vector permutes, everything else should*
9098	be lowered on the SLP level. The following is closely inspired
9099	by vect_transform_slp_perm_load and is supposed to eventually
9100	replace it.
9101	??? As intermediate step do code-gen in the SLP tree representation
9102	somehow? /*
9103	std::pair<unsigned, unsigned> first_vec = std::make_pair (x: -`1U`, y: -`1U`);
9104	std::pair<unsigned, unsigned> second_vec = std::make_pair (x: -`1U`, y: -`1U`);
9105	unsigned int index = `0`;
9106	poly_uint64 mask_element;
9107	vec_perm_builder mask;
9108	mask.new_vector (full_nelts: nunits, npatterns, nelts_per_pattern);
9109	unsigned int count = mask.encoded_nelts ();
9110	mask.quick_grow (len: count);
9111	vec_perm_indices indices;
9112	unsigned nperms = `0`;
9113	for (unsigned i = `0`; i < vperm.length (); ++i)
9114	{
9115	mask_element = vperm [i].second;
9116	if (first_vec.first == -`1U`
9117	\|\| first_vec == vperm [i].first)
9118	first_vec = vperm [i].first;
9119	else if (second_vec.first == -`1U`
9120	\|\| second_vec == vperm [i].first)
9121	{
9122	second_vec = vperm [i].first;
9123	mask_element += nunits;
9124	}
9125	else
9126	{
9127	if (dump_p)
9128	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9129	"permutation requires at "
9130	"least three vectors\n");
9131	gcc_assert (!gsi);
9132	return -`1`;
9133	}
9134
9135	mask [index++] = mask_element;
9136
9137	if (index == count)
9138	{
9139	indices.new_vector (mask, second_vec.first == -`1U` ? `1` : `2`,
9140	TYPE_VECTOR_SUBPARTS (node: op_vectype));
9141	bool identity_p = (indices.series_p (`0`, `1`, mask [`0`], `1`)
9142	&& constant_multiple_p (a: mask [`0`], b: nunits));
9143	machine_mode vmode = TYPE_MODE (vectype);
9144	machine_mode op_vmode = TYPE_MODE (op_vectype);
9145	unsigned HOST_WIDE_INT c;
9146	if ((!identity_p
9147	&& !can_vec_perm_const_p (vmode, op_vmode, indices))
9148	\|\| (identity_p
9149	&& !known_le (nunits,
9150	TYPE_VECTOR_SUBPARTS (op_vectype))
9151	&& (!constant_multiple_p (a: nunits,
9152	b: TYPE_VECTOR_SUBPARTS (node: op_vectype),
9153	multiple: &c) \|\| c != `2`)))
9154	{
9155	if (dump_p)
9156	{
9157	dump_printf_loc (MSG_MISSED_OPTIMIZATION,
9158	vect_location,
9159	"unsupported vect permute { ");
9160	for (i = `0`; i < count; ++i)
9161	{
9162	dump_dec (MSG_MISSED_OPTIMIZATION, mask [i]);
9163	dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9164	}
9165	dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9166	}
9167	gcc_assert (!gsi);
9168	return -`1`;
9169	}
9170
9171	if (!identity_p)
9172	nperms++;
9173	if (gsi)
9174	{
9175	if (second_vec.first == -`1U`)
9176	second_vec = first_vec;
9177
9178	slp_tree
9179	first_node = children [first_vec.first],
9180	second_node = children [second_vec.first];
9181
9182	tree mask_vec = NULL_TREE;
9183	if (!identity_p)
9184	mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9185
9186	for (unsigned int vi = `0`; vi < noutputs_per_mask; ++vi)
9187	{
9188	tree first_def
9189	= vect_get_slp_vect_def (slp_node: first_node,
9190	i: first_vec.second + vi);
9191	tree second_def
9192	= vect_get_slp_vect_def (slp_node: second_node,
9193	i: second_vec.second + vi);
9194	vect_add_slp_permutation (vinfo, gsi, node, first_def,
9195	second_def, mask_vec, identity_offset: mask [`0`]);
9196	}
9197	}
9198
9199	index = `0`;
9200	first_vec = std::make_pair (x: -`1U`, y: -`1U`);
9201	second_vec = std::make_pair (x: -`1U`, y: -`1U`);
9202	}
9203	}
9204
9205	return nperms;
9206	}
9207
9208	/ Vectorize the SLP permutations in NODE as specified*
9209	in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
9210	child number and lane number.
9211	Interleaving of two two-lane two-child SLP subtrees (not supported):
9212	[ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
9213	A blend of two four-lane two-child SLP subtrees:
9214	[ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
9215	Highpart of a four-lane one-child SLP subtree (not supported):
9216	[ { 0, 2 }, { 0, 3 } ]
9217	Where currently only a subset is supported by code generating below. /*
9218
9219	static bool
9220	vectorizable_slp_permutation (vec_info vinfo, gimple_stmt_iterator gsi,
9221	slp_tree node, stmt_vector_for_cost *cost_vec)
9222	{
9223	tree vectype = SLP_TREE_VECTYPE (node);
9224	lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9225	int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9226	SLP_TREE_CHILDREN (node),
9227	dump_p: dump_enabled_p ());
9228	if (nperms < `0`)
9229	return false;
9230
9231	if (!gsi)
9232	record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, `0`, vect_body);
9233
9234	return true;
9235	}
9236
9237	/ Vectorize SLP NODE. /
9238
9239	static void
9240	vect_schedule_slp_node (vec_info *vinfo,
9241	slp_tree node, slp_instance instance)
9242	{
9243	gimple_stmt_iterator si;
9244	int i;
9245	slp_tree child;
9246
9247	/ Vectorize externals and constants. /
9248	if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9249	\|\| SLP_TREE_DEF_TYPE (node) == vect_external_def)
9250	{
9251	/ ??? vectorizable_shift can end up using a scalar operand which is*
9252	currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
9253	node in this case. /*
9254	if (!SLP_TREE_VECTYPE (node))
9255	return;
9256
9257	/ There are two reasons vector defs might already exist. The first*
9258	is that we are vectorizing an existing vector def. The second is
9259	when performing BB vectorization shared constant/external nodes
9260	are not split apart during partitioning so during the code-gen
9261	DFS walk we can end up visiting them twice. /*
9262	if (! SLP_TREE_VEC_DEFS (node).exists ())
9263	vect_create_constant_vectors (vinfo, op_node: node);
9264	return;
9265	}
9266
9267	gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9268
9269	stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9270
9271	gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != `0`);
9272	SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9273
9274	if (dump_enabled_p ())
9275	dump_printf_loc (MSG_NOTE, vect_location,
9276	"------>vectorizing SLP node starting from: %G",
9277	stmt_info->stmt);
9278
9279	if (STMT_VINFO_DATA_REF (stmt_info)
9280	&& SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9281	{
9282	/ Vectorized loads go before the first scalar load to make it*
9283	ready early, vectorized stores go before the last scalar
9284	stmt which is where all uses are ready. /*
9285	stmt_vec_info last_stmt_info = NULL;
9286	if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9287	last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9288	else / DR_IS_WRITE /
9289	last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9290	si = gsi_for_stmt (last_stmt_info->stmt);
9291	}
9292	else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9293	\|\| STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9294	\|\| STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9295	&& SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9296	{
9297	/ For PHI node vectorization we do not use the insertion iterator. /
9298	si = gsi_none ();
9299	}
9300	else
9301	{
9302	/ Emit other stmts after the children vectorized defs which is*
9303	earliest possible. /*
9304	gimple *last_stmt = NULL;
9305	if (auto loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
9306	if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9307	\|\| LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9308	{
9309	/ But avoid scheduling internal defs outside of the loop when*
9310	we might have only implicitly tracked loop mask/len defs. /*
9311	gimple_stmt_iterator si
9312	= gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
9313	last_stmt = *si;
9314	}
9315	bool seen_vector_def = false;
9316	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9317	if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9318	{
9319	/ For fold-left reductions we are retaining the scalar*
9320	reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9321	set so the representation isn't perfect. Resort to the
9322	last scalar def here. /*
9323	if (SLP_TREE_VEC_DEFS (child).is_empty ())
9324	{
9325	gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9326	== cycle_phi_info_type);
9327	gphi phi = as_a <gphi >
9328	(p: vect_find_last_scalar_stmt_in_slp (node: child)->stmt);
9329	if (!last_stmt
9330	\|\| vect_stmt_dominates_stmt_p (last_stmt, phi))
9331	last_stmt = phi;
9332	}
9333	/ We are emitting all vectorized stmts in the same place and*
9334	the last one is the last.
9335	??? Unless we have a load permutation applied and that
9336	figures to re-use an earlier generated load. /*
9337	unsigned j;
9338	tree vdef;
9339	FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9340	{
9341	gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9342	if (!last_stmt
9343	\|\| vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9344	last_stmt = vstmt;
9345	}
9346	}
9347	else if (!SLP_TREE_VECTYPE (child))
9348	{
9349	/ For externals we use unvectorized at all scalar defs. /
9350	unsigned j;
9351	tree def;
9352	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9353	if (TREE_CODE (def) == SSA_NAME
9354	&& !SSA_NAME_IS_DEFAULT_DEF (def))
9355	{
9356	gimple *stmt = SSA_NAME_DEF_STMT (def);
9357	if (!last_stmt
9358	\|\| vect_stmt_dominates_stmt_p (last_stmt, stmt))
9359	last_stmt = stmt;
9360	}
9361	}
9362	else
9363	{
9364	/ For externals we have to look at all defs since their*
9365	insertion place is decided per vector. But beware
9366	of pre-existing vectors where we need to make sure
9367	we do not insert before the region boundary. /*
9368	if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9369	&& !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[`0`]))
9370	seen_vector_def = true;
9371	else
9372	{
9373	unsigned j;
9374	tree vdef;
9375	FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9376	if (TREE_CODE (vdef) == SSA_NAME
9377	&& !SSA_NAME_IS_DEFAULT_DEF (vdef))
9378	{
9379	gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9380	if (!last_stmt
9381	\|\| vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9382	last_stmt = vstmt;
9383	}
9384	}
9385	}
9386	/ This can happen when all children are pre-existing vectors or*
9387	constants. /*
9388	if (!last_stmt)
9389	last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9390	if (!last_stmt)
9391	{
9392	gcc_assert (seen_vector_def);
9393	si = gsi_after_labels (bb: as_a <bb_vec_info> (p: vinfo)->bbs [`0`]);
9394	}
9395	else if (is_ctrl_altering_stmt (last_stmt))
9396	{
9397	/ We split regions to vectorize at control altering stmts*
9398	with a definition so this must be an external which
9399	we can insert at the start of the region. /*
9400	si = gsi_after_labels (bb: as_a <bb_vec_info> (p: vinfo)->bbs [`0`]);
9401	}
9402	else if (is_a <bb_vec_info> (p: vinfo)
9403	&& gimple_bb (g: last_stmt) != gimple_bb (g: stmt_info->stmt)
9404	&& gimple_could_trap_p (stmt_info->stmt))
9405	{
9406	/ We've constrained possibly trapping operations to all come*
9407	from the same basic-block, if vectorized defs would allow earlier
9408	scheduling still force vectorized stmts to the original block.
9409	This is only necessary for BB vectorization since for loop vect
9410	all operations are in a single BB and scalar stmt based
9411	placement doesn't play well with epilogue vectorization. /*
9412	gcc_assert (dominated_by_p (CDI_DOMINATORS,
9413	gimple_bb (stmt_info->stmt),
9414	gimple_bb (last_stmt)));
9415	si = gsi_after_labels (bb: gimple_bb (g: stmt_info->stmt));
9416	}
9417	else if (is_a <gphi *> (p: last_stmt))
9418	si = gsi_after_labels (bb: gimple_bb (g: last_stmt));
9419	else
9420	{
9421	si = gsi_for_stmt (last_stmt);
9422	gsi_next (i: &si);
9423	}
9424	}
9425
9426	/ Handle purely internal nodes. /
9427	if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9428	{
9429	/ ??? the transform kind is stored to STMT_VINFO_TYPE which might*
9430	be shared with different SLP nodes (but usually it's the same
9431	operation apart from the case the stmt is only there for denoting
9432	the actual scalar lane defs ...). So do not call vect_transform_stmt
9433	but open-code it here (partly). /*
9434	bool done = vectorizable_slp_permutation (vinfo, gsi: &si, node, NULL);
9435	gcc_assert (done);
9436	stmt_vec_info slp_stmt_info;
9437	unsigned int i;
9438	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9439	if (STMT_VINFO_LIVE_P (slp_stmt_info))
9440	{
9441	done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9442	instance, i, true, NULL);
9443	gcc_assert (done);
9444	}
9445	}
9446	else
9447	vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9448	}
9449
9450	/ Replace scalar calls from SLP node NODE with setting of their lhs to zero.*
9451	For loop vectorization this is done in vectorizable_call, but for SLP
9452	it needs to be deferred until end of vect_schedule_slp, because multiple
9453	SLP instances may refer to the same scalar stmt. /*
9454
9455	static void
9456	vect_remove_slp_scalar_calls (vec_info *vinfo,
9457	slp_tree node, hash_set<slp_tree> &visited)
9458	{
9459	gimple *new_stmt;
9460	gimple_stmt_iterator gsi;
9461	int i;
9462	slp_tree child;
9463	tree lhs;
9464	stmt_vec_info stmt_info;
9465
9466	if (!node \|\| SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9467	return;
9468
9469	if (visited.add (k: node))
9470	return;
9471
9472	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9473	vect_remove_slp_scalar_calls (vinfo, node: child, visited);
9474
9475	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9476	{
9477	gcall stmt = dyn_cast <gcall > (p: stmt_info->stmt);
9478	if (!stmt \|\| gimple_bb (g: stmt) == NULL)
9479	continue;
9480	if (is_pattern_stmt_p (stmt_info)
9481	\|\| !PURE_SLP_STMT (stmt_info))
9482	continue;
9483	lhs = gimple_call_lhs (gs: stmt);
9484	if (lhs)
9485	new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9486	else
9487	{
9488	new_stmt = gimple_build_nop ();
9489	unlink_stmt_vdef (stmt_info->stmt);
9490	}
9491	gsi = gsi_for_stmt (stmt);
9492	vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9493	if (lhs)
9494	SSA_NAME_DEF_STMT (lhs) = new_stmt;
9495	}
9496	}
9497
9498	static void
9499	vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9500	{
9501	hash_set<slp_tree> visited;
9502	vect_remove_slp_scalar_calls (vinfo, node, visited);
9503	}
9504
9505	/ Vectorize the instance root. /
9506
9507	void
9508	vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9509	{
9510	gassign *rstmt = NULL;
9511
9512	if (instance->kind == slp_inst_kind_ctor)
9513	{
9514	if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == `1`)
9515	{
9516	tree vect_lhs = SLP_TREE_VEC_DEFS (node)[`0`];
9517	tree root_lhs = gimple_get_lhs (instance->root_stmts [`0`]->stmt);
9518	if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9519	TREE_TYPE (vect_lhs)))
9520	vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9521	vect_lhs);
9522	rstmt = gimple_build_assign (root_lhs, vect_lhs);
9523	}
9524	else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > `1`)
9525	{
9526	int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9527	tree child_def;
9528	int j;
9529	vec<constructor_elt, va_gc> *v;
9530	vec_alloc (v, nelems: nelts);
9531
9532	/ A CTOR can handle V16HI composition from VNx8HI so we*
9533	do not need to convert vector elements if the types
9534	do not match. /*
9535	FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9536	CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9537	tree lhs = gimple_get_lhs (instance->root_stmts [`0`]->stmt);
9538	tree rtype
9539	= TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[`0`]->stmt));
9540	tree r_constructor = build_constructor (rtype, v);
9541	rstmt = gimple_build_assign (lhs, r_constructor);
9542	}
9543	}
9544	else if (instance->kind == slp_inst_kind_bb_reduc)
9545	{
9546	/ Largely inspired by reduction chain epilogue handling in*
9547	vect_create_epilog_for_reduction. /*
9548	vec<tree> vec_defs = vNULL;
9549	vect_get_slp_defs (slp_node: node, vec_defs: &vec_defs);
9550	enum tree_code reduc_code
9551	= gimple_assign_rhs_code (gs: instance->root_stmts [`0`]->stmt);
9552	/ ??? We actually have to reflect signs somewhere. /
9553	if (reduc_code == MINUS_EXPR)
9554	reduc_code = PLUS_EXPR;
9555	gimple_seq epilogue = NULL;
9556	/ We may end up with more than one vector result, reduce them*
9557	to one vector. /*
9558	tree vec_def = vec_defs [`0`];
9559	tree vectype = TREE_TYPE (vec_def);
9560	tree compute_vectype = vectype;
9561	bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9562	&& TYPE_OVERFLOW_UNDEFINED (vectype)
9563	&& operation_can_overflow (reduc_code));
9564	if (pun_for_overflow_p)
9565	{
9566	compute_vectype = unsigned_type_for (vectype);
9567	vec_def = gimple_build (seq: &epilogue, code: VIEW_CONVERT_EXPR,
9568	type: compute_vectype, ops: vec_def);
9569	}
9570	for (unsigned i = `1`; i < vec_defs.length (); ++i)
9571	{
9572	tree def = vec_defs [i];
9573	if (pun_for_overflow_p)
9574	def = gimple_build (seq: &epilogue, code: VIEW_CONVERT_EXPR,
9575	type: compute_vectype, ops: def);
9576	vec_def = gimple_build (seq: &epilogue, code: reduc_code, type: compute_vectype,
9577	ops: vec_def, ops: def);
9578	}
9579	vec_defs.release ();
9580	/ ??? Support other schemes than direct internal fn. /
9581	internal_fn reduc_fn;
9582	if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9583	\|\| reduc_fn == IFN_LAST)
9584	gcc_unreachable ();
9585	tree scalar_def = gimple_build (seq: &epilogue, fn: as_combined_fn (fn: reduc_fn),
9586	TREE_TYPE (compute_vectype), args: vec_def);
9587	if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9588	{
9589	tree rem_def = NULL_TREE;
9590	for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9591	{
9592	def = gimple_convert (seq: &epilogue, TREE_TYPE (scalar_def), op: def);
9593	if (!rem_def)
9594	rem_def = def;
9595	else
9596	rem_def = gimple_build (seq: &epilogue, code: reduc_code,
9597	TREE_TYPE (scalar_def),
9598	ops: rem_def, ops: def);
9599	}
9600	scalar_def = gimple_build (seq: &epilogue, code: reduc_code,
9601	TREE_TYPE (scalar_def),
9602	ops: scalar_def, ops: rem_def);
9603	}
9604	scalar_def = gimple_convert (seq: &epilogue,
9605	TREE_TYPE (vectype), op: scalar_def);
9606	gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts [`0`]->stmt);
9607	gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9608	gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9609	update_stmt (s: gsi_stmt (i: rgsi));
9610	return;
9611	}
9612	else
9613	gcc_unreachable ();
9614
9615	gcc_assert (rstmt);
9616
9617	gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts [`0`]->stmt);
9618	gsi_replace (&rgsi, rstmt, true);
9619	}
9620
9621	struct slp_scc_info
9622	{
9623	bool on_stack;
9624	int dfs;
9625	int lowlink;
9626	};
9627
9628	/ Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. /
9629
9630	static void
9631	vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9632	hash_map<slp_tree, slp_scc_info> &scc_info,
9633	int &maxdfs, vec<slp_tree> &stack)
9634	{
9635	bool existed_p;
9636	slp_scc_info *info = &scc_info.get_or_insert (k: node, existed: &existed_p);
9637	gcc_assert (!existed_p);
9638	info->dfs = maxdfs;
9639	info->lowlink = maxdfs;
9640	maxdfs++;
9641
9642	/ Leaf. /
9643	if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9644	{
9645	info->on_stack = false;
9646	vect_schedule_slp_node (vinfo, node, instance);
9647	return;
9648	}
9649
9650	info->on_stack = true;
9651	stack.safe_push (obj: node);
9652
9653	unsigned i;
9654	slp_tree child;
9655	/ DFS recurse. /
9656	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9657	{
9658	if (!child)
9659	continue;
9660	slp_scc_info *child_info = scc_info.get (k: child);
9661	if (!child_info)
9662	{
9663	vect_schedule_scc (vinfo, node: child, instance, scc_info, maxdfs, stack);
9664	/ Recursion might have re-allocated the node. /
9665	info = scc_info.get (k: node);
9666	child_info = scc_info.get (k: child);
9667	info->lowlink = MIN (info->lowlink, child_info->lowlink);
9668	}
9669	else if (child_info->on_stack)
9670	info->lowlink = MIN (info->lowlink, child_info->dfs);
9671	}
9672	if (info->lowlink != info->dfs)
9673	return;
9674
9675	auto_vec<slp_tree, `4`> phis_to_fixup;
9676
9677	/ Singleton. /
9678	if (stack.last () == node)
9679	{
9680	stack.pop ();
9681	info->on_stack = false;
9682	vect_schedule_slp_node (vinfo, node, instance);
9683	if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9684	&& is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9685	phis_to_fixup.quick_push (obj: node);
9686	}
9687	else
9688	{
9689	/ SCC. /
9690	int last_idx = stack.length () - `1`;
9691	while (stack [last_idx] != node)
9692	last_idx--;
9693	/ We can break the cycle at PHIs who have at least one child*
9694	code generated. Then we could re-start the DFS walk until
9695	all nodes in the SCC are covered (we might have new entries
9696	for only back-reachable nodes). But it's simpler to just
9697	iterate and schedule those that are ready. /*
9698	unsigned todo = stack.length () - last_idx;
9699	do
9700	{
9701	for (int idx = stack.length () - `1`; idx >= last_idx; --idx)
9702	{
9703	slp_tree entry = stack [idx];
9704	if (!entry)
9705	continue;
9706	bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9707	&& is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9708	bool ready = !phi;
9709	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9710	if (!child)
9711	{
9712	gcc_assert (phi);
9713	ready = true;
9714	break;
9715	}
9716	else if (scc_info.get (k: child)->on_stack)
9717	{
9718	if (!phi)
9719	{
9720	ready = false;
9721	break;
9722	}
9723	}
9724	else
9725	{
9726	if (phi)
9727	{
9728	ready = true;
9729	break;
9730	}
9731	}
9732	if (ready)
9733	{
9734	vect_schedule_slp_node (vinfo, node: entry, instance);
9735	scc_info.get (k: entry)->on_stack = false;
9736	stack [idx] = NULL;
9737	todo--;
9738	if (phi)
9739	phis_to_fixup.safe_push (obj: entry);
9740	}
9741	}
9742	}
9743	while (todo != `0`);
9744
9745	/ Pop the SCC. /
9746	stack.truncate (size: last_idx);
9747	}
9748
9749	/ Now fixup the backedge def of the vectorized PHIs in this SCC. /
9750	slp_tree phi_node;
9751	FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9752	{
9753	gphi phi = as_a <gphi > (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9754	edge_iterator ei;
9755	edge e;
9756	FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9757	{
9758	unsigned dest_idx = e->dest_idx;
9759	child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9760	if (!child \|\| SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9761	continue;
9762	unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9763	/ Simply fill all args. /
9764	if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9765	!= vect_first_order_recurrence)
9766	for (unsigned i = `0`; i < n; ++i)
9767	{
9768	tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9769	gphi phi = as_a <gphi > (SSA_NAME_DEF_STMT (phidef));
9770	add_phi_arg (phi, vect_get_slp_vect_def (slp_node: child, i),
9771	e, gimple_phi_arg_location (phi, i: dest_idx));
9772	}
9773	else
9774	{
9775	/ Unless it is a first order recurrence which needs*
9776	args filled in for both the PHI node and the permutes. /*
9777	gimple *perm
9778	= SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[`0`]);
9779	gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9780	add_phi_arg (as_a <gphi *> (p: rphi),
9781	vect_get_slp_vect_def (slp_node: child, i: n - `1`),
9782	e, gimple_phi_arg_location (phi, i: dest_idx));
9783	for (unsigned i = `0`; i < n; ++i)
9784	{
9785	gimple *perm
9786	= SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9787	if (i > `0`)
9788	gimple_assign_set_rhs1 (gs: perm,
9789	rhs: vect_get_slp_vect_def (slp_node: child, i: i - `1`));
9790	gimple_assign_set_rhs2 (gs: perm,
9791	rhs: vect_get_slp_vect_def (slp_node: child, i));
9792	update_stmt (s: perm);
9793	}
9794	}
9795	}
9796	}
9797	}
9798
9799	/ Generate vector code for SLP_INSTANCES in the loop/basic block. /
9800
9801	void
9802	vect_schedule_slp (vec_info vinfo, const* vec<slp_instance> &slp_instances)
9803	{
9804	slp_instance instance;
9805	unsigned int i;
9806
9807	hash_map<slp_tree, slp_scc_info> scc_info;
9808	int maxdfs = `0`;
9809	FOR_EACH_VEC_ELT (slp_instances, i, instance)
9810	{
9811	slp_tree node = SLP_INSTANCE_TREE (instance);
9812	if (dump_enabled_p ())
9813	{
9814	dump_printf_loc (MSG_NOTE, vect_location,
9815	"Vectorizing SLP tree:\n");
9816	/ ??? Dump all? /
9817	if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9818	dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9819	SLP_INSTANCE_ROOT_STMTS (instance)[`0`]->stmt);
9820	vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location,
9821	SLP_INSTANCE_TREE (instance));
9822	}
9823	/ Schedule the tree of INSTANCE, scheduling SCCs in a way to*
9824	have a PHI be the node breaking the cycle. /*
9825	auto_vec<slp_tree> stack;
9826	if (!scc_info.get (k: node))
9827	vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9828
9829	if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9830	vectorize_slp_instance_root_stmt (node, instance);
9831
9832	if (dump_enabled_p ())
9833	dump_printf_loc (MSG_NOTE, vect_location,
9834	"vectorizing stmts using SLP.\n");
9835	}
9836
9837	FOR_EACH_VEC_ELT (slp_instances, i, instance)
9838	{
9839	slp_tree root = SLP_INSTANCE_TREE (instance);
9840	stmt_vec_info store_info;
9841	unsigned int j;
9842
9843	/ Remove scalar call stmts. Do not do this for basic-block*
9844	vectorization as not all uses may be vectorized.
9845	??? Why should this be necessary? DCE should be able to
9846	remove the stmts itself.
9847	??? For BB vectorization we can as well remove scalar
9848	stmts starting from the SLP tree root if they have no
9849	uses. /*
9850	if (is_a <loop_vec_info> (p: vinfo))
9851	vect_remove_slp_scalar_calls (vinfo, node: root);
9852
9853	/ Remove vectorized stores original scalar stmts. /
9854	for (j = `0`; SLP_TREE_SCALAR_STMTS (root).iterate (ix: j, ptr: &store_info); j++)
9855	{
9856	if (!STMT_VINFO_DATA_REF (store_info)
9857	\|\| !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9858	break;
9859
9860	store_info = vect_orig_stmt (stmt_info: store_info);
9861	/ Free the attached stmt_vec_info and remove the stmt. /
9862	vinfo->remove_stmt (store_info);
9863
9864	/ Invalidate SLP_TREE_REPRESENTATIVE in case we released it*
9865	to not crash in vect_free_slp_tree later. /*
9866	if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9867	SLP_TREE_REPRESENTATIVE (root) = NULL;
9868	}
9869	}
9870	}
9871

source code of gcc/tree-vect-slp.cc