1/* Copyright (C) 1988-2024 Free Software Foundation, Inc.
2
3This file is part of GCC.
4
5GCC is free software; you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation; either version 3, or (at your option)
8any later version.
9
10GCC is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with GCC; see the file COPYING3. If not see
17<http://www.gnu.org/licenses/>. */
18
19#define IN_TARGET_CODE 1
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "backend.h"
25#include "rtl.h"
26#include "tree.h"
27#include "memmodel.h"
28#include "gimple.h"
29#include "cfghooks.h"
30#include "cfgloop.h"
31#include "df.h"
32#include "tm_p.h"
33#include "stringpool.h"
34#include "expmed.h"
35#include "optabs.h"
36#include "regs.h"
37#include "emit-rtl.h"
38#include "recog.h"
39#include "cgraph.h"
40#include "diagnostic.h"
41#include "cfgbuild.h"
42#include "alias.h"
43#include "fold-const.h"
44#include "attribs.h"
45#include "calls.h"
46#include "stor-layout.h"
47#include "varasm.h"
48#include "output.h"
49#include "insn-attr.h"
50#include "flags.h"
51#include "except.h"
52#include "explow.h"
53#include "expr.h"
54#include "cfgrtl.h"
55#include "common/common-target.h"
56#include "langhooks.h"
57#include "reload.h"
58#include "gimplify.h"
59#include "dwarf2.h"
60#include "tm-constrs.h"
61#include "cselib.h"
62#include "sched-int.h"
63#include "opts.h"
64#include "tree-pass.h"
65#include "context.h"
66#include "pass_manager.h"
67#include "target-globals.h"
68#include "gimple-iterator.h"
69#include "shrink-wrap.h"
70#include "builtins.h"
71#include "rtl-iter.h"
72#include "tree-iterator.h"
73#include "dbgcnt.h"
74#include "case-cfn-macros.h"
75#include "dojump.h"
76#include "fold-const-call.h"
77#include "tree-vrp.h"
78#include "tree-ssanames.h"
79#include "selftest.h"
80#include "selftest-rtl.h"
81#include "print-rtl.h"
82#include "intl.h"
83#include "ifcvt.h"
84#include "symbol-summary.h"
85#include "sreal.h"
86#include "ipa-cp.h"
87#include "ipa-prop.h"
88#include "ipa-fnsummary.h"
89#include "wide-int-bitmask.h"
90#include "tree-vector-builder.h"
91#include "debug.h"
92#include "dwarf2out.h"
93#include "i386-builtins.h"
94#include "i386-features.h"
95#include "i386-expand.h"
96
97const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
98 "savms64",
99 "resms64",
100 "resms64x",
101 "savms64f",
102 "resms64f",
103 "resms64fx"
104};
105
106const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
107/* The below offset values are where each register is stored for the layout
108 relative to incoming stack pointer. The value of each m_regs[].offset will
109 be relative to the incoming base pointer (rax or rsi) used by the stub.
110
111 s_instances: 0 1 2 3
112 Offset: realigned or aligned + 8
113 Register aligned aligned + 8 aligned w/HFP w/HFP */
114 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
115 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
116 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
117 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
118 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
119 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
120 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
121 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
122 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
123 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
124 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
125 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
126 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
127 BP_REG, /* 0xc0 0xc8 N/A N/A */
128 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
129 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
130 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
131 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
132};
133
134/* Instantiate static const values. */
135const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
136const unsigned xlogue_layout::MIN_REGS;
137const unsigned xlogue_layout::MAX_REGS;
138const unsigned xlogue_layout::MAX_EXTRA_REGS;
139const unsigned xlogue_layout::VARIANT_COUNT;
140const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
141
142/* Initialize xlogue_layout::s_stub_names to zero. */
143char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
144 [STUB_NAME_MAX_LEN];
145
146/* Instantiates all xlogue_layout instances. */
147const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
148 xlogue_layout (0, false),
149 xlogue_layout (8, false),
150 xlogue_layout (0, true),
151 xlogue_layout (8, true)
152};
153
154/* Return an appropriate const instance of xlogue_layout based upon values
155 in cfun->machine and crtl. */
156const class xlogue_layout &
157xlogue_layout::get_instance ()
158{
159 enum xlogue_stub_sets stub_set;
160 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
161
162 if (stack_realign_fp)
163 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
164 else if (frame_pointer_needed)
165 stub_set = aligned_plus_8
166 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
167 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
168 else
169 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
170
171 return s_instances[stub_set];
172}
173
174/* Determine how many clobbered registers can be saved by the stub.
175 Returns the count of registers the stub will save and restore. */
176unsigned
177xlogue_layout::count_stub_managed_regs ()
178{
179 bool hfp = frame_pointer_needed || stack_realign_fp;
180 unsigned i, count;
181 unsigned regno;
182
183 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
184 {
185 regno = REG_ORDER[i];
186 if (regno == BP_REG && hfp)
187 continue;
188 if (!ix86_save_reg (regno, maybe_eh_return: false, ignore_outlined: false))
189 break;
190 ++count;
191 }
192 return count;
193}
194
195/* Determine if register REGNO is a stub managed register given the
196 total COUNT of stub managed registers. */
197bool
198xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
199{
200 bool hfp = frame_pointer_needed || stack_realign_fp;
201 unsigned i;
202
203 for (i = 0; i < count; ++i)
204 {
205 gcc_assert (i < MAX_REGS);
206 if (REG_ORDER[i] == BP_REG && hfp)
207 ++count;
208 else if (REG_ORDER[i] == regno)
209 return true;
210 }
211 return false;
212}
213
214/* Constructor for xlogue_layout. */
215xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
216 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
217 m_stack_align_off_in (stack_align_off_in)
218{
219 HOST_WIDE_INT offset = stack_align_off_in;
220 unsigned i, j;
221
222 for (i = j = 0; i < MAX_REGS; ++i)
223 {
224 unsigned regno = REG_ORDER[i];
225
226 if (regno == BP_REG && hfp)
227 continue;
228 if (SSE_REGNO_P (regno))
229 {
230 offset += 16;
231 /* Verify that SSE regs are always aligned. */
232 gcc_assert (!((stack_align_off_in + offset) & 15));
233 }
234 else
235 offset += 8;
236
237 m_regs[j].regno = regno;
238 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
239 }
240 gcc_assert (j == m_nregs);
241}
242
243const char *
244xlogue_layout::get_stub_name (enum xlogue_stub stub,
245 unsigned n_extra_regs)
246{
247 const int have_avx = TARGET_AVX;
248 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
249
250 /* Lazy init */
251 if (!*name)
252 {
253 int res = snprintf (s: name, maxlen: STUB_NAME_MAX_LEN, format: "__%s_%s_%u",
254 (have_avx ? "avx" : "sse"),
255 STUB_BASE_NAMES[stub],
256 MIN_REGS + n_extra_regs);
257 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
258 }
259
260 return name;
261}
262
263/* Return rtx of a symbol ref for the entry point (based upon
264 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
265rtx
266xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
267{
268 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
269 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
270 gcc_assert (stub < XLOGUE_STUB_COUNT);
271 gcc_assert (crtl->stack_realign_finalized);
272
273 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
274}
275
276unsigned scalar_chain::max_id = 0;
277
278namespace {
279
280/* Initialize new chain. */
281
282scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
283{
284 smode = smode_;
285 vmode = vmode_;
286
287 chain_id = ++max_id;
288
289 if (dump_file)
290 fprintf (stream: dump_file, format: "Created a new instruction chain #%d\n", chain_id);
291
292 bitmap_obstack_initialize (NULL);
293 insns = BITMAP_ALLOC (NULL);
294 defs = BITMAP_ALLOC (NULL);
295 defs_conv = BITMAP_ALLOC (NULL);
296 insns_conv = BITMAP_ALLOC (NULL);
297 queue = NULL;
298
299 n_sse_to_integer = 0;
300 n_integer_to_sse = 0;
301
302 max_visits = x86_stv_max_visits;
303}
304
305/* Free chain's data. */
306
307scalar_chain::~scalar_chain ()
308{
309 BITMAP_FREE (insns);
310 BITMAP_FREE (defs);
311 BITMAP_FREE (defs_conv);
312 BITMAP_FREE (insns_conv);
313 bitmap_obstack_release (NULL);
314}
315
316/* Add instruction into chains' queue. */
317
318void
319scalar_chain::add_to_queue (unsigned insn_uid)
320{
321 if (!bitmap_set_bit (queue, insn_uid))
322 return;
323
324 if (dump_file)
325 fprintf (stream: dump_file, format: " Adding insn %d into chain's #%d queue\n",
326 insn_uid, chain_id);
327}
328
329/* For DImode conversion, mark register defined by DEF as requiring
330 conversion. */
331
332void
333scalar_chain::mark_dual_mode_def (df_ref def)
334{
335 gcc_assert (DF_REF_REG_DEF_P (def));
336
337 /* Record the def/insn pair so we can later efficiently iterate over
338 the defs to convert on insns not in the chain. */
339 bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
340 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
341 {
342 if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
343 && !reg_new)
344 return;
345 n_integer_to_sse++;
346 }
347 else
348 {
349 if (!reg_new)
350 return;
351 n_sse_to_integer++;
352 }
353
354 if (dump_file)
355 fprintf (stream: dump_file,
356 format: " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
357 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
358}
359
360/* Check REF's chain to add new insns into a queue
361 and find registers requiring conversion. Return true if OK, false
362 if the analysis was aborted. */
363
364bool
365scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref,
366 bitmap disallowed)
367{
368 df_link *chain;
369 bool mark_def = false;
370
371 gcc_checking_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)));
372
373 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
374 {
375 unsigned uid = DF_REF_INSN_UID (chain->ref);
376
377 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
378 continue;
379
380 if (--max_visits == 0)
381 return false;
382
383 if (!DF_REF_REG_MEM_P (chain->ref))
384 {
385 if (bitmap_bit_p (insns, uid))
386 continue;
387
388 if (bitmap_bit_p (candidates, uid))
389 {
390 add_to_queue (insn_uid: uid);
391 continue;
392 }
393
394 /* If we run into parts of an aborted chain discovery abort. */
395 if (bitmap_bit_p (disallowed, uid))
396 return false;
397 }
398
399 if (DF_REF_REG_DEF_P (chain->ref))
400 {
401 if (dump_file)
402 fprintf (stream: dump_file, format: " r%d def in insn %d isn't convertible\n",
403 DF_REF_REGNO (chain->ref), uid);
404 mark_dual_mode_def (def: chain->ref);
405 }
406 else
407 {
408 if (dump_file)
409 fprintf (stream: dump_file, format: " r%d use in insn %d isn't convertible\n",
410 DF_REF_REGNO (chain->ref), uid);
411 mark_def = true;
412 }
413 }
414
415 if (mark_def)
416 mark_dual_mode_def (def: ref);
417
418 return true;
419}
420
421/* Add instruction into a chain. Return true if OK, false if the search
422 was aborted. */
423
424bool
425scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid,
426 bitmap disallowed)
427{
428 if (!bitmap_set_bit (insns, insn_uid))
429 return true;
430
431 if (dump_file)
432 fprintf (stream: dump_file, format: " Adding insn %d to chain #%d\n", insn_uid, chain_id);
433
434 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
435 rtx def_set = single_set (insn);
436 if (def_set && REG_P (SET_DEST (def_set))
437 && !HARD_REGISTER_P (SET_DEST (def_set)))
438 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
439
440 /* ??? The following is quadratic since analyze_register_chain
441 iterates over all refs to look for dual-mode regs. Instead this
442 should be done separately for all regs mentioned in the chain once. */
443 df_ref ref;
444 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
445 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
446 if (!analyze_register_chain (candidates, ref, disallowed))
447 return false;
448
449 /* The operand(s) of VEC_SELECT don't need to be converted/convertible. */
450 if (def_set && GET_CODE (SET_SRC (def_set)) == VEC_SELECT)
451 return true;
452
453 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
454 if (!DF_REF_REG_MEM_P (ref))
455 if (!analyze_register_chain (candidates, ref, disallowed))
456 return false;
457
458 return true;
459}
460
461/* Build new chain starting from insn INSN_UID recursively
462 adding all dependent uses and definitions. Return true if OK, false
463 if the chain discovery was aborted. */
464
465bool
466scalar_chain::build (bitmap candidates, unsigned insn_uid, bitmap disallowed)
467{
468 queue = BITMAP_ALLOC (NULL);
469 bitmap_set_bit (queue, insn_uid);
470
471 if (dump_file)
472 fprintf (stream: dump_file, format: "Building chain #%d...\n", chain_id);
473
474 while (!bitmap_empty_p (map: queue))
475 {
476 insn_uid = bitmap_first_set_bit (queue);
477 bitmap_clear_bit (queue, insn_uid);
478 bitmap_clear_bit (candidates, insn_uid);
479 if (!add_insn (candidates, insn_uid, disallowed))
480 {
481 /* If we aborted the search put sofar found insn on the set of
482 disallowed insns so that further searches reaching them also
483 abort and thus we abort the whole but yet undiscovered chain. */
484 bitmap_ior_into (disallowed, insns);
485 if (dump_file)
486 fprintf (stream: dump_file, format: "Aborted chain #%d discovery\n", chain_id);
487 BITMAP_FREE (queue);
488 return false;
489 }
490 }
491
492 if (dump_file)
493 {
494 fprintf (stream: dump_file, format: "Collected chain #%d...\n", chain_id);
495 fprintf (stream: dump_file, format: " insns: ");
496 dump_bitmap (file: dump_file, map: insns);
497 if (!bitmap_empty_p (map: defs_conv))
498 {
499 bitmap_iterator bi;
500 unsigned id;
501 const char *comma = "";
502 fprintf (stream: dump_file, format: " defs to convert: ");
503 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
504 {
505 fprintf (stream: dump_file, format: "%sr%d", comma, id);
506 comma = ", ";
507 }
508 fprintf (stream: dump_file, format: "\n");
509 }
510 }
511
512 BITMAP_FREE (queue);
513
514 return true;
515}
516
517/* Return a cost of building a vector costant
518 instead of using a scalar one. */
519
520int
521general_scalar_chain::vector_const_cost (rtx exp)
522{
523 gcc_assert (CONST_INT_P (exp));
524
525 if (standard_sse_constant_p (exp, vmode))
526 return ix86_cost->sse_op;
527 /* We have separate costs for SImode and DImode, use SImode costs
528 for smaller modes. */
529 return ix86_cost->sse_load[smode == DImode ? 1 : 0];
530}
531
532/* Compute a gain for chain conversion. */
533
534int
535general_scalar_chain::compute_convert_gain ()
536{
537 bitmap_iterator bi;
538 unsigned insn_uid;
539 int gain = 0;
540 int cost = 0;
541
542 if (dump_file)
543 fprintf (stream: dump_file, format: "Computing gain for chain #%d...\n", chain_id);
544
545 /* SSE costs distinguish between SImode and DImode loads/stores, for
546 int costs factor in the number of GPRs involved. When supporting
547 smaller modes than SImode the int load/store costs need to be
548 adjusted as well. */
549 unsigned sse_cost_idx = smode == DImode ? 1 : 0;
550 unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
551
552 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
553 {
554 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
555 rtx def_set = single_set (insn);
556 rtx src = SET_SRC (def_set);
557 rtx dst = SET_DEST (def_set);
558 int igain = 0;
559
560 if (REG_P (src) && REG_P (dst))
561 igain += 2 * m - ix86_cost->xmm_move;
562 else if (REG_P (src) && MEM_P (dst))
563 igain
564 += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx];
565 else if (MEM_P (src) && REG_P (dst))
566 igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
567 else
568 {
569 /* For operations on memory operands, include the overhead
570 of explicit load and store instructions. */
571 if (MEM_P (dst))
572 igain += optimize_insn_for_size_p ()
573 ? -COSTS_N_BYTES (8)
574 : (m * (ix86_cost->int_load[2]
575 + ix86_cost->int_store[2])
576 - (ix86_cost->sse_load[sse_cost_idx] +
577 ix86_cost->sse_store[sse_cost_idx]));
578
579 switch (GET_CODE (src))
580 {
581 case ASHIFT:
582 case ASHIFTRT:
583 case LSHIFTRT:
584 if (m == 2)
585 {
586 if (INTVAL (XEXP (src, 1)) >= 32)
587 igain += ix86_cost->add;
588 /* Gain for extend highpart case. */
589 else if (GET_CODE (XEXP (src, 0)) == ASHIFT)
590 igain += ix86_cost->shift_const - ix86_cost->sse_op;
591 else
592 igain += ix86_cost->shift_const;
593 }
594
595 igain += ix86_cost->shift_const - ix86_cost->sse_op;
596
597 if (CONST_INT_P (XEXP (src, 0)))
598 igain -= vector_const_cost (XEXP (src, 0));
599 break;
600
601 case ROTATE:
602 case ROTATERT:
603 igain += m * ix86_cost->shift_const;
604 if (TARGET_AVX512VL)
605 igain -= ix86_cost->sse_op;
606 else if (smode == DImode)
607 {
608 int bits = INTVAL (XEXP (src, 1));
609 if ((bits & 0x0f) == 0)
610 igain -= ix86_cost->sse_op;
611 else if ((bits & 0x07) == 0)
612 igain -= 2 * ix86_cost->sse_op;
613 else
614 igain -= 3 * ix86_cost->sse_op;
615 }
616 else if (INTVAL (XEXP (src, 1)) == 16)
617 igain -= ix86_cost->sse_op;
618 else
619 igain -= 2 * ix86_cost->sse_op;
620 break;
621
622 case AND:
623 case IOR:
624 case XOR:
625 case PLUS:
626 case MINUS:
627 igain += m * ix86_cost->add - ix86_cost->sse_op;
628 /* Additional gain for andnot for targets without BMI. */
629 if (GET_CODE (XEXP (src, 0)) == NOT
630 && !TARGET_BMI)
631 igain += m * ix86_cost->add;
632
633 if (CONST_INT_P (XEXP (src, 0)))
634 igain -= vector_const_cost (XEXP (src, 0));
635 if (CONST_INT_P (XEXP (src, 1)))
636 igain -= vector_const_cost (XEXP (src, 1));
637 if (MEM_P (XEXP (src, 1)))
638 {
639 if (optimize_insn_for_size_p ())
640 igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
641 else
642 igain += m * ix86_cost->int_load[2]
643 - ix86_cost->sse_load[sse_cost_idx];
644 }
645 break;
646
647 case NEG:
648 case NOT:
649 igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
650
651 if (GET_CODE (XEXP (src, 0)) != ABS)
652 {
653 igain += m * ix86_cost->add;
654 break;
655 }
656 /* FALLTHRU */
657
658 case ABS:
659 case SMAX:
660 case SMIN:
661 case UMAX:
662 case UMIN:
663 /* We do not have any conditional move cost, estimate it as a
664 reg-reg move. Comparisons are costed as adds. */
665 igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
666 /* Integer SSE ops are all costed the same. */
667 igain -= ix86_cost->sse_op;
668 break;
669
670 case COMPARE:
671 if (XEXP (src, 1) != const0_rtx)
672 {
673 /* cmp vs. pxor;pshufd;ptest. */
674 igain += COSTS_N_INSNS (m - 3);
675 }
676 else if (GET_CODE (XEXP (src, 0)) != AND)
677 {
678 /* test vs. pshufd;ptest. */
679 igain += COSTS_N_INSNS (m - 2);
680 }
681 else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT)
682 {
683 /* and;test vs. pshufd;ptest. */
684 igain += COSTS_N_INSNS (2 * m - 2);
685 }
686 else if (TARGET_BMI)
687 {
688 /* andn;test vs. pandn;pshufd;ptest. */
689 igain += COSTS_N_INSNS (2 * m - 3);
690 }
691 else
692 {
693 /* not;and;test vs. pandn;pshufd;ptest. */
694 igain += COSTS_N_INSNS (3 * m - 3);
695 }
696 break;
697
698 case CONST_INT:
699 if (REG_P (dst))
700 {
701 if (optimize_insn_for_size_p ())
702 {
703 /* xor (2 bytes) vs. xorps (3 bytes). */
704 if (src == const0_rtx)
705 igain -= COSTS_N_BYTES (1);
706 /* movdi_internal vs. movv2di_internal. */
707 /* => mov (5 bytes) vs. movaps (7 bytes). */
708 else if (x86_64_immediate_operand (src, SImode))
709 igain -= COSTS_N_BYTES (2);
710 else
711 /* ??? Larger immediate constants are placed in the
712 constant pool, where the size benefit/impact of
713 STV conversion is affected by whether and how
714 often each constant pool entry is shared/reused.
715 The value below is empirically derived from the
716 CSiBE benchmark (and the optimal value may drift
717 over time). */
718 igain += COSTS_N_BYTES (0);
719 }
720 else
721 {
722 /* DImode can be immediate for TARGET_64BIT
723 and SImode always. */
724 igain += m * COSTS_N_INSNS (1);
725 igain -= vector_const_cost (exp: src);
726 }
727 }
728 else if (MEM_P (dst))
729 {
730 igain += (m * ix86_cost->int_store[2]
731 - ix86_cost->sse_store[sse_cost_idx]);
732 igain -= vector_const_cost (exp: src);
733 }
734 break;
735
736 case VEC_SELECT:
737 if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
738 {
739 // movd (4 bytes) replaced with movdqa (4 bytes).
740 if (!optimize_insn_for_size_p ())
741 igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move;
742 }
743 else
744 {
745 // pshufd; movd replaced with pshufd.
746 if (optimize_insn_for_size_p ())
747 igain += COSTS_N_BYTES (4);
748 else
749 igain += ix86_cost->sse_to_integer;
750 }
751 break;
752
753 default:
754 gcc_unreachable ();
755 }
756 }
757
758 if (igain != 0 && dump_file)
759 {
760 fprintf (stream: dump_file, format: " Instruction gain %d for ", igain);
761 dump_insn_slim (dump_file, insn);
762 }
763 gain += igain;
764 }
765
766 if (dump_file)
767 fprintf (stream: dump_file, format: " Instruction conversion gain: %d\n", gain);
768
769 /* Cost the integer to sse and sse to integer moves. */
770 if (!optimize_function_for_size_p (cfun))
771 {
772 cost += n_sse_to_integer * ix86_cost->sse_to_integer;
773 /* ??? integer_to_sse but we only have that in the RA cost table.
774 Assume sse_to_integer/integer_to_sse are the same which they
775 are at the moment. */
776 cost += n_integer_to_sse * ix86_cost->sse_to_integer;
777 }
778 else if (TARGET_64BIT || smode == SImode)
779 {
780 cost += n_sse_to_integer * COSTS_N_BYTES (4);
781 cost += n_integer_to_sse * COSTS_N_BYTES (4);
782 }
783 else if (TARGET_SSE4_1)
784 {
785 /* vmovd (4 bytes) + vpextrd (6 bytes). */
786 cost += n_sse_to_integer * COSTS_N_BYTES (10);
787 /* vmovd (4 bytes) + vpinsrd (6 bytes). */
788 cost += n_integer_to_sse * COSTS_N_BYTES (10);
789 }
790 else
791 {
792 /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */
793 cost += n_sse_to_integer * COSTS_N_BYTES (13);
794 /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */
795 cost += n_integer_to_sse * COSTS_N_BYTES (12);
796 }
797
798 if (dump_file)
799 fprintf (stream: dump_file, format: " Registers conversion cost: %d\n", cost);
800
801 gain -= cost;
802
803 if (dump_file)
804 fprintf (stream: dump_file, format: " Total gain: %d\n", gain);
805
806 return gain;
807}
808
809/* Insert generated conversion instruction sequence INSNS
810 after instruction AFTER. New BB may be required in case
811 instruction has EH region attached. */
812
813void
814scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
815{
816 if (!control_flow_insn_p (after))
817 {
818 emit_insn_after (insns, after);
819 return;
820 }
821
822 basic_block bb = BLOCK_FOR_INSN (insn: after);
823 edge e = find_fallthru_edge (edges: bb->succs);
824 gcc_assert (e);
825
826 basic_block new_bb = split_edge (e);
827 emit_insn_after (insns, BB_HEAD (new_bb));
828}
829
830} // anon namespace
831
832/* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
833 zeroing the upper parts. */
834
835static rtx
836gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
837{
838 switch (GET_MODE_NUNITS (vmode))
839 {
840 case 1:
841 return gen_rtx_SUBREG (vmode, gpr, 0);
842 case 2:
843 return gen_rtx_VEC_CONCAT (vmode, gpr,
844 CONST0_RTX (GET_MODE_INNER (vmode)));
845 default:
846 return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
847 CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
848 }
849}
850
851/* Make vector copies for all register REGNO definitions
852 and replace its uses in a chain. */
853
854void
855scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
856{
857 rtx vreg = *defs_map.get (k: reg);
858
859 start_sequence ();
860 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
861 {
862 rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
863 if (smode == DImode && !TARGET_64BIT)
864 {
865 emit_move_insn (adjust_address (tmp, SImode, 0),
866 gen_rtx_SUBREG (SImode, reg, 0));
867 emit_move_insn (adjust_address (tmp, SImode, 4),
868 gen_rtx_SUBREG (SImode, reg, 4));
869 }
870 else
871 emit_move_insn (copy_rtx (tmp), reg);
872 emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
873 gen_gpr_to_xmm_move_src (vmode, tmp)));
874 }
875 else if (!TARGET_64BIT && smode == DImode)
876 {
877 if (TARGET_SSE4_1)
878 {
879 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
880 CONST0_RTX (V4SImode),
881 gen_rtx_SUBREG (SImode, reg, 0)));
882 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
883 gen_rtx_SUBREG (V4SImode, vreg, 0),
884 gen_rtx_SUBREG (SImode, reg, 4),
885 GEN_INT (2)));
886 }
887 else
888 {
889 rtx tmp = gen_reg_rtx (DImode);
890 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
891 CONST0_RTX (V4SImode),
892 gen_rtx_SUBREG (SImode, reg, 0)));
893 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
894 CONST0_RTX (V4SImode),
895 gen_rtx_SUBREG (SImode, reg, 4)));
896 emit_insn (gen_vec_interleave_lowv4si
897 (gen_rtx_SUBREG (V4SImode, vreg, 0),
898 gen_rtx_SUBREG (V4SImode, vreg, 0),
899 gen_rtx_SUBREG (V4SImode, tmp, 0)));
900 }
901 }
902 else
903 emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
904 gen_gpr_to_xmm_move_src (vmode, reg)));
905 rtx_insn *seq = get_insns ();
906 end_sequence ();
907 emit_conversion_insns (insns: seq, after: insn);
908
909 if (dump_file)
910 fprintf (stream: dump_file,
911 format: " Copied r%d to a vector register r%d for insn %d\n",
912 REGNO (reg), REGNO (vreg), INSN_UID (insn));
913}
914
915/* Copy the definition SRC of INSN inside the chain to DST for
916 scalar uses outside of the chain. */
917
918void
919scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
920{
921 start_sequence ();
922 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
923 {
924 rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
925 emit_move_insn (tmp, src);
926 if (!TARGET_64BIT && smode == DImode)
927 {
928 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
929 adjust_address (tmp, SImode, 0));
930 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
931 adjust_address (tmp, SImode, 4));
932 }
933 else
934 emit_move_insn (dst, copy_rtx (tmp));
935 }
936 else if (!TARGET_64BIT && smode == DImode)
937 {
938 if (TARGET_SSE4_1)
939 {
940 rtx tmp = gen_rtx_PARALLEL (VOIDmode,
941 gen_rtvec (1, const0_rtx));
942 emit_insn
943 (gen_rtx_SET
944 (gen_rtx_SUBREG (SImode, dst, 0),
945 gen_rtx_VEC_SELECT (SImode,
946 gen_rtx_SUBREG (V4SImode, src, 0),
947 tmp)));
948
949 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
950 emit_insn
951 (gen_rtx_SET
952 (gen_rtx_SUBREG (SImode, dst, 4),
953 gen_rtx_VEC_SELECT (SImode,
954 gen_rtx_SUBREG (V4SImode, src, 0),
955 tmp)));
956 }
957 else
958 {
959 rtx vcopy = gen_reg_rtx (V2DImode);
960 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
961 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
962 gen_rtx_SUBREG (SImode, vcopy, 0));
963 emit_move_insn (vcopy,
964 gen_rtx_LSHIFTRT (V2DImode,
965 vcopy, GEN_INT (32)));
966 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
967 gen_rtx_SUBREG (SImode, vcopy, 0));
968 }
969 }
970 else
971 emit_move_insn (dst, src);
972
973 rtx_insn *seq = get_insns ();
974 end_sequence ();
975 emit_conversion_insns (insns: seq, after: insn);
976
977 if (dump_file)
978 fprintf (stream: dump_file,
979 format: " Copied r%d to a scalar register r%d for insn %d\n",
980 REGNO (src), REGNO (dst), INSN_UID (insn));
981}
982
983/* Helper function to convert immediate constant X to vmode. */
984static rtx
985smode_convert_cst (rtx x, enum machine_mode vmode)
986{
987 /* Prefer all ones vector in case of -1. */
988 if (constm1_operand (x, GET_MODE (x)))
989 return CONSTM1_RTX (vmode);
990
991 unsigned n = GET_MODE_NUNITS (vmode);
992 rtx *v = XALLOCAVEC (rtx, n);
993 v[0] = x;
994 for (unsigned i = 1; i < n; ++i)
995 v[i] = const0_rtx;
996 return gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
997}
998
999/* Convert operand OP in INSN. We should handle
1000 memory operands and uninitialized registers.
1001 All other register uses are converted during
1002 registers conversion. */
1003
1004void
1005scalar_chain::convert_op (rtx *op, rtx_insn *insn)
1006{
1007 rtx tmp;
1008
1009 if (GET_MODE (*op) == V1TImode)
1010 return;
1011
1012 *op = copy_rtx_if_shared (*op);
1013
1014 if (GET_CODE (*op) == NOT
1015 || GET_CODE (*op) == ASHIFT)
1016 {
1017 convert_op (op: &XEXP (*op, 0), insn);
1018 PUT_MODE (x: *op, mode: vmode);
1019 }
1020 else if (MEM_P (*op))
1021 {
1022 rtx_insn *movabs = NULL;
1023
1024 /* Emit MOVABS to load from a 64-bit absolute address to a GPR. */
1025 if (!memory_operand (*op, GET_MODE (*op)))
1026 {
1027 tmp = gen_reg_rtx (GET_MODE (*op));
1028 movabs = emit_insn_before (gen_rtx_SET (tmp, *op), insn);
1029
1030 *op = tmp;
1031 }
1032
1033 tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (GET_MODE (*op)), 0);
1034
1035 rtx_insn *eh_insn
1036 = emit_insn_before (gen_rtx_SET (copy_rtx (tmp),
1037 gen_gpr_to_xmm_move_src (vmode, *op)),
1038 insn);
1039
1040 if (cfun->can_throw_non_call_exceptions)
1041 {
1042 /* Handle REG_EH_REGION note. */
1043 rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
1044 if (note)
1045 {
1046 if (movabs)
1047 eh_insn = movabs;
1048 control_flow_insns.safe_push (obj: eh_insn);
1049 add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
1050 }
1051 }
1052
1053 *op = tmp;
1054
1055 if (dump_file)
1056 fprintf (stream: dump_file, format: " Preloading operand for insn %d into r%d\n",
1057 INSN_UID (insn), REGNO (tmp));
1058 }
1059 else if (REG_P (*op))
1060 *op = gen_rtx_SUBREG (vmode, *op, 0);
1061 else if (CONST_SCALAR_INT_P (*op))
1062 {
1063 rtx vec_cst = smode_convert_cst (x: *op, vmode);
1064
1065 if (!standard_sse_constant_p (vec_cst, vmode))
1066 {
1067 start_sequence ();
1068 vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
1069 rtx_insn *seq = get_insns ();
1070 end_sequence ();
1071 emit_insn_before (seq, insn);
1072 }
1073
1074 tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
1075
1076 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
1077 *op = tmp;
1078 }
1079 else
1080 {
1081 gcc_assert (SUBREG_P (*op));
1082 gcc_assert (GET_MODE (*op) == vmode);
1083 }
1084}
1085
1086/* Convert CCZmode COMPARE to vector mode. */
1087
1088rtx
1089scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
1090{
1091 rtx src, tmp;
1092
1093 /* Handle any REG_EQUAL notes. */
1094 tmp = find_reg_equal_equiv_note (insn);
1095 if (tmp)
1096 {
1097 if (GET_CODE (XEXP (tmp, 0)) == COMPARE
1098 && GET_MODE (XEXP (tmp, 0)) == CCZmode
1099 && REG_P (XEXP (XEXP (tmp, 0), 0)))
1100 {
1101 rtx *op = &XEXP (XEXP (tmp, 0), 1);
1102 if (CONST_SCALAR_INT_P (*op))
1103 {
1104 if (constm1_operand (*op, GET_MODE (*op)))
1105 *op = CONSTM1_RTX (vmode);
1106 else
1107 {
1108 unsigned n = GET_MODE_NUNITS (vmode);
1109 rtx *v = XALLOCAVEC (rtx, n);
1110 v[0] = *op;
1111 for (unsigned i = 1; i < n; ++i)
1112 v[i] = const0_rtx;
1113 *op = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
1114 }
1115 tmp = NULL_RTX;
1116 }
1117 else if (REG_P (*op))
1118 tmp = NULL_RTX;
1119 }
1120
1121 if (tmp)
1122 remove_note (insn, tmp);
1123 }
1124
1125 /* Comparison against anything other than zero, requires an XOR. */
1126 if (op2 != const0_rtx)
1127 {
1128 convert_op (op: &op1, insn);
1129 convert_op (op: &op2, insn);
1130 /* If both operands are MEMs, explicitly load the OP1 into TMP. */
1131 if (MEM_P (op1) && MEM_P (op2))
1132 {
1133 tmp = gen_reg_rtx (vmode);
1134 emit_insn_before (gen_rtx_SET (tmp, op1), insn);
1135 src = tmp;
1136 }
1137 else
1138 src = op1;
1139 src = gen_rtx_XOR (vmode, src, op2);
1140 }
1141 else if (GET_CODE (op1) == AND
1142 && GET_CODE (XEXP (op1, 0)) == NOT)
1143 {
1144 rtx op11 = XEXP (XEXP (op1, 0), 0);
1145 rtx op12 = XEXP (op1, 1);
1146 convert_op (op: &op11, insn);
1147 convert_op (op: &op12, insn);
1148 if (!REG_P (op11))
1149 {
1150 tmp = gen_reg_rtx (vmode);
1151 emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1152 op11 = tmp;
1153 }
1154 src = gen_rtx_AND (vmode, gen_rtx_NOT (vmode, op11), op12);
1155 }
1156 else if (GET_CODE (op1) == AND)
1157 {
1158 rtx op11 = XEXP (op1, 0);
1159 rtx op12 = XEXP (op1, 1);
1160 convert_op (op: &op11, insn);
1161 convert_op (op: &op12, insn);
1162 if (!REG_P (op11))
1163 {
1164 tmp = gen_reg_rtx (vmode);
1165 emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1166 op11 = tmp;
1167 }
1168 return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, op11, op12),
1169 UNSPEC_PTEST);
1170 }
1171 else
1172 {
1173 convert_op (op: &op1, insn);
1174 src = op1;
1175 }
1176
1177 if (!REG_P (src))
1178 {
1179 tmp = gen_reg_rtx (vmode);
1180 emit_insn_before (gen_rtx_SET (tmp, src), insn);
1181 src = tmp;
1182 }
1183
1184 if (vmode == V2DImode)
1185 {
1186 tmp = gen_reg_rtx (vmode);
1187 emit_insn_before (gen_vec_interleave_lowv2di (tmp, src, src), insn);
1188 src = tmp;
1189 }
1190 else if (vmode == V4SImode)
1191 {
1192 tmp = gen_reg_rtx (vmode);
1193 emit_insn_before (gen_sse2_pshufd (tmp, src, const0_rtx), insn);
1194 src = tmp;
1195 }
1196
1197 return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
1198}
1199
1200/* Helper function for converting INSN to vector mode. */
1201
1202void
1203scalar_chain::convert_insn_common (rtx_insn *insn)
1204{
1205 /* Generate copies for out-of-chain uses of defs and adjust debug uses. */
1206 for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1207 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1208 {
1209 df_link *use;
1210 for (use = DF_REF_CHAIN (ref); use; use = use->next)
1211 if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
1212 && (DF_REF_REG_MEM_P (use->ref)
1213 || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
1214 break;
1215 if (use)
1216 convert_reg (insn, DF_REF_REG (ref),
1217 src: *defs_map.get (k: regno_reg_rtx [DF_REF_REGNO (ref)]));
1218 else if (MAY_HAVE_DEBUG_BIND_INSNS)
1219 {
1220 /* If we generated a scalar copy we can leave debug-insns
1221 as-is, if not, we have to adjust them. */
1222 auto_vec<rtx_insn *, 5> to_reset_debug_insns;
1223 for (use = DF_REF_CHAIN (ref); use; use = use->next)
1224 if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
1225 {
1226 rtx_insn *debug_insn = DF_REF_INSN (use->ref);
1227 /* If there's a reaching definition outside of the
1228 chain we have to reset. */
1229 df_link *def;
1230 for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
1231 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
1232 break;
1233 if (def)
1234 to_reset_debug_insns.safe_push (obj: debug_insn);
1235 else
1236 {
1237 *DF_REF_REAL_LOC (use->ref)
1238 = *defs_map.get (k: regno_reg_rtx [DF_REF_REGNO (ref)]);
1239 df_insn_rescan (debug_insn);
1240 }
1241 }
1242 /* Have to do the reset outside of the DF_CHAIN walk to not
1243 disrupt it. */
1244 while (!to_reset_debug_insns.is_empty ())
1245 {
1246 rtx_insn *debug_insn = to_reset_debug_insns.pop ();
1247 INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
1248 df_insn_rescan_debug_internal (debug_insn);
1249 }
1250 }
1251 }
1252
1253 /* Replace uses in this insn with the defs we use in the chain. */
1254 for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1255 if (!DF_REF_REG_MEM_P (ref))
1256 if (rtx *vreg = defs_map.get (k: regno_reg_rtx[DF_REF_REGNO (ref)]))
1257 {
1258 /* Also update a corresponding REG_DEAD note. */
1259 rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
1260 if (note)
1261 XEXP (note, 0) = *vreg;
1262 *DF_REF_REAL_LOC (ref) = *vreg;
1263 }
1264}
1265
1266/* Convert INSN which is an SImode or DImode rotation by a constant
1267 to vector mode. CODE is either ROTATE or ROTATERT with operands
1268 OP0 and OP1. Returns the SET_SRC of the last instruction in the
1269 resulting sequence, which is emitted before INSN. */
1270
1271rtx
1272general_scalar_chain::convert_rotate (enum rtx_code code, rtx op0, rtx op1,
1273 rtx_insn *insn)
1274{
1275 int bits = INTVAL (op1);
1276 rtx pat, result;
1277
1278 convert_op (op: &op0, insn);
1279 if (bits == 0)
1280 return op0;
1281
1282 if (smode == DImode)
1283 {
1284 if (code == ROTATE)
1285 bits = 64 - bits;
1286 if (bits == 32)
1287 {
1288 rtx tmp1 = gen_reg_rtx (V4SImode);
1289 pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1290 GEN_INT (225));
1291 emit_insn_before (pat, insn);
1292 result = gen_lowpart (V2DImode, tmp1);
1293 }
1294 else if (TARGET_AVX512VL)
1295 result = simplify_gen_binary (code, V2DImode, op0, op1);
1296 else if (bits == 16 || bits == 48)
1297 {
1298 rtx tmp1 = gen_reg_rtx (V8HImode);
1299 pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0),
1300 GEN_INT (bits == 16 ? 57 : 147));
1301 emit_insn_before (pat, insn);
1302 result = gen_lowpart (V2DImode, tmp1);
1303 }
1304 else if ((bits & 0x07) == 0)
1305 {
1306 rtx tmp1 = gen_reg_rtx (V4SImode);
1307 pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1308 GEN_INT (68));
1309 emit_insn_before (pat, insn);
1310 rtx tmp2 = gen_reg_rtx (V1TImode);
1311 pat = gen_sse2_lshrv1ti3 (tmp2, gen_lowpart (V1TImode, tmp1),
1312 GEN_INT (bits));
1313 emit_insn_before (pat, insn);
1314 result = gen_lowpart (V2DImode, tmp2);
1315 }
1316 else
1317 {
1318 rtx tmp1 = gen_reg_rtx (V4SImode);
1319 pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1320 GEN_INT (20));
1321 emit_insn_before (pat, insn);
1322 rtx tmp2 = gen_reg_rtx (V2DImode);
1323 pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1324 GEN_INT (bits & 31));
1325 emit_insn_before (pat, insn);
1326 rtx tmp3 = gen_reg_rtx (V4SImode);
1327 pat = gen_sse2_pshufd (tmp3, gen_lowpart (V4SImode, tmp2),
1328 GEN_INT (bits > 32 ? 34 : 136));
1329 emit_insn_before (pat, insn);
1330 result = gen_lowpart (V2DImode, tmp3);
1331 }
1332 }
1333 else if (bits == 16)
1334 {
1335 rtx tmp1 = gen_reg_rtx (V8HImode);
1336 pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0), GEN_INT (225));
1337 emit_insn_before (pat, insn);
1338 result = gen_lowpart (V4SImode, tmp1);
1339 }
1340 else if (TARGET_AVX512VL)
1341 result = simplify_gen_binary (code, V4SImode, op0, op1);
1342 else
1343 {
1344 if (code == ROTATE)
1345 bits = 32 - bits;
1346
1347 rtx tmp1 = gen_reg_rtx (V4SImode);
1348 emit_insn_before (gen_sse2_pshufd (tmp1, op0, GEN_INT (224)), insn);
1349 rtx tmp2 = gen_reg_rtx (V2DImode);
1350 pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1351 GEN_INT (bits));
1352 emit_insn_before (pat, insn);
1353 result = gen_lowpart (V4SImode, tmp2);
1354 }
1355
1356 return result;
1357}
1358
1359/* Convert INSN to vector mode. */
1360
1361void
1362general_scalar_chain::convert_insn (rtx_insn *insn)
1363{
1364 rtx def_set = single_set (insn);
1365 rtx src = SET_SRC (def_set);
1366 rtx dst = SET_DEST (def_set);
1367 rtx subreg;
1368
1369 if (MEM_P (dst) && !REG_P (src))
1370 {
1371 /* There are no scalar integer instructions and therefore
1372 temporary register usage is required. */
1373 rtx tmp = gen_reg_rtx (smode);
1374 emit_conversion_insns (insns: gen_move_insn (dst, tmp), after: insn);
1375 dst = gen_rtx_SUBREG (vmode, tmp, 0);
1376 }
1377 else if (REG_P (dst) && GET_MODE (dst) == smode)
1378 {
1379 /* Replace the definition with a SUBREG to the definition we
1380 use inside the chain. */
1381 rtx *vdef = defs_map.get (k: dst);
1382 if (vdef)
1383 dst = *vdef;
1384 dst = gen_rtx_SUBREG (vmode, dst, 0);
1385 /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
1386 is a non-REG_P. So kill those off. */
1387 rtx note = find_reg_equal_equiv_note (insn);
1388 if (note)
1389 remove_note (insn, note);
1390 }
1391
1392 switch (GET_CODE (src))
1393 {
1394 case PLUS:
1395 case MINUS:
1396 case IOR:
1397 case XOR:
1398 case AND:
1399 case SMAX:
1400 case SMIN:
1401 case UMAX:
1402 case UMIN:
1403 convert_op (op: &XEXP (src, 1), insn);
1404 /* FALLTHRU */
1405
1406 case ABS:
1407 case ASHIFT:
1408 case ASHIFTRT:
1409 case LSHIFTRT:
1410 convert_op (op: &XEXP (src, 0), insn);
1411 PUT_MODE (x: src, mode: vmode);
1412 break;
1413
1414 case ROTATE:
1415 case ROTATERT:
1416 src = convert_rotate (GET_CODE (src), XEXP (src, 0), XEXP (src, 1),
1417 insn);
1418 break;
1419
1420 case NEG:
1421 src = XEXP (src, 0);
1422
1423 if (GET_CODE (src) == ABS)
1424 {
1425 src = XEXP (src, 0);
1426 convert_op (op: &src, insn);
1427 subreg = gen_reg_rtx (vmode);
1428 emit_insn_before (gen_rtx_SET (subreg,
1429 gen_rtx_ABS (vmode, src)), insn);
1430 src = subreg;
1431 }
1432 else
1433 convert_op (op: &src, insn);
1434
1435 subreg = gen_reg_rtx (vmode);
1436 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
1437 src = gen_rtx_MINUS (vmode, subreg, src);
1438 break;
1439
1440 case NOT:
1441 src = XEXP (src, 0);
1442 convert_op (op: &src, insn);
1443 subreg = gen_reg_rtx (vmode);
1444 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
1445 src = gen_rtx_XOR (vmode, src, subreg);
1446 break;
1447
1448 case MEM:
1449 if (!REG_P (dst))
1450 convert_op (op: &src, insn);
1451 break;
1452
1453 case REG:
1454 if (!MEM_P (dst))
1455 convert_op (op: &src, insn);
1456 break;
1457
1458 case SUBREG:
1459 gcc_assert (GET_MODE (src) == vmode);
1460 break;
1461
1462 case COMPARE:
1463 dst = gen_rtx_REG (CCZmode, FLAGS_REG);
1464 src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
1465 break;
1466
1467 case CONST_INT:
1468 convert_op (op: &src, insn);
1469 break;
1470
1471 case VEC_SELECT:
1472 if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
1473 src = XEXP (src, 0);
1474 else if (smode == DImode)
1475 {
1476 rtx tmp = gen_lowpart (V1TImode, XEXP (src, 0));
1477 dst = gen_lowpart (V1TImode, dst);
1478 src = gen_rtx_LSHIFTRT (V1TImode, tmp, GEN_INT (64));
1479 }
1480 else
1481 {
1482 rtx tmp = XVECEXP (XEXP (src, 1), 0, 0);
1483 rtvec vec = gen_rtvec (4, tmp, tmp, tmp, tmp);
1484 rtx par = gen_rtx_PARALLEL (VOIDmode, vec);
1485 src = gen_rtx_VEC_SELECT (vmode, XEXP (src, 0), par);
1486 }
1487 break;
1488
1489 default:
1490 gcc_unreachable ();
1491 }
1492
1493 SET_SRC (def_set) = src;
1494 SET_DEST (def_set) = dst;
1495
1496 /* Drop possible dead definitions. */
1497 PATTERN (insn) = def_set;
1498
1499 INSN_CODE (insn) = -1;
1500 int patt = recog_memoized (insn);
1501 if (patt == -1)
1502 fatal_insn_not_found (insn);
1503 df_insn_rescan (insn);
1504}
1505
1506/* Compute a gain for chain conversion. */
1507
1508int
1509timode_scalar_chain::compute_convert_gain ()
1510{
1511 /* Assume that if we have to move TImode values between units,
1512 then transforming this chain isn't worth it. */
1513 if (n_sse_to_integer || n_integer_to_sse)
1514 return -1;
1515
1516 bitmap_iterator bi;
1517 unsigned insn_uid;
1518
1519 /* Split ties to prefer V1TImode when not optimizing for size. */
1520 int gain = optimize_size ? 0 : 1;
1521
1522 if (dump_file)
1523 fprintf (stream: dump_file, format: "Computing gain for chain #%d...\n", chain_id);
1524
1525 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1526 {
1527 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1528 rtx def_set = single_set (insn);
1529 rtx src = SET_SRC (def_set);
1530 rtx dst = SET_DEST (def_set);
1531 HOST_WIDE_INT op1val;
1532 int scost, vcost;
1533 int igain = 0;
1534
1535 switch (GET_CODE (src))
1536 {
1537 case REG:
1538 if (optimize_insn_for_size_p ())
1539 igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
1540 else
1541 igain = COSTS_N_INSNS (1);
1542 break;
1543
1544 case MEM:
1545 igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (7)
1546 : COSTS_N_INSNS (1);
1547 break;
1548
1549 case CONST_INT:
1550 if (MEM_P (dst)
1551 && standard_sse_constant_p (src, V1TImode))
1552 igain = optimize_insn_for_size_p() ? COSTS_N_BYTES (11) : 1;
1553 break;
1554
1555 case NOT:
1556 if (MEM_P (dst))
1557 igain = -COSTS_N_INSNS (1);
1558 break;
1559
1560 case AND:
1561 case XOR:
1562 case IOR:
1563 if (!MEM_P (dst))
1564 igain = COSTS_N_INSNS (1);
1565 break;
1566
1567 case ASHIFT:
1568 case LSHIFTRT:
1569 /* See ix86_expand_v1ti_shift. */
1570 op1val = INTVAL (XEXP (src, 1));
1571 if (optimize_insn_for_size_p ())
1572 {
1573 if (op1val == 64 || op1val == 65)
1574 scost = COSTS_N_BYTES (5);
1575 else if (op1val >= 66)
1576 scost = COSTS_N_BYTES (6);
1577 else if (op1val == 1)
1578 scost = COSTS_N_BYTES (8);
1579 else
1580 scost = COSTS_N_BYTES (9);
1581
1582 if ((op1val & 7) == 0)
1583 vcost = COSTS_N_BYTES (5);
1584 else if (op1val > 64)
1585 vcost = COSTS_N_BYTES (10);
1586 else
1587 vcost = TARGET_AVX ? COSTS_N_BYTES (19) : COSTS_N_BYTES (23);
1588 }
1589 else
1590 {
1591 scost = COSTS_N_INSNS (2);
1592 if ((op1val & 7) == 0)
1593 vcost = COSTS_N_INSNS (1);
1594 else if (op1val > 64)
1595 vcost = COSTS_N_INSNS (2);
1596 else
1597 vcost = TARGET_AVX ? COSTS_N_INSNS (4) : COSTS_N_INSNS (5);
1598 }
1599 igain = scost - vcost;
1600 break;
1601
1602 case ASHIFTRT:
1603 /* See ix86_expand_v1ti_ashiftrt. */
1604 op1val = INTVAL (XEXP (src, 1));
1605 if (optimize_insn_for_size_p ())
1606 {
1607 if (op1val == 64 || op1val == 127)
1608 scost = COSTS_N_BYTES (7);
1609 else if (op1val == 1)
1610 scost = COSTS_N_BYTES (8);
1611 else if (op1val == 65)
1612 scost = COSTS_N_BYTES (10);
1613 else if (op1val >= 66)
1614 scost = COSTS_N_BYTES (11);
1615 else
1616 scost = COSTS_N_BYTES (9);
1617
1618 if (op1val == 127)
1619 vcost = COSTS_N_BYTES (10);
1620 else if (op1val == 64)
1621 vcost = COSTS_N_BYTES (14);
1622 else if (op1val == 96)
1623 vcost = COSTS_N_BYTES (18);
1624 else if (op1val >= 111)
1625 vcost = COSTS_N_BYTES (15);
1626 else if (TARGET_AVX2 && op1val == 32)
1627 vcost = COSTS_N_BYTES (16);
1628 else if (TARGET_SSE4_1 && op1val == 32)
1629 vcost = COSTS_N_BYTES (20);
1630 else if (op1val >= 96)
1631 vcost = COSTS_N_BYTES (23);
1632 else if ((op1val & 7) == 0)
1633 vcost = COSTS_N_BYTES (28);
1634 else if (TARGET_AVX2 && op1val < 32)
1635 vcost = COSTS_N_BYTES (30);
1636 else if (op1val == 1 || op1val >= 64)
1637 vcost = COSTS_N_BYTES (42);
1638 else
1639 vcost = COSTS_N_BYTES (47);
1640 }
1641 else
1642 {
1643 if (op1val >= 65 && op1val <= 126)
1644 scost = COSTS_N_INSNS (3);
1645 else
1646 scost = COSTS_N_INSNS (2);
1647
1648 if (op1val == 127)
1649 vcost = COSTS_N_INSNS (2);
1650 else if (op1val == 64)
1651 vcost = COSTS_N_INSNS (3);
1652 else if (op1val == 96)
1653 vcost = COSTS_N_INSNS (4);
1654 else if (op1val >= 111)
1655 vcost = COSTS_N_INSNS (3);
1656 else if (TARGET_AVX2 && op1val == 32)
1657 vcost = COSTS_N_INSNS (3);
1658 else if (TARGET_SSE4_1 && op1val == 32)
1659 vcost = COSTS_N_INSNS (4);
1660 else if (op1val >= 96)
1661 vcost = COSTS_N_INSNS (5);
1662 else if ((op1val & 7) == 0)
1663 vcost = COSTS_N_INSNS (6);
1664 else if (TARGET_AVX2 && op1val < 32)
1665 vcost = COSTS_N_INSNS (6);
1666 else if (op1val == 1 || op1val >= 64)
1667 vcost = COSTS_N_INSNS (9);
1668 else
1669 vcost = COSTS_N_INSNS (10);
1670 }
1671 igain = scost - vcost;
1672 break;
1673
1674 case ROTATE:
1675 case ROTATERT:
1676 /* See ix86_expand_v1ti_rotate. */
1677 op1val = INTVAL (XEXP (src, 1));
1678 if (optimize_insn_for_size_p ())
1679 {
1680 scost = COSTS_N_BYTES (13);
1681 if ((op1val & 31) == 0)
1682 vcost = COSTS_N_BYTES (5);
1683 else if ((op1val & 7) == 0)
1684 vcost = TARGET_AVX ? COSTS_N_BYTES (13) : COSTS_N_BYTES (18);
1685 else if (op1val > 32 && op1val < 96)
1686 vcost = COSTS_N_BYTES (24);
1687 else
1688 vcost = COSTS_N_BYTES (19);
1689 }
1690 else
1691 {
1692 scost = COSTS_N_INSNS (3);
1693 if ((op1val & 31) == 0)
1694 vcost = COSTS_N_INSNS (1);
1695 else if ((op1val & 7) == 0)
1696 vcost = TARGET_AVX ? COSTS_N_INSNS (3) : COSTS_N_INSNS (4);
1697 else if (op1val > 32 && op1val < 96)
1698 vcost = COSTS_N_INSNS (5);
1699 else
1700 vcost = COSTS_N_INSNS (1);
1701 }
1702 igain = scost - vcost;
1703 break;
1704
1705 case COMPARE:
1706 if (XEXP (src, 1) == const0_rtx)
1707 {
1708 if (GET_CODE (XEXP (src, 0)) == AND)
1709 /* and;and;or (9 bytes) vs. ptest (5 bytes). */
1710 igain = optimize_insn_for_size_p() ? COSTS_N_BYTES (4)
1711 : COSTS_N_INSNS (2);
1712 /* or (3 bytes) vs. ptest (5 bytes). */
1713 else if (optimize_insn_for_size_p ())
1714 igain = -COSTS_N_BYTES (2);
1715 }
1716 else if (XEXP (src, 1) == const1_rtx)
1717 /* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes). */
1718 igain = optimize_insn_for_size_p() ? -COSTS_N_BYTES (6)
1719 : -COSTS_N_INSNS (1);
1720 break;
1721
1722 default:
1723 break;
1724 }
1725
1726 if (igain != 0 && dump_file)
1727 {
1728 fprintf (stream: dump_file, format: " Instruction gain %d for ", igain);
1729 dump_insn_slim (dump_file, insn);
1730 }
1731 gain += igain;
1732 }
1733
1734 if (dump_file)
1735 fprintf (stream: dump_file, format: " Total gain: %d\n", gain);
1736
1737 return gain;
1738}
1739
1740/* Fix uses of converted REG in debug insns. */
1741
1742void
1743timode_scalar_chain::fix_debug_reg_uses (rtx reg)
1744{
1745 if (!flag_var_tracking)
1746 return;
1747
1748 df_ref ref, next;
1749 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
1750 {
1751 rtx_insn *insn = DF_REF_INSN (ref);
1752 /* Make sure the next ref is for a different instruction,
1753 so that we're not affected by the rescan. */
1754 next = DF_REF_NEXT_REG (ref);
1755 while (next && DF_REF_INSN (next) == insn)
1756 next = DF_REF_NEXT_REG (next);
1757
1758 if (DEBUG_INSN_P (insn))
1759 {
1760 /* It may be a debug insn with a TImode variable in
1761 register. */
1762 bool changed = false;
1763 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
1764 {
1765 rtx *loc = DF_REF_LOC (ref);
1766 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
1767 {
1768 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
1769 changed = true;
1770 }
1771 }
1772 if (changed)
1773 df_insn_rescan (insn);
1774 }
1775 }
1776}
1777
1778/* Convert INSN from TImode to V1T1mode. */
1779
1780void
1781timode_scalar_chain::convert_insn (rtx_insn *insn)
1782{
1783 rtx def_set = single_set (insn);
1784 rtx src = SET_SRC (def_set);
1785 rtx dst = SET_DEST (def_set);
1786 rtx tmp;
1787
1788 switch (GET_CODE (dst))
1789 {
1790 case REG:
1791 if (GET_MODE (dst) == TImode)
1792 {
1793 PUT_MODE (x: dst, V1TImode);
1794 fix_debug_reg_uses (reg: dst);
1795 }
1796 if (GET_MODE (dst) == V1TImode)
1797 {
1798 /* It might potentially be helpful to convert REG_EQUAL notes,
1799 but for now we just remove them. */
1800 rtx note = find_reg_equal_equiv_note (insn);
1801 if (note)
1802 remove_note (insn, note);
1803 }
1804 break;
1805 case MEM:
1806 PUT_MODE (x: dst, V1TImode);
1807 break;
1808
1809 default:
1810 gcc_unreachable ();
1811 }
1812
1813 switch (GET_CODE (src))
1814 {
1815 case REG:
1816 if (GET_MODE (src) == TImode)
1817 {
1818 PUT_MODE (x: src, V1TImode);
1819 fix_debug_reg_uses (reg: src);
1820 }
1821 break;
1822
1823 case MEM:
1824 PUT_MODE (x: src, V1TImode);
1825 break;
1826
1827 case CONST_WIDE_INT:
1828 if (NONDEBUG_INSN_P (insn))
1829 {
1830 /* Since there are no instructions to store 128-bit constant,
1831 temporary register usage is required. */
1832 bool use_move;
1833 start_sequence ();
1834 tmp = ix86_convert_const_wide_int_to_broadcast (TImode, op: src);
1835 if (tmp)
1836 {
1837 src = lowpart_subreg (V1TImode, op: tmp, TImode);
1838 use_move = true;
1839 }
1840 else
1841 {
1842 src = smode_convert_cst (x: src, V1TImode);
1843 src = validize_mem (force_const_mem (V1TImode, src));
1844 use_move = MEM_P (dst);
1845 }
1846 rtx_insn *seq = get_insns ();
1847 end_sequence ();
1848 if (seq)
1849 emit_insn_before (seq, insn);
1850 if (use_move)
1851 {
1852 tmp = gen_reg_rtx (V1TImode);
1853 emit_insn_before (gen_rtx_SET (tmp, src), insn);
1854 src = tmp;
1855 }
1856 }
1857 break;
1858
1859 case CONST_INT:
1860 switch (standard_sse_constant_p (src, TImode))
1861 {
1862 case 1:
1863 src = CONST0_RTX (GET_MODE (dst));
1864 break;
1865 case 2:
1866 src = CONSTM1_RTX (GET_MODE (dst));
1867 break;
1868 default:
1869 gcc_unreachable ();
1870 }
1871 if (MEM_P (dst))
1872 {
1873 tmp = gen_reg_rtx (V1TImode);
1874 emit_insn_before (gen_rtx_SET (tmp, src), insn);
1875 src = tmp;
1876 }
1877 break;
1878
1879 case AND:
1880 if (GET_CODE (XEXP (src, 0)) == NOT)
1881 {
1882 convert_op (op: &XEXP (XEXP (src, 0), 0), insn);
1883 convert_op (op: &XEXP (src, 1), insn);
1884 PUT_MODE (XEXP (src, 0), V1TImode);
1885 PUT_MODE (x: src, V1TImode);
1886 break;
1887 }
1888 /* FALLTHRU */
1889
1890 case XOR:
1891 case IOR:
1892 convert_op (op: &XEXP (src, 0), insn);
1893 convert_op (op: &XEXP (src, 1), insn);
1894 PUT_MODE (x: src, V1TImode);
1895 if (MEM_P (dst))
1896 {
1897 tmp = gen_reg_rtx (V1TImode);
1898 emit_insn_before (gen_rtx_SET (tmp, src), insn);
1899 src = tmp;
1900 }
1901 break;
1902
1903 case NOT:
1904 src = XEXP (src, 0);
1905 convert_op (op: &src, insn);
1906 tmp = gen_reg_rtx (V1TImode);
1907 emit_insn_before (gen_move_insn (tmp, CONSTM1_RTX (V1TImode)), insn);
1908 src = gen_rtx_XOR (V1TImode, src, tmp);
1909 if (MEM_P (dst))
1910 {
1911 tmp = gen_reg_rtx (V1TImode);
1912 emit_insn_before (gen_rtx_SET (tmp, src), insn);
1913 src = tmp;
1914 }
1915 break;
1916
1917 case COMPARE:
1918 dst = gen_rtx_REG (CCZmode, FLAGS_REG);
1919 src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
1920 break;
1921
1922 case ASHIFT:
1923 case LSHIFTRT:
1924 case ASHIFTRT:
1925 case ROTATERT:
1926 case ROTATE:
1927 convert_op (op: &XEXP (src, 0), insn);
1928 PUT_MODE (x: src, V1TImode);
1929 break;
1930
1931 default:
1932 gcc_unreachable ();
1933 }
1934
1935 SET_SRC (def_set) = src;
1936 SET_DEST (def_set) = dst;
1937
1938 /* Drop possible dead definitions. */
1939 PATTERN (insn) = def_set;
1940
1941 INSN_CODE (insn) = -1;
1942 recog_memoized (insn);
1943 df_insn_rescan (insn);
1944}
1945
1946/* Generate copies from defs used by the chain but not defined therein.
1947 Also populates defs_map which is used later by convert_insn. */
1948
1949void
1950scalar_chain::convert_registers ()
1951{
1952 bitmap_iterator bi;
1953 unsigned id;
1954 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1955 {
1956 rtx chain_reg = gen_reg_rtx (smode);
1957 defs_map.put (k: regno_reg_rtx[id], v: chain_reg);
1958 }
1959 EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
1960 for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
1961 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1962 make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
1963}
1964
1965/* Convert whole chain creating required register
1966 conversions and copies. */
1967
1968int
1969scalar_chain::convert ()
1970{
1971 bitmap_iterator bi;
1972 unsigned id;
1973 int converted_insns = 0;
1974
1975 if (!dbg_cnt (index: stv_conversion))
1976 return 0;
1977
1978 if (dump_file)
1979 fprintf (stream: dump_file, format: "Converting chain #%d...\n", chain_id);
1980
1981 convert_registers ();
1982
1983 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
1984 {
1985 rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
1986 convert_insn_common (insn);
1987 convert_insn (insn);
1988 converted_insns++;
1989 }
1990
1991 return converted_insns;
1992}
1993
1994/* Return the SET expression if INSN doesn't reference hard register.
1995 Return NULL if INSN uses or defines a hard register, excluding
1996 pseudo register pushes, hard register uses in a memory address,
1997 clobbers and flags definitions. */
1998
1999static rtx
2000pseudo_reg_set (rtx_insn *insn)
2001{
2002 rtx set = single_set (insn);
2003 if (!set)
2004 return NULL;
2005
2006 /* Check pseudo register push first. */
2007 machine_mode mode = TARGET_64BIT ? TImode : DImode;
2008 if (REG_P (SET_SRC (set))
2009 && !HARD_REGISTER_P (SET_SRC (set))
2010 && push_operand (SET_DEST (set), mode))
2011 return set;
2012
2013 df_ref ref;
2014 FOR_EACH_INSN_DEF (ref, insn)
2015 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2016 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2017 && DF_REF_REGNO (ref) != FLAGS_REG)
2018 return NULL;
2019
2020 FOR_EACH_INSN_USE (ref, insn)
2021 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2022 return NULL;
2023
2024 return set;
2025}
2026
2027/* Return true if the register REG is defined in a single DEF chain.
2028 If it is defined in more than one DEF chains, we may not be able
2029 to convert it in all chains. */
2030
2031static bool
2032single_def_chain_p (rtx reg)
2033{
2034 df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
2035 if (!ref)
2036 return false;
2037 return DF_REF_NEXT_REG (ref) == nullptr;
2038}
2039
2040/* Check if comparison INSN may be transformed into vector comparison.
2041 Currently we transform equality/inequality checks which look like:
2042 (set (reg:CCZ 17 flags) (compare:CCZ (reg:TI x) (reg:TI y))) */
2043
2044static bool
2045convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
2046{
2047 if (mode != (TARGET_64BIT ? TImode : DImode))
2048 return false;
2049
2050 if (!TARGET_SSE4_1)
2051 return false;
2052
2053 rtx def_set = single_set (insn);
2054
2055 gcc_assert (def_set);
2056
2057 rtx src = SET_SRC (def_set);
2058 rtx dst = SET_DEST (def_set);
2059
2060 gcc_assert (GET_CODE (src) == COMPARE);
2061
2062 if (GET_CODE (dst) != REG
2063 || REGNO (dst) != FLAGS_REG
2064 || GET_MODE (dst) != CCZmode)
2065 return false;
2066
2067 rtx op1 = XEXP (src, 0);
2068 rtx op2 = XEXP (src, 1);
2069
2070 /* *cmp<dwi>_doubleword. */
2071 if ((CONST_SCALAR_INT_P (op1)
2072 || ((REG_P (op1) || MEM_P (op1))
2073 && GET_MODE (op1) == mode))
2074 && (CONST_SCALAR_INT_P (op2)
2075 || ((REG_P (op2) || MEM_P (op2))
2076 && GET_MODE (op2) == mode)))
2077 return true;
2078
2079 /* *testti_doubleword. */
2080 if (op2 == const0_rtx
2081 && GET_CODE (op1) == AND
2082 && REG_P (XEXP (op1, 0)))
2083 {
2084 rtx op12 = XEXP (op1, 1);
2085 return GET_MODE (XEXP (op1, 0)) == TImode
2086 && (CONST_SCALAR_INT_P (op12)
2087 || ((REG_P (op12) || MEM_P (op12))
2088 && GET_MODE (op12) == TImode));
2089 }
2090
2091 /* *test<dwi>_not_doubleword. */
2092 if (op2 == const0_rtx
2093 && GET_CODE (op1) == AND
2094 && GET_CODE (XEXP (op1, 0)) == NOT)
2095 {
2096 rtx op11 = XEXP (XEXP (op1, 0), 0);
2097 rtx op12 = XEXP (op1, 1);
2098 return (REG_P (op11) || MEM_P (op11))
2099 && (REG_P (op12) || MEM_P (op12))
2100 && GET_MODE (op11) == mode
2101 && GET_MODE (op12) == mode;
2102 }
2103
2104 return false;
2105}
2106
2107/* The general version of scalar_to_vector_candidate_p. */
2108
2109static bool
2110general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
2111{
2112 rtx def_set = pseudo_reg_set (insn);
2113
2114 if (!def_set)
2115 return false;
2116
2117 rtx src = SET_SRC (def_set);
2118 rtx dst = SET_DEST (def_set);
2119
2120 if (GET_CODE (src) == COMPARE)
2121 return convertible_comparison_p (insn, mode);
2122
2123 /* We are interested in "mode" only. */
2124 if ((GET_MODE (src) != mode
2125 && !CONST_INT_P (src))
2126 || GET_MODE (dst) != mode)
2127 return false;
2128
2129 if (!REG_P (dst) && !MEM_P (dst))
2130 return false;
2131
2132 switch (GET_CODE (src))
2133 {
2134 case ASHIFTRT:
2135 if (mode == DImode && !TARGET_AVX512VL)
2136 return false;
2137 /* FALLTHRU */
2138
2139 case ASHIFT:
2140 case LSHIFTRT:
2141 case ROTATE:
2142 case ROTATERT:
2143 if (!CONST_INT_P (XEXP (src, 1))
2144 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
2145 return false;
2146
2147 /* Check for extend highpart case. */
2148 if (mode != DImode
2149 || GET_CODE (src) != ASHIFTRT
2150 || GET_CODE (XEXP (src, 0)) != ASHIFT)
2151 break;
2152
2153 src = XEXP (src, 0);
2154 break;
2155
2156 case SMAX:
2157 case SMIN:
2158 case UMAX:
2159 case UMIN:
2160 if ((mode == DImode && !TARGET_AVX512VL)
2161 || (mode == SImode && !TARGET_SSE4_1))
2162 return false;
2163 /* Fallthru. */
2164
2165 case AND:
2166 case IOR:
2167 case XOR:
2168 case PLUS:
2169 case MINUS:
2170 if (!REG_P (XEXP (src, 1))
2171 && !MEM_P (XEXP (src, 1))
2172 && !CONST_INT_P (XEXP (src, 1)))
2173 return false;
2174
2175 if (GET_MODE (XEXP (src, 1)) != mode
2176 && !CONST_INT_P (XEXP (src, 1)))
2177 return false;
2178
2179 /* Check for andnot case. */
2180 if (GET_CODE (src) != AND
2181 || GET_CODE (XEXP (src, 0)) != NOT)
2182 break;
2183
2184 src = XEXP (src, 0);
2185 /* FALLTHRU */
2186
2187 case NOT:
2188 break;
2189
2190 case NEG:
2191 /* Check for nabs case. */
2192 if (GET_CODE (XEXP (src, 0)) != ABS)
2193 break;
2194
2195 src = XEXP (src, 0);
2196 /* FALLTHRU */
2197
2198 case ABS:
2199 if ((mode == DImode && !TARGET_AVX512VL)
2200 || (mode == SImode && !TARGET_SSSE3))
2201 return false;
2202 break;
2203
2204 case REG:
2205 return true;
2206
2207 case MEM:
2208 case CONST_INT:
2209 return REG_P (dst);
2210
2211 case VEC_SELECT:
2212 /* Excluding MEM_P (dst) avoids intefering with vpextr[dq]. */
2213 return REG_P (dst)
2214 && REG_P (XEXP (src, 0))
2215 && GET_MODE (XEXP (src, 0)) == (mode == DImode ? V2DImode
2216 : V4SImode)
2217 && GET_CODE (XEXP (src, 1)) == PARALLEL
2218 && XVECLEN (XEXP (src, 1), 0) == 1
2219 && CONST_INT_P (XVECEXP (XEXP (src, 1), 0, 0));
2220
2221 default:
2222 return false;
2223 }
2224
2225 if (!REG_P (XEXP (src, 0))
2226 && !MEM_P (XEXP (src, 0))
2227 && !CONST_INT_P (XEXP (src, 0)))
2228 return false;
2229
2230 if (GET_MODE (XEXP (src, 0)) != mode
2231 && !CONST_INT_P (XEXP (src, 0)))
2232 return false;
2233
2234 return true;
2235}
2236
2237/* Check for a suitable TImode memory operand. */
2238
2239static bool
2240timode_mem_p (rtx x)
2241{
2242 return MEM_P (x)
2243 && (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
2244 || !misaligned_operand (x, TImode));
2245}
2246
2247/* The TImode version of scalar_to_vector_candidate_p. */
2248
2249static bool
2250timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2251{
2252 rtx def_set = pseudo_reg_set (insn);
2253
2254 if (!def_set)
2255 return false;
2256
2257 rtx src = SET_SRC (def_set);
2258 rtx dst = SET_DEST (def_set);
2259
2260 if (GET_CODE (src) == COMPARE)
2261 return convertible_comparison_p (insn, TImode);
2262
2263 if (GET_MODE (dst) != TImode
2264 || (GET_MODE (src) != TImode
2265 && !CONST_SCALAR_INT_P (src)))
2266 return false;
2267
2268 if (!REG_P (dst) && !MEM_P (dst))
2269 return false;
2270
2271 if (MEM_P (dst)
2272 && misaligned_operand (dst, TImode)
2273 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2274 return false;
2275
2276 if (REG_P (dst) && !single_def_chain_p (reg: dst))
2277 return false;
2278
2279 switch (GET_CODE (src))
2280 {
2281 case REG:
2282 return single_def_chain_p (reg: src);
2283
2284 case CONST_WIDE_INT:
2285 return true;
2286
2287 case CONST_INT:
2288 /* ??? Verify performance impact before enabling CONST_INT for
2289 __int128 store. */
2290 return standard_sse_constant_p (src, TImode);
2291
2292 case MEM:
2293 /* Memory must be aligned or unaligned load is optimal. */
2294 return (REG_P (dst)
2295 && (!misaligned_operand (src, TImode)
2296 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2297
2298 case AND:
2299 if (!MEM_P (dst)
2300 && GET_CODE (XEXP (src, 0)) == NOT
2301 && REG_P (XEXP (XEXP (src, 0), 0))
2302 && (REG_P (XEXP (src, 1))
2303 || CONST_SCALAR_INT_P (XEXP (src, 1))
2304 || timode_mem_p (XEXP (src, 1))))
2305 return true;
2306 return REG_P (XEXP (src, 0))
2307 && (REG_P (XEXP (src, 1))
2308 || CONST_SCALAR_INT_P (XEXP (src, 1))
2309 || timode_mem_p (XEXP (src, 1)));
2310
2311 case IOR:
2312 case XOR:
2313 return REG_P (XEXP (src, 0))
2314 && (REG_P (XEXP (src, 1))
2315 || CONST_SCALAR_INT_P (XEXP (src, 1))
2316 || timode_mem_p (XEXP (src, 1)));
2317
2318 case NOT:
2319 return REG_P (XEXP (src, 0)) || timode_mem_p (XEXP (src, 0));
2320
2321 case ASHIFT:
2322 case LSHIFTRT:
2323 case ASHIFTRT:
2324 case ROTATERT:
2325 case ROTATE:
2326 /* Handle shifts/rotates by integer constants between 0 and 127. */
2327 return REG_P (XEXP (src, 0))
2328 && CONST_INT_P (XEXP (src, 1))
2329 && (INTVAL (XEXP (src, 1)) & ~0x7f) == 0;
2330
2331 default:
2332 return false;
2333 }
2334}
2335
2336/* For a register REGNO, scan instructions for its defs and uses.
2337 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2338
2339static void
2340timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2341 unsigned int regno)
2342{
2343 /* Do nothing if REGNO is already in REGS or is a hard reg. */
2344 if (bitmap_bit_p (regs, regno)
2345 || HARD_REGISTER_NUM_P (regno))
2346 return;
2347
2348 for (df_ref def = DF_REG_DEF_CHAIN (regno);
2349 def;
2350 def = DF_REF_NEXT_REG (def))
2351 {
2352 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2353 {
2354 if (dump_file)
2355 fprintf (stream: dump_file,
2356 format: "r%d has non convertible def in insn %d\n",
2357 regno, DF_REF_INSN_UID (def));
2358
2359 bitmap_set_bit (regs, regno);
2360 break;
2361 }
2362 }
2363
2364 for (df_ref ref = DF_REG_USE_CHAIN (regno);
2365 ref;
2366 ref = DF_REF_NEXT_REG (ref))
2367 {
2368 /* Debug instructions are skipped. */
2369 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
2370 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2371 {
2372 if (dump_file)
2373 fprintf (stream: dump_file,
2374 format: "r%d has non convertible use in insn %d\n",
2375 regno, DF_REF_INSN_UID (ref));
2376
2377 bitmap_set_bit (regs, regno);
2378 break;
2379 }
2380 }
2381}
2382
2383/* For a given bitmap of insn UIDs scans all instructions and
2384 remove insn from CANDIDATES in case it has both convertible
2385 and not convertible definitions.
2386
2387 All insns in a bitmap are conversion candidates according to
2388 scalar_to_vector_candidate_p. Currently it implies all insns
2389 are single_set. */
2390
2391static void
2392timode_remove_non_convertible_regs (bitmap candidates)
2393{
2394 bitmap_iterator bi;
2395 unsigned id;
2396 bitmap regs = BITMAP_ALLOC (NULL);
2397 bool changed;
2398
2399 do {
2400 changed = false;
2401 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2402 {
2403 rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
2404 df_ref ref;
2405
2406 FOR_EACH_INSN_DEF (ref, insn)
2407 if (!DF_REF_REG_MEM_P (ref)
2408 && GET_MODE (DF_REF_REG (ref)) == TImode)
2409 timode_check_non_convertible_regs (candidates, regs,
2410 DF_REF_REGNO (ref));
2411
2412 FOR_EACH_INSN_USE (ref, insn)
2413 if (!DF_REF_REG_MEM_P (ref)
2414 && GET_MODE (DF_REF_REG (ref)) == TImode)
2415 timode_check_non_convertible_regs (candidates, regs,
2416 DF_REF_REGNO (ref));
2417 }
2418
2419 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2420 {
2421 for (df_ref def = DF_REG_DEF_CHAIN (id);
2422 def;
2423 def = DF_REF_NEXT_REG (def))
2424 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2425 {
2426 if (dump_file)
2427 fprintf (stream: dump_file, format: "Removing insn %d from candidates list\n",
2428 DF_REF_INSN_UID (def));
2429
2430 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2431 changed = true;
2432 }
2433
2434 for (df_ref ref = DF_REG_USE_CHAIN (id);
2435 ref;
2436 ref = DF_REF_NEXT_REG (ref))
2437 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2438 {
2439 if (dump_file)
2440 fprintf (stream: dump_file, format: "Removing insn %d from candidates list\n",
2441 DF_REF_INSN_UID (ref));
2442
2443 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
2444 changed = true;
2445 }
2446 }
2447 } while (changed);
2448
2449 BITMAP_FREE (regs);
2450}
2451
2452/* Main STV pass function. Find and convert scalar
2453 instructions into vector mode when profitable. */
2454
2455static unsigned int
2456convert_scalars_to_vector (bool timode_p)
2457{
2458 basic_block bb;
2459 int converted_insns = 0;
2460 auto_vec<rtx_insn *> control_flow_insns;
2461
2462 bitmap_obstack_initialize (NULL);
2463 const machine_mode cand_mode[3] = { SImode, DImode, TImode };
2464 const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
2465 bitmap_head candidates[3]; /* { SImode, DImode, TImode } */
2466 for (unsigned i = 0; i < 3; ++i)
2467 bitmap_initialize (head: &candidates[i], obstack: &bitmap_default_obstack);
2468
2469 calculate_dominance_info (CDI_DOMINATORS);
2470 df_set_flags (DF_DEFER_INSN_RESCAN | DF_RD_PRUNE_DEAD_DEFS);
2471 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2472 df_analyze ();
2473
2474 /* Find all instructions we want to convert into vector mode. */
2475 if (dump_file)
2476 fprintf (stream: dump_file, format: "Searching for mode conversion candidates...\n");
2477
2478 FOR_EACH_BB_FN (bb, cfun)
2479 {
2480 rtx_insn *insn;
2481 FOR_BB_INSNS (bb, insn)
2482 if (timode_p
2483 && timode_scalar_to_vector_candidate_p (insn))
2484 {
2485 if (dump_file)
2486 fprintf (stream: dump_file, format: " insn %d is marked as a TImode candidate\n",
2487 INSN_UID (insn));
2488
2489 bitmap_set_bit (&candidates[2], INSN_UID (insn));
2490 }
2491 else if (!timode_p)
2492 {
2493 /* Check {SI,DI}mode. */
2494 for (unsigned i = 0; i <= 1; ++i)
2495 if (general_scalar_to_vector_candidate_p (insn, mode: cand_mode[i]))
2496 {
2497 if (dump_file)
2498 fprintf (stream: dump_file, format: " insn %d is marked as a %s candidate\n",
2499 INSN_UID (insn), i == 0 ? "SImode" : "DImode");
2500
2501 bitmap_set_bit (&candidates[i], INSN_UID (insn));
2502 break;
2503 }
2504 }
2505 }
2506
2507 if (timode_p)
2508 timode_remove_non_convertible_regs (candidates: &candidates[2]);
2509
2510 for (unsigned i = 0; i <= 2; ++i)
2511 if (!bitmap_empty_p (map: &candidates[i]))
2512 break;
2513 else if (i == 2 && dump_file)
2514 fprintf (stream: dump_file, format: "There are no candidates for optimization.\n");
2515
2516 for (unsigned i = 0; i <= 2; ++i)
2517 {
2518 auto_bitmap disallowed;
2519 bitmap_tree_view (&candidates[i]);
2520 while (!bitmap_empty_p (map: &candidates[i]))
2521 {
2522 unsigned uid = bitmap_first_set_bit (&candidates[i]);
2523 scalar_chain *chain;
2524
2525 if (cand_mode[i] == TImode)
2526 chain = new timode_scalar_chain;
2527 else
2528 chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
2529
2530 /* Find instructions chain we want to convert to vector mode.
2531 Check all uses and definitions to estimate all required
2532 conversions. */
2533 if (chain->build (candidates: &candidates[i], insn_uid: uid, disallowed))
2534 {
2535 if (chain->compute_convert_gain () > 0)
2536 converted_insns += chain->convert ();
2537 else if (dump_file)
2538 fprintf (stream: dump_file, format: "Chain #%d conversion is not profitable\n",
2539 chain->chain_id);
2540 }
2541
2542 rtx_insn* iter_insn;
2543 unsigned int ii;
2544 FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
2545 control_flow_insns.safe_push (obj: iter_insn);
2546
2547 delete chain;
2548 }
2549 }
2550
2551 if (dump_file)
2552 fprintf (stream: dump_file, format: "Total insns converted: %d\n", converted_insns);
2553
2554 for (unsigned i = 0; i <= 2; ++i)
2555 bitmap_release (head: &candidates[i]);
2556 bitmap_obstack_release (NULL);
2557 df_process_deferred_rescans ();
2558
2559 /* Conversion means we may have 128bit register spills/fills
2560 which require aligned stack. */
2561 if (converted_insns)
2562 {
2563 if (crtl->stack_alignment_needed < 128)
2564 crtl->stack_alignment_needed = 128;
2565 if (crtl->stack_alignment_estimated < 128)
2566 crtl->stack_alignment_estimated = 128;
2567
2568 crtl->stack_realign_needed
2569 = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
2570 crtl->stack_realign_tried = crtl->stack_realign_needed;
2571
2572 crtl->stack_realign_processed = true;
2573
2574 if (!crtl->drap_reg)
2575 {
2576 rtx drap_rtx = targetm.calls.get_drap_rtx ();
2577
2578 /* stack_realign_drap and drap_rtx must match. */
2579 gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
2580
2581 /* Do nothing if NULL is returned,
2582 which means DRAP is not needed. */
2583 if (drap_rtx != NULL)
2584 {
2585 crtl->args.internal_arg_pointer = drap_rtx;
2586
2587 /* Call fixup_tail_calls to clean up
2588 REG_EQUIV note if DRAP is needed. */
2589 fixup_tail_calls ();
2590 }
2591 }
2592
2593 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2594 if (TARGET_64BIT)
2595 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2596 parm; parm = DECL_CHAIN (parm))
2597 {
2598 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2599 continue;
2600 if (DECL_RTL_SET_P (parm)
2601 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2602 {
2603 rtx r = DECL_RTL (parm);
2604 if (REG_P (r))
2605 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2606 }
2607 if (DECL_INCOMING_RTL (parm)
2608 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2609 {
2610 rtx r = DECL_INCOMING_RTL (parm);
2611 if (REG_P (r))
2612 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2613 }
2614 }
2615
2616 if (!control_flow_insns.is_empty ())
2617 {
2618 free_dominance_info (CDI_DOMINATORS);
2619
2620 unsigned int i;
2621 rtx_insn* insn;
2622 FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
2623 if (control_flow_insn_p (insn))
2624 {
2625 /* Split the block after insn. There will be a fallthru
2626 edge, which is OK so we keep it. We have to create
2627 the exception edges ourselves. */
2628 bb = BLOCK_FOR_INSN (insn);
2629 split_block (bb, insn);
2630 rtl_make_eh_edge (NULL, bb, BB_END (bb));
2631 }
2632 }
2633 }
2634
2635 return 0;
2636}
2637
2638static unsigned int
2639rest_of_handle_insert_vzeroupper (void)
2640{
2641 /* vzeroupper instructions are inserted immediately after reload and
2642 postreload_cse to clean up after it a little bit to account for possible
2643 spills from 256bit or 512bit registers. The pass reuses mode switching
2644 infrastructure by re-running mode insertion pass, so disable entities
2645 that have already been processed. */
2646 for (int i = 0; i < MAX_386_ENTITIES; i++)
2647 ix86_optimize_mode_switching[i] = 0;
2648
2649 ix86_optimize_mode_switching[AVX_U128] = 1;
2650
2651 /* Call optimize_mode_switching. */
2652 g->get_passes ()->execute_pass_mode_switching ();
2653
2654 /* LRA removes all REG_DEAD/REG_UNUSED notes and normally they
2655 reappear in the IL only at the start of pass_rtl_dse2, which does
2656 df_note_add_problem (); df_analyze ();
2657 The vzeroupper is scheduled after postreload_cse pass and mode
2658 switching computes the notes as well, the problem is that e.g.
2659 pass_gcse2 doesn't maintain the notes, see PR113059 and
2660 PR112760. Remove the notes now to restore status quo ante
2661 until we figure out how to maintain the notes or what else
2662 to do. */
2663 basic_block bb;
2664 rtx_insn *insn;
2665 FOR_EACH_BB_FN (bb, cfun)
2666 FOR_BB_INSNS (bb, insn)
2667 if (NONDEBUG_INSN_P (insn))
2668 {
2669 rtx *pnote = &REG_NOTES (insn);
2670 while (*pnote != 0)
2671 {
2672 if (REG_NOTE_KIND (*pnote) == REG_DEAD
2673 || REG_NOTE_KIND (*pnote) == REG_UNUSED)
2674 *pnote = XEXP (*pnote, 1);
2675 else
2676 pnote = &XEXP (*pnote, 1);
2677 }
2678 }
2679
2680 df_remove_problem (df_note);
2681 df_analyze ();
2682 return 0;
2683}
2684
2685namespace {
2686
2687const pass_data pass_data_insert_vzeroupper =
2688{
2689 .type: RTL_PASS, /* type */
2690 .name: "vzeroupper", /* name */
2691 .optinfo_flags: OPTGROUP_NONE, /* optinfo_flags */
2692 .tv_id: TV_MACH_DEP, /* tv_id */
2693 .properties_required: 0, /* properties_required */
2694 .properties_provided: 0, /* properties_provided */
2695 .properties_destroyed: 0, /* properties_destroyed */
2696 .todo_flags_start: 0, /* todo_flags_start */
2697 TODO_df_finish, /* todo_flags_finish */
2698};
2699
2700class pass_insert_vzeroupper : public rtl_opt_pass
2701{
2702public:
2703 pass_insert_vzeroupper(gcc::context *ctxt)
2704 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2705 {}
2706
2707 /* opt_pass methods: */
2708 bool gate (function *) final override
2709 {
2710 return TARGET_AVX && TARGET_VZEROUPPER;
2711 }
2712
2713 unsigned int execute (function *) final override
2714 {
2715 return rest_of_handle_insert_vzeroupper ();
2716 }
2717
2718}; // class pass_insert_vzeroupper
2719
2720const pass_data pass_data_stv =
2721{
2722 .type: RTL_PASS, /* type */
2723 .name: "stv", /* name */
2724 .optinfo_flags: OPTGROUP_NONE, /* optinfo_flags */
2725 .tv_id: TV_MACH_DEP, /* tv_id */
2726 .properties_required: 0, /* properties_required */
2727 .properties_provided: 0, /* properties_provided */
2728 .properties_destroyed: 0, /* properties_destroyed */
2729 .todo_flags_start: 0, /* todo_flags_start */
2730 TODO_df_finish, /* todo_flags_finish */
2731};
2732
2733class pass_stv : public rtl_opt_pass
2734{
2735public:
2736 pass_stv (gcc::context *ctxt)
2737 : rtl_opt_pass (pass_data_stv, ctxt),
2738 timode_p (false)
2739 {}
2740
2741 /* opt_pass methods: */
2742 bool gate (function *) final override
2743 {
2744 return ((!timode_p || TARGET_64BIT)
2745 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2746 }
2747
2748 unsigned int execute (function *) final override
2749 {
2750 return convert_scalars_to_vector (timode_p);
2751 }
2752
2753 opt_pass *clone () final override
2754 {
2755 return new pass_stv (m_ctxt);
2756 }
2757
2758 void set_pass_param (unsigned int n, bool param) final override
2759 {
2760 gcc_assert (n == 0);
2761 timode_p = param;
2762 }
2763
2764private:
2765 bool timode_p;
2766}; // class pass_stv
2767
2768} // anon namespace
2769
2770rtl_opt_pass *
2771make_pass_insert_vzeroupper (gcc::context *ctxt)
2772{
2773 return new pass_insert_vzeroupper (ctxt);
2774}
2775
2776rtl_opt_pass *
2777make_pass_stv (gcc::context *ctxt)
2778{
2779 return new pass_stv (ctxt);
2780}
2781
2782/* Inserting ENDBR and pseudo patchable-area instructions. */
2783
2784static void
2785rest_of_insert_endbr_and_patchable_area (bool need_endbr,
2786 unsigned int patchable_area_size)
2787{
2788 rtx endbr;
2789 rtx_insn *insn;
2790 rtx_insn *endbr_insn = NULL;
2791 basic_block bb;
2792
2793 if (need_endbr)
2794 {
2795 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
2796 is absent among function attributes. Later an optimization will
2797 be introduced to make analysis if an address of a static function
2798 is taken. A static function whose address is not taken will get
2799 a nocf_check attribute. This will allow to reduce the number of
2800 EB. */
2801 if (!lookup_attribute (attr_name: "nocf_check",
2802 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2803 && (!flag_manual_endbr
2804 || lookup_attribute (attr_name: "cf_check",
2805 DECL_ATTRIBUTES (cfun->decl)))
2806 && (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
2807 || ix86_cmodel == CM_LARGE
2808 || ix86_cmodel == CM_LARGE_PIC
2809 || flag_force_indirect_call
2810 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
2811 && DECL_DLLIMPORT_P (cfun->decl))))
2812 {
2813 if (crtl->profile && flag_fentry)
2814 {
2815 /* Queue ENDBR insertion to x86_function_profiler.
2816 NB: Any patchable-area insn will be inserted after
2817 ENDBR. */
2818 cfun->machine->insn_queued_at_entrance = TYPE_ENDBR;
2819 }
2820 else
2821 {
2822 endbr = gen_nop_endbr ();
2823 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2824 rtx_insn *insn = BB_HEAD (bb);
2825 endbr_insn = emit_insn_before (endbr, insn);
2826 }
2827 }
2828 }
2829
2830 if (patchable_area_size)
2831 {
2832 if (crtl->profile && flag_fentry)
2833 {
2834 /* Queue patchable-area insertion to x86_function_profiler.
2835 NB: If there is a queued ENDBR, x86_function_profiler
2836 will also handle patchable-area. */
2837 if (!cfun->machine->insn_queued_at_entrance)
2838 cfun->machine->insn_queued_at_entrance = TYPE_PATCHABLE_AREA;
2839 }
2840 else
2841 {
2842 rtx patchable_area
2843 = gen_patchable_area (GEN_INT (patchable_area_size),
2844 GEN_INT (crtl->patch_area_entry == 0));
2845 if (endbr_insn)
2846 emit_insn_after (patchable_area, endbr_insn);
2847 else
2848 {
2849 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2850 insn = BB_HEAD (bb);
2851 emit_insn_before (patchable_area, insn);
2852 }
2853 }
2854 }
2855
2856 if (!need_endbr)
2857 return;
2858
2859 bb = 0;
2860 FOR_EACH_BB_FN (bb, cfun)
2861 {
2862 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2863 insn = NEXT_INSN (insn))
2864 {
2865 if (CALL_P (insn))
2866 {
2867 need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
2868 if (!need_endbr && !SIBLING_CALL_P (insn))
2869 {
2870 rtx call = get_call_rtx_from (insn);
2871 rtx fnaddr = XEXP (call, 0);
2872 tree fndecl = NULL_TREE;
2873
2874 /* Also generate ENDBRANCH for non-tail call which
2875 may return via indirect branch. */
2876 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
2877 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
2878 if (fndecl == NULL_TREE)
2879 fndecl = MEM_EXPR (fnaddr);
2880 if (fndecl
2881 && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
2882 && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
2883 fndecl = NULL_TREE;
2884 if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
2885 {
2886 tree fntype = TREE_TYPE (fndecl);
2887 if (lookup_attribute (attr_name: "indirect_return",
2888 TYPE_ATTRIBUTES (fntype)))
2889 need_endbr = true;
2890 }
2891 }
2892 if (!need_endbr)
2893 continue;
2894 /* Generate ENDBRANCH after CALL, which can return more than
2895 twice, setjmp-like functions. */
2896
2897 endbr = gen_nop_endbr ();
2898 emit_insn_after_setloc (endbr, insn, INSN_LOCATION (insn));
2899 continue;
2900 }
2901
2902 if (JUMP_P (insn) && flag_cet_switch)
2903 {
2904 rtx target = JUMP_LABEL (insn);
2905 if (target == NULL_RTX || ANY_RETURN_P (target))
2906 continue;
2907
2908 /* Check the jump is a switch table. */
2909 rtx_insn *label = as_a<rtx_insn *> (p: target);
2910 rtx_insn *table = next_insn (label);
2911 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2912 continue;
2913
2914 /* For the indirect jump find out all places it jumps and insert
2915 ENDBRANCH there. It should be done under a special flag to
2916 control ENDBRANCH generation for switch stmts. */
2917 edge_iterator ei;
2918 edge e;
2919 basic_block dest_blk;
2920
2921 FOR_EACH_EDGE (e, ei, bb->succs)
2922 {
2923 rtx_insn *insn;
2924
2925 dest_blk = e->dest;
2926 insn = BB_HEAD (dest_blk);
2927 gcc_assert (LABEL_P (insn));
2928 endbr = gen_nop_endbr ();
2929 emit_insn_after (endbr, insn);
2930 }
2931 continue;
2932 }
2933
2934 if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2935 {
2936 endbr = gen_nop_endbr ();
2937 emit_insn_after (endbr, insn);
2938 continue;
2939 }
2940 }
2941 }
2942
2943 return;
2944}
2945
2946namespace {
2947
2948const pass_data pass_data_insert_endbr_and_patchable_area =
2949{
2950 .type: RTL_PASS, /* type. */
2951 .name: "endbr_and_patchable_area", /* name. */
2952 .optinfo_flags: OPTGROUP_NONE, /* optinfo_flags. */
2953 .tv_id: TV_MACH_DEP, /* tv_id. */
2954 .properties_required: 0, /* properties_required. */
2955 .properties_provided: 0, /* properties_provided. */
2956 .properties_destroyed: 0, /* properties_destroyed. */
2957 .todo_flags_start: 0, /* todo_flags_start. */
2958 .todo_flags_finish: 0, /* todo_flags_finish. */
2959};
2960
2961class pass_insert_endbr_and_patchable_area : public rtl_opt_pass
2962{
2963public:
2964 pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
2965 : rtl_opt_pass (pass_data_insert_endbr_and_patchable_area, ctxt)
2966 {}
2967
2968 /* opt_pass methods: */
2969 bool gate (function *) final override
2970 {
2971 need_endbr = (flag_cf_protection & CF_BRANCH) != 0;
2972 patchable_area_size = crtl->patch_area_size - crtl->patch_area_entry;
2973 return need_endbr || patchable_area_size;
2974 }
2975
2976 unsigned int execute (function *) final override
2977 {
2978 timevar_push (tv: TV_MACH_DEP);
2979 rest_of_insert_endbr_and_patchable_area (need_endbr,
2980 patchable_area_size);
2981 timevar_pop (tv: TV_MACH_DEP);
2982 return 0;
2983 }
2984
2985private:
2986 bool need_endbr;
2987 unsigned int patchable_area_size;
2988}; // class pass_insert_endbr_and_patchable_area
2989
2990} // anon namespace
2991
2992rtl_opt_pass *
2993make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
2994{
2995 return new pass_insert_endbr_and_patchable_area (ctxt);
2996}
2997
2998/* At entry of the nearest common dominator for basic blocks with
2999 conversions/rcp/sqrt/rsqrt/round, generate a single
3000 vxorps %xmmN, %xmmN, %xmmN
3001 for all
3002 vcvtss2sd op, %xmmN, %xmmX
3003 vcvtsd2ss op, %xmmN, %xmmX
3004 vcvtsi2ss op, %xmmN, %xmmX
3005 vcvtsi2sd op, %xmmN, %xmmX
3006
3007 NB: We want to generate only a single vxorps to cover the whole
3008 function. The LCM algorithm isn't appropriate here since it may
3009 place a vxorps inside the loop. */
3010
3011static unsigned int
3012remove_partial_avx_dependency (void)
3013{
3014 timevar_push (tv: TV_MACH_DEP);
3015
3016 bitmap_obstack_initialize (NULL);
3017 bitmap convert_bbs = BITMAP_ALLOC (NULL);
3018
3019 basic_block bb;
3020 rtx_insn *insn, *set_insn;
3021 rtx set;
3022 rtx v4sf_const0 = NULL_RTX;
3023
3024 auto_vec<rtx_insn *> control_flow_insns;
3025
3026 /* We create invalid RTL initially so defer rescans. */
3027 df_set_flags (DF_DEFER_INSN_RESCAN);
3028
3029 FOR_EACH_BB_FN (bb, cfun)
3030 {
3031 FOR_BB_INSNS (bb, insn)
3032 {
3033 if (!NONDEBUG_INSN_P (insn))
3034 continue;
3035
3036 set = single_set (insn);
3037 if (!set)
3038 continue;
3039
3040 if (get_attr_avx_partial_xmm_update (insn)
3041 != AVX_PARTIAL_XMM_UPDATE_TRUE)
3042 continue;
3043
3044 /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
3045 SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
3046 round, to vec_dup and vec_merge with subreg. */
3047 rtx src = SET_SRC (set);
3048 rtx dest = SET_DEST (set);
3049 machine_mode dest_mode = GET_MODE (dest);
3050 bool convert_p = false;
3051 switch (GET_CODE (src))
3052 {
3053 case FLOAT:
3054 case FLOAT_EXTEND:
3055 case FLOAT_TRUNCATE:
3056 case UNSIGNED_FLOAT:
3057 convert_p = true;
3058 break;
3059 default:
3060 break;
3061 }
3062
3063 /* Only hanlde conversion here. */
3064 machine_mode src_mode
3065 = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
3066 switch (src_mode)
3067 {
3068 case E_SFmode:
3069 case E_DFmode:
3070 if (TARGET_USE_VECTOR_FP_CONVERTS
3071 || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
3072 continue;
3073 break;
3074 case E_SImode:
3075 case E_DImode:
3076 if (TARGET_USE_VECTOR_CONVERTS
3077 || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
3078 continue;
3079 break;
3080 case E_VOIDmode:
3081 gcc_assert (!convert_p);
3082 break;
3083 default:
3084 gcc_unreachable ();
3085 }
3086
3087 if (!v4sf_const0)
3088 v4sf_const0 = gen_reg_rtx (V4SFmode);
3089
3090 rtx zero;
3091 machine_mode dest_vecmode;
3092 switch (dest_mode)
3093 {
3094 case E_HFmode:
3095 dest_vecmode = V8HFmode;
3096 zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
3097 break;
3098 case E_SFmode:
3099 dest_vecmode = V4SFmode;
3100 zero = v4sf_const0;
3101 break;
3102 case E_DFmode:
3103 dest_vecmode = V2DFmode;
3104 zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
3105 break;
3106 default:
3107 gcc_unreachable ();
3108 }
3109
3110 /* Change source to vector mode. */
3111 src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
3112 src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
3113 GEN_INT (HOST_WIDE_INT_1U));
3114 /* Change destination to vector mode. */
3115 rtx vec = gen_reg_rtx (dest_vecmode);
3116 /* Generate an XMM vector SET. */
3117 set = gen_rtx_SET (vec, src);
3118 set_insn = emit_insn_before (set, insn);
3119 df_insn_rescan (set_insn);
3120
3121 if (cfun->can_throw_non_call_exceptions)
3122 {
3123 /* Handle REG_EH_REGION note. */
3124 rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
3125 if (note)
3126 {
3127 control_flow_insns.safe_push (obj: set_insn);
3128 add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
3129 }
3130 }
3131
3132 src = gen_rtx_SUBREG (dest_mode, vec, 0);
3133 set = gen_rtx_SET (dest, src);
3134
3135 /* Drop possible dead definitions. */
3136 PATTERN (insn) = set;
3137
3138 INSN_CODE (insn) = -1;
3139 recog_memoized (insn);
3140 df_insn_rescan (insn);
3141 bitmap_set_bit (convert_bbs, bb->index);
3142 }
3143 }
3144
3145 if (v4sf_const0)
3146 {
3147 /* (Re-)discover loops so that bb->loop_father can be used in the
3148 analysis below. */
3149 calculate_dominance_info (CDI_DOMINATORS);
3150 loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
3151
3152 /* Generate a vxorps at entry of the nearest dominator for basic
3153 blocks with conversions, which is in the fake loop that
3154 contains the whole function, so that there is only a single
3155 vxorps in the whole function. */
3156 bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
3157 convert_bbs);
3158 while (bb->loop_father->latch
3159 != EXIT_BLOCK_PTR_FOR_FN (cfun))
3160 bb = get_immediate_dominator (CDI_DOMINATORS,
3161 bb->loop_father->header);
3162
3163 set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
3164
3165 insn = BB_HEAD (bb);
3166 while (insn && !NONDEBUG_INSN_P (insn))
3167 {
3168 if (insn == BB_END (bb))
3169 {
3170 insn = NULL;
3171 break;
3172 }
3173 insn = NEXT_INSN (insn);
3174 }
3175 if (insn == BB_HEAD (bb))
3176 set_insn = emit_insn_before (set, insn);
3177 else
3178 set_insn = emit_insn_after (set,
3179 insn ? PREV_INSN (insn) : BB_END (bb));
3180 df_insn_rescan (set_insn);
3181 loop_optimizer_finalize ();
3182
3183 if (!control_flow_insns.is_empty ())
3184 {
3185 free_dominance_info (CDI_DOMINATORS);
3186
3187 unsigned int i;
3188 FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
3189 if (control_flow_insn_p (insn))
3190 {
3191 /* Split the block after insn. There will be a fallthru
3192 edge, which is OK so we keep it. We have to create
3193 the exception edges ourselves. */
3194 bb = BLOCK_FOR_INSN (insn);
3195 split_block (bb, insn);
3196 rtl_make_eh_edge (NULL, bb, BB_END (bb));
3197 }
3198 }
3199 }
3200
3201 df_process_deferred_rescans ();
3202 df_clear_flags (DF_DEFER_INSN_RESCAN);
3203 bitmap_obstack_release (NULL);
3204 BITMAP_FREE (convert_bbs);
3205
3206 timevar_pop (tv: TV_MACH_DEP);
3207 return 0;
3208}
3209
3210namespace {
3211
3212const pass_data pass_data_remove_partial_avx_dependency =
3213{
3214 .type: RTL_PASS, /* type */
3215 .name: "rpad", /* name */
3216 .optinfo_flags: OPTGROUP_NONE, /* optinfo_flags */
3217 .tv_id: TV_MACH_DEP, /* tv_id */
3218 .properties_required: 0, /* properties_required */
3219 .properties_provided: 0, /* properties_provided */
3220 .properties_destroyed: 0, /* properties_destroyed */
3221 .todo_flags_start: 0, /* todo_flags_start */
3222 .todo_flags_finish: 0, /* todo_flags_finish */
3223};
3224
3225class pass_remove_partial_avx_dependency : public rtl_opt_pass
3226{
3227public:
3228 pass_remove_partial_avx_dependency (gcc::context *ctxt)
3229 : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
3230 {}
3231
3232 /* opt_pass methods: */
3233 bool gate (function *) final override
3234 {
3235 return (TARGET_AVX
3236 && TARGET_SSE_PARTIAL_REG_DEPENDENCY
3237 && TARGET_SSE_MATH
3238 && optimize
3239 && optimize_function_for_speed_p (cfun));
3240 }
3241
3242 unsigned int execute (function *) final override
3243 {
3244 return remove_partial_avx_dependency ();
3245 }
3246}; // class pass_rpad
3247
3248} // anon namespace
3249
3250rtl_opt_pass *
3251make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
3252{
3253 return new pass_remove_partial_avx_dependency (ctxt);
3254}
3255
3256/* This compares the priority of target features in function DECL1
3257 and DECL2. It returns positive value if DECL1 is higher priority,
3258 negative value if DECL2 is higher priority and 0 if they are the
3259 same. */
3260
3261int
3262ix86_compare_version_priority (tree decl1, tree decl2)
3263{
3264 unsigned int priority1 = get_builtin_code_for_version (decl: decl1, NULL);
3265 unsigned int priority2 = get_builtin_code_for_version (decl: decl2, NULL);
3266
3267 return (int)priority1 - (int)priority2;
3268}
3269
3270/* V1 and V2 point to function versions with different priorities
3271 based on the target ISA. This function compares their priorities. */
3272
3273static int
3274feature_compare (const void *v1, const void *v2)
3275{
3276 typedef struct _function_version_info
3277 {
3278 tree version_decl;
3279 tree predicate_chain;
3280 unsigned int dispatch_priority;
3281 } function_version_info;
3282
3283 const function_version_info c1 = *(const function_version_info *)v1;
3284 const function_version_info c2 = *(const function_version_info *)v2;
3285 return (c2.dispatch_priority - c1.dispatch_priority);
3286}
3287
3288/* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
3289 to return a pointer to VERSION_DECL if the outcome of the expression
3290 formed by PREDICATE_CHAIN is true. This function will be called during
3291 version dispatch to decide which function version to execute. It returns
3292 the basic block at the end, to which more conditions can be added. */
3293
3294static basic_block
3295add_condition_to_bb (tree function_decl, tree version_decl,
3296 tree predicate_chain, basic_block new_bb)
3297{
3298 gimple *return_stmt;
3299 tree convert_expr, result_var;
3300 gimple *convert_stmt;
3301 gimple *call_cond_stmt;
3302 gimple *if_else_stmt;
3303
3304 basic_block bb1, bb2, bb3;
3305 edge e12, e23;
3306
3307 tree cond_var, and_expr_var = NULL_TREE;
3308 gimple_seq gseq;
3309
3310 tree predicate_decl, predicate_arg;
3311
3312 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
3313
3314 gcc_assert (new_bb != NULL);
3315 gseq = bb_seq (bb: new_bb);
3316
3317
3318 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
3319 build_fold_addr_expr (version_decl));
3320 result_var = create_tmp_var (ptr_type_node);
3321 convert_stmt = gimple_build_assign (result_var, convert_expr);
3322 return_stmt = gimple_build_return (result_var);
3323
3324 if (predicate_chain == NULL_TREE)
3325 {
3326 gimple_seq_add_stmt (&gseq, convert_stmt);
3327 gimple_seq_add_stmt (&gseq, return_stmt);
3328 set_bb_seq (bb: new_bb, seq: gseq);
3329 gimple_set_bb (convert_stmt, new_bb);
3330 gimple_set_bb (return_stmt, new_bb);
3331 pop_cfun ();
3332 return new_bb;
3333 }
3334
3335 while (predicate_chain != NULL)
3336 {
3337 cond_var = create_tmp_var (integer_type_node);
3338 predicate_decl = TREE_PURPOSE (predicate_chain);
3339 predicate_arg = TREE_VALUE (predicate_chain);
3340 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
3341 gimple_call_set_lhs (gs: call_cond_stmt, lhs: cond_var);
3342
3343 gimple_set_block (g: call_cond_stmt, DECL_INITIAL (function_decl));
3344 gimple_set_bb (call_cond_stmt, new_bb);
3345 gimple_seq_add_stmt (&gseq, call_cond_stmt);
3346
3347 predicate_chain = TREE_CHAIN (predicate_chain);
3348
3349 if (and_expr_var == NULL)
3350 and_expr_var = cond_var;
3351 else
3352 {
3353 gimple *assign_stmt;
3354 /* Use MIN_EXPR to check if any integer is zero?.
3355 and_expr_var = min_expr <cond_var, and_expr_var> */
3356 assign_stmt = gimple_build_assign (and_expr_var,
3357 build2 (MIN_EXPR, integer_type_node,
3358 cond_var, and_expr_var));
3359
3360 gimple_set_block (g: assign_stmt, DECL_INITIAL (function_decl));
3361 gimple_set_bb (assign_stmt, new_bb);
3362 gimple_seq_add_stmt (&gseq, assign_stmt);
3363 }
3364 }
3365
3366 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
3367 integer_zero_node,
3368 NULL_TREE, NULL_TREE);
3369 gimple_set_block (g: if_else_stmt, DECL_INITIAL (function_decl));
3370 gimple_set_bb (if_else_stmt, new_bb);
3371 gimple_seq_add_stmt (&gseq, if_else_stmt);
3372
3373 gimple_seq_add_stmt (&gseq, convert_stmt);
3374 gimple_seq_add_stmt (&gseq, return_stmt);
3375 set_bb_seq (bb: new_bb, seq: gseq);
3376
3377 bb1 = new_bb;
3378 e12 = split_block (bb1, if_else_stmt);
3379 bb2 = e12->dest;
3380 e12->flags &= ~EDGE_FALLTHRU;
3381 e12->flags |= EDGE_TRUE_VALUE;
3382
3383 e23 = split_block (bb2, return_stmt);
3384
3385 gimple_set_bb (convert_stmt, bb2);
3386 gimple_set_bb (return_stmt, bb2);
3387
3388 bb3 = e23->dest;
3389 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
3390
3391 remove_edge (e23);
3392 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
3393
3394 pop_cfun ();
3395
3396 return bb3;
3397}
3398
3399/* This function generates the dispatch function for
3400 multi-versioned functions. DISPATCH_DECL is the function which will
3401 contain the dispatch logic. FNDECLS are the function choices for
3402 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
3403 in DISPATCH_DECL in which the dispatch code is generated. */
3404
3405static int
3406dispatch_function_versions (tree dispatch_decl,
3407 void *fndecls_p,
3408 basic_block *empty_bb)
3409{
3410 tree default_decl;
3411 gimple *ifunc_cpu_init_stmt;
3412 gimple_seq gseq;
3413 int ix;
3414 tree ele;
3415 vec<tree> *fndecls;
3416 unsigned int num_versions = 0;
3417 unsigned int actual_versions = 0;
3418 unsigned int i;
3419
3420 struct _function_version_info
3421 {
3422 tree version_decl;
3423 tree predicate_chain;
3424 unsigned int dispatch_priority;
3425 }*function_version_info;
3426
3427 gcc_assert (dispatch_decl != NULL
3428 && fndecls_p != NULL
3429 && empty_bb != NULL);
3430
3431 /*fndecls_p is actually a vector. */
3432 fndecls = static_cast<vec<tree> *> (fndecls_p);
3433
3434 /* At least one more version other than the default. */
3435 num_versions = fndecls->length ();
3436 gcc_assert (num_versions >= 2);
3437
3438 function_version_info = (struct _function_version_info *)
3439 XNEWVEC (struct _function_version_info, (num_versions - 1));
3440
3441 /* The first version in the vector is the default decl. */
3442 default_decl = (*fndecls)[0];
3443
3444 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
3445
3446 gseq = bb_seq (bb: *empty_bb);
3447 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
3448 constructors, so explicity call __builtin_cpu_init here. */
3449 ifunc_cpu_init_stmt
3450 = gimple_build_call_vec (get_ix86_builtin (c: IX86_BUILTIN_CPU_INIT), vNULL);
3451 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
3452 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
3453 set_bb_seq (bb: *empty_bb, seq: gseq);
3454
3455 pop_cfun ();
3456
3457
3458 for (ix = 1; fndecls->iterate (ix, ptr: &ele); ++ix)
3459 {
3460 tree version_decl = ele;
3461 tree predicate_chain = NULL_TREE;
3462 unsigned int priority;
3463 /* Get attribute string, parse it and find the right predicate decl.
3464 The predicate function could be a lengthy combination of many
3465 features, like arch-type and various isa-variants. */
3466 priority = get_builtin_code_for_version (decl: version_decl,
3467 predicate_list: &predicate_chain);
3468
3469 if (predicate_chain == NULL_TREE)
3470 continue;
3471
3472 function_version_info [actual_versions].version_decl = version_decl;
3473 function_version_info [actual_versions].predicate_chain
3474 = predicate_chain;
3475 function_version_info [actual_versions].dispatch_priority = priority;
3476 actual_versions++;
3477 }
3478
3479 /* Sort the versions according to descending order of dispatch priority. The
3480 priority is based on the ISA. This is not a perfect solution. There
3481 could still be ambiguity. If more than one function version is suitable
3482 to execute, which one should be dispatched? In future, allow the user
3483 to specify a dispatch priority next to the version. */
3484 qsort (function_version_info, actual_versions,
3485 sizeof (struct _function_version_info), feature_compare);
3486
3487 for (i = 0; i < actual_versions; ++i)
3488 *empty_bb = add_condition_to_bb (function_decl: dispatch_decl,
3489 version_decl: function_version_info[i].version_decl,
3490 predicate_chain: function_version_info[i].predicate_chain,
3491 new_bb: *empty_bb);
3492
3493 /* dispatch default version at the end. */
3494 *empty_bb = add_condition_to_bb (function_decl: dispatch_decl, version_decl: default_decl,
3495 NULL, new_bb: *empty_bb);
3496
3497 free (ptr: function_version_info);
3498 return 0;
3499}
3500
3501/* This function changes the assembler name for functions that are
3502 versions. If DECL is a function version and has a "target"
3503 attribute, it appends the attribute string to its assembler name. */
3504
3505static tree
3506ix86_mangle_function_version_assembler_name (tree decl, tree id)
3507{
3508 tree version_attr;
3509 const char *orig_name, *version_string;
3510 char *attr_str, *assembler_name;
3511
3512 if (DECL_DECLARED_INLINE_P (decl)
3513 && lookup_attribute (attr_name: "gnu_inline",
3514 DECL_ATTRIBUTES (decl)))
3515 error_at (DECL_SOURCE_LOCATION (decl),
3516 "function versions cannot be marked as %<gnu_inline%>,"
3517 " bodies have to be generated");
3518
3519 if (DECL_VIRTUAL_P (decl)
3520 || DECL_VINDEX (decl))
3521 sorry ("virtual function multiversioning not supported");
3522
3523 version_attr = lookup_attribute (attr_name: "target", DECL_ATTRIBUTES (decl));
3524
3525 /* target attribute string cannot be NULL. */
3526 gcc_assert (version_attr != NULL_TREE);
3527
3528 orig_name = IDENTIFIER_POINTER (id);
3529 version_string
3530 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
3531
3532 if (strcmp (s1: version_string, s2: "default") == 0)
3533 return id;
3534
3535 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
3536 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
3537
3538 sprintf (s: assembler_name, format: "%s.%s", orig_name, attr_str);
3539
3540 /* Allow assembler name to be modified if already set. */
3541 if (DECL_ASSEMBLER_NAME_SET_P (decl))
3542 SET_DECL_RTL (decl, NULL);
3543
3544 tree ret = get_identifier (assembler_name);
3545 XDELETEVEC (attr_str);
3546 XDELETEVEC (assembler_name);
3547 return ret;
3548}
3549
3550tree
3551ix86_mangle_decl_assembler_name (tree decl, tree id)
3552{
3553 /* For function version, add the target suffix to the assembler name. */
3554 if (TREE_CODE (decl) == FUNCTION_DECL
3555 && DECL_FUNCTION_VERSIONED (decl))
3556 id = ix86_mangle_function_version_assembler_name (decl, id);
3557#ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
3558 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
3559#endif
3560
3561 return id;
3562}
3563
3564/* Make a dispatcher declaration for the multi-versioned function DECL.
3565 Calls to DECL function will be replaced with calls to the dispatcher
3566 by the front-end. Returns the decl of the dispatcher function. */
3567
3568tree
3569ix86_get_function_versions_dispatcher (void *decl)
3570{
3571 tree fn = (tree) decl;
3572 struct cgraph_node *node = NULL;
3573 struct cgraph_node *default_node = NULL;
3574 struct cgraph_function_version_info *node_v = NULL;
3575 struct cgraph_function_version_info *first_v = NULL;
3576
3577 tree dispatch_decl = NULL;
3578
3579 struct cgraph_function_version_info *default_version_info = NULL;
3580
3581 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
3582
3583 node = cgraph_node::get (decl: fn);
3584 gcc_assert (node != NULL);
3585
3586 node_v = node->function_version ();
3587 gcc_assert (node_v != NULL);
3588
3589 if (node_v->dispatcher_resolver != NULL)
3590 return node_v->dispatcher_resolver;
3591
3592 /* Find the default version and make it the first node. */
3593 first_v = node_v;
3594 /* Go to the beginning of the chain. */
3595 while (first_v->prev != NULL)
3596 first_v = first_v->prev;
3597 default_version_info = first_v;
3598 while (default_version_info != NULL)
3599 {
3600 if (is_function_default_version
3601 (default_version_info->this_node->decl))
3602 break;
3603 default_version_info = default_version_info->next;
3604 }
3605
3606 /* If there is no default node, just return NULL. */
3607 if (default_version_info == NULL)
3608 return NULL;
3609
3610 /* Make default info the first node. */
3611 if (first_v != default_version_info)
3612 {
3613 default_version_info->prev->next = default_version_info->next;
3614 if (default_version_info->next)
3615 default_version_info->next->prev = default_version_info->prev;
3616 first_v->prev = default_version_info;
3617 default_version_info->next = first_v;
3618 default_version_info->prev = NULL;
3619 }
3620
3621 default_node = default_version_info->this_node;
3622
3623#if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
3624 if (targetm.has_ifunc_p ())
3625 {
3626 struct cgraph_function_version_info *it_v = NULL;
3627 struct cgraph_node *dispatcher_node = NULL;
3628 struct cgraph_function_version_info *dispatcher_version_info = NULL;
3629
3630 /* Right now, the dispatching is done via ifunc. */
3631 dispatch_decl = make_dispatcher_decl (default_node->decl);
3632 TREE_NOTHROW (dispatch_decl) = TREE_NOTHROW (fn);
3633
3634 dispatcher_node = cgraph_node::get_create (dispatch_decl);
3635 gcc_assert (dispatcher_node != NULL);
3636 dispatcher_node->dispatcher_function = 1;
3637 dispatcher_version_info
3638 = dispatcher_node->insert_new_function_version ();
3639 dispatcher_version_info->next = default_version_info;
3640 dispatcher_node->definition = 1;
3641
3642 /* Set the dispatcher for all the versions. */
3643 it_v = default_version_info;
3644 while (it_v != NULL)
3645 {
3646 it_v->dispatcher_resolver = dispatch_decl;
3647 it_v = it_v->next;
3648 }
3649 }
3650 else
3651#endif
3652 {
3653 error_at (DECL_SOURCE_LOCATION (default_node->decl),
3654 "multiversioning needs %<ifunc%> which is not supported "
3655 "on this target");
3656 }
3657
3658 return dispatch_decl;
3659}
3660
3661/* Make the resolver function decl to dispatch the versions of
3662 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
3663 ifunc alias that will point to the created resolver. Create an
3664 empty basic block in the resolver and store the pointer in
3665 EMPTY_BB. Return the decl of the resolver function. */
3666
3667static tree
3668make_resolver_func (const tree default_decl,
3669 const tree ifunc_alias_decl,
3670 basic_block *empty_bb)
3671{
3672 tree decl, type, t;
3673
3674 /* Create resolver function name based on default_decl. */
3675 tree decl_name = clone_function_name (decl: default_decl, suffix: "resolver");
3676 const char *resolver_name = IDENTIFIER_POINTER (decl_name);
3677
3678 /* The resolver function should return a (void *). */
3679 type = build_function_type_list (ptr_type_node, NULL_TREE);
3680
3681 decl = build_fn_decl (resolver_name, type);
3682 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
3683
3684 DECL_NAME (decl) = decl_name;
3685 TREE_USED (decl) = 1;
3686 DECL_ARTIFICIAL (decl) = 1;
3687 DECL_IGNORED_P (decl) = 1;
3688 TREE_PUBLIC (decl) = 0;
3689 DECL_UNINLINABLE (decl) = 1;
3690
3691 /* Resolver is not external, body is generated. */
3692 DECL_EXTERNAL (decl) = 0;
3693 DECL_EXTERNAL (ifunc_alias_decl) = 0;
3694
3695 DECL_CONTEXT (decl) = NULL_TREE;
3696 DECL_INITIAL (decl) = make_node (BLOCK);
3697 DECL_STATIC_CONSTRUCTOR (decl) = 0;
3698
3699 if (DECL_COMDAT_GROUP (default_decl)
3700 || TREE_PUBLIC (default_decl))
3701 {
3702 /* In this case, each translation unit with a call to this
3703 versioned function will put out a resolver. Ensure it
3704 is comdat to keep just one copy. */
3705 DECL_COMDAT (decl) = 1;
3706 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
3707 }
3708 else
3709 TREE_PUBLIC (ifunc_alias_decl) = 0;
3710
3711 /* Build result decl and add to function_decl. */
3712 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
3713 DECL_CONTEXT (t) = decl;
3714 DECL_ARTIFICIAL (t) = 1;
3715 DECL_IGNORED_P (t) = 1;
3716 DECL_RESULT (decl) = t;
3717
3718 gimplify_function_tree (decl);
3719 push_cfun (DECL_STRUCT_FUNCTION (decl));
3720 *empty_bb = init_lowered_empty_function (decl, false,
3721 profile_count::uninitialized ());
3722
3723 cgraph_node::add_new_function (fndecl: decl, lowered: true);
3724 symtab->call_cgraph_insertion_hooks (node: cgraph_node::get_create (decl));
3725
3726 pop_cfun ();
3727
3728 gcc_assert (ifunc_alias_decl != NULL);
3729 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
3730 DECL_ATTRIBUTES (ifunc_alias_decl)
3731 = make_attribute ("ifunc", resolver_name,
3732 DECL_ATTRIBUTES (ifunc_alias_decl));
3733
3734 /* Create the alias for dispatch to resolver here. */
3735 cgraph_node::create_same_body_alias (alias: ifunc_alias_decl, decl);
3736 return decl;
3737}
3738
3739/* Generate the dispatching code body to dispatch multi-versioned function
3740 DECL. The target hook is called to process the "target" attributes and
3741 provide the code to dispatch the right function at run-time. NODE points
3742 to the dispatcher decl whose body will be created. */
3743
3744tree
3745ix86_generate_version_dispatcher_body (void *node_p)
3746{
3747 tree resolver_decl;
3748 basic_block empty_bb;
3749 tree default_ver_decl;
3750 struct cgraph_node *versn;
3751 struct cgraph_node *node;
3752
3753 struct cgraph_function_version_info *node_version_info = NULL;
3754 struct cgraph_function_version_info *versn_info = NULL;
3755
3756 node = (cgraph_node *)node_p;
3757
3758 node_version_info = node->function_version ();
3759 gcc_assert (node->dispatcher_function
3760 && node_version_info != NULL);
3761
3762 if (node_version_info->dispatcher_resolver)
3763 return node_version_info->dispatcher_resolver;
3764
3765 /* The first version in the chain corresponds to the default version. */
3766 default_ver_decl = node_version_info->next->this_node->decl;
3767
3768 /* node is going to be an alias, so remove the finalized bit. */
3769 node->definition = false;
3770
3771 resolver_decl = make_resolver_func (default_decl: default_ver_decl,
3772 ifunc_alias_decl: node->decl, empty_bb: &empty_bb);
3773
3774 node_version_info->dispatcher_resolver = resolver_decl;
3775
3776 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
3777
3778 auto_vec<tree, 2> fn_ver_vec;
3779
3780 for (versn_info = node_version_info->next; versn_info;
3781 versn_info = versn_info->next)
3782 {
3783 versn = versn_info->this_node;
3784 /* Check for virtual functions here again, as by this time it should
3785 have been determined if this function needs a vtable index or
3786 not. This happens for methods in derived classes that override
3787 virtual methods in base classes but are not explicitly marked as
3788 virtual. */
3789 if (DECL_VINDEX (versn->decl))
3790 sorry ("virtual function multiversioning not supported");
3791
3792 fn_ver_vec.safe_push (obj: versn->decl);
3793 }
3794
3795 dispatch_function_versions (dispatch_decl: resolver_decl, fndecls_p: &fn_ver_vec, empty_bb: &empty_bb);
3796 cgraph_edge::rebuild_edges ();
3797 pop_cfun ();
3798 return resolver_decl;
3799}
3800
3801
3802

source code of gcc/config/i386/i386-features.cc