1/* Scheduler hooks for IA-32 which implement CPU specific logic.
2 Copyright (C) 1988-2024 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3. If not see
18<http://www.gnu.org/licenses/>. */
19
20#define IN_TARGET_CODE 1
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "rtl.h"
27#include "tree.h"
28#include "cfghooks.h"
29#include "tm_p.h"
30#include "target.h"
31#include "insn-config.h"
32#include "insn-attr.h"
33#include "insn-opinit.h"
34#include "recog.h"
35
36/* Return the maximum number of instructions a cpu can issue. */
37
38int
39ix86_issue_rate (void)
40{
41 switch (ix86_tune)
42 {
43 case PROCESSOR_PENTIUM:
44 case PROCESSOR_LAKEMONT:
45 case PROCESSOR_BONNELL:
46 case PROCESSOR_SILVERMONT:
47 case PROCESSOR_KNL:
48 case PROCESSOR_KNM:
49 case PROCESSOR_INTEL:
50 case PROCESSOR_K6:
51 case PROCESSOR_BTVER2:
52 case PROCESSOR_PENTIUM4:
53 case PROCESSOR_NOCONA:
54 return 2;
55
56 case PROCESSOR_PENTIUMPRO:
57 case PROCESSOR_ATHLON:
58 case PROCESSOR_K8:
59 case PROCESSOR_AMDFAM10:
60 case PROCESSOR_BTVER1:
61 case PROCESSOR_LUJIAZUI:
62 return 3;
63
64 case PROCESSOR_BDVER1:
65 case PROCESSOR_BDVER2:
66 case PROCESSOR_BDVER3:
67 case PROCESSOR_BDVER4:
68 case PROCESSOR_ZNVER1:
69 case PROCESSOR_ZNVER2:
70 case PROCESSOR_ZNVER3:
71 case PROCESSOR_ZNVER4:
72 case PROCESSOR_ZNVER5:
73 case PROCESSOR_CORE2:
74 case PROCESSOR_NEHALEM:
75 case PROCESSOR_SANDYBRIDGE:
76 case PROCESSOR_HASWELL:
77 case PROCESSOR_TREMONT:
78 case PROCESSOR_SKYLAKE:
79 case PROCESSOR_SKYLAKE_AVX512:
80 case PROCESSOR_CASCADELAKE:
81 case PROCESSOR_CANNONLAKE:
82 case PROCESSOR_ALDERLAKE:
83 case PROCESSOR_YONGFENG:
84 case PROCESSOR_GENERIC:
85 return 4;
86
87 case PROCESSOR_ICELAKE_CLIENT:
88 case PROCESSOR_ICELAKE_SERVER:
89 case PROCESSOR_TIGERLAKE:
90 case PROCESSOR_COOPERLAKE:
91 case PROCESSOR_ROCKETLAKE:
92 return 5;
93
94 case PROCESSOR_SAPPHIRERAPIDS:
95 return 6;
96
97 default:
98 return 1;
99 }
100}
101
102/* Return true iff USE_INSN has a memory address with operands set by
103 SET_INSN. */
104
105bool
106ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
107{
108 int i;
109 extract_insn_cached (use_insn);
110 for (i = recog_data.n_operands - 1; i >= 0; --i)
111 if (MEM_P (recog_data.operand[i]))
112 {
113 rtx addr = XEXP (recog_data.operand[i], 0);
114 if (modified_in_p (addr, set_insn) != 0)
115 {
116 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
117 has SP based memory (unless index reg is modified in a pop). */
118 rtx set = single_set (insn: set_insn);
119 if (set
120 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
121 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
122 {
123 struct ix86_address parts;
124 if (ix86_decompose_address (addr, &parts)
125 && parts.base == stack_pointer_rtx
126 && (parts.index == NULL_RTX
127 || MEM_P (SET_DEST (set))
128 || !modified_in_p (parts.index, set_insn)))
129 return false;
130 }
131 return true;
132 }
133 return false;
134 }
135 return false;
136}
137
138/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
139 by DEP_INSN and nothing set by DEP_INSN. */
140
141static bool
142ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
143{
144 rtx set, set2;
145
146 /* Simplify the test for uninteresting insns. */
147 if (insn_type != TYPE_SETCC
148 && insn_type != TYPE_ICMOV
149 && insn_type != TYPE_FCMOV
150 && insn_type != TYPE_IBR)
151 return false;
152
153 if ((set = single_set (insn: dep_insn)) != 0)
154 {
155 set = SET_DEST (set);
156 set2 = NULL_RTX;
157 }
158 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
159 && XVECLEN (PATTERN (dep_insn), 0) == 2
160 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
161 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
162 {
163 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
164 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
165 }
166 else
167 return false;
168
169 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
170 return false;
171
172 /* This test is true if the dependent insn reads the flags but
173 not any other potentially set register. */
174 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
175 return false;
176
177 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
178 return false;
179
180 return true;
181}
182
183/* Helper function for exact_store_load_dependency.
184 Return true if addr is found in insn. */
185static bool
186exact_dependency_1 (rtx addr, rtx insn)
187{
188 enum rtx_code code;
189 const char *format_ptr;
190 int i, j;
191
192 code = GET_CODE (insn);
193 switch (code)
194 {
195 case MEM:
196 if (rtx_equal_p (addr, insn))
197 return true;
198 break;
199 case REG:
200 CASE_CONST_ANY:
201 case SYMBOL_REF:
202 case CODE_LABEL:
203 case PC:
204 case EXPR_LIST:
205 return false;
206 default:
207 break;
208 }
209
210 format_ptr = GET_RTX_FORMAT (code);
211 for (i = 0; i < GET_RTX_LENGTH (code); i++)
212 {
213 switch (*format_ptr++)
214 {
215 case 'e':
216 if (exact_dependency_1 (addr, XEXP (insn, i)))
217 return true;
218 break;
219 case 'E':
220 for (j = 0; j < XVECLEN (insn, i); j++)
221 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
222 return true;
223 break;
224 }
225 }
226 return false;
227}
228
229/* Return true if there exists exact dependency for store & load, i.e.
230 the same memory address is used in them. */
231static bool
232exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
233{
234 rtx set1, set2;
235
236 set1 = single_set (insn: store);
237 if (!set1)
238 return false;
239 if (!MEM_P (SET_DEST (set1)))
240 return false;
241 set2 = single_set (insn: load);
242 if (!set2)
243 return false;
244 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
245 return true;
246 return false;
247}
248
249
250/* This function corrects the value of COST (latency) based on the relationship
251 between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
252 DW. It should return the new value.
253
254 On x86 CPUs this is most commonly used to model the fact that valus of
255 registers used to compute address of memory operand needs to be ready
256 earlier than values of registers used in the actual operation. */
257
258int
259ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
260 unsigned int)
261{
262 enum attr_type insn_type, dep_insn_type;
263 enum attr_memory memory;
264 rtx set, set2;
265 int dep_insn_code_number;
266
267 /* Anti and output dependencies have zero cost on all CPUs. */
268 if (dep_type != 0)
269 return 0;
270
271 dep_insn_code_number = recog_memoized (insn: dep_insn);
272
273 /* If we can't recognize the insns, we can't really do anything. */
274 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
275 return cost;
276
277 insn_type = get_attr_type (insn);
278 dep_insn_type = get_attr_type (dep_insn);
279
280 switch (ix86_tune)
281 {
282 case PROCESSOR_PENTIUM:
283 case PROCESSOR_LAKEMONT:
284 /* Address Generation Interlock adds a cycle of latency. */
285 if (insn_type == TYPE_LEA)
286 {
287 rtx addr = PATTERN (insn);
288
289 if (GET_CODE (addr) == PARALLEL)
290 addr = XVECEXP (addr, 0, 0);
291
292 gcc_assert (GET_CODE (addr) == SET);
293
294 addr = SET_SRC (addr);
295 if (modified_in_p (addr, dep_insn))
296 cost += 1;
297 }
298 else if (ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
299 cost += 1;
300
301 /* ??? Compares pair with jump/setcc. */
302 if (ix86_flags_dependent (insn, dep_insn, insn_type))
303 cost = 0;
304
305 /* Floating point stores require value to be ready one cycle earlier. */
306 if (insn_type == TYPE_FMOV
307 && get_attr_memory (insn) == MEMORY_STORE
308 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
309 cost += 1;
310 break;
311
312 case PROCESSOR_PENTIUMPRO:
313 /* INT->FP conversion is expensive. */
314 if (get_attr_fp_int_src (dep_insn))
315 cost += 5;
316
317 /* There is one cycle extra latency between an FP op and a store. */
318 if (insn_type == TYPE_FMOV
319 && (set = single_set (insn: dep_insn)) != NULL_RTX
320 && (set2 = single_set (insn)) != NULL_RTX
321 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
322 && MEM_P (SET_DEST (set2)))
323 cost += 1;
324
325 memory = get_attr_memory (insn);
326
327 /* Show ability of reorder buffer to hide latency of load by executing
328 in parallel with previous instruction in case
329 previous instruction is not needed to compute the address. */
330 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
331 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
332 {
333 /* Claim moves to take one cycle, as core can issue one load
334 at time and the next load can start cycle later. */
335 if (dep_insn_type == TYPE_IMOV
336 || dep_insn_type == TYPE_FMOV)
337 cost = 1;
338 else if (cost > 1)
339 cost--;
340 }
341 break;
342
343 case PROCESSOR_K6:
344 /* The esp dependency is resolved before
345 the instruction is really finished. */
346 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
347 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
348 return 1;
349
350 /* INT->FP conversion is expensive. */
351 if (get_attr_fp_int_src (dep_insn))
352 cost += 5;
353
354 memory = get_attr_memory (insn);
355
356 /* Show ability of reorder buffer to hide latency of load by executing
357 in parallel with previous instruction in case
358 previous instruction is not needed to compute the address. */
359 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
360 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
361 {
362 /* Claim moves to take one cycle, as core can issue one load
363 at time and the next load can start cycle later. */
364 if (dep_insn_type == TYPE_IMOV
365 || dep_insn_type == TYPE_FMOV)
366 cost = 1;
367 else if (cost > 2)
368 cost -= 2;
369 else
370 cost = 1;
371 }
372 break;
373
374 case PROCESSOR_AMDFAM10:
375 case PROCESSOR_BDVER1:
376 case PROCESSOR_BDVER2:
377 case PROCESSOR_BDVER3:
378 case PROCESSOR_BDVER4:
379 case PROCESSOR_BTVER1:
380 case PROCESSOR_BTVER2:
381 /* Stack engine allows to execute push&pop instructions in parall. */
382 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
383 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
384 return 0;
385 /* FALLTHRU */
386
387 case PROCESSOR_ATHLON:
388 case PROCESSOR_K8:
389 memory = get_attr_memory (insn);
390
391 /* Show ability of reorder buffer to hide latency of load by executing
392 in parallel with previous instruction in case
393 previous instruction is not needed to compute the address. */
394 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
395 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
396 {
397 enum attr_unit unit = get_attr_unit (insn);
398 int loadcost = 3;
399
400 /* Because of the difference between the length of integer and
401 floating unit pipeline preparation stages, the memory operands
402 for floating point are cheaper.
403
404 ??? For Athlon it the difference is most probably 2. */
405 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
406 loadcost = 3;
407 else
408 loadcost = TARGET_CPU_P (ATHLON) ? 2 : 0;
409
410 if (cost >= loadcost)
411 cost -= loadcost;
412 else
413 cost = 0;
414 }
415 break;
416
417 case PROCESSOR_ZNVER1:
418 case PROCESSOR_ZNVER2:
419 case PROCESSOR_ZNVER3:
420 case PROCESSOR_ZNVER4:
421 case PROCESSOR_ZNVER5:
422 /* Stack engine allows to execute push&pop instructions in parall. */
423 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
424 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
425 return 0;
426
427 memory = get_attr_memory (insn);
428
429 /* Show ability of reorder buffer to hide latency of load by executing
430 in parallel with previous instruction in case
431 previous instruction is not needed to compute the address. */
432 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
433 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
434 {
435 enum attr_unit unit = get_attr_unit (insn);
436 int loadcost;
437
438 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
439 loadcost = 4;
440 else
441 loadcost = 7;
442
443 if (cost >= loadcost)
444 cost -= loadcost;
445 else
446 cost = 0;
447 }
448 break;
449
450 case PROCESSOR_YONGFENG:
451 /* Stack engine allows to execute push&pop instructions in parallel. */
452 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
453 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
454 return 0;
455 /* FALLTHRU */
456
457 case PROCESSOR_LUJIAZUI:
458 memory = get_attr_memory (insn);
459
460 /* Show ability of reorder buffer to hide latency of load by executing
461 in parallel with previous instruction in case
462 previous instruction is not needed to compute the address. */
463 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
464 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
465 {
466 int loadcost = 4;
467
468 if (cost >= loadcost)
469 cost -= loadcost;
470 else
471 cost = 0;
472 }
473 break;
474
475 case PROCESSOR_CORE2:
476 case PROCESSOR_NEHALEM:
477 case PROCESSOR_SANDYBRIDGE:
478 case PROCESSOR_HASWELL:
479 case PROCESSOR_TREMONT:
480 case PROCESSOR_ALDERLAKE:
481 case PROCESSOR_GENERIC:
482 /* Stack engine allows to execute push&pop instructions in parall. */
483 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
484 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
485 return 0;
486
487 memory = get_attr_memory (insn);
488
489 /* Show ability of reorder buffer to hide latency of load by executing
490 in parallel with previous instruction in case
491 previous instruction is not needed to compute the address. */
492 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
493 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
494 {
495 if (cost >= 4)
496 cost -= 4;
497 else
498 cost = 0;
499 }
500 break;
501
502 case PROCESSOR_SILVERMONT:
503 case PROCESSOR_KNL:
504 case PROCESSOR_KNM:
505 case PROCESSOR_INTEL:
506 if (!reload_completed)
507 return cost;
508
509 /* Increase cost of integer loads. */
510 memory = get_attr_memory (dep_insn);
511 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
512 {
513 enum attr_unit unit = get_attr_unit (dep_insn);
514 if (unit == UNIT_INTEGER && cost == 1)
515 {
516 if (memory == MEMORY_LOAD)
517 cost = 3;
518 else
519 {
520 /* Increase cost of ld/st for short int types only
521 because of store forwarding issue. */
522 rtx set = single_set (insn: dep_insn);
523 if (set && (GET_MODE (SET_DEST (set)) == QImode
524 || GET_MODE (SET_DEST (set)) == HImode))
525 {
526 /* Increase cost of store/load insn if exact
527 dependence exists and it is load insn. */
528 enum attr_memory insn_memory = get_attr_memory (insn);
529 if (insn_memory == MEMORY_LOAD
530 && exact_store_load_dependency (store: dep_insn, load: insn))
531 cost = 3;
532 }
533 }
534 }
535 }
536
537 default:
538 break;
539 }
540
541 return cost;
542}
543
544/* How many alternative schedules to try. This should be as wide as the
545 scheduling freedom in the DFA, but no wider. Making this value too
546 large results extra work for the scheduler. */
547
548int
549ia32_multipass_dfa_lookahead (void)
550{
551 /* Generally, we want haifa-sched:max_issue() to look ahead as far
552 as many instructions can be executed on a cycle, i.e.,
553 issue_rate. */
554 if (reload_completed)
555 return ix86_issue_rate ();
556 /* Don't use lookahead for pre-reload schedule to save compile time. */
557 return 0;
558}
559
560/* Return true if target platform supports macro-fusion. */
561
562bool
563ix86_macro_fusion_p ()
564{
565 return TARGET_FUSE_CMP_AND_BRANCH;
566}
567
568/* Check whether current microarchitecture support macro fusion
569 for insn pair "CONDGEN + CONDJMP". Refer to
570 "Intel Architectures Optimization Reference Manual". */
571
572bool
573ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
574{
575 rtx src, dest;
576 enum rtx_code ccode;
577 rtx compare_set = NULL_RTX, test_if, cond;
578 rtx alu_set = NULL_RTX, addr = NULL_RTX;
579 enum attr_type condgen_type;
580
581 if (!any_condjump_p (condjmp))
582 return false;
583
584 unsigned int condreg1, condreg2;
585 rtx cc_reg_1;
586 targetm.fixed_condition_code_regs (&condreg1, &condreg2);
587 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
588 if (!reg_referenced_p (cc_reg_1, PATTERN (insn: condjmp))
589 || !condgen
590 || !modified_in_p (cc_reg_1, condgen))
591 return false;
592
593 condgen_type = get_attr_type (condgen);
594 if (condgen_type == TYPE_MULTI
595 && INSN_CODE (condgen) == code_for_stack_protect_test_1 (arg0: ptr_mode)
596 && TARGET_FUSE_ALU_AND_BRANCH)
597 {
598 /* stack_protect_test_<mode> ends with a sub, which subtracts
599 a non-rip special memory operand from a GPR. */
600 src = NULL_RTX;
601 alu_set = XVECEXP (PATTERN (condgen), 0, 1);
602 goto handle_stack_protect_test;
603 }
604 else if (condgen_type != TYPE_TEST
605 && condgen_type != TYPE_ICMP
606 && condgen_type != TYPE_INCDEC
607 && condgen_type != TYPE_ALU)
608 return false;
609
610 compare_set = single_set (insn: condgen);
611 if (compare_set == NULL_RTX && !TARGET_FUSE_ALU_AND_BRANCH)
612 return false;
613
614 if (compare_set == NULL_RTX)
615 {
616 int i;
617 rtx pat = PATTERN (insn: condgen);
618 for (i = 0; i < XVECLEN (pat, 0); i++)
619 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
620 {
621 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
622 if (GET_CODE (set_src) == COMPARE)
623 compare_set = XVECEXP (pat, 0, i);
624 else
625 alu_set = XVECEXP (pat, 0, i);
626 }
627 }
628 if (compare_set == NULL_RTX)
629 return false;
630 src = SET_SRC (compare_set);
631 if (GET_CODE (src) != COMPARE)
632 return false;
633
634 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
635 supported. */
636 if ((MEM_P (XEXP (src, 0)) && CONST_INT_P (XEXP (src, 1)))
637 || (MEM_P (XEXP (src, 1)) && CONST_INT_P (XEXP (src, 0))))
638 return false;
639
640 /* No fusion for RIP-relative address. */
641 if (MEM_P (XEXP (src, 0)))
642 addr = XEXP (XEXP (src, 0), 0);
643 else if (MEM_P (XEXP (src, 1)))
644 addr = XEXP (XEXP (src, 1), 0);
645
646 if (addr)
647 {
648 ix86_address parts;
649 int ok = ix86_decompose_address (addr, &parts);
650 gcc_assert (ok);
651
652 if (ix86_rip_relative_addr_p (parts: &parts))
653 return false;
654 }
655
656 handle_stack_protect_test:
657 test_if = SET_SRC (pc_set (condjmp));
658 cond = XEXP (test_if, 0);
659 ccode = GET_CODE (cond);
660 /* Check whether conditional jump use Sign or Overflow Flags. */
661 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
662 && (ccode == GE || ccode == GT || ccode == LE || ccode == LT))
663 return false;
664
665 /* Return true for TYPE_TEST and TYPE_ICMP. */
666 if (condgen_type == TYPE_TEST || condgen_type == TYPE_ICMP)
667 return true;
668
669 /* The following is the case that macro-fusion for alu + jmp. */
670 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
671 return false;
672
673 /* No fusion for alu op with memory destination operand. */
674 dest = SET_DEST (alu_set);
675 if (MEM_P (dest))
676 return false;
677
678 /* Macro-fusion for inc/dec + unsigned conditional jump is not
679 supported. */
680 if (condgen_type == TYPE_INCDEC
681 && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU))
682 return false;
683
684 return true;
685}
686
687

source code of gcc/config/i386/x86-tune-sched.cc