1/* Scheduler hooks for IA-32 which implement CPU specific logic.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3. If not see
18<http://www.gnu.org/licenses/>. */
19
20#include "config.h"
21#include "system.h"
22#include "coretypes.h"
23#include "backend.h"
24#include "rtl.h"
25#include "tree.h"
26#include "cfghooks.h"
27#include "tm_p.h"
28#include "insn-config.h"
29#include "insn-attr.h"
30#include "recog.h"
31#include "target.h"
32
33/* Return the maximum number of instructions a cpu can issue. */
34
35int
36ix86_issue_rate (void)
37{
38 switch (ix86_tune)
39 {
40 case PROCESSOR_PENTIUM:
41 case PROCESSOR_LAKEMONT:
42 case PROCESSOR_BONNELL:
43 case PROCESSOR_SILVERMONT:
44 case PROCESSOR_KNL:
45 case PROCESSOR_KNM:
46 case PROCESSOR_INTEL:
47 case PROCESSOR_K6:
48 case PROCESSOR_BTVER2:
49 case PROCESSOR_PENTIUM4:
50 case PROCESSOR_NOCONA:
51 return 2;
52
53 case PROCESSOR_PENTIUMPRO:
54 case PROCESSOR_ATHLON:
55 case PROCESSOR_K8:
56 case PROCESSOR_AMDFAM10:
57 case PROCESSOR_BTVER1:
58 return 3;
59
60 case PROCESSOR_BDVER1:
61 case PROCESSOR_BDVER2:
62 case PROCESSOR_BDVER3:
63 case PROCESSOR_BDVER4:
64 case PROCESSOR_ZNVER1:
65 case PROCESSOR_CORE2:
66 case PROCESSOR_NEHALEM:
67 case PROCESSOR_SANDYBRIDGE:
68 case PROCESSOR_HASWELL:
69 case PROCESSOR_GENERIC:
70 return 4;
71
72 default:
73 return 1;
74 }
75}
76
77/* Return true iff USE_INSN has a memory address with operands set by
78 SET_INSN. */
79
80bool
81ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
82{
83 int i;
84 extract_insn_cached (use_insn);
85 for (i = recog_data.n_operands - 1; i >= 0; --i)
86 if (MEM_P (recog_data.operand[i]))
87 {
88 rtx addr = XEXP (recog_data.operand[i], 0);
89 if (modified_in_p (addr, set_insn) != 0)
90 {
91 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
92 has SP based memory (unless index reg is modified in a pop). */
93 rtx set = single_set (set_insn);
94 if (set
95 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
96 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
97 {
98 struct ix86_address parts;
99 if (ix86_decompose_address (addr, &parts)
100 && parts.base == stack_pointer_rtx
101 && (parts.index == NULL_RTX
102 || MEM_P (SET_DEST (set))
103 || !modified_in_p (parts.index, set_insn)))
104 return false;
105 }
106 return true;
107 }
108 return false;
109 }
110 return false;
111}
112
113/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
114 by DEP_INSN and nothing set by DEP_INSN. */
115
116static bool
117ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
118{
119 rtx set, set2;
120
121 /* Simplify the test for uninteresting insns. */
122 if (insn_type != TYPE_SETCC
123 && insn_type != TYPE_ICMOV
124 && insn_type != TYPE_FCMOV
125 && insn_type != TYPE_IBR)
126 return false;
127
128 if ((set = single_set (dep_insn)) != 0)
129 {
130 set = SET_DEST (set);
131 set2 = NULL_RTX;
132 }
133 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
134 && XVECLEN (PATTERN (dep_insn), 0) == 2
135 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
136 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
137 {
138 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
139 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
140 }
141 else
142 return false;
143
144 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
145 return false;
146
147 /* This test is true if the dependent insn reads the flags but
148 not any other potentially set register. */
149 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
150 return false;
151
152 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
153 return false;
154
155 return true;
156}
157
158/* Helper function for exact_store_load_dependency.
159 Return true if addr is found in insn. */
160static bool
161exact_dependency_1 (rtx addr, rtx insn)
162{
163 enum rtx_code code;
164 const char *format_ptr;
165 int i, j;
166
167 code = GET_CODE (insn);
168 switch (code)
169 {
170 case MEM:
171 if (rtx_equal_p (addr, insn))
172 return true;
173 break;
174 case REG:
175 CASE_CONST_ANY:
176 case SYMBOL_REF:
177 case CODE_LABEL:
178 case PC:
179 case CC0:
180 case EXPR_LIST:
181 return false;
182 default:
183 break;
184 }
185
186 format_ptr = GET_RTX_FORMAT (code);
187 for (i = 0; i < GET_RTX_LENGTH (code); i++)
188 {
189 switch (*format_ptr++)
190 {
191 case 'e':
192 if (exact_dependency_1 (addr, XEXP (insn, i)))
193 return true;
194 break;
195 case 'E':
196 for (j = 0; j < XVECLEN (insn, i); j++)
197 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
198 return true;
199 break;
200 }
201 }
202 return false;
203}
204
205/* Return true if there exists exact dependency for store & load, i.e.
206 the same memory address is used in them. */
207static bool
208exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
209{
210 rtx set1, set2;
211
212 set1 = single_set (store);
213 if (!set1)
214 return false;
215 if (!MEM_P (SET_DEST (set1)))
216 return false;
217 set2 = single_set (load);
218 if (!set2)
219 return false;
220 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
221 return true;
222 return false;
223}
224
225
226/* This function corrects the value of COST (latency) based on the relationship
227 between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
228 DW. It should return the new value.
229
230 On x86 CPUs this is most commonly used to model the fact that valus of
231 registers used to compute address of memory operand needs to be ready
232 earlier than values of registers used in the actual operation. */
233
234int
235ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
236 unsigned int)
237{
238 enum attr_type insn_type, dep_insn_type;
239 enum attr_memory memory;
240 rtx set, set2;
241 int dep_insn_code_number;
242
243 /* Anti and output dependencies have zero cost on all CPUs. */
244 if (dep_type != 0)
245 return 0;
246
247 dep_insn_code_number = recog_memoized (dep_insn);
248
249 /* If we can't recognize the insns, we can't really do anything. */
250 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
251 return cost;
252
253 insn_type = get_attr_type (insn);
254 dep_insn_type = get_attr_type (dep_insn);
255
256 switch (ix86_tune)
257 {
258 case PROCESSOR_PENTIUM:
259 case PROCESSOR_LAKEMONT:
260 /* Address Generation Interlock adds a cycle of latency. */
261 if (insn_type == TYPE_LEA)
262 {
263 rtx addr = PATTERN (insn);
264
265 if (GET_CODE (addr) == PARALLEL)
266 addr = XVECEXP (addr, 0, 0);
267
268 gcc_assert (GET_CODE (addr) == SET);
269
270 addr = SET_SRC (addr);
271 if (modified_in_p (addr, dep_insn))
272 cost += 1;
273 }
274 else if (ix86_agi_dependent (dep_insn, insn))
275 cost += 1;
276
277 /* ??? Compares pair with jump/setcc. */
278 if (ix86_flags_dependent (insn, dep_insn, insn_type))
279 cost = 0;
280
281 /* Floating point stores require value to be ready one cycle earlier. */
282 if (insn_type == TYPE_FMOV
283 && get_attr_memory (insn) == MEMORY_STORE
284 && !ix86_agi_dependent (dep_insn, insn))
285 cost += 1;
286 break;
287
288 case PROCESSOR_PENTIUMPRO:
289 /* INT->FP conversion is expensive. */
290 if (get_attr_fp_int_src (dep_insn))
291 cost += 5;
292
293 /* There is one cycle extra latency between an FP op and a store. */
294 if (insn_type == TYPE_FMOV
295 && (set = single_set (dep_insn)) != NULL_RTX
296 && (set2 = single_set (insn)) != NULL_RTX
297 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
298 && MEM_P (SET_DEST (set2)))
299 cost += 1;
300
301 memory = get_attr_memory (insn);
302
303 /* Show ability of reorder buffer to hide latency of load by executing
304 in parallel with previous instruction in case
305 previous instruction is not needed to compute the address. */
306 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
307 && !ix86_agi_dependent (dep_insn, insn))
308 {
309 /* Claim moves to take one cycle, as core can issue one load
310 at time and the next load can start cycle later. */
311 if (dep_insn_type == TYPE_IMOV
312 || dep_insn_type == TYPE_FMOV)
313 cost = 1;
314 else if (cost > 1)
315 cost--;
316 }
317 break;
318
319 case PROCESSOR_K6:
320 /* The esp dependency is resolved before
321 the instruction is really finished. */
322 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
323 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
324 return 1;
325
326 /* INT->FP conversion is expensive. */
327 if (get_attr_fp_int_src (dep_insn))
328 cost += 5;
329
330 memory = get_attr_memory (insn);
331
332 /* Show ability of reorder buffer to hide latency of load by executing
333 in parallel with previous instruction in case
334 previous instruction is not needed to compute the address. */
335 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
336 && !ix86_agi_dependent (dep_insn, insn))
337 {
338 /* Claim moves to take one cycle, as core can issue one load
339 at time and the next load can start cycle later. */
340 if (dep_insn_type == TYPE_IMOV
341 || dep_insn_type == TYPE_FMOV)
342 cost = 1;
343 else if (cost > 2)
344 cost -= 2;
345 else
346 cost = 1;
347 }
348 break;
349
350 case PROCESSOR_AMDFAM10:
351 case PROCESSOR_BDVER1:
352 case PROCESSOR_BDVER2:
353 case PROCESSOR_BDVER3:
354 case PROCESSOR_BDVER4:
355 case PROCESSOR_BTVER1:
356 case PROCESSOR_BTVER2:
357 /* Stack engine allows to execute push&pop instructions in parall. */
358 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
359 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
360 return 0;
361 /* FALLTHRU */
362
363 case PROCESSOR_ATHLON:
364 case PROCESSOR_K8:
365 memory = get_attr_memory (insn);
366
367 /* Show ability of reorder buffer to hide latency of load by executing
368 in parallel with previous instruction in case
369 previous instruction is not needed to compute the address. */
370 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
371 && !ix86_agi_dependent (dep_insn, insn))
372 {
373 enum attr_unit unit = get_attr_unit (insn);
374 int loadcost = 3;
375
376 /* Because of the difference between the length of integer and
377 floating unit pipeline preparation stages, the memory operands
378 for floating point are cheaper.
379
380 ??? For Athlon it the difference is most probably 2. */
381 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
382 loadcost = 3;
383 else
384 loadcost = TARGET_ATHLON ? 2 : 0;
385
386 if (cost >= loadcost)
387 cost -= loadcost;
388 else
389 cost = 0;
390 }
391 break;
392
393 case PROCESSOR_ZNVER1:
394 /* Stack engine allows to execute push&pop instructions in parall. */
395 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
396 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
397 return 0;
398
399 memory = get_attr_memory (insn);
400
401 /* Show ability of reorder buffer to hide latency of load by executing
402 in parallel with previous instruction in case
403 previous instruction is not needed to compute the address. */
404 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
405 && !ix86_agi_dependent (dep_insn, insn))
406 {
407 enum attr_unit unit = get_attr_unit (insn);
408 int loadcost;
409
410 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
411 loadcost = 4;
412 else
413 loadcost = 7;
414
415 if (cost >= loadcost)
416 cost -= loadcost;
417 else
418 cost = 0;
419 }
420 break;
421
422 case PROCESSOR_CORE2:
423 case PROCESSOR_NEHALEM:
424 case PROCESSOR_SANDYBRIDGE:
425 case PROCESSOR_HASWELL:
426 case PROCESSOR_GENERIC:
427 /* Stack engine allows to execute push&pop instructions in parall. */
428 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
429 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
430 return 0;
431
432 memory = get_attr_memory (insn);
433
434 /* Show ability of reorder buffer to hide latency of load by executing
435 in parallel with previous instruction in case
436 previous instruction is not needed to compute the address. */
437 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
438 && !ix86_agi_dependent (dep_insn, insn))
439 {
440 if (cost >= 4)
441 cost -= 4;
442 else
443 cost = 0;
444 }
445 break;
446
447 case PROCESSOR_SILVERMONT:
448 case PROCESSOR_KNL:
449 case PROCESSOR_KNM:
450 case PROCESSOR_INTEL:
451 if (!reload_completed)
452 return cost;
453
454 /* Increase cost of integer loads. */
455 memory = get_attr_memory (dep_insn);
456 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
457 {
458 enum attr_unit unit = get_attr_unit (dep_insn);
459 if (unit == UNIT_INTEGER && cost == 1)
460 {
461 if (memory == MEMORY_LOAD)
462 cost = 3;
463 else
464 {
465 /* Increase cost of ld/st for short int types only
466 because of store forwarding issue. */
467 rtx set = single_set (dep_insn);
468 if (set && (GET_MODE (SET_DEST (set)) == QImode
469 || GET_MODE (SET_DEST (set)) == HImode))
470 {
471 /* Increase cost of store/load insn if exact
472 dependence exists and it is load insn. */
473 enum attr_memory insn_memory = get_attr_memory (insn);
474 if (insn_memory == MEMORY_LOAD
475 && exact_store_load_dependency (dep_insn, insn))
476 cost = 3;
477 }
478 }
479 }
480 }
481
482 default:
483 break;
484 }
485
486 return cost;
487}
488
489/* How many alternative schedules to try. This should be as wide as the
490 scheduling freedom in the DFA, but no wider. Making this value too
491 large results extra work for the scheduler. */
492
493int
494ia32_multipass_dfa_lookahead (void)
495{
496 /* Generally, we want haifa-sched:max_issue() to look ahead as far
497 as many instructions can be executed on a cycle, i.e.,
498 issue_rate. */
499 if (reload_completed)
500 return ix86_issue_rate ();
501 /* Don't use lookahead for pre-reload schedule to save compile time. */
502 return 0;
503}
504
505/* Return true if target platform supports macro-fusion. */
506
507bool
508ix86_macro_fusion_p ()
509{
510 return TARGET_FUSE_CMP_AND_BRANCH;
511}
512
513/* Check whether current microarchitecture support macro fusion
514 for insn pair "CONDGEN + CONDJMP". Refer to
515 "Intel Architectures Optimization Reference Manual". */
516
517bool
518ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
519{
520 rtx src, dest;
521 enum rtx_code ccode;
522 rtx compare_set = NULL_RTX, test_if, cond;
523 rtx alu_set = NULL_RTX, addr = NULL_RTX;
524
525 if (!any_condjump_p (condjmp))
526 return false;
527
528 unsigned int condreg1, condreg2;
529 rtx cc_reg_1;
530 targetm.fixed_condition_code_regs (&condreg1, &condreg2);
531 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
532 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
533 || !condgen
534 || !modified_in_p (cc_reg_1, condgen))
535 return false;
536
537 if (get_attr_type (condgen) != TYPE_TEST
538 && get_attr_type (condgen) != TYPE_ICMP
539 && get_attr_type (condgen) != TYPE_INCDEC
540 && get_attr_type (condgen) != TYPE_ALU)
541 return false;
542
543 compare_set = single_set (condgen);
544 if (compare_set == NULL_RTX
545 && !TARGET_FUSE_ALU_AND_BRANCH)
546 return false;
547
548 if (compare_set == NULL_RTX)
549 {
550 int i;
551 rtx pat = PATTERN (condgen);
552 for (i = 0; i < XVECLEN (pat, 0); i++)
553 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
554 {
555 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
556 if (GET_CODE (set_src) == COMPARE)
557 compare_set = XVECEXP (pat, 0, i);
558 else
559 alu_set = XVECEXP (pat, 0, i);
560 }
561 }
562 if (compare_set == NULL_RTX)
563 return false;
564 src = SET_SRC (compare_set);
565 if (GET_CODE (src) != COMPARE)
566 return false;
567
568 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
569 supported. */
570 if ((MEM_P (XEXP (src, 0))
571 && CONST_INT_P (XEXP (src, 1)))
572 || (MEM_P (XEXP (src, 1))
573 && CONST_INT_P (XEXP (src, 0))))
574 return false;
575
576 /* No fusion for RIP-relative address. */
577 if (MEM_P (XEXP (src, 0)))
578 addr = XEXP (XEXP (src, 0), 0);
579 else if (MEM_P (XEXP (src, 1)))
580 addr = XEXP (XEXP (src, 1), 0);
581
582 if (addr) {
583 ix86_address parts;
584 int ok = ix86_decompose_address (addr, &parts);
585 gcc_assert (ok);
586
587 if (ix86_rip_relative_addr_p (&parts))
588 return false;
589 }
590
591 test_if = SET_SRC (pc_set (condjmp));
592 cond = XEXP (test_if, 0);
593 ccode = GET_CODE (cond);
594 /* Check whether conditional jump use Sign or Overflow Flags. */
595 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
596 && (ccode == GE
597 || ccode == GT
598 || ccode == LE
599 || ccode == LT))
600 return false;
601
602 /* Return true for TYPE_TEST and TYPE_ICMP. */
603 if (get_attr_type (condgen) == TYPE_TEST
604 || get_attr_type (condgen) == TYPE_ICMP)
605 return true;
606
607 /* The following is the case that macro-fusion for alu + jmp. */
608 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
609 return false;
610
611 /* No fusion for alu op with memory destination operand. */
612 dest = SET_DEST (alu_set);
613 if (MEM_P (dest))
614 return false;
615
616 /* Macro-fusion for inc/dec + unsigned conditional jump is not
617 supported. */
618 if (get_attr_type (condgen) == TYPE_INCDEC
619 && (ccode == GEU
620 || ccode == GTU
621 || ccode == LEU
622 || ccode == LTU))
623 return false;
624
625 return true;
626}
627
628