1 | /* Definitions of x86 tunable features. |
2 | Copyright (C) 2013-2023 Free Software Foundation, Inc. |
3 | |
4 | This file is part of GCC. |
5 | |
6 | GCC is free software; you can redistribute it and/or modify |
7 | it under the terms of the GNU General Public License as published by |
8 | the Free Software Foundation; either version 3, or (at your option) |
9 | any later version. |
10 | |
11 | GCC is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | GNU General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU General Public License and |
17 | a copy of the GCC Runtime Library Exception along with this program; |
18 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
19 | <http://www.gnu.org/licenses/>. */ |
20 | |
21 | /* Tuning for a given CPU XXXX consists of: |
22 | - adding new CPU into: |
23 | - adding PROCESSOR_XXX to processor_type (in i386.h) |
24 | - possibly adding XXX into CPU attribute in i386.md |
25 | - adding XXX to processor_alias_table (in i386.cc) |
26 | - introducing ix86_XXX_cost in i386.cc |
27 | - Stringop generation table can be build based on test_stringop |
28 | - script (once rest of tuning is complete) |
29 | - designing a scheduler model in |
30 | - XXXX.md file |
31 | - Updating ix86_issue_rate and ix86_adjust_cost in i386.md |
32 | - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder |
33 | and ix86_sched_init_global if those tricks are needed. |
34 | - Tunning the flags bellow. Those are split into sections and each |
35 | section is very roughly ordered by importance. */ |
36 | |
37 | /*****************************************************************************/ |
38 | /* Scheduling flags. */ |
39 | /*****************************************************************************/ |
40 | |
41 | /* X86_TUNE_SCHEDULE: Enable scheduling. */ |
42 | DEF_TUNE (X86_TUNE_SCHEDULE, "schedule" , |
43 | m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT |
44 | | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN |
45 | | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID |
46 | | m_CORE_ATOM | m_GENERIC) |
47 | |
48 | /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming |
49 | on modern chips. Prefer stores affecting whole integer register |
50 | over partial stores. For example prefer MOVZBL or MOVQ to load 8bit |
51 | value over movb. */ |
52 | DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency" , |
53 | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 |
54 | | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL |
55 | | m_KNL | m_KNM | m_AMD_MULTIPLE | m_ZHAOXIN | m_TREMONT |
56 | | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC) |
57 | |
58 | /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store |
59 | destinations to be 128bit to allow register renaming on 128bit SSE units, |
60 | but usually results in one extra microop on 64bit SSE units. |
61 | Experimental results shows that disabling this option on P4 brings over 20% |
62 | SPECfp regression, while enabling it on K8 brings roughly 2.4% regression |
63 | that can be partly masked by careful scheduling of moves. */ |
64 | DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency" , |
65 | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 |
66 | | m_BDVER | m_ZNVER | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID |
67 | | m_CORE_ATOM | m_GENERIC) |
68 | |
69 | /* X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY: This knob avoids |
70 | partial write to the destination in scalar SSE conversion from FP |
71 | to FP. */ |
72 | DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY, |
73 | "sse_partial_reg_fp_converts_dependency" , |
74 | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 |
75 | | m_BDVER | m_ZNVER | m_ZHAOXIN | m_CORE_HYBRID | m_CORE_ATOM |
76 | | m_GENERIC) |
77 | |
78 | /* X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY: This knob avoids partial |
79 | write to the destination in scalar SSE conversion from integer to FP. */ |
80 | DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY, |
81 | "sse_partial_reg_converts_dependency" , |
82 | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 |
83 | | m_BDVER | m_ZNVER | m_ZHAOXIN | m_CORE_HYBRID | m_CORE_ATOM |
84 | | m_GENERIC) |
85 | |
86 | /* X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before |
87 | several insns to break false dependency on the dest register for GLC |
88 | micro-architecture. */ |
89 | DEF_TUNE (X86_TUNE_DEST_FALSE_DEP_FOR_GLC, |
90 | "dest_false_dep_for_glc" , m_SAPPHIRERAPIDS | m_CORE_HYBRID |
91 | | m_CORE_ATOM) |
92 | |
93 | /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies |
94 | are resolved on SSE register parts instead of whole registers, so we may |
95 | maintain just lower part of scalar values in proper format leaving the |
96 | upper part undefined. */ |
97 | DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs" , m_ATHLON_K8) |
98 | |
99 | /* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of flags |
100 | set by instructions affecting just some flags (in particular shifts). |
101 | This is because Core2 resolves dependencies on whole flags register |
102 | and such sequences introduce false dependency on previous instruction |
103 | setting full flags. |
104 | |
105 | The flags does not affect generation of INC and DEC that is controlled |
106 | by X86_TUNE_USE_INCDEC. */ |
107 | |
108 | DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall" , |
109 | m_CORE2) |
110 | |
111 | /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid |
112 | partial dependencies. */ |
113 | DEF_TUNE (X86_TUNE_MOVX, "movx" , |
114 | m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE |
115 | | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_KNL | m_KNM | m_INTEL |
116 | | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN |
117 | | m_CORE_AVX2 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC) |
118 | |
119 | /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by |
120 | full sized loads. */ |
121 | DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall" , |
122 | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL |
123 | | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_AMD_MULTIPLE |
124 | | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC) |
125 | |
126 | /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent |
127 | conditional jump instruction for 32 bit TARGET. */ |
128 | DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32" , |
129 | m_CORE_ALL | m_BDVER | m_ZNVER | m_ZHAOXIN | m_GENERIC) |
130 | |
131 | /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent |
132 | conditional jump instruction for TARGET_64BIT. */ |
133 | DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64" , |
134 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER |
135 | | m_ZNVER | m_ZHAOXIN | m_GENERIC) |
136 | |
137 | /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a |
138 | subsequent conditional jump instruction when the condition jump |
139 | check sign flag (SF) or overflow flag (OF). */ |
140 | DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags" , |
141 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER |
142 | | m_ZNVER | m_ZHAOXIN | m_GENERIC) |
143 | |
144 | /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional |
145 | jump instruction when the alu instruction produces the CCFLAG consumed by |
146 | the conditional jump instruction. */ |
147 | DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch" , |
148 | m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC) |
149 | |
150 | |
151 | /*****************************************************************************/ |
152 | /* Function prologue, epilogue and function calling sequences. */ |
153 | /*****************************************************************************/ |
154 | |
155 | /* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing |
156 | arguments in prologue/epilogue instead of separately for each call |
157 | by push/pop instructions. |
158 | This increase code size by about 5% in 32bit mode, less so in 64bit mode |
159 | because parameters are passed in registers. It is considerable |
160 | win for targets without stack engine that prevents multple push operations |
161 | to happen in parallel. */ |
162 | |
163 | DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args" , |
164 | m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL |
165 | | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8 | m_ZHAOXIN) |
166 | |
167 | /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are |
168 | considered on critical path. */ |
169 | DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move" , |
170 | m_PPRO | m_ATHLON_K8) |
171 | |
172 | /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are |
173 | considered on critical path. */ |
174 | DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move" , |
175 | m_PPRO | m_ATHLON_K8) |
176 | |
177 | /* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */ |
178 | DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave" , |
179 | m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN |
180 | | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC) |
181 | |
182 | /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions. |
183 | Some chips, like 486 and Pentium works faster with separate load |
184 | and push instructions. */ |
185 | DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory" , |
186 | m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE |
187 | | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC) |
188 | |
189 | /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred |
190 | over esp subtraction. */ |
191 | DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push" , m_386 | m_486 | m_PENT |
192 | | m_LAKEMONT | m_K6_GEODE) |
193 | |
194 | /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred |
195 | over esp subtraction. */ |
196 | DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push" , m_PENT | m_LAKEMONT |
197 | | m_K6_GEODE) |
198 | |
199 | /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred |
200 | over esp addition. */ |
201 | DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop" , m_386 | m_486 | m_PENT |
202 | | m_LAKEMONT | m_PPRO) |
203 | |
204 | /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred |
205 | over esp addition. */ |
206 | DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop" , m_PENT | m_LAKEMONT) |
207 | |
208 | /*****************************************************************************/ |
209 | /* Branch predictor tuning */ |
210 | /*****************************************************************************/ |
211 | |
212 | /* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4 |
213 | instructions long. */ |
214 | DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function" , m_BONNELL) |
215 | |
216 | /* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination |
217 | of conditional jump or directly preceded by other jump instruction. |
218 | This is important for AND K8-AMDFAM10 because the branch prediction |
219 | architecture expect at most one jump per 2 byte window. Failing to |
220 | pad returns leads to misaligned return stack. */ |
221 | DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns" , |
222 | m_ATHLON_K8 | m_AMDFAM10) |
223 | |
224 | /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more |
225 | than 4 branch instructions in the 16 byte window. */ |
226 | DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit" , |
227 | m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM |
228 | | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10) |
229 | |
230 | /*****************************************************************************/ |
231 | /* Integer instruction selection tuning */ |
232 | /*****************************************************************************/ |
233 | |
234 | /* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching |
235 | at -O3. For the moment, the prefetching seems badly tuned for Intel |
236 | chips. */ |
237 | DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial" , |
238 | m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER) |
239 | |
240 | /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall |
241 | on 16-bit immediate moves into memory on Core2 and Corei7. */ |
242 | DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall" , m_CORE_ALL | m_ZHAOXIN | m_GENERIC) |
243 | |
244 | /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such |
245 | as "add mem, reg". */ |
246 | DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify" , ~(m_PENT | m_LAKEMONT | m_PPRO)) |
247 | |
248 | /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions. |
249 | |
250 | Core2 and nehalem has stall of 7 cycles for partial flag register stalls. |
251 | Sandy bridge and Ivy bridge generate extra uop. On Haswell this extra uop |
252 | is output only when the values needs to be really merged, which is not |
253 | done by GCC generated code. */ |
254 | DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec" , |
255 | ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE |
256 | | m_BONNELL | m_SILVERMONT | m_INTEL | m_KNL | m_KNM | m_GOLDMONT |
257 | | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM |
258 | | m_ZHAOXIN | m_GENERIC)) |
259 | |
260 | /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred |
261 | for DFmode copies */ |
262 | DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves" , |
263 | ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT |
264 | | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN |
265 | | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID |
266 | | m_CORE_ATOM | m_GENERIC)) |
267 | |
268 | /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag |
269 | will impact LEA instruction selection. */ |
270 | DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu" , m_BONNELL | m_SILVERMONT | m_KNL |
271 | | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL | m_ZHAOXIN) |
272 | |
273 | /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */ |
274 | DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr" , |
275 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS |
276 | | m_KNL | m_KNM) |
277 | |
278 | /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is |
279 | vector path on AMD machines. |
280 | FIXME: Do we need to enable this for core? */ |
281 | DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem" , |
282 | m_K8 | m_AMDFAM10) |
283 | |
284 | /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD |
285 | machines. |
286 | FIXME: Do we need to enable this for core? */ |
287 | DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8" , |
288 | m_K8 | m_AMDFAM10) |
289 | |
290 | /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for |
291 | a conditional move. */ |
292 | DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove" , |
293 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_KNL |
294 | | m_KNM | m_INTEL) |
295 | |
296 | /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such |
297 | as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */ |
298 | DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop" , m_386 | m_P4_NOCONA) |
299 | |
300 | /* X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB: Enable use of REP MOVSB/STOSB to |
301 | move/set sequences of bytes with known size. */ |
302 | DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB, |
303 | "prefer_known_rep_movsb_stosb" , |
304 | m_SKYLAKE | m_CORE_HYBRID | m_CORE_ATOM | m_TREMONT | m_CORE_AVX512 |
305 | | m_ZHAOXIN) |
306 | |
307 | /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of |
308 | compact prologues and epilogues by issuing a misaligned moves. This |
309 | requires target to handle misaligned moves and partial memory stalls |
310 | reasonably well. |
311 | FIXME: This may actualy be a win on more targets than listed here. */ |
312 | DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES, |
313 | "misaligned_move_string_pro_epilogues" , |
314 | m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_ZHAOXIN | m_TREMONT |
315 | | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC) |
316 | |
317 | /* X86_TUNE_USE_SAHF: Controls use of SAHF. */ |
318 | DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf" , |
319 | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT |
320 | | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER |
321 | | m_BTVER | m_ZNVER | m_ZHAOXIN | m_GOLDMONT | m_GOLDMONT_PLUS |
322 | | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC) |
323 | |
324 | /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */ |
325 | DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd" , |
326 | ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL |
327 | | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS)) |
328 | |
329 | /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */ |
330 | DEF_TUNE (X86_TUNE_USE_BT, "use_bt" , |
331 | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL |
332 | | m_LAKEMONT | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT |
333 | | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM |
334 | | m_GENERIC) |
335 | |
336 | /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency |
337 | for bit-manipulation instructions. */ |
338 | DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi" , |
339 | m_SANDYBRIDGE | m_HASWELL | m_SKYLAKE | m_SKYLAKE_AVX512 |
340 | | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE |
341 | | m_ZHAOXIN | m_GENERIC) |
342 | |
343 | /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based |
344 | on hardware capabilities. Bdver3 hardware has a loop buffer which makes |
345 | unrolling small loop less important. For, such architectures we adjust |
346 | the unroll factor so that the unrolled loop fits the loop buffer. */ |
347 | DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor" , m_BDVER3 | m_BDVER4) |
348 | |
349 | /* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in |
350 | if-converted sequence to one. */ |
351 | DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn" , |
352 | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GOLDMONT |
353 | | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM |
354 | | m_ZHAOXIN | m_GENERIC) |
355 | |
356 | /* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence. */ |
357 | DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence" , |
358 | m_CORE_ALL | m_BDVER | m_ZNVER | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID |
359 | | m_CORE_ATOM | m_GENERIC) |
360 | |
361 | /* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by |
362 | generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) - |
363 | (signed) x >> (W-1)) instead of cmove or SSE max/abs instructions. */ |
364 | DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs" , |
365 | m_CORE_ALL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT |
366 | | m_GOLDMONT_PLUS | m_ZHAOXIN) |
367 | |
368 | /*****************************************************************************/ |
369 | /* 387 instruction selection tuning */ |
370 | /*****************************************************************************/ |
371 | |
372 | /* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit |
373 | integer operand. |
374 | FIXME: Why this is disabled for modern chips? */ |
375 | DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop" , |
376 | m_386 | m_486 | m_K6_GEODE) |
377 | |
378 | /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit |
379 | integer operand. */ |
380 | DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop" , |
381 | ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL |
382 | | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_AMD_MULTIPLE |
383 | | m_ZHAOXIN | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT |
384 | | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)) |
385 | |
386 | /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */ |
387 | DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep" , m_AMD_MULTIPLE | m_ZHAOXIN) |
388 | |
389 | /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */ |
390 | DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants" , |
391 | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT |
392 | | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_ZHAOXIN |
393 | | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID |
394 | | m_CORE_ATOM | m_GENERIC) |
395 | |
396 | /*****************************************************************************/ |
397 | /* SSE instruction selection tuning */ |
398 | /*****************************************************************************/ |
399 | |
400 | /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE |
401 | regs instead of memory. */ |
402 | DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill" , |
403 | m_CORE_ALL) |
404 | |
405 | /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead |
406 | of a sequence loading registers by parts. */ |
407 | DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal" , |
408 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM |
409 | | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID |
410 | | m_CORE_ATOM | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_ZHAOXIN |
411 | | m_GENERIC) |
412 | |
413 | /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores |
414 | instead of a sequence loading registers by parts. */ |
415 | DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal" , |
416 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM |
417 | | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID |
418 | | m_CORE_ATOM | m_BDVER | m_ZNVER | m_ZHAOXIN | m_GENERIC) |
419 | |
420 | /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL: Use packed single |
421 | precision 128bit instructions instead of double where possible. */ |
422 | DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal" , |
423 | m_BDVER | m_ZNVER) |
424 | |
425 | /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. */ |
426 | DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores" , |
427 | m_AMD_MULTIPLE | m_ZHAOXIN | m_CORE_ALL | m_TREMONT | m_CORE_HYBRID |
428 | | m_CORE_ATOM | m_GENERIC) |
429 | |
430 | /* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to |
431 | xorps/xorpd and other variants. */ |
432 | DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor" , |
433 | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER |
434 | | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC) |
435 | |
436 | /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer |
437 | to SSE registers. If disabled, the moves will be done by storing |
438 | the value to memory and reloading. |
439 | Enable this flag for generic - the only relevant architecture preferring |
440 | no inter-unit moves is Buldozer. While this makes small regression on SPECfp |
441 | scores (sub 0.3%), disabling inter-unit moves penalizes noticeably hand |
442 | written vectorized code which use i.e. _mm_set_epi16. */ |
443 | DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec" , |
444 | ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)) |
445 | |
446 | /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE |
447 | to integer registers. If disabled, the moves will be done by storing |
448 | the value to memory and reloading. */ |
449 | DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec" , |
450 | ~m_ATHLON_K8) |
451 | |
452 | /* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions |
453 | to use both SSE and integer registers at a same time. */ |
454 | DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions" , |
455 | ~(m_AMDFAM10 | m_BDVER)) |
456 | |
457 | /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for |
458 | fp converts to destination register. */ |
459 | DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts" , |
460 | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS |
461 | | m_INTEL) |
462 | |
463 | /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion |
464 | from FP to FP. This form of instructions avoids partial write to the |
465 | destination. */ |
466 | DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts" , |
467 | m_AMDFAM10) |
468 | |
469 | /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion |
470 | from integer to FP. */ |
471 | DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts" , m_AMDFAM10) |
472 | |
473 | /* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */ |
474 | DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb" , |
475 | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT |
476 | | m_GOLDMONT_PLUS | m_INTEL) |
477 | |
478 | /* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes. */ |
479 | DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes" , |
480 | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID |
481 | | m_CORE_ATOM | m_INTEL) |
482 | |
483 | /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2 |
484 | elements. */ |
485 | DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts" , |
486 | ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID |
487 | | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS)) |
488 | |
489 | /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2 |
490 | elements. */ |
491 | DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts" , |
492 | ~(m_ZNVER4)) |
493 | |
494 | /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4 |
495 | elements. */ |
496 | DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts" , |
497 | ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID |
498 | | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS)) |
499 | |
500 | /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4 |
501 | elements. */ |
502 | DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts" , |
503 | ~(m_ZNVER4)) |
504 | |
505 | /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more |
506 | elements. */ |
507 | DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts" , |
508 | ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM |
509 | | m_YONGFENG | m_GENERIC | m_GDS)) |
510 | |
511 | /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more |
512 | elements. */ |
513 | DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts" , |
514 | ~(m_ZNVER4)) |
515 | |
516 | /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or |
517 | smaller FMA chain. */ |
518 | DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains" , m_ZNVER1 | m_ZNVER2 | m_ZNVER3 |
519 | | m_YONGFENG) |
520 | |
521 | /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or |
522 | smaller FMA chain. */ |
523 | DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains" , m_ZNVER2 | m_ZNVER3 |
524 | | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM) |
525 | |
526 | /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or |
527 | smaller FMA chain. */ |
528 | DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains" , m_NONE) |
529 | |
530 | /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd |
531 | for v2df vector reduction. */ |
532 | DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD, |
533 | "v2df_reduction_prefer_haddpd" , m_NONE) |
534 | |
535 | /*****************************************************************************/ |
536 | /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ |
537 | /*****************************************************************************/ |
538 | |
539 | /* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are |
540 | split. */ |
541 | DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal" , |
542 | ~(m_NEHALEM | m_SANDYBRIDGE)) |
543 | |
544 | /* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are |
545 | split. */ |
546 | DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal" , |
547 | ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1)) |
548 | |
549 | /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops. */ |
550 | DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs" ,m_BDVER | m_BTVER2 |
551 | | m_ZNVER1) |
552 | |
553 | /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for |
554 | the auto-vectorizer. */ |
555 | DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal" , m_BDVER | m_BTVER2 |
556 | | m_ZNVER1) |
557 | |
558 | /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX |
559 | instructions in the auto-vectorizer. */ |
560 | DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal" , m_CORE_AVX512) |
561 | |
562 | /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 ops. */ |
563 | DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs" , m_ZNVER4) |
564 | |
565 | /* X86_TUNE_AVX256_MOVE_BY_PIECES: Optimize move_by_pieces with 256-bit |
566 | AVX instructions. */ |
567 | DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces" , |
568 | m_CORE_HYBRID | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3) |
569 | |
570 | /* X86_TUNE_AVX256_STORE_BY_PIECES: Optimize store_by_pieces with 256-bit |
571 | AVX instructions. */ |
572 | DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES, "avx256_store_by_pieces" , |
573 | m_CORE_HYBRID | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3) |
574 | |
575 | /* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit |
576 | AVX instructions. */ |
577 | DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces" , |
578 | m_SAPPHIRERAPIDS | m_ZNVER4) |
579 | |
580 | /* X86_TUNE_AVX512_STORE_BY_PIECES: Optimize store_by_pieces with 512-bit |
581 | AVX instructions. */ |
582 | DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces" , |
583 | m_SAPPHIRERAPIDS | m_ZNVER4) |
584 | |
585 | /*****************************************************************************/ |
586 | /*****************************************************************************/ |
587 | /* Historical relics: tuning flags that helps a specific old CPU designs */ |
588 | /*****************************************************************************/ |
589 | |
590 | /* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in |
591 | an integer register. */ |
592 | DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add" , ~m_386) |
593 | |
594 | /* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations, |
595 | such as fsqrt, fprem, fsin, fcos, fsincos etc. |
596 | Should be enabled for all targets that always has coprocesor. */ |
597 | DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387" , |
598 | ~(m_386 | m_486 | m_LAKEMONT)) |
599 | |
600 | /* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for |
601 | inline strlen. This affects only -minline-all-stringops mode. By |
602 | default we always dispatch to a library since our internal strlen |
603 | is bad. */ |
604 | DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen" , ~m_386) |
605 | |
606 | /* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of |
607 | longer "sal $1, reg". */ |
608 | DEF_TUNE (X86_TUNE_SHIFT1, "shift1" , ~m_486) |
609 | |
610 | /* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead |
611 | of mozbl/movwl. */ |
612 | DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and" , |
613 | m_486 | m_PENT) |
614 | |
615 | /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode |
616 | and SImode multiply, but 386 and 486 do HImode multiply faster. */ |
617 | DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul" , |
618 | ~(m_386 | m_486)) |
619 | |
620 | /* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic |
621 | into 16bit/8bit when resulting sequence is shorter. For example |
622 | for "and $-65536, reg" to 16bit store of 0. */ |
623 | DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix" , |
624 | ~(m_386 | m_486 | m_PENT | m_LAKEMONT)) |
625 | |
626 | /* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions |
627 | such as "add $1, mem". */ |
628 | DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write" , |
629 | ~(m_PENT | m_LAKEMONT)) |
630 | |
631 | /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR |
632 | than a MOV. */ |
633 | DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or" , m_PENT | m_LAKEMONT) |
634 | |
635 | /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is, |
636 | but one byte longer. */ |
637 | DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable" , m_PENT | m_LAKEMONT) |
638 | |
639 | /* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled |
640 | use of partial registers by renaming. This improved performance of 16bit |
641 | code where upper halves of registers are not used. It also leads to |
642 | an penalty whenever a 16bit store is followed by 32bit use. This flag |
643 | disables production of such sequences in common cases. |
644 | See also X86_TUNE_HIMODE_MATH. |
645 | |
646 | In current implementation the partial register stalls are not eliminated |
647 | very well - they can be introduced via subregs synthesized by combine |
648 | and can happen in caller/callee saving sequences. */ |
649 | DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall" , m_PPRO) |
650 | |
651 | /* X86_TUNE_PARTIAL_MEMORY_READ_STALL: Reading (possible unaligned) part of |
652 | memory location after a large write to the same address causes |
653 | store-to-load forwarding stall. */ |
654 | DEF_TUNE (X86_TUNE_PARTIAL_MEMORY_READ_STALL, "partial_memory_read_stall" , |
655 | m_386 | m_486 | m_PENT | m_LAKEMONT | m_PPRO | m_P4_NOCONA | m_CORE2 |
656 | | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT |
657 | | m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10) |
658 | |
659 | /* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to |
660 | corresponding 32bit arithmetic. */ |
661 | DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode" , |
662 | ~m_PPRO) |
663 | |
664 | /* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic. Again we avoid |
665 | partial register stalls on PentiumPro targets. */ |
666 | DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs" , m_PPRO) |
667 | |
668 | /* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic. |
669 | On PPro this flag is meant to avoid partial register stalls. */ |
670 | DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math" , ~m_PPRO) |
671 | |
672 | /* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates |
673 | directly to memory. */ |
674 | DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves" , m_PPRO) |
675 | |
676 | /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */ |
677 | DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb" , m_PENT4) |
678 | |
679 | /* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear |
680 | integer register. */ |
681 | DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0" , m_K6) |
682 | |
683 | /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory |
684 | operand that cannot be represented using a modRM byte. The XOR |
685 | replacement is long decoded, so this split helps here as well. */ |
686 | DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode" , m_K6) |
687 | |
688 | /* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded |
689 | forms of instructions on K8 targets. */ |
690 | DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode" , |
691 | m_K8) |
692 | |
693 | /*****************************************************************************/ |
694 | /* This never worked well before. */ |
695 | /*****************************************************************************/ |
696 | |
697 | /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based |
698 | on simulation result. But after P4 was made, no performance benefit |
699 | was observed with branch hints. It also increases the code size. |
700 | As a result, icc never generates branch hints. */ |
701 | DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints" , m_NONE) |
702 | |
703 | /* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. */ |
704 | DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math" , m_ALL) |
705 | |
706 | /* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit |
707 | arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme |
708 | is usually used for RISC targets. */ |
709 | DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs" , m_NONE) |
710 | |
711 | /* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion |
712 | before a transfer of control flow out of the function. */ |
713 | DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper" , ~m_KNL) |
714 | |
715 | /* X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag |
716 | modifications on architectures where theses operations are slow. */ |
717 | DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc" , m_PENT4) |
718 | |
719 | /* X86_TUNE_USE_RCR: Controls use of rcr 1 instruction instead of shrd. */ |
720 | DEF_TUNE (X86_TUNE_USE_RCR, "use_rcr" , m_AMD_MULTIPLE) |
721 | |