x86-tune.def source code [gcc/config/i386/x86-tune.def]

1	/ Definitions of x86 tunable features.*
2	Copyright (C) 2013-2023 Free Software Foundation, Inc.
3
4	This file is part of GCC.
5
6	GCC is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 3, or (at your option)
9	any later version.
10
11	GCC is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License and
17	a copy of the GCC Runtime Library Exception along with this program;
18	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
19	<http://www.gnu.org/licenses/>. /*
20
21	/ Tuning for a given CPU XXXX consists of:*
22	- adding new CPU into:
23	- adding PROCESSOR_XXX to processor_type (in i386.h)
24	- possibly adding XXX into CPU attribute in i386.md
25	- adding XXX to processor_alias_table (in i386.cc)
26	- introducing ix86_XXX_cost in i386.cc
27	- Stringop generation table can be build based on test_stringop
28	- script (once rest of tuning is complete)
29	- designing a scheduler model in
30	- XXXX.md file
31	- Updating ix86_issue_rate and ix86_adjust_cost in i386.md
32	- possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder
33	and ix86_sched_init_global if those tricks are needed.
34	- Tunning the flags bellow. Those are split into sections and each
35	section is very roughly ordered by importance. /*
36
37	/***************************************************************************/
38	/ Scheduling flags. /
39	/***************************************************************************/
40
41	/ X86_TUNE_SCHEDULE: Enable scheduling. /
42	DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
43	m_PENT \| m_LAKEMONT \| m_PPRO \| m_CORE_ALL \| m_BONNELL \| m_SILVERMONT
44	\| m_INTEL \| m_KNL \| m_KNM \| m_K6_GEODE \| m_AMD_MULTIPLE \| m_ZHAOXIN
45	\| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_TREMONT \| m_CORE_HYBRID
46	\| m_CORE_ATOM \| m_GENERIC)
47
48	/ X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming*
49	on modern chips. Prefer stores affecting whole integer register
50	over partial stores. For example prefer MOVZBL or MOVQ to load 8bit
51	value over movb. /*
52	DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
53	m_P4_NOCONA \| m_CORE2 \| m_NEHALEM \| m_SANDYBRIDGE \| m_CORE_AVX2
54	\| m_BONNELL \| m_SILVERMONT \| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_INTEL
55	\| m_KNL \| m_KNM \| m_AMD_MULTIPLE \| m_ZHAOXIN \| m_TREMONT
56	\| m_CORE_HYBRID \| m_CORE_ATOM \| m_GENERIC)
57
58	/ X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store*
59	destinations to be 128bit to allow register renaming on 128bit SSE units,
60	but usually results in one extra microop on 64bit SSE units.
61	Experimental results shows that disabling this option on P4 brings over 20%
62	SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
63	that can be partly masked by careful scheduling of moves. /*
64	DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
65	m_PPRO \| m_P4_NOCONA \| m_CORE_ALL \| m_BONNELL \| m_AMDFAM10
66	\| m_BDVER \| m_ZNVER \| m_ZHAOXIN \| m_TREMONT \| m_CORE_HYBRID
67	\| m_CORE_ATOM \| m_GENERIC)
68
69	/ X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY: This knob avoids*
70	partial write to the destination in scalar SSE conversion from FP
71	to FP. /*
72	DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY,
73	"sse_partial_reg_fp_converts_dependency",
74	m_PPRO \| m_P4_NOCONA \| m_CORE_ALL \| m_BONNELL \| m_AMDFAM10
75	\| m_BDVER \| m_ZNVER \| m_ZHAOXIN \| m_CORE_HYBRID \| m_CORE_ATOM
76	\| m_GENERIC)
77
78	/ X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY: This knob avoids partial*
79	write to the destination in scalar SSE conversion from integer to FP. /*
80	DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
81	"sse_partial_reg_converts_dependency",
82	m_PPRO \| m_P4_NOCONA \| m_CORE_ALL \| m_BONNELL \| m_AMDFAM10
83	\| m_BDVER \| m_ZNVER \| m_ZHAOXIN \| m_CORE_HYBRID \| m_CORE_ATOM
84	\| m_GENERIC)
85
86	/ X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before*
87	several insns to break false dependency on the dest register for GLC
88	micro-architecture. /*
89	DEF_TUNE (X86_TUNE_DEST_FALSE_DEP_FOR_GLC,
90	"dest_false_dep_for_glc", m_SAPPHIRERAPIDS \| m_CORE_HYBRID
91	\| m_CORE_ATOM)
92
93	/ X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies*
94	are resolved on SSE register parts instead of whole registers, so we may
95	maintain just lower part of scalar values in proper format leaving the
96	upper part undefined. /*
97	DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
98
99	/ X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of flags*
100	set by instructions affecting just some flags (in particular shifts).
101	This is because Core2 resolves dependencies on whole flags register
102	and such sequences introduce false dependency on previous instruction
103	setting full flags.
104
105	The flags does not affect generation of INC and DEC that is controlled
106	by X86_TUNE_USE_INCDEC. /*
107
108	DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
109	m_CORE2)
110
111	/ X86_TUNE_MOVX: Enable to zero extend integer registers to avoid*
112	partial dependencies. /*
113	DEF_TUNE (X86_TUNE_MOVX, "movx",
114	m_PPRO \| m_P4_NOCONA \| m_CORE2 \| m_NEHALEM \| m_SANDYBRIDGE
115	\| m_BONNELL \| m_SILVERMONT \| m_GOLDMONT \| m_KNL \| m_KNM \| m_INTEL
116	\| m_GOLDMONT_PLUS \| m_GEODE \| m_AMD_MULTIPLE \| m_ZHAOXIN
117	\| m_CORE_AVX2 \| m_TREMONT \| m_CORE_HYBRID \| m_CORE_ATOM \| m_GENERIC)
118
119	/ X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by*
120	full sized loads. /*
121	DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
122	m_P4_NOCONA \| m_CORE_ALL \| m_BONNELL \| m_SILVERMONT \| m_INTEL
123	\| m_KNL \| m_KNM \| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_AMD_MULTIPLE
124	\| m_ZHAOXIN \| m_TREMONT \| m_CORE_HYBRID \| m_CORE_ATOM \| m_GENERIC)
125
126	/ X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent*
127	conditional jump instruction for 32 bit TARGET. /*
128	DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
129	m_CORE_ALL \| m_BDVER \| m_ZNVER \| m_ZHAOXIN \| m_GENERIC)
130
131	/ X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent*
132	conditional jump instruction for TARGET_64BIT. /*
133	DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
134	m_NEHALEM \| m_SANDYBRIDGE \| m_CORE_AVX2 \| m_BDVER
135	\| m_ZNVER \| m_ZHAOXIN \| m_GENERIC)
136
137	/ X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a*
138	subsequent conditional jump instruction when the condition jump
139	check sign flag (SF) or overflow flag (OF). /*
140	DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
141	m_NEHALEM \| m_SANDYBRIDGE \| m_CORE_AVX2 \| m_BDVER
142	\| m_ZNVER \| m_ZHAOXIN \| m_GENERIC)
143
144	/ X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional*
145	jump instruction when the alu instruction produces the CCFLAG consumed by
146	the conditional jump instruction. /*
147	DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
148	m_SANDYBRIDGE \| m_CORE_AVX2 \| m_ZHAOXIN \| m_GENERIC)
149
150
151	/***************************************************************************/
152	/ Function prologue, epilogue and function calling sequences. /
153	/***************************************************************************/
154
155	/ X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing*
156	arguments in prologue/epilogue instead of separately for each call
157	by push/pop instructions.
158	This increase code size by about 5% in 32bit mode, less so in 64bit mode
159	because parameters are passed in registers. It is considerable
160	win for targets without stack engine that prevents multple push operations
161	to happen in parallel. /*
162
163	DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
164	m_PPRO \| m_P4_NOCONA \| m_BONNELL \| m_SILVERMONT \| m_KNL \| m_KNM \| m_INTEL
165	\| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_ATHLON_K8 \| m_ZHAOXIN)
166
167	/ X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are*
168	considered on critical path. /*
169	DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
170	m_PPRO \| m_ATHLON_K8)
171
172	/ X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are*
173	considered on critical path. /*
174	DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
175	m_PPRO \| m_ATHLON_K8)
176
177	/ X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. /
178	DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
179	m_386 \| m_CORE_ALL \| m_K6_GEODE \| m_AMD_MULTIPLE \| m_ZHAOXIN
180	\| m_TREMONT \| m_CORE_HYBRID \| m_CORE_ATOM \| m_GENERIC)
181
182	/ X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.*
183	Some chips, like 486 and Pentium works faster with separate load
184	and push instructions. /*
185	DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
186	m_386 \| m_P4_NOCONA \| m_CORE_ALL \| m_K6_GEODE \| m_AMD_MULTIPLE
187	\| m_ZHAOXIN \| m_TREMONT \| m_CORE_HYBRID \| m_CORE_ATOM \| m_GENERIC)
188
189	/ X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred*
190	over esp subtraction. /*
191	DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 \| m_486 \| m_PENT
192	\| m_LAKEMONT \| m_K6_GEODE)
193
194	/ X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred*
195	over esp subtraction. /*
196	DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT \| m_LAKEMONT
197	\| m_K6_GEODE)
198
199	/ X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred*
200	over esp addition. /*
201	DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 \| m_486 \| m_PENT
202	\| m_LAKEMONT \| m_PPRO)
203
204	/ X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred*
205	over esp addition. /*
206	DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT \| m_LAKEMONT)
207
208	/***************************************************************************/
209	/ Branch predictor tuning /
210	/***************************************************************************/
211
212	/ X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4*
213	instructions long. /*
214	DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL)
215
216	/ X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination*
217	of conditional jump or directly preceded by other jump instruction.
218	This is important for AND K8-AMDFAM10 because the branch prediction
219	architecture expect at most one jump per 2 byte window. Failing to
220	pad returns leads to misaligned return stack. /*
221	DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
222	m_ATHLON_K8 \| m_AMDFAM10)
223
224	/ X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more*
225	than 4 branch instructions in the 16 byte window. /*
226	DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
227	m_PPRO \| m_P4_NOCONA \| m_BONNELL \| m_SILVERMONT \| m_KNL \| m_KNM
228	\| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_INTEL \| m_ATHLON_K8 \| m_AMDFAM10)
229
230	/***************************************************************************/
231	/ Integer instruction selection tuning /
232	/***************************************************************************/
233
234	/ X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching*
235	at -O3. For the moment, the prefetching seems badly tuned for Intel
236	chips. /*
237	DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
238	m_K6_GEODE \| m_ATHLON_K8 \| m_AMDFAM10 \| m_BDVER \| m_BTVER)
239
240	/ X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall*
241	on 16-bit immediate moves into memory on Core2 and Corei7. /*
242	DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL \| m_ZHAOXIN \| m_GENERIC)
243
244	/ X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such*
245	as "add mem, reg". /*
246	DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT \| m_LAKEMONT \| m_PPRO))
247
248	/ X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.*
249
250	Core2 and nehalem has stall of 7 cycles for partial flag register stalls.
251	Sandy bridge and Ivy bridge generate extra uop. On Haswell this extra uop
252	is output only when the values needs to be really merged, which is not
253	done by GCC generated code. /*
254	DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
255	~(m_P4_NOCONA \| m_CORE2 \| m_NEHALEM \| m_SANDYBRIDGE
256	\| m_BONNELL \| m_SILVERMONT \| m_INTEL \| m_KNL \| m_KNM \| m_GOLDMONT
257	\| m_GOLDMONT_PLUS \| m_TREMONT \| m_CORE_HYBRID \| m_CORE_ATOM
258	\| m_ZHAOXIN \| m_GENERIC))
259
260	/ X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred*
261	for DFmode copies /*
262	DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
263	~(m_PPRO \| m_P4_NOCONA \| m_CORE_ALL \| m_BONNELL \| m_SILVERMONT
264	\| m_KNL \| m_KNM \| m_INTEL \| m_GEODE \| m_AMD_MULTIPLE \| m_ZHAOXIN
265	\| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_TREMONT \| m_CORE_HYBRID
266	\| m_CORE_ATOM \| m_GENERIC))
267
268	/ X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag*
269	will impact LEA instruction selection. /*
270	DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL \| m_SILVERMONT \| m_KNL
271	\| m_KNM \| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_INTEL \| m_ZHAOXIN)
272
273	/ X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. /
274	DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
275	m_BONNELL \| m_SILVERMONT \| m_GOLDMONT \| m_GOLDMONT_PLUS
276	\| m_KNL \| m_KNM)
277
278	/ X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is*
279	vector path on AMD machines.
280	FIXME: Do we need to enable this for core? /*
281	DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
282	m_K8 \| m_AMDFAM10)
283
284	/ X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD*
285	machines.
286	FIXME: Do we need to enable this for core? /*
287	DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
288	m_K8 \| m_AMDFAM10)
289
290	/ X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for*
291	a conditional move. /*
292	DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
293	m_BONNELL \| m_SILVERMONT \| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_KNL
294	\| m_KNM \| m_INTEL)
295
296	/ X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such*
297	as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. /*
298	DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 \| m_P4_NOCONA)
299
300	/ X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB: Enable use of REP MOVSB/STOSB to*
301	move/set sequences of bytes with known size. /*
302	DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
303	"prefer_known_rep_movsb_stosb",
304	m_SKYLAKE \| m_CORE_HYBRID \| m_CORE_ATOM \| m_TREMONT \| m_CORE_AVX512
305	\| m_ZHAOXIN)
306
307	/ X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of*
308	compact prologues and epilogues by issuing a misaligned moves. This
309	requires target to handle misaligned moves and partial memory stalls
310	reasonably well.
311	FIXME: This may actualy be a win on more targets than listed here. /*
312	DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
313	"misaligned_move_string_pro_epilogues",
314	m_386 \| m_486 \| m_CORE_ALL \| m_AMD_MULTIPLE \| m_ZHAOXIN \| m_TREMONT
315	\| m_CORE_HYBRID \| m_CORE_ATOM \| m_GENERIC)
316
317	/ X86_TUNE_USE_SAHF: Controls use of SAHF. /
318	DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
319	m_PPRO \| m_P4_NOCONA \| m_CORE_ALL \| m_BONNELL \| m_SILVERMONT
320	\| m_KNL \| m_KNM \| m_INTEL \| m_K6_GEODE \| m_K8 \| m_AMDFAM10 \| m_BDVER
321	\| m_BTVER \| m_ZNVER \| m_ZHAOXIN \| m_GOLDMONT \| m_GOLDMONT_PLUS
322	\| m_TREMONT \| m_CORE_HYBRID \| m_CORE_ATOM \| m_GENERIC)
323
324	/ X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. /
325	DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
326	~(m_PENT \| m_LAKEMONT \| m_BONNELL \| m_SILVERMONT \| m_KNL \| m_KNM \| m_INTEL
327	\| m_K6 \| m_GOLDMONT \| m_GOLDMONT_PLUS))
328
329	/ X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. /
330	DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
331	m_CORE_ALL \| m_BONNELL \| m_SILVERMONT \| m_KNL \| m_KNM \| m_INTEL
332	\| m_LAKEMONT \| m_AMD_MULTIPLE \| m_ZHAOXIN \| m_GOLDMONT
333	\| m_GOLDMONT_PLUS \| m_TREMONT \| m_CORE_HYBRID \| m_CORE_ATOM
334	\| m_GENERIC)
335
336	/ X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency*
337	for bit-manipulation instructions. /*
338	DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
339	m_SANDYBRIDGE \| m_HASWELL \| m_SKYLAKE \| m_SKYLAKE_AVX512
340	\| m_CANNONLAKE \| m_CASCADELAKE \| m_COOPERLAKE
341	\| m_ZHAOXIN \| m_GENERIC)
342
343	/ X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based*
344	on hardware capabilities. Bdver3 hardware has a loop buffer which makes
345	unrolling small loop less important. For, such architectures we adjust
346	the unroll factor so that the unrolled loop fits the loop buffer. /*
347	DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 \| m_BDVER4)
348
349	/ X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in*
350	if-converted sequence to one. /*
351	DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
352	m_SILVERMONT \| m_KNL \| m_KNM \| m_INTEL \| m_CORE_ALL \| m_GOLDMONT
353	\| m_GOLDMONT_PLUS \| m_TREMONT \| m_CORE_HYBRID \| m_CORE_ATOM
354	\| m_ZHAOXIN \| m_GENERIC)
355
356	/ X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence. /
357	DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
358	m_CORE_ALL \| m_BDVER \| m_ZNVER \| m_ZHAOXIN \| m_TREMONT \| m_CORE_HYBRID
359	\| m_CORE_ATOM \| m_GENERIC)
360
361	/ X86_TUNE_EXPAND_ABS: This enables a new abs pattern by*
362	generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) -
363	(signed) x >> (W-1)) instead of cmove or SSE max/abs instructions. /*
364	DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs",
365	m_CORE_ALL \| m_SILVERMONT \| m_KNL \| m_KNM \| m_GOLDMONT
366	\| m_GOLDMONT_PLUS \| m_ZHAOXIN)
367
368	/***************************************************************************/
369	/ 387 instruction selection tuning /
370	/***************************************************************************/
371
372	/ X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit*
373	integer operand.
374	FIXME: Why this is disabled for modern chips? /*
375	DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
376	m_386 \| m_486 \| m_K6_GEODE)
377
378	/ X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit*
379	integer operand. /*
380	DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
381	~(m_PENT \| m_LAKEMONT \| m_PPRO \| m_CORE_ALL \| m_BONNELL
382	\| m_SILVERMONT \| m_KNL \| m_KNM \| m_INTEL \| m_AMD_MULTIPLE
383	\| m_ZHAOXIN \| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_TREMONT
384	\| m_CORE_HYBRID \| m_CORE_ATOM \| m_GENERIC))
385
386	/ X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. /
387	DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE \| m_ZHAOXIN)
388
389	/ X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. /
390	DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
391	m_PPRO \| m_P4_NOCONA \| m_CORE_ALL \| m_BONNELL \| m_SILVERMONT
392	\| m_KNL \| m_KNM \| m_INTEL \| m_K6_GEODE \| m_ATHLON_K8 \| m_ZHAOXIN
393	\| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_TREMONT \| m_CORE_HYBRID
394	\| m_CORE_ATOM \| m_GENERIC)
395
396	/***************************************************************************/
397	/ SSE instruction selection tuning /
398	/***************************************************************************/
399
400	/ X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE*
401	regs instead of memory. /*
402	DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
403	m_CORE_ALL)
404
405	/ X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead*
406	of a sequence loading registers by parts. /*
407	DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
408	m_NEHALEM \| m_SANDYBRIDGE \| m_CORE_AVX2 \| m_SILVERMONT \| m_KNL \| m_KNM
409	\| m_INTEL \| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_TREMONT \| m_CORE_HYBRID
410	\| m_CORE_ATOM \| m_AMDFAM10 \| m_BDVER \| m_BTVER \| m_ZNVER \| m_ZHAOXIN
411	\| m_GENERIC)
412
413	/ X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores*
414	instead of a sequence loading registers by parts. /*
415	DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
416	m_NEHALEM \| m_SANDYBRIDGE \| m_CORE_AVX2 \| m_SILVERMONT \| m_KNL \| m_KNM
417	\| m_INTEL \| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_TREMONT \| m_CORE_HYBRID
418	\| m_CORE_ATOM \| m_BDVER \| m_ZNVER \| m_ZHAOXIN \| m_GENERIC)
419
420	/ X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL: Use packed single*
421	precision 128bit instructions instead of double where possible. /*
422	DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
423	m_BDVER \| m_ZNVER)
424
425	/ X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. /
426	DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
427	m_AMD_MULTIPLE \| m_ZHAOXIN \| m_CORE_ALL \| m_TREMONT \| m_CORE_HYBRID
428	\| m_CORE_ATOM \| m_GENERIC)
429
430	/ X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to*
431	xorps/xorpd and other variants. /*
432	DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
433	m_PPRO \| m_P4_NOCONA \| m_CORE_ALL \| m_BDVER \| m_BTVER \| m_ZNVER
434	\| m_ZHAOXIN \| m_TREMONT \| m_CORE_HYBRID \| m_CORE_ATOM \| m_GENERIC)
435
436	/ X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer*
437	to SSE registers. If disabled, the moves will be done by storing
438	the value to memory and reloading.
439	Enable this flag for generic - the only relevant architecture preferring
440	no inter-unit moves is Buldozer. While this makes small regression on SPECfp
441	scores (sub 0.3%), disabling inter-unit moves penalizes noticeably hand
442	written vectorized code which use i.e. _mm_set_epi16. /*
443	DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
444	~(m_ATHLON_K8 \| m_AMDFAM10 \| m_BDVER \| m_BTVER))
445
446	/ X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE*
447	to integer registers. If disabled, the moves will be done by storing
448	the value to memory and reloading. /*
449	DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
450	~m_ATHLON_K8)
451
452	/ X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions*
453	to use both SSE and integer registers at a same time. /*
454	DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
455	~(m_AMDFAM10 \| m_BDVER))
456
457	/ X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for*
458	fp converts to destination register. /*
459	DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
460	m_SILVERMONT \| m_KNL \| m_KNM \| m_GOLDMONT \| m_GOLDMONT_PLUS
461	\| m_INTEL)
462
463	/ X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion*
464	from FP to FP. This form of instructions avoids partial write to the
465	destination. /*
466	DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
467	m_AMDFAM10)
468
469	/ X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion*
470	from integer to FP. /*
471	DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
472
473	/ X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. /
474	DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
475	m_BONNELL \| m_SILVERMONT \| m_KNL \| m_KNM \| m_GOLDMONT
476	\| m_GOLDMONT_PLUS \| m_INTEL)
477
478	/ X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes. /
479	DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
480	m_SILVERMONT \| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_TREMONT \| m_CORE_HYBRID
481	\| m_CORE_ATOM \| m_INTEL)
482
483	/ X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2*
484	elements. /*
485	DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
486	~(m_ZNVER1 \| m_ZNVER2 \| m_ZNVER3 \| m_ZNVER4 \| m_CORE_HYBRID
487	\| m_YONGFENG \| m_CORE_ATOM \| m_GENERIC \| m_GDS))
488
489	/ X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2*
490	elements. /*
491	DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
492	~(m_ZNVER4))
493
494	/ X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4*
495	elements. /*
496	DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
497	~(m_ZNVER1 \| m_ZNVER2 \| m_ZNVER3 \| m_ZNVER4 \| m_CORE_HYBRID
498	\| m_YONGFENG \| m_CORE_ATOM \| m_GENERIC \| m_GDS))
499
500	/ X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4*
501	elements. /*
502	DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
503	~(m_ZNVER4))
504
505	/ X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more*
506	elements. /*
507	DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
508	~(m_ZNVER1 \| m_ZNVER2 \| m_ZNVER4 \| m_CORE_HYBRID \| m_CORE_ATOM
509	\| m_YONGFENG \| m_GENERIC \| m_GDS))
510
511	/ X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more*
512	elements. /*
513	DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
514	~(m_ZNVER4))
515
516	/ X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or*
517	smaller FMA chain. /*
518	DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 \| m_ZNVER2 \| m_ZNVER3
519	\| m_YONGFENG)
520
521	/ X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or*
522	smaller FMA chain. /*
523	DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 \| m_ZNVER3
524	\| m_CORE_HYBRID \| m_SAPPHIRERAPIDS \| m_CORE_ATOM)
525
526	/ X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or*
527	smaller FMA chain. /*
528	DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
529
530	/ X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd*
531	for v2df vector reduction. /*
532	DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD,
533	"v2df_reduction_prefer_haddpd", m_NONE)
534
535	/***************************************************************************/
536	/ AVX instruction selection tuning (some of SSE flags affects AVX, too) /
537	/***************************************************************************/
538
539	/ X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are*
540	split. /*
541	DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
542	~(m_NEHALEM \| m_SANDYBRIDGE))
543
544	/ X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are*
545	split. /*
546	DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal",
547	~(m_NEHALEM \| m_SANDYBRIDGE \| m_BDVER \| m_ZNVER1))
548
549	/ X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops. /
550	DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER \| m_BTVER2
551	\| m_ZNVER1)
552
553	/ X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for*
554	the auto-vectorizer. /*
555	DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER \| m_BTVER2
556	\| m_ZNVER1)
557
558	/ X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX*
559	instructions in the auto-vectorizer. /*
560	DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
561
562	/ X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 ops. /
563	DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4)
564
565	/ X86_TUNE_AVX256_MOVE_BY_PIECES: Optimize move_by_pieces with 256-bit*
566	AVX instructions. /*
567	DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces",
568	m_CORE_HYBRID \| m_CORE_AVX2 \| m_ZNVER1 \| m_ZNVER2 \| m_ZNVER3)
569
570	/ X86_TUNE_AVX256_STORE_BY_PIECES: Optimize store_by_pieces with 256-bit*
571	AVX instructions. /*
572	DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES, "avx256_store_by_pieces",
573	m_CORE_HYBRID \| m_CORE_AVX2 \| m_ZNVER1 \| m_ZNVER2 \| m_ZNVER3)
574
575	/ X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit*
576	AVX instructions. /*
577	DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
578	m_SAPPHIRERAPIDS \| m_ZNVER4)
579
580	/ X86_TUNE_AVX512_STORE_BY_PIECES: Optimize store_by_pieces with 512-bit*
581	AVX instructions. /*
582	DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
583	m_SAPPHIRERAPIDS \| m_ZNVER4)
584
585	/***************************************************************************/
586	/***************************************************************************/
587	/ Historical relics: tuning flags that helps a specific old CPU designs /
588	/***************************************************************************/
589
590	/ X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in*
591	an integer register. /*
592	DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
593
594	/ X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,*
595	such as fsqrt, fprem, fsin, fcos, fsincos etc.
596	Should be enabled for all targets that always has coprocesor. /*
597	DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
598	~(m_386 \| m_486 \| m_LAKEMONT))
599
600	/ X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for*
601	inline strlen. This affects only -minline-all-stringops mode. By
602	default we always dispatch to a library since our internal strlen
603	is bad. /*
604	DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386)
605
606	/ X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of*
607	longer "sal $1, reg". /*
608	DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
609
610	/ X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead*
611	of mozbl/movwl. /*
612	DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",
613	m_486 \| m_PENT)
614
615	/ X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode*
616	and SImode multiply, but 386 and 486 do HImode multiply faster. /*
617	DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
618	~(m_386 \| m_486))
619
620	/ X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic*
621	into 16bit/8bit when resulting sequence is shorter. For example
622	for "and $-65536, reg" to 16bit store of 0. /*
623	DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix",
624	~(m_386 \| m_486 \| m_PENT \| m_LAKEMONT))
625
626	/ X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions*
627	such as "add $1, mem". /*
628	DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write",
629	~(m_PENT \| m_LAKEMONT))
630
631	/ X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR*
632	than a MOV. /*
633	DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT \| m_LAKEMONT)
634
635	/ X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,*
636	but one byte longer. /*
637	DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT \| m_LAKEMONT)
638
639	/ X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled*
640	use of partial registers by renaming. This improved performance of 16bit
641	code where upper halves of registers are not used. It also leads to
642	an penalty whenever a 16bit store is followed by 32bit use. This flag
643	disables production of such sequences in common cases.
644	See also X86_TUNE_HIMODE_MATH.
645
646	In current implementation the partial register stalls are not eliminated
647	very well - they can be introduced via subregs synthesized by combine
648	and can happen in caller/callee saving sequences. /*
649	DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
650
651	/ X86_TUNE_PARTIAL_MEMORY_READ_STALL: Reading (possible unaligned) part of*
652	memory location after a large write to the same address causes
653	store-to-load forwarding stall. /*
654	DEF_TUNE (X86_TUNE_PARTIAL_MEMORY_READ_STALL, "partial_memory_read_stall",
655	m_386 \| m_486 \| m_PENT \| m_LAKEMONT \| m_PPRO \| m_P4_NOCONA \| m_CORE2
656	\| m_SILVERMONT \| m_GOLDMONT \| m_GOLDMONT_PLUS \| m_TREMONT
657	\| m_K6_GEODE \| m_ATHLON_K8 \| m_AMDFAM10)
658
659	/ X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to*
660	corresponding 32bit arithmetic. /*
661	DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
662	~m_PPRO)
663
664	/ X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic. Again we avoid*
665	partial register stalls on PentiumPro targets. /*
666	DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
667
668	/ X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic.*
669	On PPro this flag is meant to avoid partial register stalls. /*
670	DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
671
672	/ X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates*
673	directly to memory. /*
674	DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
675
676	/ X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. /
677	DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
678
679	/ X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear*
680	integer register. /*
681	DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
682
683	/ X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory*
684	operand that cannot be represented using a modRM byte. The XOR
685	replacement is long decoded, so this split helps here as well. /*
686	DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
687
688	/ X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded*
689	forms of instructions on K8 targets. /*
690	DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
691	m_K8)
692
693	/***************************************************************************/
694	/ This never worked well before. /
695	/***************************************************************************/
696
697	/ X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based*
698	on simulation result. But after P4 was made, no performance benefit
699	was observed with branch hints. It also increases the code size.
700	As a result, icc never generates branch hints. /*
701	DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", m_NONE)
702
703	/ X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. /
704	DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", m_ALL)
705
706	/ X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit*
707	arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme
708	is usually used for RISC targets. /*
709	DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", m_NONE)
710
711	/ X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion*
712	before a transfer of control flow out of the function. /*
713	DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)
714
715	/ X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag*
716	modifications on architectures where theses operations are slow. /*
717	DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc", m_PENT4)
718
719	/ X86_TUNE_USE_RCR: Controls use of rcr 1 instruction instead of shrd. /
720	DEF_TUNE (X86_TUNE_USE_RCR, "use_rcr", m_AMD_MULTIPLE)
721

source code of gcc/config/i386/x86-tune.def