1/* Costs of operations of individual x86 CPUs.
2 Copyright (C) 1988-2023 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16Under Section 7 of GPL version 3, you are granted additional
17permissions described in the GCC Runtime Library Exception, version
183.1, as published by the Free Software Foundation.
19
20You should have received a copy of the GNU General Public License and
21a copy of the GCC Runtime Library Exception along with this program;
22see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23<http://www.gnu.org/licenses/>. */
24/* Processor costs (relative to an add) */
25/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26#define COSTS_N_BYTES(N) ((N) * 2)
27
28#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29
30static stringop_algs ix86_size_memcpy[2] = {
31 {.unknown_size: rep_prefix_1_byte, .size: {{-1, rep_prefix_1_byte, false}}},
32 {.unknown_size: rep_prefix_1_byte, .size: {{-1, rep_prefix_1_byte, false}}}};
33static stringop_algs ix86_size_memset[2] = {
34 {.unknown_size: rep_prefix_1_byte, .size: {{-1, rep_prefix_1_byte, false}}},
35 {.unknown_size: rep_prefix_1_byte, .size: {{-1, rep_prefix_1_byte, false}}}};
36
37const
38struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39 .hard_register: {
40 /* Start of register allocator costs. integer->integer move cost is 2. */
41 .movzbl_load: 2, /* cost for loading QImode using movzbl */
42 .int_load: {2, 2, 2}, /* cost of loading integer registers
43 in QImode, HImode and SImode.
44 Relative to reg-reg move (2). */
45 .int_store: {2, 2, 2}, /* cost of storing integer registers */
46 .fp_move: 2, /* cost of reg,reg fld/fst */
47 .fp_load: {2, 2, 2}, /* cost of loading fp registers
48 in SFmode, DFmode and XFmode */
49 .fp_store: {2, 2, 2}, /* cost of storing fp registers
50 in SFmode, DFmode and XFmode */
51 .mmx_move: 3, /* cost of moving MMX register */
52 .mmx_load: {3, 3}, /* cost of loading MMX registers
53 in SImode and DImode */
54 .mmx_store: {3, 3}, /* cost of storing MMX registers
55 in SImode and DImode */
56 .xmm_move: 3, .ymm_move: 3, .zmm_move: 3, /* cost of moving XMM,YMM,ZMM register */
57 .sse_load: {3, 3, 3, 3, 3}, /* cost of loading SSE registers
58 in 32,64,128,256 and 512-bit */
59 .sse_store: {3, 3, 3, 3, 3}, /* cost of storing SSE registers
60 in 32,64,128,256 and 512-bit */
61 .sse_to_integer: 3, .integer_to_sse: 3, /* SSE->integer and integer->SSE moves */
62 .mask_to_integer: 3, .integer_to_mask: 3, /* mask->integer and integer->mask moves */
63 .mask_load: {2, 2, 2}, /* cost of loading mask register
64 in QImode, HImode, SImode. */
65 .mask_store: {2, 2, 2}, /* cost if storing mask register
66 in QImode, HImode, SImode. */
67 .mask_move: 2, /* cost of moving mask register. */
68 /* End of register allocator costs. */
69 },
70
71 COSTS_N_BYTES (2), /* cost of an add instruction */
72 COSTS_N_BYTES (3), /* cost of a lea instruction */
73 COSTS_N_BYTES (2), /* variable shift costs */
74 COSTS_N_BYTES (3), /* constant shift costs */
75 .mult_init: {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
76 COSTS_N_BYTES (3), /* HI */
77 COSTS_N_BYTES (3), /* SI */
78 COSTS_N_BYTES (3), /* DI */
79 COSTS_N_BYTES (5)}, /* other */
80 .mult_bit: 0, /* cost of multiply per each bit set */
81 .divide: {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
82 COSTS_N_BYTES (3), /* HI */
83 COSTS_N_BYTES (3), /* SI */
84 COSTS_N_BYTES (3), /* DI */
85 COSTS_N_BYTES (5)}, /* other */
86 COSTS_N_BYTES (3), /* cost of movsx */
87 COSTS_N_BYTES (3), /* cost of movzx */
88 .large_insn: 0, /* "large" insn */
89 .move_ratio: 2, /* MOVE_RATIO */
90 .clear_ratio: 2, /* CLEAR_RATIO */
91 .int_load: {2, 2, 2}, /* cost of loading integer registers
92 in QImode, HImode and SImode.
93 Relative to reg-reg move (2). */
94 .int_store: {2, 2, 2}, /* cost of storing integer registers */
95 .sse_load: {3, 3, 3, 3, 3}, /* cost of loading SSE register
96 in 32bit, 64bit, 128bit, 256bit and 512bit */
97 .sse_store: {3, 3, 3, 3, 3}, /* cost of storing SSE register
98 in 32bit, 64bit, 128bit, 256bit and 512bit */
99 .sse_unaligned_load: {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
100 in 128bit, 256bit and 512bit */
101 .sse_unaligned_store: {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
102 in 128bit, 256bit and 512bit */
103 .xmm_move: 3, .ymm_move: 3, .zmm_move: 3, /* cost of moving XMM,YMM,ZMM register */
104 .sse_to_integer: 3, /* cost of moving SSE register to integer. */
105 .gather_static: 5, .gather_per_elt: 0, /* Gather load static, per_elt. */
106 .scatter_static: 5, .scatter_per_elt: 0, /* Gather store static, per_elt. */
107 .l1_cache_size: 0, /* size of l1 cache */
108 .l2_cache_size: 0, /* size of l2 cache */
109 .prefetch_block: 0, /* size of prefetch block */
110 .simultaneous_prefetches: 0, /* number of parallel prefetches */
111 .branch_cost: 2, /* Branch cost */
112 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
113 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
114 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
115 COSTS_N_BYTES (2), /* cost of FABS instruction. */
116 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
117 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
118
119 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
120 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
121 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
122 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
123 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
124 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
125 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
126 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
127 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
128 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
129 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
130 .memcpy: ix86_size_memcpy,
131 .memset: ix86_size_memset,
132 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
133 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
134 NULL, /* Loop alignment. */
135 NULL, /* Jump alignment. */
136 NULL, /* Label alignment. */
137 NULL, /* Func alignment. */
138 .small_unroll_ninsns: 4, /* Small unroll limit. */
139 .small_unroll_factor: 2, /* Small unroll factor. */
140};
141
142/* Processor costs (relative to an add) */
143static stringop_algs i386_memcpy[2] = {
144 {.unknown_size: rep_prefix_1_byte, .size: {{-1, rep_prefix_1_byte, false}}},
145 DUMMY_STRINGOP_ALGS};
146static stringop_algs i386_memset[2] = {
147 {.unknown_size: rep_prefix_1_byte, .size: {{-1, rep_prefix_1_byte, false}}},
148 DUMMY_STRINGOP_ALGS};
149
150static const
151struct processor_costs i386_cost = { /* 386 specific costs */
152 .hard_register: {
153 /* Start of register allocator costs. integer->integer move cost is 2. */
154 .movzbl_load: 4, /* cost for loading QImode using movzbl */
155 .int_load: {2, 4, 2}, /* cost of loading integer registers
156 in QImode, HImode and SImode.
157 Relative to reg-reg move (2). */
158 .int_store: {2, 4, 2}, /* cost of storing integer registers */
159 .fp_move: 2, /* cost of reg,reg fld/fst */
160 .fp_load: {8, 8, 8}, /* cost of loading fp registers
161 in SFmode, DFmode and XFmode */
162 .fp_store: {8, 8, 8}, /* cost of storing fp registers
163 in SFmode, DFmode and XFmode */
164 .mmx_move: 2, /* cost of moving MMX register */
165 .mmx_load: {4, 8}, /* cost of loading MMX registers
166 in SImode and DImode */
167 .mmx_store: {4, 8}, /* cost of storing MMX registers
168 in SImode and DImode */
169 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
170 .sse_load: {4, 8, 16, 32, 64}, /* cost of loading SSE registers
171 in 32,64,128,256 and 512-bit */
172 .sse_store: {4, 8, 16, 32, 64}, /* cost of storing SSE registers
173 in 32,64,128,256 and 512-bit */
174 .sse_to_integer: 3, .integer_to_sse: 3, /* SSE->integer and integer->SSE moves */
175 .mask_to_integer: 3, .integer_to_mask: 3, /* mask->integer and integer->mask moves */
176 .mask_load: {2, 4, 2}, /* cost of loading mask register
177 in QImode, HImode, SImode. */
178 .mask_store: {2, 4, 2}, /* cost if storing mask register
179 in QImode, HImode, SImode. */
180 .mask_move: 2, /* cost of moving mask register. */
181 /* End of register allocator costs. */
182 },
183
184 COSTS_N_INSNS (1), /* cost of an add instruction */
185 COSTS_N_INSNS (1), /* cost of a lea instruction */
186 COSTS_N_INSNS (3), /* variable shift costs */
187 COSTS_N_INSNS (2), /* constant shift costs */
188 .mult_init: {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
189 COSTS_N_INSNS (6), /* HI */
190 COSTS_N_INSNS (6), /* SI */
191 COSTS_N_INSNS (6), /* DI */
192 COSTS_N_INSNS (6)}, /* other */
193 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
194 .divide: {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
195 COSTS_N_INSNS (23), /* HI */
196 COSTS_N_INSNS (23), /* SI */
197 COSTS_N_INSNS (23), /* DI */
198 COSTS_N_INSNS (23)}, /* other */
199 COSTS_N_INSNS (3), /* cost of movsx */
200 COSTS_N_INSNS (2), /* cost of movzx */
201 .large_insn: 15, /* "large" insn */
202 .move_ratio: 3, /* MOVE_RATIO */
203 .clear_ratio: 3, /* CLEAR_RATIO */
204 .int_load: {2, 4, 2}, /* cost of loading integer registers
205 in QImode, HImode and SImode.
206 Relative to reg-reg move (2). */
207 .int_store: {2, 4, 2}, /* cost of storing integer registers */
208 .sse_load: {4, 8, 16, 32, 64}, /* cost of loading SSE register
209 in 32bit, 64bit, 128bit, 256bit and 512bit */
210 .sse_store: {4, 8, 16, 32, 64}, /* cost of storing SSE register
211 in 32bit, 64bit, 128bit, 256bit and 512bit */
212 .sse_unaligned_load: {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
213 .sse_unaligned_store: {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
214 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
215 .sse_to_integer: 3, /* cost of moving SSE register to integer. */
216 .gather_static: 4, .gather_per_elt: 4, /* Gather load static, per_elt. */
217 .scatter_static: 4, .scatter_per_elt: 4, /* Gather store static, per_elt. */
218 .l1_cache_size: 0, /* size of l1 cache */
219 .l2_cache_size: 0, /* size of l2 cache */
220 .prefetch_block: 0, /* size of prefetch block */
221 .simultaneous_prefetches: 0, /* number of parallel prefetches */
222 .branch_cost: 1, /* Branch cost */
223 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
224 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
225 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
226 COSTS_N_INSNS (22), /* cost of FABS instruction. */
227 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
228 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
229
230 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
231 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
232 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
233 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
234 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
235 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
236 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
237 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
238 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
239 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
240 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
241 .memcpy: i386_memcpy,
242 .memset: i386_memset,
243 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
244 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
245 .align_loop: "4", /* Loop alignment. */
246 .align_jump: "4", /* Jump alignment. */
247 NULL, /* Label alignment. */
248 .align_func: "4", /* Func alignment. */
249 .small_unroll_ninsns: 4, /* Small unroll limit. */
250 .small_unroll_factor: 2, /* Small unroll factor. */
251};
252
253static stringop_algs i486_memcpy[2] = {
254 {.unknown_size: rep_prefix_4_byte, .size: {{-1, rep_prefix_4_byte, false}}},
255 DUMMY_STRINGOP_ALGS};
256static stringop_algs i486_memset[2] = {
257 {.unknown_size: rep_prefix_4_byte, .size: {{-1, rep_prefix_4_byte, false}}},
258 DUMMY_STRINGOP_ALGS};
259
260static const
261struct processor_costs i486_cost = { /* 486 specific costs */
262 .hard_register: {
263 /* Start of register allocator costs. integer->integer move cost is 2. */
264 .movzbl_load: 4, /* cost for loading QImode using movzbl */
265 .int_load: {2, 4, 2}, /* cost of loading integer registers
266 in QImode, HImode and SImode.
267 Relative to reg-reg move (2). */
268 .int_store: {2, 4, 2}, /* cost of storing integer registers */
269 .fp_move: 2, /* cost of reg,reg fld/fst */
270 .fp_load: {8, 8, 8}, /* cost of loading fp registers
271 in SFmode, DFmode and XFmode */
272 .fp_store: {8, 8, 8}, /* cost of storing fp registers
273 in SFmode, DFmode and XFmode */
274 .mmx_move: 2, /* cost of moving MMX register */
275 .mmx_load: {4, 8}, /* cost of loading MMX registers
276 in SImode and DImode */
277 .mmx_store: {4, 8}, /* cost of storing MMX registers
278 in SImode and DImode */
279 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
280 .sse_load: {4, 8, 16, 32, 64}, /* cost of loading SSE registers
281 in 32,64,128,256 and 512-bit */
282 .sse_store: {4, 8, 16, 32, 64}, /* cost of storing SSE registers
283 in 32,64,128,256 and 512-bit */
284 .sse_to_integer: 3, .integer_to_sse: 3, /* SSE->integer and integer->SSE moves */
285 .mask_to_integer: 3, .integer_to_mask: 3, /* mask->integer and integer->mask moves */
286 .mask_load: {2, 4, 2}, /* cost of loading mask register
287 in QImode, HImode, SImode. */
288 .mask_store: {2, 4, 2}, /* cost if storing mask register
289 in QImode, HImode, SImode. */
290 .mask_move: 2, /* cost of moving mask register. */
291 /* End of register allocator costs. */
292 },
293
294 COSTS_N_INSNS (1), /* cost of an add instruction */
295 COSTS_N_INSNS (1), /* cost of a lea instruction */
296 COSTS_N_INSNS (3), /* variable shift costs */
297 COSTS_N_INSNS (2), /* constant shift costs */
298 .mult_init: {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
299 COSTS_N_INSNS (12), /* HI */
300 COSTS_N_INSNS (12), /* SI */
301 COSTS_N_INSNS (12), /* DI */
302 COSTS_N_INSNS (12)}, /* other */
303 .mult_bit: 1, /* cost of multiply per each bit set */
304 .divide: {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
305 COSTS_N_INSNS (40), /* HI */
306 COSTS_N_INSNS (40), /* SI */
307 COSTS_N_INSNS (40), /* DI */
308 COSTS_N_INSNS (40)}, /* other */
309 COSTS_N_INSNS (3), /* cost of movsx */
310 COSTS_N_INSNS (2), /* cost of movzx */
311 .large_insn: 15, /* "large" insn */
312 .move_ratio: 3, /* MOVE_RATIO */
313 .clear_ratio: 3, /* CLEAR_RATIO */
314 .int_load: {2, 4, 2}, /* cost of loading integer registers
315 in QImode, HImode and SImode.
316 Relative to reg-reg move (2). */
317 .int_store: {2, 4, 2}, /* cost of storing integer registers */
318 .sse_load: {4, 8, 16, 32, 64}, /* cost of loading SSE register
319 in 32bit, 64bit, 128bit, 256bit and 512bit */
320 .sse_store: {4, 8, 16, 32, 64}, /* cost of storing SSE register
321 in 32bit, 64bit, 128bit, 256bit and 512bit */
322 .sse_unaligned_load: {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
323 .sse_unaligned_store: {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
324 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
325 .sse_to_integer: 3, /* cost of moving SSE register to integer. */
326 .gather_static: 4, .gather_per_elt: 4, /* Gather load static, per_elt. */
327 .scatter_static: 4, .scatter_per_elt: 4, /* Gather store static, per_elt. */
328 .l1_cache_size: 4, /* size of l1 cache. 486 has 8kB cache
329 shared for code and data, so 4kB is
330 not really precise. */
331 .l2_cache_size: 4, /* size of l2 cache */
332 .prefetch_block: 0, /* size of prefetch block */
333 .simultaneous_prefetches: 0, /* number of parallel prefetches */
334 .branch_cost: 1, /* Branch cost */
335 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
336 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
337 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
338 COSTS_N_INSNS (3), /* cost of FABS instruction. */
339 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
340 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
341
342 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
343 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
344 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
345 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
346 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
347 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
348 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
349 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
350 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
351 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
352 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
353 .memcpy: i486_memcpy,
354 .memset: i486_memset,
355 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
356 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
357 .align_loop: "16", /* Loop alignment. */
358 .align_jump: "16", /* Jump alignment. */
359 .align_label: "0:0:8", /* Label alignment. */
360 .align_func: "16", /* Func alignment. */
361 .small_unroll_ninsns: 4, /* Small unroll limit. */
362 .small_unroll_factor: 2, /* Small unroll factor. */
363};
364
365static stringop_algs pentium_memcpy[2] = {
366 {.unknown_size: libcall, .size: {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
367 DUMMY_STRINGOP_ALGS};
368static stringop_algs pentium_memset[2] = {
369 {.unknown_size: libcall, .size: {{-1, rep_prefix_4_byte, false}}},
370 DUMMY_STRINGOP_ALGS};
371
372static const
373struct processor_costs pentium_cost = {
374 .hard_register: {
375 /* Start of register allocator costs. integer->integer move cost is 2. */
376 .movzbl_load: 6, /* cost for loading QImode using movzbl */
377 .int_load: {2, 4, 2}, /* cost of loading integer registers
378 in QImode, HImode and SImode.
379 Relative to reg-reg move (2). */
380 .int_store: {2, 4, 2}, /* cost of storing integer registers */
381 .fp_move: 2, /* cost of reg,reg fld/fst */
382 .fp_load: {2, 2, 6}, /* cost of loading fp registers
383 in SFmode, DFmode and XFmode */
384 .fp_store: {4, 4, 6}, /* cost of storing fp registers
385 in SFmode, DFmode and XFmode */
386 .mmx_move: 8, /* cost of moving MMX register */
387 .mmx_load: {8, 8}, /* cost of loading MMX registers
388 in SImode and DImode */
389 .mmx_store: {8, 8}, /* cost of storing MMX registers
390 in SImode and DImode */
391 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
392 .sse_load: {4, 8, 16, 32, 64}, /* cost of loading SSE registers
393 in 32,64,128,256 and 512-bit */
394 .sse_store: {4, 8, 16, 32, 64}, /* cost of storing SSE registers
395 in 32,64,128,256 and 512-bit */
396 .sse_to_integer: 3, .integer_to_sse: 3, /* SSE->integer and integer->SSE moves */
397 .mask_to_integer: 3, .integer_to_mask: 3, /* mask->integer and integer->mask moves */
398 .mask_load: {2, 4, 2}, /* cost of loading mask register
399 in QImode, HImode, SImode. */
400 .mask_store: {2, 4, 2}, /* cost if storing mask register
401 in QImode, HImode, SImode. */
402 .mask_move: 2, /* cost of moving mask register. */
403 /* End of register allocator costs. */
404 },
405
406 COSTS_N_INSNS (1), /* cost of an add instruction */
407 COSTS_N_INSNS (1), /* cost of a lea instruction */
408 COSTS_N_INSNS (4), /* variable shift costs */
409 COSTS_N_INSNS (1), /* constant shift costs */
410 .mult_init: {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
411 COSTS_N_INSNS (11), /* HI */
412 COSTS_N_INSNS (11), /* SI */
413 COSTS_N_INSNS (11), /* DI */
414 COSTS_N_INSNS (11)}, /* other */
415 .mult_bit: 0, /* cost of multiply per each bit set */
416 .divide: {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
417 COSTS_N_INSNS (25), /* HI */
418 COSTS_N_INSNS (25), /* SI */
419 COSTS_N_INSNS (25), /* DI */
420 COSTS_N_INSNS (25)}, /* other */
421 COSTS_N_INSNS (3), /* cost of movsx */
422 COSTS_N_INSNS (2), /* cost of movzx */
423 .large_insn: 8, /* "large" insn */
424 .move_ratio: 6, /* MOVE_RATIO */
425 .clear_ratio: 6, /* CLEAR_RATIO */
426 .int_load: {2, 4, 2}, /* cost of loading integer registers
427 in QImode, HImode and SImode.
428 Relative to reg-reg move (2). */
429 .int_store: {2, 4, 2}, /* cost of storing integer registers */
430 .sse_load: {4, 8, 16, 32, 64}, /* cost of loading SSE register
431 in 32bit, 64bit, 128bit, 256bit and 512bit */
432 .sse_store: {4, 8, 16, 32, 64}, /* cost of storing SSE register
433 in 32bit, 64bit, 128bit, 256bit and 512bit */
434 .sse_unaligned_load: {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
435 .sse_unaligned_store: {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
436 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
437 .sse_to_integer: 3, /* cost of moving SSE register to integer. */
438 .gather_static: 4, .gather_per_elt: 4, /* Gather load static, per_elt. */
439 .scatter_static: 4, .scatter_per_elt: 4, /* Gather store static, per_elt. */
440 .l1_cache_size: 8, /* size of l1 cache. */
441 .l2_cache_size: 8, /* size of l2 cache */
442 .prefetch_block: 0, /* size of prefetch block */
443 .simultaneous_prefetches: 0, /* number of parallel prefetches */
444 .branch_cost: 2, /* Branch cost */
445 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
446 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
447 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
448 COSTS_N_INSNS (1), /* cost of FABS instruction. */
449 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
450 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
451
452 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
453 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
454 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
455 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
456 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
457 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
458 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
459 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
460 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
461 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
462 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
463 .memcpy: pentium_memcpy,
464 .memset: pentium_memset,
465 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
466 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
467 .align_loop: "16:8:8", /* Loop alignment. */
468 .align_jump: "16:8:8", /* Jump alignment. */
469 .align_label: "0:0:8", /* Label alignment. */
470 .align_func: "16", /* Func alignment. */
471 .small_unroll_ninsns: 4, /* Small unroll limit. */
472 .small_unroll_factor: 2, /* Small unroll factor. */
473};
474
475static const
476struct processor_costs lakemont_cost = {
477 .hard_register: {
478 /* Start of register allocator costs. integer->integer move cost is 2. */
479 .movzbl_load: 6, /* cost for loading QImode using movzbl */
480 .int_load: {2, 4, 2}, /* cost of loading integer registers
481 in QImode, HImode and SImode.
482 Relative to reg-reg move (2). */
483 .int_store: {2, 4, 2}, /* cost of storing integer registers */
484 .fp_move: 2, /* cost of reg,reg fld/fst */
485 .fp_load: {2, 2, 6}, /* cost of loading fp registers
486 in SFmode, DFmode and XFmode */
487 .fp_store: {4, 4, 6}, /* cost of storing fp registers
488 in SFmode, DFmode and XFmode */
489 .mmx_move: 8, /* cost of moving MMX register */
490 .mmx_load: {8, 8}, /* cost of loading MMX registers
491 in SImode and DImode */
492 .mmx_store: {8, 8}, /* cost of storing MMX registers
493 in SImode and DImode */
494 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
495 .sse_load: {4, 8, 16, 32, 64}, /* cost of loading SSE registers
496 in 32,64,128,256 and 512-bit */
497 .sse_store: {4, 8, 16, 32, 64}, /* cost of storing SSE registers
498 in 32,64,128,256 and 512-bit */
499 .sse_to_integer: 3, .integer_to_sse: 3, /* SSE->integer and integer->SSE moves */
500 .mask_to_integer: 3, .integer_to_mask: 3, /* mask->integer and integer->mask moves */
501 .mask_load: {2, 4, 2}, /* cost of loading mask register
502 in QImode, HImode, SImode. */
503 .mask_store: {2, 4, 2}, /* cost if storing mask register
504 in QImode, HImode, SImode. */
505 .mask_move: 2, /* cost of moving mask register. */
506 /* End of register allocator costs. */
507 },
508
509 COSTS_N_INSNS (1), /* cost of an add instruction */
510 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
511 COSTS_N_INSNS (1), /* variable shift costs */
512 COSTS_N_INSNS (1), /* constant shift costs */
513 .mult_init: {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
514 COSTS_N_INSNS (11), /* HI */
515 COSTS_N_INSNS (11), /* SI */
516 COSTS_N_INSNS (11), /* DI */
517 COSTS_N_INSNS (11)}, /* other */
518 .mult_bit: 0, /* cost of multiply per each bit set */
519 .divide: {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
520 COSTS_N_INSNS (25), /* HI */
521 COSTS_N_INSNS (25), /* SI */
522 COSTS_N_INSNS (25), /* DI */
523 COSTS_N_INSNS (25)}, /* other */
524 COSTS_N_INSNS (3), /* cost of movsx */
525 COSTS_N_INSNS (2), /* cost of movzx */
526 .large_insn: 8, /* "large" insn */
527 .move_ratio: 17, /* MOVE_RATIO */
528 .clear_ratio: 6, /* CLEAR_RATIO */
529 .int_load: {2, 4, 2}, /* cost of loading integer registers
530 in QImode, HImode and SImode.
531 Relative to reg-reg move (2). */
532 .int_store: {2, 4, 2}, /* cost of storing integer registers */
533 .sse_load: {4, 8, 16, 32, 64}, /* cost of loading SSE register
534 in 32bit, 64bit, 128bit, 256bit and 512bit */
535 .sse_store: {4, 8, 16, 32, 64}, /* cost of storing SSE register
536 in 32bit, 64bit, 128bit, 256bit and 512bit */
537 .sse_unaligned_load: {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
538 .sse_unaligned_store: {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
539 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
540 .sse_to_integer: 3, /* cost of moving SSE register to integer. */
541 .gather_static: 4, .gather_per_elt: 4, /* Gather load static, per_elt. */
542 .scatter_static: 4, .scatter_per_elt: 4, /* Gather store static, per_elt. */
543 .l1_cache_size: 8, /* size of l1 cache. */
544 .l2_cache_size: 8, /* size of l2 cache */
545 .prefetch_block: 0, /* size of prefetch block */
546 .simultaneous_prefetches: 0, /* number of parallel prefetches */
547 .branch_cost: 2, /* Branch cost */
548 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
549 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
550 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
551 COSTS_N_INSNS (1), /* cost of FABS instruction. */
552 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
553 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
554
555 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
556 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
557 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
558 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
559 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
560 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
561 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
562 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
563 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
564 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
565 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
566 .memcpy: pentium_memcpy,
567 .memset: pentium_memset,
568 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
569 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
570 .align_loop: "16:8:8", /* Loop alignment. */
571 .align_jump: "16:8:8", /* Jump alignment. */
572 .align_label: "0:0:8", /* Label alignment. */
573 .align_func: "16", /* Func alignment. */
574 .small_unroll_ninsns: 4, /* Small unroll limit. */
575 .small_unroll_factor: 2, /* Small unroll factor. */
576};
577
578/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
579 (we ensure the alignment). For small blocks inline loop is still a
580 noticeable win, for bigger blocks either rep movsl or rep movsb is
581 way to go. Rep movsb has apparently more expensive startup time in CPU,
582 but after 4K the difference is down in the noise. */
583static stringop_algs pentiumpro_memcpy[2] = {
584 {.unknown_size: rep_prefix_4_byte, .size: {{128, loop, false}, {1024, unrolled_loop, false},
585 {8192, rep_prefix_4_byte, false},
586 {-1, rep_prefix_1_byte, false}}},
587 DUMMY_STRINGOP_ALGS};
588static stringop_algs pentiumpro_memset[2] = {
589 {.unknown_size: rep_prefix_4_byte, .size: {{1024, unrolled_loop, false},
590 {8192, rep_prefix_4_byte, false},
591 {-1, libcall, false}}},
592 DUMMY_STRINGOP_ALGS};
593static const
594struct processor_costs pentiumpro_cost = {
595 .hard_register: {
596 /* Start of register allocator costs. integer->integer move cost is 2. */
597 .movzbl_load: 2, /* cost for loading QImode using movzbl */
598 .int_load: {4, 4, 4}, /* cost of loading integer registers
599 in QImode, HImode and SImode.
600 Relative to reg-reg move (2). */
601 .int_store: {2, 2, 2}, /* cost of storing integer registers */
602 .fp_move: 2, /* cost of reg,reg fld/fst */
603 .fp_load: {2, 2, 6}, /* cost of loading fp registers
604 in SFmode, DFmode and XFmode */
605 .fp_store: {4, 4, 6}, /* cost of storing fp registers
606 in SFmode, DFmode and XFmode */
607 .mmx_move: 2, /* cost of moving MMX register */
608 .mmx_load: {2, 2}, /* cost of loading MMX registers
609 in SImode and DImode */
610 .mmx_store: {2, 2}, /* cost of storing MMX registers
611 in SImode and DImode */
612 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
613 .sse_load: {4, 8, 16, 32, 64}, /* cost of loading SSE registers
614 in 32,64,128,256 and 512-bit */
615 .sse_store: {4, 8, 16, 32, 64}, /* cost of storing SSE registers
616 in 32,64,128,256 and 512-bit */
617 .sse_to_integer: 3, .integer_to_sse: 3, /* SSE->integer and integer->SSE moves */
618 .mask_to_integer: 3, .integer_to_mask: 3, /* mask->integer and integer->mask moves */
619 .mask_load: {4, 4, 4}, /* cost of loading mask register
620 in QImode, HImode, SImode. */
621 .mask_store: {2, 2, 2}, /* cost if storing mask register
622 in QImode, HImode, SImode. */
623 .mask_move: 2, /* cost of moving mask register. */
624 /* End of register allocator costs. */
625 },
626
627 COSTS_N_INSNS (1), /* cost of an add instruction */
628 COSTS_N_INSNS (1), /* cost of a lea instruction */
629 COSTS_N_INSNS (1), /* variable shift costs */
630 COSTS_N_INSNS (1), /* constant shift costs */
631 .mult_init: {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
632 COSTS_N_INSNS (4), /* HI */
633 COSTS_N_INSNS (4), /* SI */
634 COSTS_N_INSNS (4), /* DI */
635 COSTS_N_INSNS (4)}, /* other */
636 .mult_bit: 0, /* cost of multiply per each bit set */
637 .divide: {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
638 COSTS_N_INSNS (17), /* HI */
639 COSTS_N_INSNS (17), /* SI */
640 COSTS_N_INSNS (17), /* DI */
641 COSTS_N_INSNS (17)}, /* other */
642 COSTS_N_INSNS (1), /* cost of movsx */
643 COSTS_N_INSNS (1), /* cost of movzx */
644 .large_insn: 8, /* "large" insn */
645 .move_ratio: 6, /* MOVE_RATIO */
646 .clear_ratio: 6, /* CLEAR_RATIO */
647 .int_load: {4, 4, 4}, /* cost of loading integer registers
648 in QImode, HImode and SImode.
649 Relative to reg-reg move (2). */
650 .int_store: {2, 2, 2}, /* cost of storing integer registers */
651 .sse_load: {4, 8, 16, 32, 64}, /* cost of loading SSE register
652 in 32bit, 64bit, 128bit, 256bit and 512bit */
653 .sse_store: {4, 8, 16, 32, 64}, /* cost of storing SSE register
654 in 32bit, 64bit, 128bit, 256bit and 512bit */
655 .sse_unaligned_load: {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
656 .sse_unaligned_store: {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
657 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
658 .sse_to_integer: 3, /* cost of moving SSE register to integer. */
659 .gather_static: 4, .gather_per_elt: 4, /* Gather load static, per_elt. */
660 .scatter_static: 4, .scatter_per_elt: 4, /* Gather store static, per_elt. */
661 .l1_cache_size: 8, /* size of l1 cache. */
662 .l2_cache_size: 256, /* size of l2 cache */
663 .prefetch_block: 32, /* size of prefetch block */
664 .simultaneous_prefetches: 6, /* number of parallel prefetches */
665 .branch_cost: 2, /* Branch cost */
666 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
667 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
668 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
669 COSTS_N_INSNS (2), /* cost of FABS instruction. */
670 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
671 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
672
673 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
674 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
675 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
676 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
677 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
678 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
679 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
680 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
681 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
682 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
683 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
684 .memcpy: pentiumpro_memcpy,
685 .memset: pentiumpro_memset,
686 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
687 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
688 .align_loop: "16", /* Loop alignment. */
689 .align_jump: "16:11:8", /* Jump alignment. */
690 .align_label: "0:0:8", /* Label alignment. */
691 .align_func: "16", /* Func alignment. */
692 .small_unroll_ninsns: 4, /* Small unroll limit. */
693 .small_unroll_factor: 2, /* Small unroll factor. */
694};
695
696static stringop_algs geode_memcpy[2] = {
697 {.unknown_size: libcall, .size: {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
698 DUMMY_STRINGOP_ALGS};
699static stringop_algs geode_memset[2] = {
700 {.unknown_size: libcall, .size: {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
701 DUMMY_STRINGOP_ALGS};
702static const
703struct processor_costs geode_cost = {
704 .hard_register: {
705 /* Start of register allocator costs. integer->integer move cost is 2. */
706 .movzbl_load: 2, /* cost for loading QImode using movzbl */
707 .int_load: {2, 2, 2}, /* cost of loading integer registers
708 in QImode, HImode and SImode.
709 Relative to reg-reg move (2). */
710 .int_store: {2, 2, 2}, /* cost of storing integer registers */
711 .fp_move: 2, /* cost of reg,reg fld/fst */
712 .fp_load: {2, 2, 2}, /* cost of loading fp registers
713 in SFmode, DFmode and XFmode */
714 .fp_store: {4, 6, 6}, /* cost of storing fp registers
715 in SFmode, DFmode and XFmode */
716 .mmx_move: 2, /* cost of moving MMX register */
717 .mmx_load: {2, 2}, /* cost of loading MMX registers
718 in SImode and DImode */
719 .mmx_store: {2, 2}, /* cost of storing MMX registers
720 in SImode and DImode */
721 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
722 .sse_load: {2, 2, 8, 16, 32}, /* cost of loading SSE registers
723 in 32,64,128,256 and 512-bit */
724 .sse_store: {2, 2, 8, 16, 32}, /* cost of storing SSE registers
725 in 32,64,128,256 and 512-bit */
726 .sse_to_integer: 6, .integer_to_sse: 6, /* SSE->integer and integer->SSE moves */
727 .mask_to_integer: 6, .integer_to_mask: 6, /* mask->integer and integer->mask moves */
728 .mask_load: {2, 2, 2}, /* cost of loading mask register
729 in QImode, HImode, SImode. */
730 .mask_store: {2, 2, 2}, /* cost if storing mask register
731 in QImode, HImode, SImode. */
732 .mask_move: 2, /* cost of moving mask register. */
733 /* End of register allocator costs. */
734 },
735
736 COSTS_N_INSNS (1), /* cost of an add instruction */
737 COSTS_N_INSNS (1), /* cost of a lea instruction */
738 COSTS_N_INSNS (2), /* variable shift costs */
739 COSTS_N_INSNS (1), /* constant shift costs */
740 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
741 COSTS_N_INSNS (4), /* HI */
742 COSTS_N_INSNS (7), /* SI */
743 COSTS_N_INSNS (7), /* DI */
744 COSTS_N_INSNS (7)}, /* other */
745 .mult_bit: 0, /* cost of multiply per each bit set */
746 .divide: {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
747 COSTS_N_INSNS (23), /* HI */
748 COSTS_N_INSNS (39), /* SI */
749 COSTS_N_INSNS (39), /* DI */
750 COSTS_N_INSNS (39)}, /* other */
751 COSTS_N_INSNS (1), /* cost of movsx */
752 COSTS_N_INSNS (1), /* cost of movzx */
753 .large_insn: 8, /* "large" insn */
754 .move_ratio: 4, /* MOVE_RATIO */
755 .clear_ratio: 4, /* CLEAR_RATIO */
756 .int_load: {2, 2, 2}, /* cost of loading integer registers
757 in QImode, HImode and SImode.
758 Relative to reg-reg move (2). */
759 .int_store: {2, 2, 2}, /* cost of storing integer registers */
760 .sse_load: {2, 2, 8, 16, 32}, /* cost of loading SSE register
761 in 32bit, 64bit, 128bit, 256bit and 512bit */
762 .sse_store: {2, 2, 8, 16, 32}, /* cost of storing SSE register
763 in 32bit, 64bit, 128bit, 256bit and 512bit */
764 .sse_unaligned_load: {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
765 .sse_unaligned_store: {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
766 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
767 .sse_to_integer: 6, /* cost of moving SSE register to integer. */
768 .gather_static: 2, .gather_per_elt: 2, /* Gather load static, per_elt. */
769 .scatter_static: 2, .scatter_per_elt: 2, /* Gather store static, per_elt. */
770 .l1_cache_size: 64, /* size of l1 cache. */
771 .l2_cache_size: 128, /* size of l2 cache. */
772 .prefetch_block: 32, /* size of prefetch block */
773 .simultaneous_prefetches: 1, /* number of parallel prefetches */
774 .branch_cost: 1, /* Branch cost */
775 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
776 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
777 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
778 COSTS_N_INSNS (1), /* cost of FABS instruction. */
779 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
780 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
781
782 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
783 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
784 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
785 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
786 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
787 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
788 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
789 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
790 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
791 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
792 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
793 .memcpy: geode_memcpy,
794 .memset: geode_memset,
795 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
796 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
797 NULL, /* Loop alignment. */
798 NULL, /* Jump alignment. */
799 NULL, /* Label alignment. */
800 NULL, /* Func alignment. */
801 .small_unroll_ninsns: 4, /* Small unroll limit. */
802 .small_unroll_factor: 2, /* Small unroll factor. */
803};
804
805static stringop_algs k6_memcpy[2] = {
806 {.unknown_size: libcall, .size: {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
807 DUMMY_STRINGOP_ALGS};
808static stringop_algs k6_memset[2] = {
809 {.unknown_size: libcall, .size: {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
810 DUMMY_STRINGOP_ALGS};
811static const
812struct processor_costs k6_cost = {
813 .hard_register: {
814 /* Start of register allocator costs. integer->integer move cost is 2. */
815 .movzbl_load: 3, /* cost for loading QImode using movzbl */
816 .int_load: {4, 5, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 .int_store: {2, 3, 2}, /* cost of storing integer registers */
820 .fp_move: 4, /* cost of reg,reg fld/fst */
821 .fp_load: {6, 6, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 .fp_store: {4, 4, 4}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 .mmx_move: 2, /* cost of moving MMX register */
826 .mmx_load: {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 .mmx_store: {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
831 .sse_load: {2, 2, 8, 16, 32}, /* cost of loading SSE registers
832 in 32,64,128,256 and 512-bit */
833 .sse_store: {2, 2, 8, 16, 32}, /* cost of storing SSE registers
834 in 32,64,128,256 and 512-bit */
835 .sse_to_integer: 6, .integer_to_sse: 6, /* SSE->integer and integer->SSE moves */
836 .mask_to_integer: 6, .integer_to_mask: 6, /* mask->integer and integer->mask moves */
837 .mask_load: {4, 5, 4}, /* cost of loading mask register
838 in QImode, HImode, SImode. */
839 .mask_store: {2, 3, 2}, /* cost if storing mask register
840 in QImode, HImode, SImode. */
841 .mask_move: 2, /* cost of moving mask register. */
842 /* End of register allocator costs. */
843 },
844
845 COSTS_N_INSNS (1), /* cost of an add instruction */
846 COSTS_N_INSNS (2), /* cost of a lea instruction */
847 COSTS_N_INSNS (1), /* variable shift costs */
848 COSTS_N_INSNS (1), /* constant shift costs */
849 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
850 COSTS_N_INSNS (3), /* HI */
851 COSTS_N_INSNS (3), /* SI */
852 COSTS_N_INSNS (3), /* DI */
853 COSTS_N_INSNS (3)}, /* other */
854 .mult_bit: 0, /* cost of multiply per each bit set */
855 .divide: {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
856 COSTS_N_INSNS (18), /* HI */
857 COSTS_N_INSNS (18), /* SI */
858 COSTS_N_INSNS (18), /* DI */
859 COSTS_N_INSNS (18)}, /* other */
860 COSTS_N_INSNS (2), /* cost of movsx */
861 COSTS_N_INSNS (2), /* cost of movzx */
862 .large_insn: 8, /* "large" insn */
863 .move_ratio: 4, /* MOVE_RATIO */
864 .clear_ratio: 4, /* CLEAR_RATIO */
865 .int_load: {4, 5, 4}, /* cost of loading integer registers
866 in QImode, HImode and SImode.
867 Relative to reg-reg move (2). */
868 .int_store: {2, 3, 2}, /* cost of storing integer registers */
869 .sse_load: {2, 2, 8, 16, 32}, /* cost of loading SSE register
870 in 32bit, 64bit, 128bit, 256bit and 512bit */
871 .sse_store: {2, 2, 8, 16, 32}, /* cost of storing SSE register
872 in 32bit, 64bit, 128bit, 256bit and 512bit */
873 .sse_unaligned_load: {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
874 .sse_unaligned_store: {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
875 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
876 .sse_to_integer: 6, /* cost of moving SSE register to integer. */
877 .gather_static: 2, .gather_per_elt: 2, /* Gather load static, per_elt. */
878 .scatter_static: 2, .scatter_per_elt: 2, /* Gather store static, per_elt. */
879 .l1_cache_size: 32, /* size of l1 cache. */
880 .l2_cache_size: 32, /* size of l2 cache. Some models
881 have integrated l2 cache, but
882 optimizing for k6 is not important
883 enough to worry about that. */
884 .prefetch_block: 32, /* size of prefetch block */
885 .simultaneous_prefetches: 1, /* number of parallel prefetches */
886 .branch_cost: 1, /* Branch cost */
887 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
888 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
889 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
890 COSTS_N_INSNS (2), /* cost of FABS instruction. */
891 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
892 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
893
894 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
895 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
896 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
897 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
898 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
899 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
900 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
901 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
902 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
903 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
904 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
905 .memcpy: k6_memcpy,
906 .memset: k6_memset,
907 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
908 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
909 .align_loop: "32:8:8", /* Loop alignment. */
910 .align_jump: "32:8:8", /* Jump alignment. */
911 .align_label: "0:0:8", /* Label alignment. */
912 .align_func: "32", /* Func alignment. */
913 .small_unroll_ninsns: 4, /* Small unroll limit. */
914 .small_unroll_factor: 2, /* Small unroll factor. */
915};
916
917/* For some reason, Athlon deals better with REP prefix (relative to loops)
918 compared to K8. Alignment becomes important after 8 bytes for memcpy and
919 128 bytes for memset. */
920static stringop_algs athlon_memcpy[2] = {
921 {.unknown_size: libcall, .size: {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
922 DUMMY_STRINGOP_ALGS};
923static stringop_algs athlon_memset[2] = {
924 {.unknown_size: libcall, .size: {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
925 DUMMY_STRINGOP_ALGS};
926static const
927struct processor_costs athlon_cost = {
928 .hard_register: {
929 /* Start of register allocator costs. integer->integer move cost is 2. */
930 .movzbl_load: 4, /* cost for loading QImode using movzbl */
931 .int_load: {3, 4, 3}, /* cost of loading integer registers
932 in QImode, HImode and SImode.
933 Relative to reg-reg move (2). */
934 .int_store: {3, 4, 3}, /* cost of storing integer registers */
935 .fp_move: 4, /* cost of reg,reg fld/fst */
936 .fp_load: {4, 4, 12}, /* cost of loading fp registers
937 in SFmode, DFmode and XFmode */
938 .fp_store: {6, 6, 8}, /* cost of storing fp registers
939 in SFmode, DFmode and XFmode */
940 .mmx_move: 2, /* cost of moving MMX register */
941 .mmx_load: {4, 4}, /* cost of loading MMX registers
942 in SImode and DImode */
943 .mmx_store: {4, 4}, /* cost of storing MMX registers
944 in SImode and DImode */
945 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
946 .sse_load: {4, 4, 12, 12, 24}, /* cost of loading SSE registers
947 in 32,64,128,256 and 512-bit */
948 .sse_store: {4, 4, 10, 10, 20}, /* cost of storing SSE registers
949 in 32,64,128,256 and 512-bit */
950 .sse_to_integer: 5, .integer_to_sse: 5, /* SSE->integer and integer->SSE moves */
951 .mask_to_integer: 5, .integer_to_mask: 5, /* mask->integer and integer->mask moves */
952 .mask_load: {3, 4, 3}, /* cost of loading mask register
953 in QImode, HImode, SImode. */
954 .mask_store: {3, 4, 3}, /* cost if storing mask register
955 in QImode, HImode, SImode. */
956 .mask_move: 2, /* cost of moving mask register. */
957 /* End of register allocator costs. */
958 },
959
960 COSTS_N_INSNS (1), /* cost of an add instruction */
961 COSTS_N_INSNS (2), /* cost of a lea instruction */
962 COSTS_N_INSNS (1), /* variable shift costs */
963 COSTS_N_INSNS (1), /* constant shift costs */
964 .mult_init: {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
965 COSTS_N_INSNS (5), /* HI */
966 COSTS_N_INSNS (5), /* SI */
967 COSTS_N_INSNS (5), /* DI */
968 COSTS_N_INSNS (5)}, /* other */
969 .mult_bit: 0, /* cost of multiply per each bit set */
970 .divide: {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
971 COSTS_N_INSNS (26), /* HI */
972 COSTS_N_INSNS (42), /* SI */
973 COSTS_N_INSNS (74), /* DI */
974 COSTS_N_INSNS (74)}, /* other */
975 COSTS_N_INSNS (1), /* cost of movsx */
976 COSTS_N_INSNS (1), /* cost of movzx */
977 .large_insn: 8, /* "large" insn */
978 .move_ratio: 9, /* MOVE_RATIO */
979 .clear_ratio: 6, /* CLEAR_RATIO */
980 .int_load: {3, 4, 3}, /* cost of loading integer registers
981 in QImode, HImode and SImode.
982 Relative to reg-reg move (2). */
983 .int_store: {3, 4, 3}, /* cost of storing integer registers */
984 .sse_load: {4, 4, 12, 12, 24}, /* cost of loading SSE register
985 in 32bit, 64bit, 128bit, 256bit and 512bit */
986 .sse_store: {4, 4, 10, 10, 20}, /* cost of storing SSE register
987 in 32bit, 64bit, 128bit, 256bit and 512bit */
988 .sse_unaligned_load: {4, 4, 12, 12, 24}, /* cost of unaligned loads. */
989 .sse_unaligned_store: {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
990 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
991 .sse_to_integer: 5, /* cost of moving SSE register to integer. */
992 .gather_static: 4, .gather_per_elt: 4, /* Gather load static, per_elt. */
993 .scatter_static: 4, .scatter_per_elt: 4, /* Gather store static, per_elt. */
994 .l1_cache_size: 64, /* size of l1 cache. */
995 .l2_cache_size: 256, /* size of l2 cache. */
996 .prefetch_block: 64, /* size of prefetch block */
997 .simultaneous_prefetches: 6, /* number of parallel prefetches */
998 .branch_cost: 5, /* Branch cost */
999 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1000 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1001 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1002 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1003 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1004 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1005
1006 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1007 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1008 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1009 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1010 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1011 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1012 /* 11-16 */
1013 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1014 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
1015 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1016 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
1017 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
1018 .memcpy: athlon_memcpy,
1019 .memset: athlon_memset,
1020 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1021 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1022 .align_loop: "16:8:8", /* Loop alignment. */
1023 .align_jump: "16:8:8", /* Jump alignment. */
1024 .align_label: "0:0:8", /* Label alignment. */
1025 .align_func: "16", /* Func alignment. */
1026 .small_unroll_ninsns: 4, /* Small unroll limit. */
1027 .small_unroll_factor: 2, /* Small unroll factor. */
1028};
1029
1030/* K8 has optimized REP instruction for medium sized blocks, but for very
1031 small blocks it is better to use loop. For large blocks, libcall can
1032 do nontemporary accesses and beat inline considerably. */
1033static stringop_algs k8_memcpy[2] = {
1034 {.unknown_size: libcall, .size: {{6, loop, false}, {14, unrolled_loop, false},
1035 {-1, rep_prefix_4_byte, false}}},
1036 {.unknown_size: libcall, .size: {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1037 {-1, libcall, false}}}};
1038static stringop_algs k8_memset[2] = {
1039 {.unknown_size: libcall, .size: {{8, loop, false}, {24, unrolled_loop, false},
1040 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1041 {.unknown_size: libcall, .size: {{48, unrolled_loop, false},
1042 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1043static const
1044struct processor_costs k8_cost = {
1045 .hard_register: {
1046 /* Start of register allocator costs. integer->integer move cost is 2. */
1047 .movzbl_load: 4, /* cost for loading QImode using movzbl */
1048 .int_load: {3, 4, 3}, /* cost of loading integer registers
1049 in QImode, HImode and SImode.
1050 Relative to reg-reg move (2). */
1051 .int_store: {3, 4, 3}, /* cost of storing integer registers */
1052 .fp_move: 4, /* cost of reg,reg fld/fst */
1053 .fp_load: {4, 4, 12}, /* cost of loading fp registers
1054 in SFmode, DFmode and XFmode */
1055 .fp_store: {6, 6, 8}, /* cost of storing fp registers
1056 in SFmode, DFmode and XFmode */
1057 .mmx_move: 2, /* cost of moving MMX register */
1058 .mmx_load: {3, 3}, /* cost of loading MMX registers
1059 in SImode and DImode */
1060 .mmx_store: {4, 4}, /* cost of storing MMX registers
1061 in SImode and DImode */
1062 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
1063 .sse_load: {4, 3, 12, 12, 24}, /* cost of loading SSE registers
1064 in 32,64,128,256 and 512-bit */
1065 .sse_store: {4, 4, 10, 10, 20}, /* cost of storing SSE registers
1066 in 32,64,128,256 and 512-bit */
1067 .sse_to_integer: 5, .integer_to_sse: 5, /* SSE->integer and integer->SSE moves */
1068 .mask_to_integer: 5, .integer_to_mask: 5, /* mask->integer and integer->mask moves */
1069 .mask_load: {3, 4, 3}, /* cost of loading mask register
1070 in QImode, HImode, SImode. */
1071 .mask_store: {3, 4, 3}, /* cost if storing mask register
1072 in QImode, HImode, SImode. */
1073 .mask_move: 2, /* cost of moving mask register. */
1074 /* End of register allocator costs. */
1075 },
1076
1077 COSTS_N_INSNS (1), /* cost of an add instruction */
1078 COSTS_N_INSNS (2), /* cost of a lea instruction */
1079 COSTS_N_INSNS (1), /* variable shift costs */
1080 COSTS_N_INSNS (1), /* constant shift costs */
1081 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1082 COSTS_N_INSNS (4), /* HI */
1083 COSTS_N_INSNS (3), /* SI */
1084 COSTS_N_INSNS (4), /* DI */
1085 COSTS_N_INSNS (5)}, /* other */
1086 .mult_bit: 0, /* cost of multiply per each bit set */
1087 .divide: {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1088 COSTS_N_INSNS (26), /* HI */
1089 COSTS_N_INSNS (42), /* SI */
1090 COSTS_N_INSNS (74), /* DI */
1091 COSTS_N_INSNS (74)}, /* other */
1092 COSTS_N_INSNS (1), /* cost of movsx */
1093 COSTS_N_INSNS (1), /* cost of movzx */
1094 .large_insn: 8, /* "large" insn */
1095 .move_ratio: 9, /* MOVE_RATIO */
1096 .clear_ratio: 6, /* CLEAR_RATIO */
1097 .int_load: {3, 4, 3}, /* cost of loading integer registers
1098 in QImode, HImode and SImode.
1099 Relative to reg-reg move (2). */
1100 .int_store: {3, 4, 3}, /* cost of storing integer registers */
1101 .sse_load: {4, 3, 12, 12, 24}, /* cost of loading SSE register
1102 in 32bit, 64bit, 128bit, 256bit and 512bit */
1103 .sse_store: {4, 4, 10, 10, 20}, /* cost of storing SSE register
1104 in 32bit, 64bit, 128bit, 256bit and 512bit */
1105 .sse_unaligned_load: {4, 3, 12, 12, 24}, /* cost of unaligned loads. */
1106 .sse_unaligned_store: {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
1107 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
1108 .sse_to_integer: 5, /* cost of moving SSE register to integer. */
1109 .gather_static: 4, .gather_per_elt: 4, /* Gather load static, per_elt. */
1110 .scatter_static: 4, .scatter_per_elt: 4, /* Gather store static, per_elt. */
1111 .l1_cache_size: 64, /* size of l1 cache. */
1112 .l2_cache_size: 512, /* size of l2 cache. */
1113 .prefetch_block: 64, /* size of prefetch block */
1114 /* New AMD processors never drop prefetches; if they cannot be performed
1115 immediately, they are queued. We set number of simultaneous prefetches
1116 to a large constant to reflect this (it probably is not a good idea not
1117 to limit number of prefetches at all, as their execution also takes some
1118 time). */
1119 .simultaneous_prefetches: 100, /* number of parallel prefetches */
1120 .branch_cost: 3, /* Branch cost */
1121 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1122 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1123 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1124 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1125 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1126 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1127
1128 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1129 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1130 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1131 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1132 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1133 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1134 /* 11-16 */
1135 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1136 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1137 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1138 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1139 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
1140 .memcpy: k8_memcpy,
1141 .memset: k8_memset,
1142 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1143 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1144 .align_loop: "16:8:8", /* Loop alignment. */
1145 .align_jump: "16:8:8", /* Jump alignment. */
1146 .align_label: "0:0:8", /* Label alignment. */
1147 .align_func: "16", /* Func alignment. */
1148 .small_unroll_ninsns: 4, /* Small unroll limit. */
1149 .small_unroll_factor: 2, /* Small unroll factor. */
1150};
1151
1152/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1153 very small blocks it is better to use loop. For large blocks, libcall can
1154 do nontemporary accesses and beat inline considerably. */
1155static stringop_algs amdfam10_memcpy[2] = {
1156 {.unknown_size: libcall, .size: {{6, loop, false}, {14, unrolled_loop, false},
1157 {-1, rep_prefix_4_byte, false}}},
1158 {.unknown_size: libcall, .size: {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1159 {-1, libcall, false}}}};
1160static stringop_algs amdfam10_memset[2] = {
1161 {.unknown_size: libcall, .size: {{8, loop, false}, {24, unrolled_loop, false},
1162 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1163 {.unknown_size: libcall, .size: {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1164 {-1, libcall, false}}}};
1165struct processor_costs amdfam10_cost = {
1166 .hard_register: {
1167 /* Start of register allocator costs. integer->integer move cost is 2. */
1168 .movzbl_load: 4, /* cost for loading QImode using movzbl */
1169 .int_load: {3, 4, 3}, /* cost of loading integer registers
1170 in QImode, HImode and SImode.
1171 Relative to reg-reg move (2). */
1172 .int_store: {3, 4, 3}, /* cost of storing integer registers */
1173 .fp_move: 4, /* cost of reg,reg fld/fst */
1174 .fp_load: {4, 4, 12}, /* cost of loading fp registers
1175 in SFmode, DFmode and XFmode */
1176 .fp_store: {6, 6, 8}, /* cost of storing fp registers
1177 in SFmode, DFmode and XFmode */
1178 .mmx_move: 2, /* cost of moving MMX register */
1179 .mmx_load: {3, 3}, /* cost of loading MMX registers
1180 in SImode and DImode */
1181 .mmx_store: {4, 4}, /* cost of storing MMX registers
1182 in SImode and DImode */
1183 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
1184 .sse_load: {4, 4, 3, 6, 12}, /* cost of loading SSE registers
1185 in 32,64,128,256 and 512-bit */
1186 .sse_store: {4, 4, 5, 10, 20}, /* cost of storing SSE registers
1187 in 32,64,128,256 and 512-bit */
1188 .sse_to_integer: 3, .integer_to_sse: 3, /* SSE->integer and integer->SSE moves */
1189 .mask_to_integer: 3, .integer_to_mask: 3, /* mask->integer and integer->mask moves */
1190 .mask_load: {3, 4, 3}, /* cost of loading mask register
1191 in QImode, HImode, SImode. */
1192 .mask_store: {3, 4, 3}, /* cost if storing mask register
1193 in QImode, HImode, SImode. */
1194 .mask_move: 2, /* cost of moving mask register. */
1195
1196 /* On K8:
1197 MOVD reg64, xmmreg Double FSTORE 4
1198 MOVD reg32, xmmreg Double FSTORE 4
1199 On AMDFAM10:
1200 MOVD reg64, xmmreg Double FADD 3
1201 1/1 1/1
1202 MOVD reg32, xmmreg Double FADD 3
1203 1/1 1/1 */
1204 /* End of register allocator costs. */
1205 },
1206
1207 COSTS_N_INSNS (1), /* cost of an add instruction */
1208 COSTS_N_INSNS (2), /* cost of a lea instruction */
1209 COSTS_N_INSNS (1), /* variable shift costs */
1210 COSTS_N_INSNS (1), /* constant shift costs */
1211 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1212 COSTS_N_INSNS (4), /* HI */
1213 COSTS_N_INSNS (3), /* SI */
1214 COSTS_N_INSNS (4), /* DI */
1215 COSTS_N_INSNS (5)}, /* other */
1216 .mult_bit: 0, /* cost of multiply per each bit set */
1217 .divide: {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1218 COSTS_N_INSNS (35), /* HI */
1219 COSTS_N_INSNS (51), /* SI */
1220 COSTS_N_INSNS (83), /* DI */
1221 COSTS_N_INSNS (83)}, /* other */
1222 COSTS_N_INSNS (1), /* cost of movsx */
1223 COSTS_N_INSNS (1), /* cost of movzx */
1224 .large_insn: 8, /* "large" insn */
1225 .move_ratio: 9, /* MOVE_RATIO */
1226 .clear_ratio: 6, /* CLEAR_RATIO */
1227 .int_load: {3, 4, 3}, /* cost of loading integer registers
1228 in QImode, HImode and SImode.
1229 Relative to reg-reg move (2). */
1230 .int_store: {3, 4, 3}, /* cost of storing integer registers */
1231 .sse_load: {4, 4, 3, 6, 12}, /* cost of loading SSE register
1232 in 32bit, 64bit, 128bit, 256bit and 512bit */
1233 .sse_store: {4, 4, 5, 10, 20}, /* cost of storing SSE register
1234 in 32bit, 64bit, 128bit, 256bit and 512bit */
1235 .sse_unaligned_load: {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
1236 .sse_unaligned_store: {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
1237 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
1238 .sse_to_integer: 3, /* cost of moving SSE register to integer. */
1239 .gather_static: 4, .gather_per_elt: 4, /* Gather load static, per_elt. */
1240 .scatter_static: 4, .scatter_per_elt: 4, /* Gather store static, per_elt. */
1241 .l1_cache_size: 64, /* size of l1 cache. */
1242 .l2_cache_size: 512, /* size of l2 cache. */
1243 .prefetch_block: 64, /* size of prefetch block */
1244 /* New AMD processors never drop prefetches; if they cannot be performed
1245 immediately, they are queued. We set number of simultaneous prefetches
1246 to a large constant to reflect this (it probably is not a good idea not
1247 to limit number of prefetches at all, as their execution also takes some
1248 time). */
1249 .simultaneous_prefetches: 100, /* number of parallel prefetches */
1250 .branch_cost: 2, /* Branch cost */
1251 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1252 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1253 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1254 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1255 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1256 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1257
1258 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1259 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1260 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1261 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1262 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1263 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1264 /* 11-16 */
1265 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1266 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1267 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1268 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1269 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
1270 .memcpy: amdfam10_memcpy,
1271 .memset: amdfam10_memset,
1272 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1273 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1274 .align_loop: "32:25:8", /* Loop alignment. */
1275 .align_jump: "32:8:8", /* Jump alignment. */
1276 .align_label: "0:0:8", /* Label alignment. */
1277 .align_func: "32", /* Func alignment. */
1278 .small_unroll_ninsns: 4, /* Small unroll limit. */
1279 .small_unroll_factor: 2, /* Small unroll factor. */
1280};
1281
1282/* BDVER has optimized REP instruction for medium sized blocks, but for
1283 very small blocks it is better to use loop. For large blocks, libcall
1284 can do nontemporary accesses and beat inline considerably. */
1285static stringop_algs bdver_memcpy[2] = {
1286 {.unknown_size: libcall, .size: {{6, loop, false}, {14, unrolled_loop, false},
1287 {-1, rep_prefix_4_byte, false}}},
1288 {.unknown_size: libcall, .size: {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1289 {-1, libcall, false}}}};
1290static stringop_algs bdver_memset[2] = {
1291 {.unknown_size: libcall, .size: {{8, loop, false}, {24, unrolled_loop, false},
1292 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1293 {.unknown_size: libcall, .size: {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1294 {-1, libcall, false}}}};
1295
1296const struct processor_costs bdver_cost = {
1297 .hard_register: {
1298 /* Start of register allocator costs. integer->integer move cost is 2. */
1299 .movzbl_load: 8, /* cost for loading QImode using movzbl */
1300 .int_load: {8, 8, 8}, /* cost of loading integer registers
1301 in QImode, HImode and SImode.
1302 Relative to reg-reg move (2). */
1303 .int_store: {8, 8, 8}, /* cost of storing integer registers */
1304 .fp_move: 4, /* cost of reg,reg fld/fst */
1305 .fp_load: {12, 12, 28}, /* cost of loading fp registers
1306 in SFmode, DFmode and XFmode */
1307 .fp_store: {10, 10, 18}, /* cost of storing fp registers
1308 in SFmode, DFmode and XFmode */
1309 .mmx_move: 4, /* cost of moving MMX register */
1310 .mmx_load: {12, 12}, /* cost of loading MMX registers
1311 in SImode and DImode */
1312 .mmx_store: {10, 10}, /* cost of storing MMX registers
1313 in SImode and DImode */
1314 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
1315 .sse_load: {12, 12, 10, 40, 60}, /* cost of loading SSE registers
1316 in 32,64,128,256 and 512-bit */
1317 .sse_store: {10, 10, 10, 40, 60}, /* cost of storing SSE registers
1318 in 32,64,128,256 and 512-bit */
1319 .sse_to_integer: 16, .integer_to_sse: 20, /* SSE->integer and integer->SSE moves */
1320 .mask_to_integer: 16, .integer_to_mask: 20, /* mask->integer and integer->mask moves */
1321 .mask_load: {8, 8, 8}, /* cost of loading mask register
1322 in QImode, HImode, SImode. */
1323 .mask_store: {8, 8, 8}, /* cost if storing mask register
1324 in QImode, HImode, SImode. */
1325 .mask_move: 2, /* cost of moving mask register. */
1326 /* End of register allocator costs. */
1327 },
1328
1329 COSTS_N_INSNS (1), /* cost of an add instruction */
1330 COSTS_N_INSNS (1), /* cost of a lea instruction */
1331 COSTS_N_INSNS (1), /* variable shift costs */
1332 COSTS_N_INSNS (1), /* constant shift costs */
1333 .mult_init: {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1334 COSTS_N_INSNS (4), /* HI */
1335 COSTS_N_INSNS (4), /* SI */
1336 COSTS_N_INSNS (6), /* DI */
1337 COSTS_N_INSNS (6)}, /* other */
1338 .mult_bit: 0, /* cost of multiply per each bit set */
1339 .divide: {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1340 COSTS_N_INSNS (35), /* HI */
1341 COSTS_N_INSNS (51), /* SI */
1342 COSTS_N_INSNS (83), /* DI */
1343 COSTS_N_INSNS (83)}, /* other */
1344 COSTS_N_INSNS (1), /* cost of movsx */
1345 COSTS_N_INSNS (1), /* cost of movzx */
1346 .large_insn: 8, /* "large" insn */
1347 .move_ratio: 9, /* MOVE_RATIO */
1348 .clear_ratio: 6, /* CLEAR_RATIO */
1349 .int_load: {8, 8, 8}, /* cost of loading integer registers
1350 in QImode, HImode and SImode.
1351 Relative to reg-reg move (2). */
1352 .int_store: {8, 8, 8}, /* cost of storing integer registers */
1353 .sse_load: {12, 12, 10, 40, 60}, /* cost of loading SSE register
1354 in 32bit, 64bit, 128bit, 256bit and 512bit */
1355 .sse_store: {10, 10, 10, 40, 60}, /* cost of storing SSE register
1356 in 32bit, 64bit, 128bit, 256bit and 512bit */
1357 .sse_unaligned_load: {12, 12, 10, 40, 60}, /* cost of unaligned loads. */
1358 .sse_unaligned_store: {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
1359 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
1360 .sse_to_integer: 16, /* cost of moving SSE register to integer. */
1361 .gather_static: 12, .gather_per_elt: 12, /* Gather load static, per_elt. */
1362 .scatter_static: 10, .scatter_per_elt: 10, /* Gather store static, per_elt. */
1363 .l1_cache_size: 16, /* size of l1 cache. */
1364 .l2_cache_size: 2048, /* size of l2 cache. */
1365 .prefetch_block: 64, /* size of prefetch block */
1366 /* New AMD processors never drop prefetches; if they cannot be performed
1367 immediately, they are queued. We set number of simultaneous prefetches
1368 to a large constant to reflect this (it probably is not a good idea not
1369 to limit number of prefetches at all, as their execution also takes some
1370 time). */
1371 .simultaneous_prefetches: 100, /* number of parallel prefetches */
1372 .branch_cost: 2, /* Branch cost */
1373 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1374 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1375 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1376 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1377 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1378 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1379
1380 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1381 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1382 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1383 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1384 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1385 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1386 /* 9-24 */
1387 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1388 /* 9-27 */
1389 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1390 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1391 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1392 .reassoc_int: 1, .reassoc_fp: 2, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
1393 .memcpy: bdver_memcpy,
1394 .memset: bdver_memset,
1395 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1396 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1397 .align_loop: "16:11:8", /* Loop alignment. */
1398 .align_jump: "16:8:8", /* Jump alignment. */
1399 .align_label: "0:0:8", /* Label alignment. */
1400 .align_func: "11", /* Func alignment. */
1401 .small_unroll_ninsns: 4, /* Small unroll limit. */
1402 .small_unroll_factor: 2, /* Small unroll factor. */
1403};
1404
1405
1406/* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1407 very small blocks it is better to use loop. For large blocks, libcall
1408 can do nontemporary accesses and beat inline considerably. */
1409static stringop_algs znver1_memcpy[2] = {
1410 /* 32-bit tuning. */
1411 {.unknown_size: libcall, .size: {{6, loop, false},
1412 {14, unrolled_loop, false},
1413 {-1, libcall, false}}},
1414 /* 64-bit tuning. */
1415 {.unknown_size: libcall, .size: {{16, loop, false},
1416 {128, rep_prefix_8_byte, false},
1417 {-1, libcall, false}}}};
1418static stringop_algs znver1_memset[2] = {
1419 /* 32-bit tuning. */
1420 {.unknown_size: libcall, .size: {{8, loop, false},
1421 {24, unrolled_loop, false},
1422 {128, rep_prefix_4_byte, false},
1423 {-1, libcall, false}}},
1424 /* 64-bit tuning. */
1425 {.unknown_size: libcall, .size: {{48, unrolled_loop, false},
1426 {128, rep_prefix_8_byte, false},
1427 {-1, libcall, false}}}};
1428struct processor_costs znver1_cost = {
1429 .hard_register: {
1430 /* Start of register allocator costs. integer->integer move cost is 2. */
1431
1432 /* reg-reg moves are done by renaming and thus they are even cheaper than
1433 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1434 to doubles of latencies, we do not model this correctly. It does not
1435 seem to make practical difference to bump prices up even more. */
1436 .movzbl_load: 6, /* cost for loading QImode using
1437 movzbl. */
1438 .int_load: {6, 6, 6}, /* cost of loading integer registers
1439 in QImode, HImode and SImode.
1440 Relative to reg-reg move (2). */
1441 .int_store: {8, 8, 8}, /* cost of storing integer
1442 registers. */
1443 .fp_move: 2, /* cost of reg,reg fld/fst. */
1444 .fp_load: {6, 6, 16}, /* cost of loading fp registers
1445 in SFmode, DFmode and XFmode. */
1446 .fp_store: {8, 8, 16}, /* cost of storing fp registers
1447 in SFmode, DFmode and XFmode. */
1448 .mmx_move: 2, /* cost of moving MMX register. */
1449 .mmx_load: {6, 6}, /* cost of loading MMX registers
1450 in SImode and DImode. */
1451 .mmx_store: {8, 8}, /* cost of storing MMX registers
1452 in SImode and DImode. */
1453 .xmm_move: 2, .ymm_move: 3, .zmm_move: 6, /* cost of moving XMM,YMM,ZMM register. */
1454 .sse_load: {6, 6, 6, 12, 24}, /* cost of loading SSE registers
1455 in 32,64,128,256 and 512-bit. */
1456 .sse_store: {8, 8, 8, 16, 32}, /* cost of storing SSE registers
1457 in 32,64,128,256 and 512-bit. */
1458 .sse_to_integer: 6, .integer_to_sse: 6, /* SSE->integer and integer->SSE moves. */
1459 .mask_to_integer: 8, .integer_to_mask: 8, /* mask->integer and integer->mask moves */
1460 .mask_load: {6, 6, 6}, /* cost of loading mask register
1461 in QImode, HImode, SImode. */
1462 .mask_store: {8, 8, 8}, /* cost if storing mask register
1463 in QImode, HImode, SImode. */
1464 .mask_move: 2, /* cost of moving mask register. */
1465 /* End of register allocator costs. */
1466 },
1467
1468 COSTS_N_INSNS (1), /* cost of an add instruction. */
1469 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1470 COSTS_N_INSNS (1), /* variable shift costs. */
1471 COSTS_N_INSNS (1), /* constant shift costs. */
1472 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1473 COSTS_N_INSNS (3), /* HI. */
1474 COSTS_N_INSNS (3), /* SI. */
1475 COSTS_N_INSNS (3), /* DI. */
1476 COSTS_N_INSNS (3)}, /* other. */
1477 .mult_bit: 0, /* cost of multiply per each bit
1478 set. */
1479 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1480 bound. */
1481 .divide: {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1482 COSTS_N_INSNS (22), /* HI. */
1483 COSTS_N_INSNS (30), /* SI. */
1484 COSTS_N_INSNS (45), /* DI. */
1485 COSTS_N_INSNS (45)}, /* other. */
1486 COSTS_N_INSNS (1), /* cost of movsx. */
1487 COSTS_N_INSNS (1), /* cost of movzx. */
1488 .large_insn: 8, /* "large" insn. */
1489 .move_ratio: 9, /* MOVE_RATIO. */
1490 .clear_ratio: 6, /* CLEAR_RATIO */
1491 .int_load: {6, 6, 6}, /* cost of loading integer registers
1492 in QImode, HImode and SImode.
1493 Relative to reg-reg move (2). */
1494 .int_store: {8, 8, 8}, /* cost of storing integer
1495 registers. */
1496 .sse_load: {6, 6, 6, 12, 24}, /* cost of loading SSE register
1497 in 32bit, 64bit, 128bit, 256bit and 512bit */
1498 .sse_store: {8, 8, 8, 16, 32}, /* cost of storing SSE register
1499 in 32bit, 64bit, 128bit, 256bit and 512bit */
1500 .sse_unaligned_load: {6, 6, 6, 12, 24}, /* cost of unaligned loads. */
1501 .sse_unaligned_store: {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
1502 .xmm_move: 2, .ymm_move: 3, .zmm_move: 6, /* cost of moving XMM,YMM,ZMM register. */
1503 .sse_to_integer: 6, /* cost of moving SSE register to integer. */
1504 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1505 throughput 12. Approx 9 uops do not depend on vector size and every load
1506 is 7 uops. */
1507 .gather_static: 18, .gather_per_elt: 8, /* Gather load static, per_elt. */
1508 .scatter_static: 18, .scatter_per_elt: 10, /* Gather store static, per_elt. */
1509 .l1_cache_size: 32, /* size of l1 cache. */
1510 .l2_cache_size: 512, /* size of l2 cache. */
1511 .prefetch_block: 64, /* size of prefetch block. */
1512 /* New AMD processors never drop prefetches; if they cannot be performed
1513 immediately, they are queued. We set number of simultaneous prefetches
1514 to a large constant to reflect this (it probably is not a good idea not
1515 to limit number of prefetches at all, as their execution also takes some
1516 time). */
1517 .simultaneous_prefetches: 100, /* number of parallel prefetches. */
1518 .branch_cost: 3, /* Branch cost. */
1519 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1520 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1521 /* Latency of fdiv is 8-15. */
1522 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1523 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1524 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1525 /* Latency of fsqrt is 4-10. */
1526 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1527
1528 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1529 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1530 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1531 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1532 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1533 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1534 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1535 /* 9-13 */
1536 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1537 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1538 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1539 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1540 and it can execute 2 integer additions and 2 multiplications thus
1541 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1542 that 4 works better than 6 probably due to register pressure.
1543
1544 Integer vector operations are taken by FP unit and execute 3 vector
1545 plus/minus operations per cycle but only one multiply. This is adjusted
1546 in ix86_reassociation_width. */
1547 .reassoc_int: 4, .reassoc_fp: 4, .reassoc_vec_int: 3, .reassoc_vec_fp: 6, /* reassoc int, fp, vec_int, vec_fp. */
1548 .memcpy: znver1_memcpy,
1549 .memset: znver1_memset,
1550 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1551 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1552 .align_loop: "16", /* Loop alignment. */
1553 .align_jump: "16", /* Jump alignment. */
1554 .align_label: "0:0:8", /* Label alignment. */
1555 .align_func: "16", /* Func alignment. */
1556 .small_unroll_ninsns: 4, /* Small unroll limit. */
1557 .small_unroll_factor: 2, /* Small unroll factor. */
1558};
1559
1560/* ZNVER2 has optimized REP instruction for medium sized blocks, but for
1561 very small blocks it is better to use loop. For large blocks, libcall
1562 can do nontemporary accesses and beat inline considerably. */
1563static stringop_algs znver2_memcpy[2] = {
1564 /* 32-bit tuning. */
1565 {.unknown_size: libcall, .size: {{6, loop, false},
1566 {14, unrolled_loop, false},
1567 {-1, libcall, false}}},
1568 /* 64-bit tuning. */
1569 {.unknown_size: libcall, .size: {{16, loop, false},
1570 {64, rep_prefix_4_byte, false},
1571 {-1, libcall, false}}}};
1572static stringop_algs znver2_memset[2] = {
1573 /* 32-bit tuning. */
1574 {.unknown_size: libcall, .size: {{8, loop, false},
1575 {24, unrolled_loop, false},
1576 {128, rep_prefix_4_byte, false},
1577 {-1, libcall, false}}},
1578 /* 64-bit tuning. */
1579 {.unknown_size: libcall, .size: {{24, rep_prefix_4_byte, false},
1580 {128, rep_prefix_8_byte, false},
1581 {-1, libcall, false}}}};
1582
1583struct processor_costs znver2_cost = {
1584 .hard_register: {
1585 /* Start of register allocator costs. integer->integer move cost is 2. */
1586
1587 /* reg-reg moves are done by renaming and thus they are even cheaper than
1588 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1589 to doubles of latencies, we do not model this correctly. It does not
1590 seem to make practical difference to bump prices up even more. */
1591 .movzbl_load: 6, /* cost for loading QImode using
1592 movzbl. */
1593 .int_load: {6, 6, 6}, /* cost of loading integer registers
1594 in QImode, HImode and SImode.
1595 Relative to reg-reg move (2). */
1596 .int_store: {8, 8, 8}, /* cost of storing integer
1597 registers. */
1598 .fp_move: 2, /* cost of reg,reg fld/fst. */
1599 .fp_load: {6, 6, 16}, /* cost of loading fp registers
1600 in SFmode, DFmode and XFmode. */
1601 .fp_store: {8, 8, 16}, /* cost of storing fp registers
1602 in SFmode, DFmode and XFmode. */
1603 .mmx_move: 2, /* cost of moving MMX register. */
1604 .mmx_load: {6, 6}, /* cost of loading MMX registers
1605 in SImode and DImode. */
1606 .mmx_store: {8, 8}, /* cost of storing MMX registers
1607 in SImode and DImode. */
1608 .xmm_move: 2, .ymm_move: 2, .zmm_move: 3, /* cost of moving XMM,YMM,ZMM
1609 register. */
1610 .sse_load: {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1611 in 32,64,128,256 and 512-bit. */
1612 .sse_store: {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1613 in 32,64,128,256 and 512-bit. */
1614 .sse_to_integer: 6, .integer_to_sse: 6, /* SSE->integer and integer->SSE
1615 moves. */
1616 .mask_to_integer: 8, .integer_to_mask: 8, /* mask->integer and integer->mask moves */
1617 .mask_load: {6, 6, 6}, /* cost of loading mask register
1618 in QImode, HImode, SImode. */
1619 .mask_store: {8, 8, 8}, /* cost if storing mask register
1620 in QImode, HImode, SImode. */
1621 .mask_move: 2, /* cost of moving mask register. */
1622 /* End of register allocator costs. */
1623 },
1624
1625 COSTS_N_INSNS (1), /* cost of an add instruction. */
1626 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1627 COSTS_N_INSNS (1), /* variable shift costs. */
1628 COSTS_N_INSNS (1), /* constant shift costs. */
1629 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1630 COSTS_N_INSNS (3), /* HI. */
1631 COSTS_N_INSNS (3), /* SI. */
1632 COSTS_N_INSNS (3), /* DI. */
1633 COSTS_N_INSNS (3)}, /* other. */
1634 .mult_bit: 0, /* cost of multiply per each bit
1635 set. */
1636 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1637 bound. */
1638 .divide: {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1639 COSTS_N_INSNS (22), /* HI. */
1640 COSTS_N_INSNS (30), /* SI. */
1641 COSTS_N_INSNS (45), /* DI. */
1642 COSTS_N_INSNS (45)}, /* other. */
1643 COSTS_N_INSNS (1), /* cost of movsx. */
1644 COSTS_N_INSNS (1), /* cost of movzx. */
1645 .large_insn: 8, /* "large" insn. */
1646 .move_ratio: 9, /* MOVE_RATIO. */
1647 .clear_ratio: 6, /* CLEAR_RATIO */
1648 .int_load: {6, 6, 6}, /* cost of loading integer registers
1649 in QImode, HImode and SImode.
1650 Relative to reg-reg move (2). */
1651 .int_store: {8, 8, 8}, /* cost of storing integer
1652 registers. */
1653 .sse_load: {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1654 in 32bit, 64bit, 128bit, 256bit and 512bit */
1655 .sse_store: {8, 8, 8, 8, 16}, /* cost of storing SSE register
1656 in 32bit, 64bit, 128bit, 256bit and 512bit */
1657 .sse_unaligned_load: {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
1658 .sse_unaligned_store: {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1659 .xmm_move: 2, .ymm_move: 2, .zmm_move: 3, /* cost of moving XMM,YMM,ZMM
1660 register. */
1661 .sse_to_integer: 6, /* cost of moving SSE register to integer. */
1662 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1663 throughput 12. Approx 9 uops do not depend on vector size and every load
1664 is 7 uops. */
1665 .gather_static: 18, .gather_per_elt: 8, /* Gather load static, per_elt. */
1666 .scatter_static: 18, .scatter_per_elt: 10, /* Gather store static, per_elt. */
1667 .l1_cache_size: 32, /* size of l1 cache. */
1668 .l2_cache_size: 512, /* size of l2 cache. */
1669 .prefetch_block: 64, /* size of prefetch block. */
1670 /* New AMD processors never drop prefetches; if they cannot be performed
1671 immediately, they are queued. We set number of simultaneous prefetches
1672 to a large constant to reflect this (it probably is not a good idea not
1673 to limit number of prefetches at all, as their execution also takes some
1674 time). */
1675 .simultaneous_prefetches: 100, /* number of parallel prefetches. */
1676 .branch_cost: 3, /* Branch cost. */
1677 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1678 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1679 /* Latency of fdiv is 8-15. */
1680 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1681 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1682 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1683 /* Latency of fsqrt is 4-10. */
1684 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1685
1686 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1687 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1688 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1689 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1690 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1691 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1692 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1693 /* 9-13. */
1694 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1695 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1696 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1697 /* Zen can execute 4 integer operations per cycle. FP operations
1698 take 3 cycles and it can execute 2 integer additions and 2
1699 multiplications thus reassociation may make sense up to with of 6.
1700 SPEC2k6 bencharks suggests
1701 that 4 works better than 6 probably due to register pressure.
1702
1703 Integer vector operations are taken by FP unit and execute 3 vector
1704 plus/minus operations per cycle but only one multiply. This is adjusted
1705 in ix86_reassociation_width. */
1706 .reassoc_int: 4, .reassoc_fp: 4, .reassoc_vec_int: 3, .reassoc_vec_fp: 6, /* reassoc int, fp, vec_int, vec_fp. */
1707 .memcpy: znver2_memcpy,
1708 .memset: znver2_memset,
1709 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1710 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1711 .align_loop: "16", /* Loop alignment. */
1712 .align_jump: "16", /* Jump alignment. */
1713 .align_label: "0:0:8", /* Label alignment. */
1714 .align_func: "16", /* Func alignment. */
1715 .small_unroll_ninsns: 4, /* Small unroll limit. */
1716 .small_unroll_factor: 2, /* Small unroll factor. */
1717};
1718
1719struct processor_costs znver3_cost = {
1720 .hard_register: {
1721 /* Start of register allocator costs. integer->integer move cost is 2. */
1722
1723 /* reg-reg moves are done by renaming and thus they are even cheaper than
1724 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1725 to doubles of latencies, we do not model this correctly. It does not
1726 seem to make practical difference to bump prices up even more. */
1727 .movzbl_load: 6, /* cost for loading QImode using
1728 movzbl. */
1729 .int_load: {6, 6, 6}, /* cost of loading integer registers
1730 in QImode, HImode and SImode.
1731 Relative to reg-reg move (2). */
1732 .int_store: {8, 8, 8}, /* cost of storing integer
1733 registers. */
1734 .fp_move: 2, /* cost of reg,reg fld/fst. */
1735 .fp_load: {6, 6, 16}, /* cost of loading fp registers
1736 in SFmode, DFmode and XFmode. */
1737 .fp_store: {8, 8, 16}, /* cost of storing fp registers
1738 in SFmode, DFmode and XFmode. */
1739 .mmx_move: 2, /* cost of moving MMX register. */
1740 .mmx_load: {6, 6}, /* cost of loading MMX registers
1741 in SImode and DImode. */
1742 .mmx_store: {8, 8}, /* cost of storing MMX registers
1743 in SImode and DImode. */
1744 .xmm_move: 2, .ymm_move: 2, .zmm_move: 3, /* cost of moving XMM,YMM,ZMM
1745 register. */
1746 .sse_load: {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1747 in 32,64,128,256 and 512-bit. */
1748 .sse_store: {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1749 in 32,64,128,256 and 512-bit. */
1750 .sse_to_integer: 6, .integer_to_sse: 6, /* SSE->integer and integer->SSE
1751 moves. */
1752 .mask_to_integer: 8, .integer_to_mask: 8, /* mask->integer and integer->mask moves */
1753 .mask_load: {6, 6, 6}, /* cost of loading mask register
1754 in QImode, HImode, SImode. */
1755 .mask_store: {8, 8, 8}, /* cost if storing mask register
1756 in QImode, HImode, SImode. */
1757 .mask_move: 2, /* cost of moving mask register. */
1758 /* End of register allocator costs. */
1759 },
1760
1761 COSTS_N_INSNS (1), /* cost of an add instruction. */
1762 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1763 COSTS_N_INSNS (1), /* variable shift costs. */
1764 COSTS_N_INSNS (1), /* constant shift costs. */
1765 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1766 COSTS_N_INSNS (3), /* HI. */
1767 COSTS_N_INSNS (3), /* SI. */
1768 COSTS_N_INSNS (3), /* DI. */
1769 COSTS_N_INSNS (3)}, /* other. */
1770 .mult_bit: 0, /* cost of multiply per each bit
1771 set. */
1772 .divide: {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */
1773 COSTS_N_INSNS (10), /* HI. */
1774 COSTS_N_INSNS (12), /* SI. */
1775 COSTS_N_INSNS (17), /* DI. */
1776 COSTS_N_INSNS (17)}, /* other. */
1777 COSTS_N_INSNS (1), /* cost of movsx. */
1778 COSTS_N_INSNS (1), /* cost of movzx. */
1779 .large_insn: 8, /* "large" insn. */
1780 .move_ratio: 9, /* MOVE_RATIO. */
1781 .clear_ratio: 6, /* CLEAR_RATIO */
1782 .int_load: {6, 6, 6}, /* cost of loading integer registers
1783 in QImode, HImode and SImode.
1784 Relative to reg-reg move (2). */
1785 .int_store: {8, 8, 8}, /* cost of storing integer
1786 registers. */
1787 .sse_load: {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1788 in 32bit, 64bit, 128bit, 256bit and 512bit */
1789 .sse_store: {8, 8, 8, 8, 16}, /* cost of storing SSE register
1790 in 32bit, 64bit, 128bit, 256bit and 512bit */
1791 .sse_unaligned_load: {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
1792 .sse_unaligned_store: {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1793 .xmm_move: 2, .ymm_move: 2, .zmm_move: 3, /* cost of moving XMM,YMM,ZMM
1794 register. */
1795 .sse_to_integer: 6, /* cost of moving SSE register to integer. */
1796 /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
1797 throughput 9. Approx 7 uops do not depend on vector size and every load
1798 is 4 uops. */
1799 .gather_static: 14, .gather_per_elt: 8, /* Gather load static, per_elt. */
1800 .scatter_static: 14, .scatter_per_elt: 10, /* Gather store static, per_elt. */
1801 .l1_cache_size: 32, /* size of l1 cache. */
1802 .l2_cache_size: 512, /* size of l2 cache. */
1803 .prefetch_block: 64, /* size of prefetch block. */
1804 /* New AMD processors never drop prefetches; if they cannot be performed
1805 immediately, they are queued. We set number of simultaneous prefetches
1806 to a large constant to reflect this (it probably is not a good idea not
1807 to limit number of prefetches at all, as their execution also takes some
1808 time). */
1809 .simultaneous_prefetches: 100, /* number of parallel prefetches. */
1810 .branch_cost: 3, /* Branch cost. */
1811 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1812 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1813 /* Latency of fdiv is 8-15. */
1814 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1815 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1816 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1817 /* Latency of fsqrt is 4-10. */
1818 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1819
1820 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1821 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1822 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1823 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1824 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1825 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1826 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1827 /* 9-13. */
1828 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1829 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1830 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1831 /* Zen can execute 4 integer operations per cycle. FP operations
1832 take 3 cycles and it can execute 2 integer additions and 2
1833 multiplications thus reassociation may make sense up to with of 6.
1834 SPEC2k6 bencharks suggests
1835 that 4 works better than 6 probably due to register pressure.
1836
1837 Integer vector operations are taken by FP unit and execute 3 vector
1838 plus/minus operations per cycle but only one multiply. This is adjusted
1839 in ix86_reassociation_width. */
1840 .reassoc_int: 4, .reassoc_fp: 4, .reassoc_vec_int: 3, .reassoc_vec_fp: 6, /* reassoc int, fp, vec_int, vec_fp. */
1841 .memcpy: znver2_memcpy,
1842 .memset: znver2_memset,
1843 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1844 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1845 .align_loop: "16", /* Loop alignment. */
1846 .align_jump: "16", /* Jump alignment. */
1847 .align_label: "0:0:8", /* Label alignment. */
1848 .align_func: "16", /* Func alignment. */
1849 .small_unroll_ninsns: 4, /* Small unroll limit. */
1850 .small_unroll_factor: 2, /* Small unroll factor. */
1851};
1852
1853/* This table currently replicates znver3_cost table. */
1854struct processor_costs znver4_cost = {
1855 .hard_register: {
1856 /* Start of register allocator costs. integer->integer move cost is 2. */
1857
1858 /* reg-reg moves are done by renaming and thus they are even cheaper than
1859 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1860 to doubles of latencies, we do not model this correctly. It does not
1861 seem to make practical difference to bump prices up even more. */
1862 .movzbl_load: 6, /* cost for loading QImode using
1863 movzbl. */
1864 .int_load: {6, 6, 6}, /* cost of loading integer registers
1865 in QImode, HImode and SImode.
1866 Relative to reg-reg move (2). */
1867 .int_store: {8, 8, 8}, /* cost of storing integer
1868 registers. */
1869 .fp_move: 2, /* cost of reg,reg fld/fst. */
1870 .fp_load: {14, 14, 17}, /* cost of loading fp registers
1871 in SFmode, DFmode and XFmode. */
1872 .fp_store: {12, 12, 16}, /* cost of storing fp registers
1873 in SFmode, DFmode and XFmode. */
1874 .mmx_move: 2, /* cost of moving MMX register. */
1875 .mmx_load: {6, 6}, /* cost of loading MMX registers
1876 in SImode and DImode. */
1877 .mmx_store: {8, 8}, /* cost of storing MMX registers
1878 in SImode and DImode. */
1879 .xmm_move: 2, .ymm_move: 2, .zmm_move: 3, /* cost of moving XMM,YMM,ZMM
1880 register. */
1881 .sse_load: {6, 6, 10, 10, 12}, /* cost of loading SSE registers
1882 in 32,64,128,256 and 512-bit. */
1883 .sse_store: {8, 8, 8, 12, 12}, /* cost of storing SSE registers
1884 in 32,64,128,256 and 512-bit. */
1885 .sse_to_integer: 6, .integer_to_sse: 8, /* SSE->integer and integer->SSE
1886 moves. */
1887 .mask_to_integer: 8, .integer_to_mask: 8, /* mask->integer and integer->mask moves */
1888 .mask_load: {6, 6, 6}, /* cost of loading mask register
1889 in QImode, HImode, SImode. */
1890 .mask_store: {8, 8, 8}, /* cost if storing mask register
1891 in QImode, HImode, SImode. */
1892 .mask_move: 2, /* cost of moving mask register. */
1893 /* End of register allocator costs. */
1894 },
1895
1896 COSTS_N_INSNS (1), /* cost of an add instruction. */
1897 /* TODO: Lea with 3 components has cost 2. */
1898 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1899 COSTS_N_INSNS (1), /* variable shift costs. */
1900 COSTS_N_INSNS (1), /* constant shift costs. */
1901 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1902 COSTS_N_INSNS (3), /* HI. */
1903 COSTS_N_INSNS (3), /* SI. */
1904 COSTS_N_INSNS (3), /* DI. */
1905 COSTS_N_INSNS (3)}, /* other. */
1906 .mult_bit: 0, /* cost of multiply per each bit
1907 set. */
1908 .divide: {COSTS_N_INSNS (12), /* cost of a divide/mod for QI. */
1909 COSTS_N_INSNS (13), /* HI. */
1910 COSTS_N_INSNS (13), /* SI. */
1911 COSTS_N_INSNS (18), /* DI. */
1912 COSTS_N_INSNS (18)}, /* other. */
1913 COSTS_N_INSNS (1), /* cost of movsx. */
1914 COSTS_N_INSNS (1), /* cost of movzx. */
1915 .large_insn: 8, /* "large" insn. */
1916 .move_ratio: 9, /* MOVE_RATIO. */
1917 .clear_ratio: 6, /* CLEAR_RATIO */
1918 .int_load: {6, 6, 6}, /* cost of loading integer registers
1919 in QImode, HImode and SImode.
1920 Relative to reg-reg move (2). */
1921 .int_store: {8, 8, 8}, /* cost of storing integer
1922 registers. */
1923 .sse_load: {6, 6, 10, 10, 12}, /* cost of loading SSE registers
1924 in 32bit, 64bit, 128bit, 256bit and 512bit */
1925 .sse_store: {8, 8, 8, 12, 12}, /* cost of storing SSE register
1926 in 32bit, 64bit, 128bit, 256bit and 512bit */
1927 .sse_unaligned_load: {6, 6, 6, 6, 6}, /* cost of unaligned loads. */
1928 .sse_unaligned_store: {8, 8, 8, 8, 8}, /* cost of unaligned stores. */
1929 .xmm_move: 2, .ymm_move: 2, .zmm_move: 2, /* cost of moving XMM,YMM,ZMM
1930 register. */
1931 .sse_to_integer: 6, /* cost of moving SSE register to integer. */
1932 /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
1933 throughput 5. Approx 7 uops do not depend on vector size and every load
1934 is 5 uops. */
1935 .gather_static: 14, .gather_per_elt: 10, /* Gather load static, per_elt. */
1936 .scatter_static: 14, .scatter_per_elt: 20, /* Gather store static, per_elt. */
1937 .l1_cache_size: 32, /* size of l1 cache. */
1938 .l2_cache_size: 1024, /* size of l2 cache. */
1939 .prefetch_block: 64, /* size of prefetch block. */
1940 /* New AMD processors never drop prefetches; if they cannot be performed
1941 immediately, they are queued. We set number of simultaneous prefetches
1942 to a large constant to reflect this (it probably is not a good idea not
1943 to limit number of prefetches at all, as their execution also takes some
1944 time). */
1945 .simultaneous_prefetches: 100, /* number of parallel prefetches. */
1946 .branch_cost: 3, /* Branch cost. */
1947 COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */
1948 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1949 /* Latency of fdiv is 8-15. */
1950 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1951 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1952 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1953 /* Latency of fsqrt is 4-10. */
1954 COSTS_N_INSNS (25), /* cost of FSQRT instruction. */
1955
1956 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1957 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1958 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1959 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1960 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1961 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1962 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1963 /* 9-13. */
1964 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1965 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1966 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1967 /* Zen can execute 4 integer operations per cycle. FP operations
1968 take 3 cycles and it can execute 2 integer additions and 2
1969 multiplications thus reassociation may make sense up to with of 6.
1970 SPEC2k6 bencharks suggests
1971 that 4 works better than 6 probably due to register pressure.
1972
1973 Integer vector operations are taken by FP unit and execute 3 vector
1974 plus/minus operations per cycle but only one multiply. This is adjusted
1975 in ix86_reassociation_width. */
1976 .reassoc_int: 4, .reassoc_fp: 4, .reassoc_vec_int: 3, .reassoc_vec_fp: 6, /* reassoc int, fp, vec_int, vec_fp. */
1977 .memcpy: znver2_memcpy,
1978 .memset: znver2_memset,
1979 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1980 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1981 .align_loop: "16", /* Loop alignment. */
1982 .align_jump: "16", /* Jump alignment. */
1983 .align_label: "0:0:8", /* Label alignment. */
1984 .align_func: "16", /* Func alignment. */
1985 .small_unroll_ninsns: 4, /* Small unroll limit. */
1986 .small_unroll_factor: 2, /* Small unroll factor. */
1987};
1988
1989/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
1990static stringop_algs skylake_memcpy[2] = {
1991 {.unknown_size: libcall,
1992 .size: {{256, rep_prefix_1_byte, true},
1993 {256, loop, false},
1994 {-1, libcall, false}}},
1995 {.unknown_size: libcall,
1996 .size: {{256, rep_prefix_1_byte, true},
1997 {256, loop, false},
1998 {-1, libcall, false}}}};
1999
2000static stringop_algs skylake_memset[2] = {
2001 {.unknown_size: libcall,
2002 .size: {{256, rep_prefix_1_byte, true},
2003 {256, loop, false},
2004 {-1, libcall, false}}},
2005 {.unknown_size: libcall,
2006 .size: {{256, rep_prefix_1_byte, true},
2007 {256, loop, false},
2008 {-1, libcall, false}}}};
2009
2010static const
2011struct processor_costs skylake_cost = {
2012 .hard_register: {
2013 /* Start of register allocator costs. integer->integer move cost is 2. */
2014 .movzbl_load: 6, /* cost for loading QImode using movzbl */
2015 .int_load: {4, 4, 4}, /* cost of loading integer registers
2016 in QImode, HImode and SImode.
2017 Relative to reg-reg move (2). */
2018 .int_store: {6, 6, 6}, /* cost of storing integer registers */
2019 .fp_move: 2, /* cost of reg,reg fld/fst */
2020 .fp_load: {6, 6, 8}, /* cost of loading fp registers
2021 in SFmode, DFmode and XFmode */
2022 .fp_store: {6, 6, 10}, /* cost of storing fp registers
2023 in SFmode, DFmode and XFmode */
2024 .mmx_move: 2, /* cost of moving MMX register */
2025 .mmx_load: {6, 6}, /* cost of loading MMX registers
2026 in SImode and DImode */
2027 .mmx_store: {6, 6}, /* cost of storing MMX registers
2028 in SImode and DImode */
2029 .xmm_move: 2, .ymm_move: 2, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register */
2030 .sse_load: {6, 6, 6, 10, 20}, /* cost of loading SSE registers
2031 in 32,64,128,256 and 512-bit */
2032 .sse_store: {8, 8, 8, 12, 24}, /* cost of storing SSE registers
2033 in 32,64,128,256 and 512-bit */
2034 .sse_to_integer: 6, .integer_to_sse: 6, /* SSE->integer and integer->SSE moves */
2035 .mask_to_integer: 6, .integer_to_mask: 6, /* mask->integer and integer->mask moves */
2036 .mask_load: {8, 8, 8}, /* cost of loading mask register
2037 in QImode, HImode, SImode. */
2038 .mask_store: {6, 6, 6}, /* cost if storing mask register
2039 in QImode, HImode, SImode. */
2040 .mask_move: 3, /* cost of moving mask register. */
2041 /* End of register allocator costs. */
2042 },
2043
2044 COSTS_N_INSNS (1), /* cost of an add instruction */
2045 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
2046 COSTS_N_INSNS (1), /* variable shift costs */
2047 COSTS_N_INSNS (1), /* constant shift costs */
2048 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2049 COSTS_N_INSNS (4), /* HI */
2050 COSTS_N_INSNS (3), /* SI */
2051 COSTS_N_INSNS (3), /* DI */
2052 COSTS_N_INSNS (3)}, /* other */
2053 .mult_bit: 0, /* cost of multiply per each bit set */
2054 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2055 model is not realistic. We compensate by increasing the latencies a bit. */
2056 .divide: {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2057 COSTS_N_INSNS (11), /* HI */
2058 COSTS_N_INSNS (14), /* SI */
2059 COSTS_N_INSNS (76), /* DI */
2060 COSTS_N_INSNS (76)}, /* other */
2061 COSTS_N_INSNS (1), /* cost of movsx */
2062 COSTS_N_INSNS (0), /* cost of movzx */
2063 .large_insn: 8, /* "large" insn */
2064 .move_ratio: 17, /* MOVE_RATIO */
2065 .clear_ratio: 17, /* CLEAR_RATIO */
2066 .int_load: {6, 6, 6}, /* cost of loading integer registers
2067 in QImode, HImode and SImode.
2068 Relative to reg-reg move (2). */
2069 .int_store: {8, 8, 8}, /* cost of storing integer registers */
2070 .sse_load: {8, 8, 8, 8, 16}, /* cost of loading SSE register
2071 in 32bit, 64bit, 128bit, 256bit and 512bit */
2072 .sse_store: {8, 8, 8, 8, 16}, /* cost of storing SSE register
2073 in 32bit, 64bit, 128bit, 256bit and 512bit */
2074 .sse_unaligned_load: {8, 8, 8, 8, 16}, /* cost of unaligned loads. */
2075 .sse_unaligned_store: {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2076 .xmm_move: 2, .ymm_move: 2, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register */
2077 .sse_to_integer: 6, /* cost of moving SSE register to integer. */
2078 .gather_static: 20, .gather_per_elt: 8, /* Gather load static, per_elt. */
2079 .scatter_static: 22, .scatter_per_elt: 10, /* Gather store static, per_elt. */
2080 .l1_cache_size: 64, /* size of l1 cache. */
2081 .l2_cache_size: 512, /* size of l2 cache. */
2082 .prefetch_block: 64, /* size of prefetch block */
2083 .simultaneous_prefetches: 6, /* number of parallel prefetches */
2084 .branch_cost: 3, /* Branch cost */
2085 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2086 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2087 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2088 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2089 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2090 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
2091
2092 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2093 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2094 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2095 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2096 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2097 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
2098 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
2099 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
2100 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
2101 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2102 .reassoc_int: 1, .reassoc_fp: 4, .reassoc_vec_int: 2, .reassoc_vec_fp: 2, /* reassoc int, fp, vec_int, vec_fp. */
2103 .memcpy: skylake_memcpy,
2104 .memset: skylake_memset,
2105 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2106 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2107 .align_loop: "16:11:8", /* Loop alignment. */
2108 .align_jump: "16:11:8", /* Jump alignment. */
2109 .align_label: "0:0:8", /* Label alignment. */
2110 .align_func: "16", /* Func alignment. */
2111 .small_unroll_ninsns: 4, /* Small unroll limit. */
2112 .small_unroll_factor: 2, /* Small unroll factor. */
2113};
2114
2115/* icelake_cost should produce code tuned for Icelake family of CPUs.
2116 NB: rep_prefix_1_byte is used only for known size. */
2117
2118static stringop_algs icelake_memcpy[2] = {
2119 {.unknown_size: libcall,
2120 .size: {{256, rep_prefix_1_byte, true},
2121 {256, loop, false},
2122 {-1, libcall, false}}},
2123 {.unknown_size: libcall,
2124 .size: {{256, rep_prefix_1_byte, true},
2125 {256, loop, false},
2126 {-1, libcall, false}}}};
2127
2128static stringop_algs icelake_memset[2] = {
2129 {.unknown_size: libcall,
2130 .size: {{256, rep_prefix_1_byte, true},
2131 {256, loop, false},
2132 {-1, libcall, false}}},
2133 {.unknown_size: libcall,
2134 .size: {{256, rep_prefix_1_byte, true},
2135 {256, loop, false},
2136 {-1, libcall, false}}}};
2137
2138static const
2139struct processor_costs icelake_cost = {
2140 .hard_register: {
2141 /* Start of register allocator costs. integer->integer move cost is 2. */
2142 .movzbl_load: 6, /* cost for loading QImode using movzbl */
2143 .int_load: {4, 4, 4}, /* cost of loading integer registers
2144 in QImode, HImode and SImode.
2145 Relative to reg-reg move (2). */
2146 .int_store: {6, 6, 6}, /* cost of storing integer registers */
2147 .fp_move: 2, /* cost of reg,reg fld/fst */
2148 .fp_load: {6, 6, 8}, /* cost of loading fp registers
2149 in SFmode, DFmode and XFmode */
2150 .fp_store: {6, 6, 10}, /* cost of storing fp registers
2151 in SFmode, DFmode and XFmode */
2152 .mmx_move: 2, /* cost of moving MMX register */
2153 .mmx_load: {6, 6}, /* cost of loading MMX registers
2154 in SImode and DImode */
2155 .mmx_store: {6, 6}, /* cost of storing MMX registers
2156 in SImode and DImode */
2157 .xmm_move: 2, .ymm_move: 2, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register */
2158 .sse_load: {6, 6, 6, 10, 20}, /* cost of loading SSE registers
2159 in 32,64,128,256 and 512-bit */
2160 .sse_store: {8, 8, 8, 12, 24}, /* cost of storing SSE registers
2161 in 32,64,128,256 and 512-bit */
2162 .sse_to_integer: 6, .integer_to_sse: 6, /* SSE->integer and integer->SSE moves */
2163 .mask_to_integer: 6, .integer_to_mask: 6, /* mask->integer and integer->mask moves */
2164 .mask_load: {8, 8, 8}, /* cost of loading mask register
2165 in QImode, HImode, SImode. */
2166 .mask_store: {6, 6, 6}, /* cost if storing mask register
2167 in QImode, HImode, SImode. */
2168 .mask_move: 3, /* cost of moving mask register. */
2169 /* End of register allocator costs. */
2170 },
2171
2172 COSTS_N_INSNS (1), /* cost of an add instruction */
2173 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
2174 COSTS_N_INSNS (1), /* variable shift costs */
2175 COSTS_N_INSNS (1), /* constant shift costs */
2176 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2177 COSTS_N_INSNS (4), /* HI */
2178 COSTS_N_INSNS (3), /* SI */
2179 COSTS_N_INSNS (3), /* DI */
2180 COSTS_N_INSNS (3)}, /* other */
2181 .mult_bit: 0, /* cost of multiply per each bit set */
2182 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2183 model is not realistic. We compensate by increasing the latencies a bit. */
2184 .divide: {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2185 COSTS_N_INSNS (11), /* HI */
2186 COSTS_N_INSNS (14), /* SI */
2187 COSTS_N_INSNS (76), /* DI */
2188 COSTS_N_INSNS (76)}, /* other */
2189 COSTS_N_INSNS (1), /* cost of movsx */
2190 COSTS_N_INSNS (0), /* cost of movzx */
2191 .large_insn: 8, /* "large" insn */
2192 .move_ratio: 17, /* MOVE_RATIO */
2193 .clear_ratio: 17, /* CLEAR_RATIO */
2194 .int_load: {6, 6, 6}, /* cost of loading integer registers
2195 in QImode, HImode and SImode.
2196 Relative to reg-reg move (2). */
2197 .int_store: {8, 8, 8}, /* cost of storing integer registers */
2198 .sse_load: {8, 8, 8, 8, 16}, /* cost of loading SSE register
2199 in 32bit, 64bit, 128bit, 256bit and 512bit */
2200 .sse_store: {8, 8, 8, 8, 16}, /* cost of storing SSE register
2201 in 32bit, 64bit, 128bit, 256bit and 512bit */
2202 .sse_unaligned_load: {8, 8, 8, 8, 16}, /* cost of unaligned loads. */
2203 .sse_unaligned_store: {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2204 .xmm_move: 2, .ymm_move: 2, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register */
2205 .sse_to_integer: 6, /* cost of moving SSE register to integer. */
2206 .gather_static: 20, .gather_per_elt: 8, /* Gather load static, per_elt. */
2207 .scatter_static: 22, .scatter_per_elt: 10, /* Gather store static, per_elt. */
2208 .l1_cache_size: 64, /* size of l1 cache. */
2209 .l2_cache_size: 512, /* size of l2 cache. */
2210 .prefetch_block: 64, /* size of prefetch block */
2211 .simultaneous_prefetches: 6, /* number of parallel prefetches */
2212 .branch_cost: 3, /* Branch cost */
2213 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2214 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2215 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2216 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2217 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2218 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
2219
2220 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2221 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2222 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2223 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2224 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2225 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
2226 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
2227 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
2228 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
2229 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2230 .reassoc_int: 1, .reassoc_fp: 4, .reassoc_vec_int: 2, .reassoc_vec_fp: 2, /* reassoc int, fp, vec_int, vec_fp. */
2231 .memcpy: icelake_memcpy,
2232 .memset: icelake_memset,
2233 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2234 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2235 .align_loop: "16:11:8", /* Loop alignment. */
2236 .align_jump: "16:11:8", /* Jump alignment. */
2237 .align_label: "0:0:8", /* Label alignment. */
2238 .align_func: "16", /* Func alignment. */
2239 .small_unroll_ninsns: 4, /* Small unroll limit. */
2240 .small_unroll_factor: 2, /* Small unroll factor. */
2241};
2242
2243/* alderlake_cost should produce code tuned for alderlake family of CPUs. */
2244static stringop_algs alderlake_memcpy[2] = {
2245 {.unknown_size: libcall,
2246 .size: {{256, rep_prefix_1_byte, true},
2247 {256, loop, false},
2248 {-1, libcall, false}}},
2249 {.unknown_size: libcall,
2250 .size: {{256, rep_prefix_1_byte, true},
2251 {256, loop, false},
2252 {-1, libcall, false}}}};
2253static stringop_algs alderlake_memset[2] = {
2254 {.unknown_size: libcall,
2255 .size: {{256, rep_prefix_1_byte, true},
2256 {256, loop, false},
2257 {-1, libcall, false}}},
2258 {.unknown_size: libcall,
2259 .size: {{256, rep_prefix_1_byte, true},
2260 {256, loop, false},
2261 {-1, libcall, false}}}};
2262static const
2263struct processor_costs alderlake_cost = {
2264 .hard_register: {
2265 /* Start of register allocator costs. integer->integer move cost is 2. */
2266 .movzbl_load: 6, /* cost for loading QImode using movzbl */
2267 .int_load: {6, 6, 6}, /* cost of loading integer registers
2268 in QImode, HImode and SImode.
2269 Relative to reg-reg move (2). */
2270 .int_store: {6, 6, 6}, /* cost of storing integer registers */
2271 .fp_move: 4, /* cost of reg,reg fld/fst */
2272 .fp_load: {6, 6, 12}, /* cost of loading fp registers
2273 in SFmode, DFmode and XFmode */
2274 .fp_store: {6, 6, 12}, /* cost of storing fp registers
2275 in SFmode, DFmode and XFmode */
2276 .mmx_move: 2, /* cost of moving MMX register */
2277 .mmx_load: {6, 6}, /* cost of loading MMX registers
2278 in SImode and DImode */
2279 .mmx_store: {6, 6}, /* cost of storing MMX registers
2280 in SImode and DImode */
2281 .xmm_move: 2, .ymm_move: 3, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register */
2282 .sse_load: {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2283 in 32,64,128,256 and 512-bit */
2284 .sse_store: {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2285 in 32,64,128,256 and 512-bit */
2286 .sse_to_integer: 6, .integer_to_sse: 6, /* SSE->integer and integer->SSE moves */
2287 .mask_to_integer: 6, .integer_to_mask: 6, /* mask->integer and integer->mask moves */
2288 .mask_load: {6, 6, 6}, /* cost of loading mask register
2289 in QImode, HImode, SImode. */
2290 .mask_store: {6, 6, 6}, /* cost if storing mask register
2291 in QImode, HImode, SImode. */
2292 .mask_move: 2, /* cost of moving mask register. */
2293 /* End of register allocator costs. */
2294 },
2295
2296 COSTS_N_INSNS (1), /* cost of an add instruction */
2297 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2298 COSTS_N_INSNS (1), /* variable shift costs */
2299 COSTS_N_INSNS (1), /* constant shift costs */
2300 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2301 COSTS_N_INSNS (4), /* HI */
2302 COSTS_N_INSNS (3), /* SI */
2303 COSTS_N_INSNS (4), /* DI */
2304 COSTS_N_INSNS (4)}, /* other */
2305 .mult_bit: 0, /* cost of multiply per each bit set */
2306 .divide: {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2307 COSTS_N_INSNS (22), /* HI */
2308 COSTS_N_INSNS (30), /* SI */
2309 COSTS_N_INSNS (74), /* DI */
2310 COSTS_N_INSNS (74)}, /* other */
2311 COSTS_N_INSNS (1), /* cost of movsx */
2312 COSTS_N_INSNS (1), /* cost of movzx */
2313 .large_insn: 8, /* "large" insn */
2314 .move_ratio: 17, /* MOVE_RATIO */
2315 .clear_ratio: 17, /* CLEAR_RATIO */
2316 .int_load: {6, 6, 6}, /* cost of loading integer registers
2317 in QImode, HImode and SImode.
2318 Relative to reg-reg move (2). */
2319 .int_store: {8, 8, 8}, /* cost of storing integer registers */
2320 .sse_load: {8, 8, 8, 10, 15}, /* cost of loading SSE register
2321 in 32bit, 64bit, 128bit, 256bit and 512bit */
2322 .sse_store: {8, 8, 8, 10, 15}, /* cost of storing SSE register
2323 in 32bit, 64bit, 128bit, 256bit and 512bit */
2324 .sse_unaligned_load: {8, 8, 8, 10, 15}, /* cost of unaligned loads. */
2325 .sse_unaligned_store: {8, 8, 8, 10, 15}, /* cost of unaligned storess. */
2326 .xmm_move: 2, .ymm_move: 3, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register */
2327 .sse_to_integer: 6, /* cost of moving SSE register to integer. */
2328 .gather_static: 18, .gather_per_elt: 6, /* Gather load static, per_elt. */
2329 .scatter_static: 18, .scatter_per_elt: 6, /* Gather store static, per_elt. */
2330 .l1_cache_size: 32, /* size of l1 cache. */
2331 .l2_cache_size: 512, /* size of l2 cache. */
2332 .prefetch_block: 64, /* size of prefetch block */
2333 .simultaneous_prefetches: 6, /* number of parallel prefetches */
2334 .branch_cost: 3, /* Branch cost */
2335 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2336 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2337 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
2338 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2339 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2340 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
2341
2342 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2343 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2344 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2345 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2346 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2347 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2348 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2349 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2350 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2351 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2352 .reassoc_int: 1, .reassoc_fp: 4, .reassoc_vec_int: 3, .reassoc_vec_fp: 3, /* reassoc int, fp, vec_int, vec_fp. */
2353 .memcpy: alderlake_memcpy,
2354 .memset: alderlake_memset,
2355 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2356 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2357 .align_loop: "16:11:8", /* Loop alignment. */
2358 .align_jump: "16:11:8", /* Jump alignment. */
2359 .align_label: "0:0:8", /* Label alignment. */
2360 .align_func: "16", /* Func alignment. */
2361 .small_unroll_ninsns: 4, /* Small unroll limit. */
2362 .small_unroll_factor: 2, /* Small unroll factor. */
2363};
2364
2365 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
2366 very small blocks it is better to use loop. For large blocks, libcall can
2367 do nontemporary accesses and beat inline considerably. */
2368static stringop_algs btver1_memcpy[2] = {
2369 {.unknown_size: libcall, .size: {{6, loop, false}, {14, unrolled_loop, false},
2370 {-1, rep_prefix_4_byte, false}}},
2371 {.unknown_size: libcall, .size: {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2372 {-1, libcall, false}}}};
2373static stringop_algs btver1_memset[2] = {
2374 {.unknown_size: libcall, .size: {{8, loop, false}, {24, unrolled_loop, false},
2375 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2376 {.unknown_size: libcall, .size: {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2377 {-1, libcall, false}}}};
2378const struct processor_costs btver1_cost = {
2379 .hard_register: {
2380 /* Start of register allocator costs. integer->integer move cost is 2. */
2381 .movzbl_load: 8, /* cost for loading QImode using movzbl */
2382 .int_load: {6, 8, 6}, /* cost of loading integer registers
2383 in QImode, HImode and SImode.
2384 Relative to reg-reg move (2). */
2385 .int_store: {6, 8, 6}, /* cost of storing integer registers */
2386 .fp_move: 4, /* cost of reg,reg fld/fst */
2387 .fp_load: {12, 12, 28}, /* cost of loading fp registers
2388 in SFmode, DFmode and XFmode */
2389 .fp_store: {12, 12, 38}, /* cost of storing fp registers
2390 in SFmode, DFmode and XFmode */
2391 .mmx_move: 4, /* cost of moving MMX register */
2392 .mmx_load: {10, 10}, /* cost of loading MMX registers
2393 in SImode and DImode */
2394 .mmx_store: {12, 12}, /* cost of storing MMX registers
2395 in SImode and DImode */
2396 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
2397 .sse_load: {10, 10, 12, 48, 96}, /* cost of loading SSE registers
2398 in 32,64,128,256 and 512-bit */
2399 .sse_store: {10, 10, 12, 48, 96}, /* cost of storing SSE registers
2400 in 32,64,128,256 and 512-bit */
2401 .sse_to_integer: 14, .integer_to_sse: 14, /* SSE->integer and integer->SSE moves */
2402 .mask_to_integer: 14, .integer_to_mask: 14, /* mask->integer and integer->mask moves */
2403 .mask_load: {6, 8, 6}, /* cost of loading mask register
2404 in QImode, HImode, SImode. */
2405 .mask_store: {6, 8, 6}, /* cost if storing mask register
2406 in QImode, HImode, SImode. */
2407 .mask_move: 2, /* cost of moving mask register. */
2408 /* End of register allocator costs. */
2409 },
2410
2411 COSTS_N_INSNS (1), /* cost of an add instruction */
2412 COSTS_N_INSNS (2), /* cost of a lea instruction */
2413 COSTS_N_INSNS (1), /* variable shift costs */
2414 COSTS_N_INSNS (1), /* constant shift costs */
2415 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2416 COSTS_N_INSNS (4), /* HI */
2417 COSTS_N_INSNS (3), /* SI */
2418 COSTS_N_INSNS (4), /* DI */
2419 COSTS_N_INSNS (5)}, /* other */
2420 .mult_bit: 0, /* cost of multiply per each bit set */
2421 .divide: {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
2422 COSTS_N_INSNS (35), /* HI */
2423 COSTS_N_INSNS (51), /* SI */
2424 COSTS_N_INSNS (83), /* DI */
2425 COSTS_N_INSNS (83)}, /* other */
2426 COSTS_N_INSNS (1), /* cost of movsx */
2427 COSTS_N_INSNS (1), /* cost of movzx */
2428 .large_insn: 8, /* "large" insn */
2429 .move_ratio: 9, /* MOVE_RATIO */
2430 .clear_ratio: 6, /* CLEAR_RATIO */
2431 .int_load: {6, 8, 6}, /* cost of loading integer registers
2432 in QImode, HImode and SImode.
2433 Relative to reg-reg move (2). */
2434 .int_store: {6, 8, 6}, /* cost of storing integer registers */
2435 .sse_load: {10, 10, 12, 48, 96}, /* cost of loading SSE register
2436 in 32bit, 64bit, 128bit, 256bit and 512bit */
2437 .sse_store: {10, 10, 12, 48, 96}, /* cost of storing SSE register
2438 in 32bit, 64bit, 128bit, 256bit and 512bit */
2439 .sse_unaligned_load: {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
2440 .sse_unaligned_store: {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
2441 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
2442 .sse_to_integer: 14, /* cost of moving SSE register to integer. */
2443 .gather_static: 10, .gather_per_elt: 10, /* Gather load static, per_elt. */
2444 .scatter_static: 10, .scatter_per_elt: 10, /* Gather store static, per_elt. */
2445 .l1_cache_size: 32, /* size of l1 cache. */
2446 .l2_cache_size: 512, /* size of l2 cache. */
2447 .prefetch_block: 64, /* size of prefetch block */
2448 .simultaneous_prefetches: 100, /* number of parallel prefetches */
2449 .branch_cost: 2, /* Branch cost */
2450 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
2451 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2452 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
2453 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2454 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2455 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
2456
2457 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2458 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2459 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
2460 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2461 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2462 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2463 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2464 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2465 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2466 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
2467 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
2468 .memcpy: btver1_memcpy,
2469 .memset: btver1_memset,
2470 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
2471 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2472 .align_loop: "16:11:8", /* Loop alignment. */
2473 .align_jump: "16:8:8", /* Jump alignment. */
2474 .align_label: "0:0:8", /* Label alignment. */
2475 .align_func: "11", /* Func alignment. */
2476 .small_unroll_ninsns: 4, /* Small unroll limit. */
2477 .small_unroll_factor: 2, /* Small unroll factor. */
2478};
2479
2480static stringop_algs btver2_memcpy[2] = {
2481 {.unknown_size: libcall, .size: {{6, loop, false}, {14, unrolled_loop, false},
2482 {-1, rep_prefix_4_byte, false}}},
2483 {.unknown_size: libcall, .size: {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2484 {-1, libcall, false}}}};
2485static stringop_algs btver2_memset[2] = {
2486 {.unknown_size: libcall, .size: {{8, loop, false}, {24, unrolled_loop, false},
2487 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2488 {.unknown_size: libcall, .size: {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2489 {-1, libcall, false}}}};
2490const struct processor_costs btver2_cost = {
2491 .hard_register: {
2492 /* Start of register allocator costs. integer->integer move cost is 2. */
2493 .movzbl_load: 8, /* cost for loading QImode using movzbl */
2494 .int_load: {8, 8, 6}, /* cost of loading integer registers
2495 in QImode, HImode and SImode.
2496 Relative to reg-reg move (2). */
2497 .int_store: {8, 8, 6}, /* cost of storing integer registers */
2498 .fp_move: 4, /* cost of reg,reg fld/fst */
2499 .fp_load: {12, 12, 28}, /* cost of loading fp registers
2500 in SFmode, DFmode and XFmode */
2501 .fp_store: {12, 12, 38}, /* cost of storing fp registers
2502 in SFmode, DFmode and XFmode */
2503 .mmx_move: 4, /* cost of moving MMX register */
2504 .mmx_load: {10, 10}, /* cost of loading MMX registers
2505 in SImode and DImode */
2506 .mmx_store: {12, 12}, /* cost of storing MMX registers
2507 in SImode and DImode */
2508 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
2509 .sse_load: {10, 10, 12, 48, 96}, /* cost of loading SSE registers
2510 in 32,64,128,256 and 512-bit */
2511 .sse_store: {10, 10, 12, 48, 96}, /* cost of storing SSE registers
2512 in 32,64,128,256 and 512-bit */
2513 .sse_to_integer: 14, .integer_to_sse: 14, /* SSE->integer and integer->SSE moves */
2514 .mask_to_integer: 14, .integer_to_mask: 14, /* mask->integer and integer->mask moves */
2515 .mask_load: {8, 8, 6}, /* cost of loading mask register
2516 in QImode, HImode, SImode. */
2517 .mask_store: {8, 8, 6}, /* cost if storing mask register
2518 in QImode, HImode, SImode. */
2519 .mask_move: 2, /* cost of moving mask register. */
2520 /* End of register allocator costs. */
2521 },
2522
2523 COSTS_N_INSNS (1), /* cost of an add instruction */
2524 COSTS_N_INSNS (2), /* cost of a lea instruction */
2525 COSTS_N_INSNS (1), /* variable shift costs */
2526 COSTS_N_INSNS (1), /* constant shift costs */
2527 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2528 COSTS_N_INSNS (4), /* HI */
2529 COSTS_N_INSNS (3), /* SI */
2530 COSTS_N_INSNS (4), /* DI */
2531 COSTS_N_INSNS (5)}, /* other */
2532 .mult_bit: 0, /* cost of multiply per each bit set */
2533 .divide: {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
2534 COSTS_N_INSNS (35), /* HI */
2535 COSTS_N_INSNS (51), /* SI */
2536 COSTS_N_INSNS (83), /* DI */
2537 COSTS_N_INSNS (83)}, /* other */
2538 COSTS_N_INSNS (1), /* cost of movsx */
2539 COSTS_N_INSNS (1), /* cost of movzx */
2540 .large_insn: 8, /* "large" insn */
2541 .move_ratio: 9, /* MOVE_RATIO */
2542 .clear_ratio: 6, /* CLEAR_RATIO */
2543 .int_load: {8, 8, 6}, /* cost of loading integer registers
2544 in QImode, HImode and SImode.
2545 Relative to reg-reg move (2). */
2546 .int_store: {8, 8, 6}, /* cost of storing integer registers */
2547 .sse_load: {10, 10, 12, 48, 96}, /* cost of loading SSE register
2548 in 32bit, 64bit, 128bit, 256bit and 512bit */
2549 .sse_store: {10, 10, 12, 48, 96}, /* cost of storing SSE register
2550 in 32bit, 64bit, 128bit, 256bit and 512bit */
2551 .sse_unaligned_load: {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
2552 .sse_unaligned_store: {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
2553 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
2554 .sse_to_integer: 14, /* cost of moving SSE register to integer. */
2555 .gather_static: 10, .gather_per_elt: 10, /* Gather load static, per_elt. */
2556 .scatter_static: 10, .scatter_per_elt: 10, /* Gather store static, per_elt. */
2557 .l1_cache_size: 32, /* size of l1 cache. */
2558 .l2_cache_size: 2048, /* size of l2 cache. */
2559 .prefetch_block: 64, /* size of prefetch block */
2560 .simultaneous_prefetches: 100, /* number of parallel prefetches */
2561 .branch_cost: 2, /* Branch cost */
2562 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
2563 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2564 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
2565 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2566 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2567 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
2568
2569 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2570 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2571 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
2572 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2573 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2574 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2575 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2576 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
2577 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
2578 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
2579 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
2580 .memcpy: btver2_memcpy,
2581 .memset: btver2_memset,
2582 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
2583 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2584 .align_loop: "16:11:8", /* Loop alignment. */
2585 .align_jump: "16:8:8", /* Jump alignment. */
2586 .align_label: "0:0:8", /* Label alignment. */
2587 .align_func: "11", /* Func alignment. */
2588 .small_unroll_ninsns: 4, /* Small unroll limit. */
2589 .small_unroll_factor: 2, /* Small unroll factor. */
2590};
2591
2592static stringop_algs pentium4_memcpy[2] = {
2593 {.unknown_size: libcall, .size: {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2594 DUMMY_STRINGOP_ALGS};
2595static stringop_algs pentium4_memset[2] = {
2596 {.unknown_size: libcall, .size: {{6, loop_1_byte, false}, {48, loop, false},
2597 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2598 DUMMY_STRINGOP_ALGS};
2599
2600static const
2601struct processor_costs pentium4_cost = {
2602 .hard_register: {
2603 /* Start of register allocator costs. integer->integer move cost is 2. */
2604 .movzbl_load: 5, /* cost for loading QImode using movzbl */
2605 .int_load: {4, 5, 4}, /* cost of loading integer registers
2606 in QImode, HImode and SImode.
2607 Relative to reg-reg move (2). */
2608 .int_store: {2, 3, 2}, /* cost of storing integer registers */
2609 .fp_move: 12, /* cost of reg,reg fld/fst */
2610 .fp_load: {14, 14, 14}, /* cost of loading fp registers
2611 in SFmode, DFmode and XFmode */
2612 .fp_store: {14, 14, 14}, /* cost of storing fp registers
2613 in SFmode, DFmode and XFmode */
2614 .mmx_move: 12, /* cost of moving MMX register */
2615 .mmx_load: {16, 16}, /* cost of loading MMX registers
2616 in SImode and DImode */
2617 .mmx_store: {16, 16}, /* cost of storing MMX registers
2618 in SImode and DImode */
2619 .xmm_move: 12, .ymm_move: 24, .zmm_move: 48, /* cost of moving XMM,YMM,ZMM register */
2620 .sse_load: {16, 16, 16, 32, 64}, /* cost of loading SSE registers
2621 in 32,64,128,256 and 512-bit */
2622 .sse_store: {16, 16, 16, 32, 64}, /* cost of storing SSE registers
2623 in 32,64,128,256 and 512-bit */
2624 .sse_to_integer: 20, .integer_to_sse: 12, /* SSE->integer and integer->SSE moves */
2625 .mask_to_integer: 20, .integer_to_mask: 12, /* mask->integer and integer->mask moves */
2626 .mask_load: {4, 5, 4}, /* cost of loading mask register
2627 in QImode, HImode, SImode. */
2628 .mask_store: {2, 3, 2}, /* cost if storing mask register
2629 in QImode, HImode, SImode. */
2630 .mask_move: 2, /* cost of moving mask register. */
2631 /* End of register allocator costs. */
2632 },
2633
2634 COSTS_N_INSNS (1), /* cost of an add instruction */
2635 COSTS_N_INSNS (3), /* cost of a lea instruction */
2636 COSTS_N_INSNS (4), /* variable shift costs */
2637 COSTS_N_INSNS (4), /* constant shift costs */
2638 .mult_init: {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
2639 COSTS_N_INSNS (15), /* HI */
2640 COSTS_N_INSNS (15), /* SI */
2641 COSTS_N_INSNS (15), /* DI */
2642 COSTS_N_INSNS (15)}, /* other */
2643 .mult_bit: 0, /* cost of multiply per each bit set */
2644 .divide: {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
2645 COSTS_N_INSNS (56), /* HI */
2646 COSTS_N_INSNS (56), /* SI */
2647 COSTS_N_INSNS (56), /* DI */
2648 COSTS_N_INSNS (56)}, /* other */
2649 COSTS_N_INSNS (1), /* cost of movsx */
2650 COSTS_N_INSNS (1), /* cost of movzx */
2651 .large_insn: 16, /* "large" insn */
2652 .move_ratio: 6, /* MOVE_RATIO */
2653 .clear_ratio: 6, /* CLEAR_RATIO */
2654 .int_load: {4, 5, 4}, /* cost of loading integer registers
2655 in QImode, HImode and SImode.
2656 Relative to reg-reg move (2). */
2657 .int_store: {2, 3, 2}, /* cost of storing integer registers */
2658 .sse_load: {16, 16, 16, 32, 64}, /* cost of loading SSE register
2659 in 32bit, 64bit, 128bit, 256bit and 512bit */
2660 .sse_store: {16, 16, 16, 32, 64}, /* cost of storing SSE register
2661 in 32bit, 64bit, 128bit, 256bit and 512bit */
2662 .sse_unaligned_load: {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
2663 .sse_unaligned_store: {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
2664 .xmm_move: 12, .ymm_move: 24, .zmm_move: 48, /* cost of moving XMM,YMM,ZMM register */
2665 .sse_to_integer: 20, /* cost of moving SSE register to integer. */
2666 .gather_static: 16, .gather_per_elt: 16, /* Gather load static, per_elt. */
2667 .scatter_static: 16, .scatter_per_elt: 16, /* Gather store static, per_elt. */
2668 .l1_cache_size: 8, /* size of l1 cache. */
2669 .l2_cache_size: 256, /* size of l2 cache. */
2670 .prefetch_block: 64, /* size of prefetch block */
2671 .simultaneous_prefetches: 6, /* number of parallel prefetches */
2672 .branch_cost: 2, /* Branch cost */
2673 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
2674 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
2675 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
2676 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2677 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2678 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
2679
2680 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
2681 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2682 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
2683 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
2684 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2685 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2686 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
2687 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
2688 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
2689 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
2690 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
2691 .memcpy: pentium4_memcpy,
2692 .memset: pentium4_memset,
2693 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2694 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2695 NULL, /* Loop alignment. */
2696 NULL, /* Jump alignment. */
2697 NULL, /* Label alignment. */
2698 NULL, /* Func alignment. */
2699 .small_unroll_ninsns: 4, /* Small unroll limit. */
2700 .small_unroll_factor: 2, /* Small unroll factor. */
2701};
2702
2703static stringop_algs nocona_memcpy[2] = {
2704 {.unknown_size: libcall, .size: {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2705 {.unknown_size: libcall, .size: {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2706 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2707
2708static stringop_algs nocona_memset[2] = {
2709 {.unknown_size: libcall, .size: {{6, loop_1_byte, false}, {48, loop, false},
2710 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2711 {.unknown_size: libcall, .size: {{24, loop, false}, {64, unrolled_loop, false},
2712 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2713
2714static const
2715struct processor_costs nocona_cost = {
2716 .hard_register: {
2717 /* Start of register allocator costs. integer->integer move cost is 2. */
2718 .movzbl_load: 4, /* cost for loading QImode using movzbl */
2719 .int_load: {4, 4, 4}, /* cost of loading integer registers
2720 in QImode, HImode and SImode.
2721 Relative to reg-reg move (2). */
2722 .int_store: {4, 4, 4}, /* cost of storing integer registers */
2723 .fp_move: 12, /* cost of reg,reg fld/fst */
2724 .fp_load: {14, 14, 14}, /* cost of loading fp registers
2725 in SFmode, DFmode and XFmode */
2726 .fp_store: {14, 14, 14}, /* cost of storing fp registers
2727 in SFmode, DFmode and XFmode */
2728 .mmx_move: 14, /* cost of moving MMX register */
2729 .mmx_load: {12, 12}, /* cost of loading MMX registers
2730 in SImode and DImode */
2731 .mmx_store: {12, 12}, /* cost of storing MMX registers
2732 in SImode and DImode */
2733 .xmm_move: 6, .ymm_move: 12, .zmm_move: 24, /* cost of moving XMM,YMM,ZMM register */
2734 .sse_load: {12, 12, 12, 24, 48}, /* cost of loading SSE registers
2735 in 32,64,128,256 and 512-bit */
2736 .sse_store: {12, 12, 12, 24, 48}, /* cost of storing SSE registers
2737 in 32,64,128,256 and 512-bit */
2738 .sse_to_integer: 20, .integer_to_sse: 12, /* SSE->integer and integer->SSE moves */
2739 .mask_to_integer: 20, .integer_to_mask: 12, /* mask->integer and integer->mask moves */
2740 .mask_load: {4, 4, 4}, /* cost of loading mask register
2741 in QImode, HImode, SImode. */
2742 .mask_store: {4, 4, 4}, /* cost if storing mask register
2743 in QImode, HImode, SImode. */
2744 .mask_move: 2, /* cost of moving mask register. */
2745 /* End of register allocator costs. */
2746 },
2747
2748 COSTS_N_INSNS (1), /* cost of an add instruction */
2749 COSTS_N_INSNS (1), /* cost of a lea instruction */
2750 COSTS_N_INSNS (1), /* variable shift costs */
2751 COSTS_N_INSNS (1), /* constant shift costs */
2752 .mult_init: {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
2753 COSTS_N_INSNS (10), /* HI */
2754 COSTS_N_INSNS (10), /* SI */
2755 COSTS_N_INSNS (10), /* DI */
2756 COSTS_N_INSNS (10)}, /* other */
2757 .mult_bit: 0, /* cost of multiply per each bit set */
2758 .divide: {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
2759 COSTS_N_INSNS (66), /* HI */
2760 COSTS_N_INSNS (66), /* SI */
2761 COSTS_N_INSNS (66), /* DI */
2762 COSTS_N_INSNS (66)}, /* other */
2763 COSTS_N_INSNS (1), /* cost of movsx */
2764 COSTS_N_INSNS (1), /* cost of movzx */
2765 .large_insn: 16, /* "large" insn */
2766 .move_ratio: 17, /* MOVE_RATIO */
2767 .clear_ratio: 6, /* CLEAR_RATIO */
2768 .int_load: {4, 4, 4}, /* cost of loading integer registers
2769 in QImode, HImode and SImode.
2770 Relative to reg-reg move (2). */
2771 .int_store: {4, 4, 4}, /* cost of storing integer registers */
2772 .sse_load: {12, 12, 12, 24, 48}, /* cost of loading SSE register
2773 in 32bit, 64bit, 128bit, 256bit and 512bit */
2774 .sse_store: {12, 12, 12, 24, 48}, /* cost of storing SSE register
2775 in 32bit, 64bit, 128bit, 256bit and 512bit */
2776 .sse_unaligned_load: {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
2777 .sse_unaligned_store: {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
2778 .xmm_move: 6, .ymm_move: 12, .zmm_move: 24, /* cost of moving XMM,YMM,ZMM register */
2779 .sse_to_integer: 20, /* cost of moving SSE register to integer. */
2780 .gather_static: 12, .gather_per_elt: 12, /* Gather load static, per_elt. */
2781 .scatter_static: 12, .scatter_per_elt: 12, /* Gather store static, per_elt. */
2782 .l1_cache_size: 8, /* size of l1 cache. */
2783 .l2_cache_size: 1024, /* size of l2 cache. */
2784 .prefetch_block: 64, /* size of prefetch block */
2785 .simultaneous_prefetches: 8, /* number of parallel prefetches */
2786 .branch_cost: 1, /* Branch cost */
2787 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
2788 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2789 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
2790 COSTS_N_INSNS (3), /* cost of FABS instruction. */
2791 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
2792 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
2793
2794 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
2795 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2796 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
2797 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
2798 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
2799 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
2800 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
2801 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
2802 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
2803 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
2804 .reassoc_int: 1, .reassoc_fp: 1, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
2805 .memcpy: nocona_memcpy,
2806 .memset: nocona_memset,
2807 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2808 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2809 NULL, /* Loop alignment. */
2810 NULL, /* Jump alignment. */
2811 NULL, /* Label alignment. */
2812 NULL, /* Func alignment. */
2813 .small_unroll_ninsns: 4, /* Small unroll limit. */
2814 .small_unroll_factor: 2, /* Small unroll factor. */
2815};
2816
2817static stringop_algs atom_memcpy[2] = {
2818 {.unknown_size: libcall, .size: {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2819 {.unknown_size: libcall, .size: {{32, loop, false}, {64, rep_prefix_4_byte, false},
2820 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2821static stringop_algs atom_memset[2] = {
2822 {.unknown_size: libcall, .size: {{8, loop, false}, {15, unrolled_loop, false},
2823 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2824 {.unknown_size: libcall, .size: {{24, loop, false}, {32, unrolled_loop, false},
2825 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2826static const
2827struct processor_costs atom_cost = {
2828 .hard_register: {
2829 /* Start of register allocator costs. integer->integer move cost is 2. */
2830 .movzbl_load: 6, /* cost for loading QImode using movzbl */
2831 .int_load: {6, 6, 6}, /* cost of loading integer registers
2832 in QImode, HImode and SImode.
2833 Relative to reg-reg move (2). */
2834 .int_store: {6, 6, 6}, /* cost of storing integer registers */
2835 .fp_move: 4, /* cost of reg,reg fld/fst */
2836 .fp_load: {6, 6, 18}, /* cost of loading fp registers
2837 in SFmode, DFmode and XFmode */
2838 .fp_store: {14, 14, 24}, /* cost of storing fp registers
2839 in SFmode, DFmode and XFmode */
2840 .mmx_move: 2, /* cost of moving MMX register */
2841 .mmx_load: {8, 8}, /* cost of loading MMX registers
2842 in SImode and DImode */
2843 .mmx_store: {10, 10}, /* cost of storing MMX registers
2844 in SImode and DImode */
2845 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
2846 .sse_load: {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2847 in 32,64,128,256 and 512-bit */
2848 .sse_store: {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2849 in 32,64,128,256 and 512-bit */
2850 .sse_to_integer: 8, .integer_to_sse: 6, /* SSE->integer and integer->SSE moves */
2851 .mask_to_integer: 8, .integer_to_mask: 6, /* mask->integer and integer->mask moves */
2852 .mask_load: {6, 6, 6}, /* cost of loading mask register
2853 in QImode, HImode, SImode. */
2854 .mask_store: {6, 6, 6}, /* cost if storing mask register
2855 in QImode, HImode, SImode. */
2856 .mask_move: 2, /* cost of moving mask register. */
2857 /* End of register allocator costs. */
2858 },
2859
2860 COSTS_N_INSNS (1), /* cost of an add instruction */
2861 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2862 COSTS_N_INSNS (1), /* variable shift costs */
2863 COSTS_N_INSNS (1), /* constant shift costs */
2864 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2865 COSTS_N_INSNS (4), /* HI */
2866 COSTS_N_INSNS (3), /* SI */
2867 COSTS_N_INSNS (4), /* DI */
2868 COSTS_N_INSNS (2)}, /* other */
2869 .mult_bit: 0, /* cost of multiply per each bit set */
2870 .divide: {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2871 COSTS_N_INSNS (26), /* HI */
2872 COSTS_N_INSNS (42), /* SI */
2873 COSTS_N_INSNS (74), /* DI */
2874 COSTS_N_INSNS (74)}, /* other */
2875 COSTS_N_INSNS (1), /* cost of movsx */
2876 COSTS_N_INSNS (1), /* cost of movzx */
2877 .large_insn: 8, /* "large" insn */
2878 .move_ratio: 17, /* MOVE_RATIO */
2879 .clear_ratio: 6, /* CLEAR_RATIO */
2880 .int_load: {6, 6, 6}, /* cost of loading integer registers
2881 in QImode, HImode and SImode.
2882 Relative to reg-reg move (2). */
2883 .int_store: {6, 6, 6}, /* cost of storing integer registers */
2884 .sse_load: {8, 8, 8, 16, 32}, /* cost of loading SSE register
2885 in 32bit, 64bit, 128bit, 256bit and 512bit */
2886 .sse_store: {8, 8, 8, 16, 32}, /* cost of storing SSE register
2887 in 32bit, 64bit, 128bit, 256bit and 512bit */
2888 .sse_unaligned_load: {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2889 .sse_unaligned_store: {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2890 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
2891 .sse_to_integer: 8, /* cost of moving SSE register to integer. */
2892 .gather_static: 8, .gather_per_elt: 8, /* Gather load static, per_elt. */
2893 .scatter_static: 8, .scatter_per_elt: 8, /* Gather store static, per_elt. */
2894 .l1_cache_size: 32, /* size of l1 cache. */
2895 .l2_cache_size: 256, /* size of l2 cache. */
2896 .prefetch_block: 64, /* size of prefetch block */
2897 .simultaneous_prefetches: 6, /* number of parallel prefetches */
2898 .branch_cost: 3, /* Branch cost */
2899 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2900 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2901 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2902 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2903 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2904 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2905
2906 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2907 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2908 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2909 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2910 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2911 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2912 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
2913 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
2914 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
2915 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
2916 .reassoc_int: 2, .reassoc_fp: 2, .reassoc_vec_int: 2, .reassoc_vec_fp: 2, /* reassoc int, fp, vec_int, vec_fp. */
2917 .memcpy: atom_memcpy,
2918 .memset: atom_memset,
2919 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2920 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2921 .align_loop: "16", /* Loop alignment. */
2922 .align_jump: "16:8:8", /* Jump alignment. */
2923 .align_label: "0:0:8", /* Label alignment. */
2924 .align_func: "16", /* Func alignment. */
2925 .small_unroll_ninsns: 4, /* Small unroll limit. */
2926 .small_unroll_factor: 2, /* Small unroll factor. */
2927};
2928
2929static stringop_algs slm_memcpy[2] = {
2930 {.unknown_size: libcall, .size: {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2931 {.unknown_size: libcall, .size: {{32, loop, false}, {64, rep_prefix_4_byte, false},
2932 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2933static stringop_algs slm_memset[2] = {
2934 {.unknown_size: libcall, .size: {{8, loop, false}, {15, unrolled_loop, false},
2935 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2936 {.unknown_size: libcall, .size: {{24, loop, false}, {32, unrolled_loop, false},
2937 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2938static const
2939struct processor_costs slm_cost = {
2940 .hard_register: {
2941 /* Start of register allocator costs. integer->integer move cost is 2. */
2942 .movzbl_load: 8, /* cost for loading QImode using movzbl */
2943 .int_load: {8, 8, 8}, /* cost of loading integer registers
2944 in QImode, HImode and SImode.
2945 Relative to reg-reg move (2). */
2946 .int_store: {6, 6, 6}, /* cost of storing integer registers */
2947 .fp_move: 2, /* cost of reg,reg fld/fst */
2948 .fp_load: {8, 8, 18}, /* cost of loading fp registers
2949 in SFmode, DFmode and XFmode */
2950 .fp_store: {6, 6, 18}, /* cost of storing fp registers
2951 in SFmode, DFmode and XFmode */
2952 .mmx_move: 2, /* cost of moving MMX register */
2953 .mmx_load: {8, 8}, /* cost of loading MMX registers
2954 in SImode and DImode */
2955 .mmx_store: {6, 6}, /* cost of storing MMX registers
2956 in SImode and DImode */
2957 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
2958 .sse_load: {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2959 in 32,64,128,256 and 512-bit */
2960 .sse_store: {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2961 in 32,64,128,256 and 512-bit */
2962 .sse_to_integer: 8, .integer_to_sse: 6, /* SSE->integer and integer->SSE moves */
2963 .mask_to_integer: 8, .integer_to_mask: 6, /* mask->integer and integer->mask moves */
2964 .mask_load: {8, 8, 8}, /* cost of loading mask register
2965 in QImode, HImode, SImode. */
2966 .mask_store: {6, 6, 6}, /* cost if storing mask register
2967 in QImode, HImode, SImode. */
2968 .mask_move: 2, /* cost of moving mask register. */
2969 /* End of register allocator costs. */
2970 },
2971
2972 COSTS_N_INSNS (1), /* cost of an add instruction */
2973 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2974 COSTS_N_INSNS (1), /* variable shift costs */
2975 COSTS_N_INSNS (1), /* constant shift costs */
2976 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2977 COSTS_N_INSNS (3), /* HI */
2978 COSTS_N_INSNS (3), /* SI */
2979 COSTS_N_INSNS (4), /* DI */
2980 COSTS_N_INSNS (2)}, /* other */
2981 .mult_bit: 0, /* cost of multiply per each bit set */
2982 .divide: {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2983 COSTS_N_INSNS (26), /* HI */
2984 COSTS_N_INSNS (42), /* SI */
2985 COSTS_N_INSNS (74), /* DI */
2986 COSTS_N_INSNS (74)}, /* other */
2987 COSTS_N_INSNS (1), /* cost of movsx */
2988 COSTS_N_INSNS (1), /* cost of movzx */
2989 .large_insn: 8, /* "large" insn */
2990 .move_ratio: 17, /* MOVE_RATIO */
2991 .clear_ratio: 6, /* CLEAR_RATIO */
2992 .int_load: {8, 8, 8}, /* cost of loading integer registers
2993 in QImode, HImode and SImode.
2994 Relative to reg-reg move (2). */
2995 .int_store: {6, 6, 6}, /* cost of storing integer registers */
2996 .sse_load: {8, 8, 8, 16, 32}, /* cost of loading SSE register
2997 in 32bit, 64bit, 128bit, 256bit and 512bit */
2998 .sse_store: {8, 8, 8, 16, 32}, /* cost of storing SSE register
2999 in SImode, DImode and TImode. */
3000 .sse_unaligned_load: {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
3001 .sse_unaligned_store: {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
3002 .xmm_move: 2, .ymm_move: 4, .zmm_move: 8, /* cost of moving XMM,YMM,ZMM register */
3003 .sse_to_integer: 8, /* cost of moving SSE register to integer. */
3004 .gather_static: 8, .gather_per_elt: 8, /* Gather load static, per_elt. */
3005 .scatter_static: 8, .scatter_per_elt: 8, /* Gather store static, per_elt. */
3006 .l1_cache_size: 32, /* size of l1 cache. */
3007 .l2_cache_size: 256, /* size of l2 cache. */
3008 .prefetch_block: 64, /* size of prefetch block */
3009 .simultaneous_prefetches: 6, /* number of parallel prefetches */
3010 .branch_cost: 3, /* Branch cost */
3011 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3012 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3013 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3014 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3015 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3016 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3017
3018 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3019 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3020 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3021 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3022 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3023 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3024 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
3025 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
3026 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
3027 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
3028 .reassoc_int: 1, .reassoc_fp: 2, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
3029 .memcpy: slm_memcpy,
3030 .memset: slm_memset,
3031 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3032 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3033 .align_loop: "16", /* Loop alignment. */
3034 .align_jump: "16:8:8", /* Jump alignment. */
3035 .align_label: "0:0:8", /* Label alignment. */
3036 .align_func: "16", /* Func alignment. */
3037 .small_unroll_ninsns: 4, /* Small unroll limit. */
3038 .small_unroll_factor: 2, /* Small unroll factor. */
3039};
3040
3041static stringop_algs tremont_memcpy[2] = {
3042 {.unknown_size: libcall,
3043 .size: {{256, rep_prefix_1_byte, true},
3044 {256, loop, false},
3045 {-1, libcall, false}}},
3046 {.unknown_size: libcall,
3047 .size: {{256, rep_prefix_1_byte, true},
3048 {256, loop, false},
3049 {-1, libcall, false}}}};
3050static stringop_algs tremont_memset[2] = {
3051 {.unknown_size: libcall,
3052 .size: {{256, rep_prefix_1_byte, true},
3053 {256, loop, false},
3054 {-1, libcall, false}}},
3055 {.unknown_size: libcall,
3056 .size: {{256, rep_prefix_1_byte, true},
3057 {256, loop, false},
3058 {-1, libcall, false}}}};
3059static const
3060struct processor_costs tremont_cost = {
3061 .hard_register: {
3062 /* Start of register allocator costs. integer->integer move cost is 2. */
3063 .movzbl_load: 6, /* cost for loading QImode using movzbl */
3064 .int_load: {6, 6, 6}, /* cost of loading integer registers
3065 in QImode, HImode and SImode.
3066 Relative to reg-reg move (2). */
3067 .int_store: {6, 6, 6}, /* cost of storing integer registers */
3068 .fp_move: 4, /* cost of reg,reg fld/fst */
3069 .fp_load: {6, 6, 12}, /* cost of loading fp registers
3070 in SFmode, DFmode and XFmode */
3071 .fp_store: {6, 6, 12}, /* cost of storing fp registers
3072 in SFmode, DFmode and XFmode */
3073 .mmx_move: 2, /* cost of moving MMX register */
3074 .mmx_load: {6, 6}, /* cost of loading MMX registers
3075 in SImode and DImode */
3076 .mmx_store: {6, 6}, /* cost of storing MMX registers
3077 in SImode and DImode */
3078 .xmm_move: 2, .ymm_move: 3, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register */
3079 .sse_load: {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3080 in 32,64,128,256 and 512-bit */
3081 .sse_store: {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3082 in 32,64,128,256 and 512-bit */
3083 .sse_to_integer: 6, .integer_to_sse: 6, /* SSE->integer and integer->SSE moves */
3084 .mask_to_integer: 6, .integer_to_mask: 6, /* mask->integer and integer->mask moves */
3085 .mask_load: {6, 6, 6}, /* cost of loading mask register
3086 in QImode, HImode, SImode. */
3087 .mask_store: {6, 6, 6}, /* cost if storing mask register
3088 in QImode, HImode, SImode. */
3089 .mask_move: 2, /* cost of moving mask register. */
3090 /* End of register allocator costs. */
3091 },
3092
3093 COSTS_N_INSNS (1), /* cost of an add instruction */
3094 /* Setting cost to 2 makes our current implementation of synth_mult result in
3095 use of unnecessary temporary registers causing regression on several
3096 SPECfp benchmarks. */
3097 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3098 COSTS_N_INSNS (1), /* variable shift costs */
3099 COSTS_N_INSNS (1), /* constant shift costs */
3100 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3101 COSTS_N_INSNS (4), /* HI */
3102 COSTS_N_INSNS (3), /* SI */
3103 COSTS_N_INSNS (4), /* DI */
3104 COSTS_N_INSNS (4)}, /* other */
3105 .mult_bit: 0, /* cost of multiply per each bit set */
3106 .divide: {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
3107 COSTS_N_INSNS (22), /* HI */
3108 COSTS_N_INSNS (30), /* SI */
3109 COSTS_N_INSNS (74), /* DI */
3110 COSTS_N_INSNS (74)}, /* other */
3111 COSTS_N_INSNS (1), /* cost of movsx */
3112 COSTS_N_INSNS (1), /* cost of movzx */
3113 .large_insn: 8, /* "large" insn */
3114 .move_ratio: 17, /* MOVE_RATIO */
3115 .clear_ratio: 17, /* CLEAR_RATIO */
3116 .int_load: {6, 6, 6}, /* cost of loading integer registers
3117 in QImode, HImode and SImode.
3118 Relative to reg-reg move (2). */
3119 .int_store: {6, 6, 6}, /* cost of storing integer registers */
3120 .sse_load: {6, 6, 6, 10, 15}, /* cost of loading SSE register
3121 in 32bit, 64bit, 128bit, 256bit and 512bit */
3122 .sse_store: {6, 6, 6, 10, 15}, /* cost of storing SSE register
3123 in 32bit, 64bit, 128bit, 256bit and 512bit */
3124 .sse_unaligned_load: {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3125 .sse_unaligned_store: {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3126 .xmm_move: 2, .ymm_move: 3, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register */
3127 .sse_to_integer: 6, /* cost of moving SSE register to integer. */
3128 .gather_static: 18, .gather_per_elt: 6, /* Gather load static, per_elt. */
3129 .scatter_static: 18, .scatter_per_elt: 6, /* Gather store static, per_elt. */
3130 .l1_cache_size: 32, /* size of l1 cache. */
3131 .l2_cache_size: 512, /* size of l2 cache. */
3132 .prefetch_block: 64, /* size of prefetch block */
3133 .simultaneous_prefetches: 6, /* number of parallel prefetches */
3134 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3135 value is increased to perhaps more appropriate value of 5. */
3136 .branch_cost: 3, /* Branch cost */
3137 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3138 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
3139 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
3140 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3141 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3142 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
3143
3144 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3145 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3146 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3147 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3148 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3149 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3150 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3151 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3152 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
3153 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
3154 .reassoc_int: 1, .reassoc_fp: 4, .reassoc_vec_int: 3, .reassoc_vec_fp: 3, /* reassoc int, fp, vec_int, vec_fp. */
3155 .memcpy: tremont_memcpy,
3156 .memset: tremont_memset,
3157 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3158 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3159 .align_loop: "16:11:8", /* Loop alignment. */
3160 .align_jump: "16:11:8", /* Jump alignment. */
3161 .align_label: "0:0:8", /* Label alignment. */
3162 .align_func: "16", /* Func alignment. */
3163 .small_unroll_ninsns: 4, /* Small unroll limit. */
3164 .small_unroll_factor: 2, /* Small unroll factor. */
3165};
3166
3167static stringop_algs intel_memcpy[2] = {
3168 {.unknown_size: libcall, .size: {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3169 {.unknown_size: libcall, .size: {{32, loop, false}, {64, rep_prefix_4_byte, false},
3170 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3171static stringop_algs intel_memset[2] = {
3172 {.unknown_size: libcall, .size: {{8, loop, false}, {15, unrolled_loop, false},
3173 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3174 {.unknown_size: libcall, .size: {{24, loop, false}, {32, unrolled_loop, false},
3175 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3176static const
3177struct processor_costs intel_cost = {
3178 .hard_register: {
3179 /* Start of register allocator costs. integer->integer move cost is 2. */
3180 .movzbl_load: 6, /* cost for loading QImode using movzbl */
3181 .int_load: {4, 4, 4}, /* cost of loading integer registers
3182 in QImode, HImode and SImode.
3183 Relative to reg-reg move (2). */
3184 .int_store: {6, 6, 6}, /* cost of storing integer registers */
3185 .fp_move: 2, /* cost of reg,reg fld/fst */
3186 .fp_load: {6, 6, 8}, /* cost of loading fp registers
3187 in SFmode, DFmode and XFmode */
3188 .fp_store: {6, 6, 10}, /* cost of storing fp registers
3189 in SFmode, DFmode and XFmode */
3190 .mmx_move: 2, /* cost of moving MMX register */
3191 .mmx_load: {6, 6}, /* cost of loading MMX registers
3192 in SImode and DImode */
3193 .mmx_store: {6, 6}, /* cost of storing MMX registers
3194 in SImode and DImode */
3195 .xmm_move: 2, .ymm_move: 2, .zmm_move: 2, /* cost of moving XMM,YMM,ZMM register */
3196 .sse_load: {6, 6, 6, 6, 6}, /* cost of loading SSE registers
3197 in 32,64,128,256 and 512-bit */
3198 .sse_store: {6, 6, 6, 6, 6}, /* cost of storing SSE registers
3199 in 32,64,128,256 and 512-bit */
3200 .sse_to_integer: 4, .integer_to_sse: 4, /* SSE->integer and integer->SSE moves */
3201 .mask_to_integer: 4, .integer_to_mask: 4, /* mask->integer and integer->mask moves */
3202 .mask_load: {4, 4, 4}, /* cost of loading mask register
3203 in QImode, HImode, SImode. */
3204 .mask_store: {6, 6, 6}, /* cost if storing mask register
3205 in QImode, HImode, SImode. */
3206 .mask_move: 2, /* cost of moving mask register. */
3207 /* End of register allocator costs. */
3208 },
3209
3210 COSTS_N_INSNS (1), /* cost of an add instruction */
3211 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3212 COSTS_N_INSNS (1), /* variable shift costs */
3213 COSTS_N_INSNS (1), /* constant shift costs */
3214 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3215 COSTS_N_INSNS (3), /* HI */
3216 COSTS_N_INSNS (3), /* SI */
3217 COSTS_N_INSNS (4), /* DI */
3218 COSTS_N_INSNS (2)}, /* other */
3219 .mult_bit: 0, /* cost of multiply per each bit set */
3220 .divide: {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
3221 COSTS_N_INSNS (26), /* HI */
3222 COSTS_N_INSNS (42), /* SI */
3223 COSTS_N_INSNS (74), /* DI */
3224 COSTS_N_INSNS (74)}, /* other */
3225 COSTS_N_INSNS (1), /* cost of movsx */
3226 COSTS_N_INSNS (1), /* cost of movzx */
3227 .large_insn: 8, /* "large" insn */
3228 .move_ratio: 17, /* MOVE_RATIO */
3229 .clear_ratio: 6, /* CLEAR_RATIO */
3230 .int_load: {4, 4, 4}, /* cost of loading integer registers
3231 in QImode, HImode and SImode.
3232 Relative to reg-reg move (2). */
3233 .int_store: {6, 6, 6}, /* cost of storing integer registers */
3234 .sse_load: {6, 6, 6, 6, 6}, /* cost of loading SSE register
3235 in 32bit, 64bit, 128bit, 256bit and 512bit */
3236 .sse_store: {6, 6, 6, 6, 6}, /* cost of storing SSE register
3237 in 32bit, 64bit, 128bit, 256bit and 512bit */
3238 .sse_unaligned_load: {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
3239 .sse_unaligned_store: {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
3240 .xmm_move: 2, .ymm_move: 2, .zmm_move: 2, /* cost of moving XMM,YMM,ZMM register */
3241 .sse_to_integer: 4, /* cost of moving SSE register to integer. */
3242 .gather_static: 6, .gather_per_elt: 6, /* Gather load static, per_elt. */
3243 .scatter_static: 6, .scatter_per_elt: 6, /* Gather store static, per_elt. */
3244 .l1_cache_size: 32, /* size of l1 cache. */
3245 .l2_cache_size: 256, /* size of l2 cache. */
3246 .prefetch_block: 64, /* size of prefetch block */
3247 .simultaneous_prefetches: 6, /* number of parallel prefetches */
3248 .branch_cost: 3, /* Branch cost */
3249 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3250 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3251 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3252 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3253 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3254 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3255
3256 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3257 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
3258 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
3259 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
3260 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3261 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3262 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
3263 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
3264 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
3265 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
3266 .reassoc_int: 1, .reassoc_fp: 4, .reassoc_vec_int: 1, .reassoc_vec_fp: 1, /* reassoc int, fp, vec_int, vec_fp. */
3267 .memcpy: intel_memcpy,
3268 .memset: intel_memset,
3269 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3270 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3271 .align_loop: "16", /* Loop alignment. */
3272 .align_jump: "16:8:8", /* Jump alignment. */
3273 .align_label: "0:0:8", /* Label alignment. */
3274 .align_func: "16", /* Func alignment. */
3275 .small_unroll_ninsns: 4, /* Small unroll limit. */
3276 .small_unroll_factor: 2, /* Small unroll factor. */
3277};
3278
3279/* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU. */
3280static stringop_algs lujiazui_memcpy[2] = {
3281 {.unknown_size: libcall, .size: {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3282 {-1, libcall, false}}},
3283 {.unknown_size: libcall, .size: {{12, unrolled_loop, true}, {32, loop, false},
3284 {6144, rep_prefix_8_byte, false},
3285 {-1, libcall, false}}}};
3286static stringop_algs lujiazui_memset[2] = {
3287 {.unknown_size: libcall, .size: {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3288 {-1, libcall, false}}},
3289 {.unknown_size: libcall, .size: {{12, loop, true}, {32, loop, false},
3290 {640, rep_prefix_8_byte, false},
3291 {-1, libcall, false}}}};
3292static const
3293struct processor_costs lujiazui_cost = {
3294 .hard_register: {
3295 /* Start of register allocator costs. integer->integer move cost is 2. */
3296 .movzbl_load: 6, /* cost for loading QImode using movzbl. */
3297 .int_load: {6, 6, 6}, /* cost of loading integer registers
3298 in QImode, HImode and SImode.
3299 Relative to reg-reg move (2). */
3300 .int_store: {6, 6, 6}, /* cost of storing integer registers. */
3301 .fp_move: 2, /* cost of reg,reg fld/fst. */
3302 .fp_load: {6, 6, 8}, /* cost of loading fp registers
3303 in SFmode, DFmode and XFmode. */
3304 .fp_store: {6, 6, 8}, /* cost of storing fp registers
3305 in SFmode, DFmode and XFmode. */
3306 .mmx_move: 2, /* cost of moving MMX register. */
3307 .mmx_load: {6, 6}, /* cost of loading MMX registers
3308 in SImode and DImode. */
3309 .mmx_store: {6, 6}, /* cost of storing MMX registers
3310 in SImode and DImode. */
3311 .xmm_move: 2, .ymm_move: 3, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register. */
3312 .sse_load: {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3313 in 32,64,128,256 and 512-bit. */
3314 .sse_store: {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3315 in 32,64,128,256 and 512-bit. */
3316 .sse_to_integer: 6, .integer_to_sse: 6, /* SSE->integer and integer->SSE moves. */
3317 .mask_to_integer: 6, .integer_to_mask: 6, /* mask->integer and integer->mask moves. */
3318 .mask_load: {6, 6, 6}, /* cost of loading mask register
3319 in QImode, HImode, SImode. */
3320 .mask_store: {6, 6, 6}, /* cost if storing mask register
3321 in QImode, HImode, SImode. */
3322 .mask_move: 2, /* cost of moving mask register. */
3323 /* End of register allocator costs. */
3324 },
3325
3326 COSTS_N_INSNS (1), /* cost of an add instruction. */
3327 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction. */
3328 COSTS_N_INSNS (1), /* variable shift costs. */
3329 COSTS_N_INSNS (1), /* constant shift costs. */
3330 .mult_init: {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3331 COSTS_N_INSNS (3), /* HI. */
3332 COSTS_N_INSNS (3), /* SI. */
3333 COSTS_N_INSNS (12), /* DI. */
3334 COSTS_N_INSNS (14)}, /* other. */
3335 .mult_bit: 0, /* cost of multiply per each bit set. */
3336 .divide: {COSTS_N_INSNS (22), /* cost of a divide/mod for QI. */
3337 COSTS_N_INSNS (24), /* HI. */
3338 COSTS_N_INSNS (24), /* SI. */
3339 COSTS_N_INSNS (150), /* DI. */
3340 COSTS_N_INSNS (152)}, /* other. */
3341 COSTS_N_INSNS (1), /* cost of movsx. */
3342 COSTS_N_INSNS (1), /* cost of movzx. */
3343 .large_insn: 8, /* "large" insn. */
3344 .move_ratio: 17, /* MOVE_RATIO. */
3345 .clear_ratio: 6, /* CLEAR_RATIO. */
3346 .int_load: {6, 6, 6}, /* cost of loading integer registers
3347 in QImode, HImode and SImode.
3348 Relative to reg-reg move (2). */
3349 .int_store: {6, 6, 6}, /* cost of storing integer registers. */
3350 .sse_load: {6, 6, 6, 10, 15}, /* cost of loading SSE register
3351 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3352 .sse_store: {6, 6, 6, 10, 15}, /* cost of storing SSE register
3353 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3354 .sse_unaligned_load: {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3355 .sse_unaligned_store: {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3356 .xmm_move: 2, .ymm_move: 3, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register. */
3357 .sse_to_integer: 6, /* cost of moving SSE register to integer. */
3358 .gather_static: 18, .gather_per_elt: 6, /* Gather load static, per_elt. */
3359 .scatter_static: 18, .scatter_per_elt: 6, /* Gather store static, per_elt. */
3360 .l1_cache_size: 32, /* size of l1 cache. */
3361 .l2_cache_size: 4096, /* size of l2 cache. */
3362 .prefetch_block: 64, /* size of prefetch block. */
3363 /* Lujiazui processor never drop prefetches, like AMD processors. */
3364 .simultaneous_prefetches: 100, /* number of parallel prefetches. */
3365 .branch_cost: 3, /* Branch cost. */
3366 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3367 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
3368 COSTS_N_INSNS (22), /* cost of FDIV instruction. */
3369 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3370 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3371 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
3372
3373 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3374 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3375 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3376 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
3377 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3378 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3379 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3380 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3381 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
3382 COSTS_N_INSNS (60), /* cost of SQRTSD instruction. */
3383 .reassoc_int: 1, .reassoc_fp: 4, .reassoc_vec_int: 3, .reassoc_vec_fp: 3, /* reassoc int, fp, vec_int, vec_fp. */
3384 .memcpy: lujiazui_memcpy,
3385 .memset: lujiazui_memset,
3386 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3387 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3388 .align_loop: "16:11:8", /* Loop alignment. */
3389 .align_jump: "16:11:8", /* Jump alignment. */
3390 .align_label: "0:0:8", /* Label alignment. */
3391 .align_func: "16", /* Func alignment. */
3392 .small_unroll_ninsns: 4, /* Small unroll limit. */
3393 .small_unroll_factor: 2, /* Small unroll factor. */
3394};
3395
3396/* yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU. */
3397static stringop_algs yongfeng_memcpy[2] = {
3398 {.unknown_size: libcall, .size: {{6, unrolled_loop, true}, {256, unrolled_loop, false},
3399 {-1, libcall, false}}},
3400 {.unknown_size: libcall, .size: {{8, loop, false}, {512, unrolled_loop, false},
3401 {-1, libcall, false}}}};
3402static stringop_algs yongfeng_memset[2] = {
3403 {.unknown_size: libcall, .size: {{6, loop_1_byte, false}, {128, loop, false},
3404 {-1, libcall, false}}},
3405 {.unknown_size: libcall, .size: {{2, rep_prefix_4_byte, false}, {64, loop, false},
3406 {1024, vector_loop, false},
3407 {-1, libcall, false}}}};
3408static const
3409struct processor_costs yongfeng_cost = {
3410 .hard_register: {
3411 /* Start of register allocator costs. integer->integer move cost is 2. */
3412 .movzbl_load: 8, /* cost for loading QImode using movzbl. */
3413 .int_load: {8, 8, 8}, /* cost of loading integer registers
3414 in QImode, HImode and SImode.
3415 Relative to reg-reg move (2). */
3416 .int_store: {8, 8, 8}, /* cost of storing integer registers. */
3417 .fp_move: 2, /* cost of reg,reg fld/fst. */
3418 .fp_load: {8, 8, 8}, /* cost of loading fp registers
3419 in SFmode, DFmode and XFmode. */
3420 .fp_store: {8, 8, 8}, /* cost of storing fp registers
3421 in SFmode, DFmode and XFmode. */
3422 .mmx_move: 2, /* cost of moving MMX register. */
3423 .mmx_load: {8, 8}, /* cost of loading MMX registers
3424 in SImode and DImode. */
3425 .mmx_store: {8, 8}, /* cost of storing MMX registers
3426 in SImode and DImode. */
3427 .xmm_move: 2, .ymm_move: 3, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register. */
3428 .sse_load: {8, 8, 8, 10, 15}, /* cost of loading SSE registers
3429 in 32,64,128,256 and 512-bit. */
3430 .sse_store: {8, 8, 8, 10, 15}, /* cost of storing SSE registers
3431 in 32,64,128,256 and 512-bit. */
3432 .sse_to_integer: 8, .integer_to_sse: 8, /* SSE->integer and integer->SSE moves. */
3433 .mask_to_integer: 8, .integer_to_mask: 8, /* mask->integer and integer->mask moves. */
3434 .mask_load: {8, 8, 8}, /* cost of loading mask register
3435 in QImode, HImode, SImode. */
3436 .mask_store: {8, 8, 8}, /* cost if storing mask register
3437 in QImode, HImode, SImode. */
3438 .mask_move: 2, /* cost of moving mask register. */
3439 /* End of register allocator costs. */
3440 },
3441
3442 COSTS_N_INSNS (1), /* cost of an add instruction. */
3443 COSTS_N_INSNS (1), /* cost of a lea instruction. */
3444 COSTS_N_INSNS (1), /* variable shift costs. */
3445 COSTS_N_INSNS (1), /* constant shift costs. */
3446 .mult_init: {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3447 COSTS_N_INSNS (3), /* HI. */
3448 COSTS_N_INSNS (2), /* SI. */
3449 COSTS_N_INSNS (2), /* DI. */
3450 COSTS_N_INSNS (3)}, /* other. */
3451 .mult_bit: 0, /* cost of multiply per each bit set. */
3452 .divide: {COSTS_N_INSNS (8), /* cost of a divide/mod for QI. */
3453 COSTS_N_INSNS (9), /* HI. */
3454 COSTS_N_INSNS (8), /* SI. */
3455 COSTS_N_INSNS (41), /* DI. */
3456 COSTS_N_INSNS (41)}, /* other. */
3457 COSTS_N_INSNS (1), /* cost of movsx. */
3458 COSTS_N_INSNS (1), /* cost of movzx. */
3459 .large_insn: 8, /* "large" insn. */
3460 .move_ratio: 17, /* MOVE_RATIO. */
3461 .clear_ratio: 6, /* CLEAR_RATIO. */
3462 .int_load: {8, 8, 8}, /* cost of loading integer registers
3463 in QImode, HImode and SImode.
3464 Relative to reg-reg move (2). */
3465 .int_store: {8, 8, 8}, /* cost of storing integer registers. */
3466 .sse_load: {8, 8, 8, 12, 15}, /* cost of loading SSE register
3467 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3468 .sse_store: {8, 8, 8, 12, 15}, /* cost of storing SSE register
3469 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3470 .sse_unaligned_load: {8, 8, 8, 12, 15}, /* cost of unaligned loads. */
3471 .sse_unaligned_store: {8, 8, 8, 12, 15}, /* cost of unaligned storess. */
3472 .xmm_move: 2, .ymm_move: 3, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register. */
3473 .sse_to_integer: 8, /* cost of moving SSE register to integer. */
3474 .gather_static: 18, .gather_per_elt: 6, /* Gather load static, per_elt. */
3475 .scatter_static: 18, .scatter_per_elt: 6, /* Gather store static, per_elt. */
3476 .l1_cache_size: 32, /* size of l1 cache. */
3477 .l2_cache_size: 256, /* size of l2 cache. */
3478 .prefetch_block: 64, /* size of prefetch block. */
3479 .simultaneous_prefetches: 12, /* number of parallel prefetches. */
3480 .branch_cost: 3, /* Branch cost. */
3481 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3482 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
3483 COSTS_N_INSNS (14), /* cost of FDIV instruction. */
3484 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3485 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3486 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3487
3488 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3489 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3490 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3491 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
3492 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3493 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3494 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
3495 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
3496 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
3497 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
3498 .reassoc_int: 4, .reassoc_fp: 4, .reassoc_vec_int: 4, .reassoc_vec_fp: 4, /* reassoc int, fp, vec_int, vec_fp. */
3499 .memcpy: yongfeng_memcpy,
3500 .memset: yongfeng_memset,
3501 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3502 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3503 .align_loop: "16:11:8", /* Loop alignment. */
3504 .align_jump: "16:11:8", /* Jump alignment. */
3505 .align_label: "0:0:8", /* Label alignment. */
3506 .align_func: "16", /* Func alignment. */
3507 .small_unroll_ninsns: 4, /* Small unroll limit. */
3508 .small_unroll_factor: 2, /* Small unroll factor. */
3509};
3510
3511
3512/* Generic should produce code tuned for Core-i7 (and newer chips)
3513 and btver1 (and newer chips). */
3514
3515static stringop_algs generic_memcpy[2] = {
3516 {.unknown_size: libcall, .size: {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3517 {-1, libcall, false}}},
3518 {.unknown_size: libcall, .size: {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3519 {-1, libcall, false}}}};
3520static stringop_algs generic_memset[2] = {
3521 {.unknown_size: libcall, .size: {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3522 {-1, libcall, false}}},
3523 {.unknown_size: libcall, .size: {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3524 {-1, libcall, false}}}};
3525static const
3526struct processor_costs generic_cost = {
3527 .hard_register: {
3528 /* Start of register allocator costs. integer->integer move cost is 2. */
3529 .movzbl_load: 6, /* cost for loading QImode using movzbl */
3530 .int_load: {6, 6, 6}, /* cost of loading integer registers
3531 in QImode, HImode and SImode.
3532 Relative to reg-reg move (2). */
3533 .int_store: {6, 6, 6}, /* cost of storing integer registers */
3534 .fp_move: 4, /* cost of reg,reg fld/fst */
3535 .fp_load: {6, 6, 12}, /* cost of loading fp registers
3536 in SFmode, DFmode and XFmode */
3537 .fp_store: {6, 6, 12}, /* cost of storing fp registers
3538 in SFmode, DFmode and XFmode */
3539 .mmx_move: 2, /* cost of moving MMX register */
3540 .mmx_load: {6, 6}, /* cost of loading MMX registers
3541 in SImode and DImode */
3542 .mmx_store: {6, 6}, /* cost of storing MMX registers
3543 in SImode and DImode */
3544 .xmm_move: 2, .ymm_move: 3, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register */
3545 .sse_load: {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3546 in 32,64,128,256 and 512-bit */
3547 .sse_store: {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3548 in 32,64,128,256 and 512-bit */
3549 .sse_to_integer: 6, .integer_to_sse: 6, /* SSE->integer and integer->SSE moves */
3550 .mask_to_integer: 6, .integer_to_mask: 6, /* mask->integer and integer->mask moves */
3551 .mask_load: {6, 6, 6}, /* cost of loading mask register
3552 in QImode, HImode, SImode. */
3553 .mask_store: {6, 6, 6}, /* cost if storing mask register
3554 in QImode, HImode, SImode. */
3555 .mask_move: 2, /* cost of moving mask register. */
3556 /* End of register allocator costs. */
3557 },
3558
3559 COSTS_N_INSNS (1), /* cost of an add instruction */
3560 /* Setting cost to 2 makes our current implementation of synth_mult result in
3561 use of unnecessary temporary registers causing regression on several
3562 SPECfp benchmarks. */
3563 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3564 COSTS_N_INSNS (1), /* variable shift costs */
3565 COSTS_N_INSNS (1), /* constant shift costs */
3566 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3567 COSTS_N_INSNS (4), /* HI */
3568 COSTS_N_INSNS (3), /* SI */
3569 COSTS_N_INSNS (4), /* DI */
3570 COSTS_N_INSNS (4)}, /* other */
3571 .mult_bit: 0, /* cost of multiply per each bit set */
3572 .divide: {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
3573 COSTS_N_INSNS (22), /* HI */
3574 COSTS_N_INSNS (30), /* SI */
3575 COSTS_N_INSNS (74), /* DI */
3576 COSTS_N_INSNS (74)}, /* other */
3577 COSTS_N_INSNS (1), /* cost of movsx */
3578 COSTS_N_INSNS (1), /* cost of movzx */
3579 .large_insn: 8, /* "large" insn */
3580 .move_ratio: 17, /* MOVE_RATIO */
3581 .clear_ratio: 6, /* CLEAR_RATIO */
3582 .int_load: {6, 6, 6}, /* cost of loading integer registers
3583 in QImode, HImode and SImode.
3584 Relative to reg-reg move (2). */
3585 .int_store: {6, 6, 6}, /* cost of storing integer registers */
3586 .sse_load: {6, 6, 6, 10, 15}, /* cost of loading SSE register
3587 in 32bit, 64bit, 128bit, 256bit and 512bit */
3588 .sse_store: {6, 6, 6, 10, 15}, /* cost of storing SSE register
3589 in 32bit, 64bit, 128bit, 256bit and 512bit */
3590 .sse_unaligned_load: {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3591 .sse_unaligned_store: {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3592 .xmm_move: 2, .ymm_move: 3, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register */
3593 .sse_to_integer: 6, /* cost of moving SSE register to integer. */
3594 .gather_static: 18, .gather_per_elt: 6, /* Gather load static, per_elt. */
3595 .scatter_static: 18, .scatter_per_elt: 6, /* Gather store static, per_elt. */
3596 .l1_cache_size: 32, /* size of l1 cache. */
3597 .l2_cache_size: 512, /* size of l2 cache. */
3598 .prefetch_block: 64, /* size of prefetch block */
3599 .simultaneous_prefetches: 6, /* number of parallel prefetches */
3600 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3601 value is increased to perhaps more appropriate value of 5. */
3602 .branch_cost: 3, /* Branch cost */
3603 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3604 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
3605 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
3606 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3607 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3608 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
3609
3610 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3611 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3612 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3613 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3614 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3615 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3616 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3617 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3618 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
3619 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
3620 .reassoc_int: 1, .reassoc_fp: 4, .reassoc_vec_int: 3, .reassoc_vec_fp: 3, /* reassoc int, fp, vec_int, vec_fp. */
3621 .memcpy: generic_memcpy,
3622 .memset: generic_memset,
3623 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3624 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3625 .align_loop: "16:11:8", /* Loop alignment. */
3626 .align_jump: "16:11:8", /* Jump alignment. */
3627 .align_label: "0:0:8", /* Label alignment. */
3628 .align_func: "16", /* Func alignment. */
3629 .small_unroll_ninsns: 4, /* Small unroll limit. */
3630 .small_unroll_factor: 2, /* Small unroll factor. */
3631};
3632
3633/* core_cost should produce code tuned for Core familly of CPUs. */
3634static stringop_algs core_memcpy[2] = {
3635 {.unknown_size: libcall, .size: {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
3636 {.unknown_size: libcall, .size: {{24, loop, true}, {128, rep_prefix_8_byte, true},
3637 {-1, libcall, false}}}};
3638static stringop_algs core_memset[2] = {
3639 {.unknown_size: libcall, .size: {{6, loop_1_byte, true},
3640 {24, loop, true},
3641 {8192, rep_prefix_4_byte, true},
3642 {-1, libcall, false}}},
3643 {.unknown_size: libcall, .size: {{24, loop, true}, {512, rep_prefix_8_byte, true},
3644 {-1, libcall, false}}}};
3645
3646static const
3647struct processor_costs core_cost = {
3648 .hard_register: {
3649 /* Start of register allocator costs. integer->integer move cost is 2. */
3650 .movzbl_load: 6, /* cost for loading QImode using movzbl */
3651 .int_load: {4, 4, 4}, /* cost of loading integer registers
3652 in QImode, HImode and SImode.
3653 Relative to reg-reg move (2). */
3654 .int_store: {6, 6, 6}, /* cost of storing integer registers */
3655 .fp_move: 2, /* cost of reg,reg fld/fst */
3656 .fp_load: {6, 6, 8}, /* cost of loading fp registers
3657 in SFmode, DFmode and XFmode */
3658 .fp_store: {6, 6, 10}, /* cost of storing fp registers
3659 in SFmode, DFmode and XFmode */
3660 .mmx_move: 2, /* cost of moving MMX register */
3661 .mmx_load: {6, 6}, /* cost of loading MMX registers
3662 in SImode and DImode */
3663 .mmx_store: {6, 6}, /* cost of storing MMX registers
3664 in SImode and DImode */
3665 .xmm_move: 2, .ymm_move: 2, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register */
3666 .sse_load: {6, 6, 6, 6, 12}, /* cost of loading SSE registers
3667 in 32,64,128,256 and 512-bit */
3668 .sse_store: {6, 6, 6, 6, 12}, /* cost of storing SSE registers
3669 in 32,64,128,256 and 512-bit */
3670 .sse_to_integer: 6, .integer_to_sse: 6, /* SSE->integer and integer->SSE moves */
3671 .mask_to_integer: 6, .integer_to_mask: 6, /* mask->integer and integer->mask moves */
3672 .mask_load: {4, 4, 4}, /* cost of loading mask register
3673 in QImode, HImode, SImode. */
3674 .mask_store: {6, 6, 6}, /* cost if storing mask register
3675 in QImode, HImode, SImode. */
3676 .mask_move: 2, /* cost of moving mask register. */
3677 /* End of register allocator costs. */
3678 },
3679
3680 COSTS_N_INSNS (1), /* cost of an add instruction */
3681 /* On all chips taken into consideration lea is 2 cycles and more. With
3682 this cost however our current implementation of synth_mult results in
3683 use of unnecessary temporary registers causing regression on several
3684 SPECfp benchmarks. */
3685 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3686 COSTS_N_INSNS (1), /* variable shift costs */
3687 COSTS_N_INSNS (1), /* constant shift costs */
3688 .mult_init: {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3689 COSTS_N_INSNS (4), /* HI */
3690 COSTS_N_INSNS (3), /* SI */
3691 /* Here we tune for Sandybridge or newer. */
3692 COSTS_N_INSNS (3), /* DI */
3693 COSTS_N_INSNS (3)}, /* other */
3694 .mult_bit: 0, /* cost of multiply per each bit set */
3695 /* Expanding div/mod currently doesn't consider parallelism. So the cost
3696 model is not realistic. We compensate by increasing the latencies a bit. */
3697 .divide: {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
3698 COSTS_N_INSNS (11), /* HI */
3699 COSTS_N_INSNS (14), /* SI */
3700 COSTS_N_INSNS (81), /* DI */
3701 COSTS_N_INSNS (81)}, /* other */
3702 COSTS_N_INSNS (1), /* cost of movsx */
3703 COSTS_N_INSNS (1), /* cost of movzx */
3704 .large_insn: 8, /* "large" insn */
3705 .move_ratio: 17, /* MOVE_RATIO */
3706 .clear_ratio: 6, /* CLEAR_RATIO */
3707 .int_load: {4, 4, 4}, /* cost of loading integer registers
3708 in QImode, HImode and SImode.
3709 Relative to reg-reg move (2). */
3710 .int_store: {6, 6, 6}, /* cost of storing integer registers */
3711 .sse_load: {6, 6, 6, 6, 12}, /* cost of loading SSE register
3712 in 32bit, 64bit, 128bit, 256bit and 512bit */
3713 .sse_store: {6, 6, 6, 6, 12}, /* cost of storing SSE register
3714 in 32bit, 64bit, 128bit, 256bit and 512bit */
3715 .sse_unaligned_load: {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
3716 .sse_unaligned_store: {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
3717 .xmm_move: 2, .ymm_move: 2, .zmm_move: 4, /* cost of moving XMM,YMM,ZMM register */
3718 .sse_to_integer: 2, /* cost of moving SSE register to integer. */
3719 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
3720 rec. throughput 6.
3721 So 5 uops statically and one uops per load. */
3722 .gather_static: 10, .gather_per_elt: 6, /* Gather load static, per_elt. */
3723 .scatter_static: 10, .scatter_per_elt: 6, /* Gather store static, per_elt. */
3724 .l1_cache_size: 64, /* size of l1 cache. */
3725 .l2_cache_size: 512, /* size of l2 cache. */
3726 .prefetch_block: 64, /* size of prefetch block */
3727 .simultaneous_prefetches: 6, /* number of parallel prefetches */
3728 /* FIXME perhaps more appropriate value is 5. */
3729 .branch_cost: 3, /* Branch cost */
3730 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3731 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
3732 /* 10-24 */
3733 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
3734 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3735 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3736 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
3737
3738 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3739 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3740 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3741 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3742 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3743 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3744 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
3745 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
3746 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
3747 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
3748 .reassoc_int: 1, .reassoc_fp: 4, .reassoc_vec_int: 2, .reassoc_vec_fp: 2, /* reassoc int, fp, vec_int, vec_fp. */
3749 .memcpy: core_memcpy,
3750 .memset: core_memset,
3751 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3752 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3753 .align_loop: "16:11:8", /* Loop alignment. */
3754 .align_jump: "16:11:8", /* Jump alignment. */
3755 .align_label: "0:0:8", /* Label alignment. */
3756 .align_func: "16", /* Func alignment. */
3757 .small_unroll_ninsns: 4, /* Small unroll limit. */
3758 .small_unroll_factor: 2, /* Small unroll factor. */
3759};
3760
3761

source code of gcc/config/i386/x86-tune-costs.h