1/* Costs of operations of individual x86 CPUs.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16Under Section 7 of GPL version 3, you are granted additional
17permissions described in the GCC Runtime Library Exception, version
183.1, as published by the Free Software Foundation.
19
20You should have received a copy of the GNU General Public License and
21a copy of the GCC Runtime Library Exception along with this program;
22see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23<http://www.gnu.org/licenses/>. */
24/* Processor costs (relative to an add) */
25/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26#define COSTS_N_BYTES(N) ((N) * 2)
27
28#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29
30static stringop_algs ix86_size_memcpy[2] = {
31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33static stringop_algs ix86_size_memset[2] = {
34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
36
37const
38struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39 COSTS_N_BYTES (2), /* cost of an add instruction */
40 COSTS_N_BYTES (3), /* cost of a lea instruction */
41 COSTS_N_BYTES (2), /* variable shift costs */
42 COSTS_N_BYTES (3), /* constant shift costs */
43 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
44 COSTS_N_BYTES (3), /* HI */
45 COSTS_N_BYTES (3), /* SI */
46 COSTS_N_BYTES (3), /* DI */
47 COSTS_N_BYTES (5)}, /* other */
48 0, /* cost of multiply per each bit set */
49 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
50 COSTS_N_BYTES (3), /* HI */
51 COSTS_N_BYTES (3), /* SI */
52 COSTS_N_BYTES (3), /* DI */
53 COSTS_N_BYTES (5)}, /* other */
54 COSTS_N_BYTES (3), /* cost of movsx */
55 COSTS_N_BYTES (3), /* cost of movzx */
56 0, /* "large" insn */
57 2, /* MOVE_RATIO */
58
59 /* All move costs are relative to integer->integer move times 2. */
60 2, /* cost for loading QImode using movzbl */
61 {2, 2, 2}, /* cost of loading integer registers
62 in QImode, HImode and SImode.
63 Relative to reg-reg move (2). */
64 {2, 2, 2}, /* cost of storing integer registers */
65 2, /* cost of reg,reg fld/fst */
66 {2, 2, 2}, /* cost of loading fp registers
67 in SFmode, DFmode and XFmode */
68 {2, 2, 2}, /* cost of storing fp registers
69 in SFmode, DFmode and XFmode */
70 3, /* cost of moving MMX register */
71 {3, 3}, /* cost of loading MMX registers
72 in SImode and DImode */
73 {3, 3}, /* cost of storing MMX registers
74 in SImode and DImode */
75 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
76 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
77 in 32,64,128,256 and 512-bit */
78 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
79 in 128bit, 256bit and 512bit */
80 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
81 in 32,64,128,256 and 512-bit */
82 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
83 in 128bit, 256bit and 512bit */
84 3, 3, /* SSE->integer and integer->SSE moves */
85 5, 0, /* Gather load static, per_elt. */
86 5, 0, /* Gather store static, per_elt. */
87 0, /* size of l1 cache */
88 0, /* size of l2 cache */
89 0, /* size of prefetch block */
90 0, /* number of parallel prefetches */
91 2, /* Branch cost */
92 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
93 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
94 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
95 COSTS_N_BYTES (2), /* cost of FABS instruction. */
96 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
97 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
98
99 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
100 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
101 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
102 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
103 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
104 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
105 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
106 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
107 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
108 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
109 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
110 ix86_size_memcpy,
111 ix86_size_memset,
112 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
113 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
114};
115
116/* Processor costs (relative to an add) */
117static stringop_algs i386_memcpy[2] = {
118 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
119 DUMMY_STRINGOP_ALGS};
120static stringop_algs i386_memset[2] = {
121 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
122 DUMMY_STRINGOP_ALGS};
123
124static const
125struct processor_costs i386_cost = { /* 386 specific costs */
126 COSTS_N_INSNS (1), /* cost of an add instruction */
127 COSTS_N_INSNS (1), /* cost of a lea instruction */
128 COSTS_N_INSNS (3), /* variable shift costs */
129 COSTS_N_INSNS (2), /* constant shift costs */
130 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
131 COSTS_N_INSNS (6), /* HI */
132 COSTS_N_INSNS (6), /* SI */
133 COSTS_N_INSNS (6), /* DI */
134 COSTS_N_INSNS (6)}, /* other */
135 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
136 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
137 COSTS_N_INSNS (23), /* HI */
138 COSTS_N_INSNS (23), /* SI */
139 COSTS_N_INSNS (23), /* DI */
140 COSTS_N_INSNS (23)}, /* other */
141 COSTS_N_INSNS (3), /* cost of movsx */
142 COSTS_N_INSNS (2), /* cost of movzx */
143 15, /* "large" insn */
144 3, /* MOVE_RATIO */
145
146 /* All move costs are relative to integer->integer move times 2 and thus
147 they are latency*2. */
148 4, /* cost for loading QImode using movzbl */
149 {2, 4, 2}, /* cost of loading integer registers
150 in QImode, HImode and SImode.
151 Relative to reg-reg move (2). */
152 {2, 4, 2}, /* cost of storing integer registers */
153 2, /* cost of reg,reg fld/fst */
154 {8, 8, 8}, /* cost of loading fp registers
155 in SFmode, DFmode and XFmode */
156 {8, 8, 8}, /* cost of storing fp registers
157 in SFmode, DFmode and XFmode */
158 2, /* cost of moving MMX register */
159 {4, 8}, /* cost of loading MMX registers
160 in SImode and DImode */
161 {4, 8}, /* cost of storing MMX registers
162 in SImode and DImode */
163 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
164 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
165 in 32,64,128,256 and 512-bit */
166 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
167 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
168 in 32,64,128,256 and 512-bit */
169 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
170 3, 3, /* SSE->integer and integer->SSE moves */
171 4, 4, /* Gather load static, per_elt. */
172 4, 4, /* Gather store static, per_elt. */
173 0, /* size of l1 cache */
174 0, /* size of l2 cache */
175 0, /* size of prefetch block */
176 0, /* number of parallel prefetches */
177 1, /* Branch cost */
178 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
179 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
180 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
181 COSTS_N_INSNS (22), /* cost of FABS instruction. */
182 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
183 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
184
185 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
186 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
187 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
188 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
189 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
190 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
191 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
192 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
193 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
194 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
195 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
196 i386_memcpy,
197 i386_memset,
198 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
199 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
200};
201
202static stringop_algs i486_memcpy[2] = {
203 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
204 DUMMY_STRINGOP_ALGS};
205static stringop_algs i486_memset[2] = {
206 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
207 DUMMY_STRINGOP_ALGS};
208
209static const
210struct processor_costs i486_cost = { /* 486 specific costs */
211 COSTS_N_INSNS (1), /* cost of an add instruction */
212 COSTS_N_INSNS (1), /* cost of a lea instruction */
213 COSTS_N_INSNS (3), /* variable shift costs */
214 COSTS_N_INSNS (2), /* constant shift costs */
215 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
216 COSTS_N_INSNS (12), /* HI */
217 COSTS_N_INSNS (12), /* SI */
218 COSTS_N_INSNS (12), /* DI */
219 COSTS_N_INSNS (12)}, /* other */
220 1, /* cost of multiply per each bit set */
221 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
222 COSTS_N_INSNS (40), /* HI */
223 COSTS_N_INSNS (40), /* SI */
224 COSTS_N_INSNS (40), /* DI */
225 COSTS_N_INSNS (40)}, /* other */
226 COSTS_N_INSNS (3), /* cost of movsx */
227 COSTS_N_INSNS (2), /* cost of movzx */
228 15, /* "large" insn */
229 3, /* MOVE_RATIO */
230
231 /* All move costs are relative to integer->integer move times 2 and thus
232 they are latency*2. */
233 4, /* cost for loading QImode using movzbl */
234 {2, 4, 2}, /* cost of loading integer registers
235 in QImode, HImode and SImode.
236 Relative to reg-reg move (2). */
237 {2, 4, 2}, /* cost of storing integer registers */
238 2, /* cost of reg,reg fld/fst */
239 {8, 8, 8}, /* cost of loading fp registers
240 in SFmode, DFmode and XFmode */
241 {8, 8, 8}, /* cost of storing fp registers
242 in SFmode, DFmode and XFmode */
243 2, /* cost of moving MMX register */
244 {4, 8}, /* cost of loading MMX registers
245 in SImode and DImode */
246 {4, 8}, /* cost of storing MMX registers
247 in SImode and DImode */
248 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
249 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
250 in 32,64,128,256 and 512-bit */
251 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
252 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
253 in 32,64,128,256 and 512-bit */
254 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
255 3, 3, /* SSE->integer and integer->SSE moves */
256 4, 4, /* Gather load static, per_elt. */
257 4, 4, /* Gather store static, per_elt. */
258 4, /* size of l1 cache. 486 has 8kB cache
259 shared for code and data, so 4kB is
260 not really precise. */
261 4, /* size of l2 cache */
262 0, /* size of prefetch block */
263 0, /* number of parallel prefetches */
264 1, /* Branch cost */
265 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
266 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
267 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
268 COSTS_N_INSNS (3), /* cost of FABS instruction. */
269 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
270 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
271
272 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
273 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
274 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
275 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
276 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
277 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
278 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
279 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
280 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
281 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
282 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
283 i486_memcpy,
284 i486_memset,
285 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
286 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
287};
288
289static stringop_algs pentium_memcpy[2] = {
290 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
291 DUMMY_STRINGOP_ALGS};
292static stringop_algs pentium_memset[2] = {
293 {libcall, {{-1, rep_prefix_4_byte, false}}},
294 DUMMY_STRINGOP_ALGS};
295
296static const
297struct processor_costs pentium_cost = {
298 COSTS_N_INSNS (1), /* cost of an add instruction */
299 COSTS_N_INSNS (1), /* cost of a lea instruction */
300 COSTS_N_INSNS (4), /* variable shift costs */
301 COSTS_N_INSNS (1), /* constant shift costs */
302 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
303 COSTS_N_INSNS (11), /* HI */
304 COSTS_N_INSNS (11), /* SI */
305 COSTS_N_INSNS (11), /* DI */
306 COSTS_N_INSNS (11)}, /* other */
307 0, /* cost of multiply per each bit set */
308 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
309 COSTS_N_INSNS (25), /* HI */
310 COSTS_N_INSNS (25), /* SI */
311 COSTS_N_INSNS (25), /* DI */
312 COSTS_N_INSNS (25)}, /* other */
313 COSTS_N_INSNS (3), /* cost of movsx */
314 COSTS_N_INSNS (2), /* cost of movzx */
315 8, /* "large" insn */
316 6, /* MOVE_RATIO */
317
318 /* All move costs are relative to integer->integer move times 2 and thus
319 they are latency*2. */
320 6, /* cost for loading QImode using movzbl */
321 {2, 4, 2}, /* cost of loading integer registers
322 in QImode, HImode and SImode.
323 Relative to reg-reg move (2). */
324 {2, 4, 2}, /* cost of storing integer registers */
325 2, /* cost of reg,reg fld/fst */
326 {2, 2, 6}, /* cost of loading fp registers
327 in SFmode, DFmode and XFmode */
328 {4, 4, 6}, /* cost of storing fp registers
329 in SFmode, DFmode and XFmode */
330 8, /* cost of moving MMX register */
331 {8, 8}, /* cost of loading MMX registers
332 in SImode and DImode */
333 {8, 8}, /* cost of storing MMX registers
334 in SImode and DImode */
335 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
336 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
337 in 32,64,128,256 and 512-bit */
338 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
339 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
340 in 32,64,128,256 and 512-bit */
341 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
342 3, 3, /* SSE->integer and integer->SSE moves */
343 4, 4, /* Gather load static, per_elt. */
344 4, 4, /* Gather store static, per_elt. */
345 8, /* size of l1 cache. */
346 8, /* size of l2 cache */
347 0, /* size of prefetch block */
348 0, /* number of parallel prefetches */
349 2, /* Branch cost */
350 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
351 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
352 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
353 COSTS_N_INSNS (1), /* cost of FABS instruction. */
354 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
355 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
356
357 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
358 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
359 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
360 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
361 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
362 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
363 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
364 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
365 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
366 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
367 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
368 pentium_memcpy,
369 pentium_memset,
370 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
371 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
372};
373
374static const
375struct processor_costs lakemont_cost = {
376 COSTS_N_INSNS (1), /* cost of an add instruction */
377 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
378 COSTS_N_INSNS (1), /* variable shift costs */
379 COSTS_N_INSNS (1), /* constant shift costs */
380 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
381 COSTS_N_INSNS (11), /* HI */
382 COSTS_N_INSNS (11), /* SI */
383 COSTS_N_INSNS (11), /* DI */
384 COSTS_N_INSNS (11)}, /* other */
385 0, /* cost of multiply per each bit set */
386 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
387 COSTS_N_INSNS (25), /* HI */
388 COSTS_N_INSNS (25), /* SI */
389 COSTS_N_INSNS (25), /* DI */
390 COSTS_N_INSNS (25)}, /* other */
391 COSTS_N_INSNS (3), /* cost of movsx */
392 COSTS_N_INSNS (2), /* cost of movzx */
393 8, /* "large" insn */
394 17, /* MOVE_RATIO */
395
396 /* All move costs are relative to integer->integer move times 2 and thus
397 they are latency*2. */
398 6, /* cost for loading QImode using movzbl */
399 {2, 4, 2}, /* cost of loading integer registers
400 in QImode, HImode and SImode.
401 Relative to reg-reg move (2). */
402 {2, 4, 2}, /* cost of storing integer registers */
403 2, /* cost of reg,reg fld/fst */
404 {2, 2, 6}, /* cost of loading fp registers
405 in SFmode, DFmode and XFmode */
406 {4, 4, 6}, /* cost of storing fp registers
407 in SFmode, DFmode and XFmode */
408 8, /* cost of moving MMX register */
409 {8, 8}, /* cost of loading MMX registers
410 in SImode and DImode */
411 {8, 8}, /* cost of storing MMX registers
412 in SImode and DImode */
413 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
414 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
415 in 32,64,128,256 and 512-bit */
416 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
417 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
418 in 32,64,128,256 and 512-bit */
419 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
420 3, 3, /* SSE->integer and integer->SSE moves */
421 4, 4, /* Gather load static, per_elt. */
422 4, 4, /* Gather store static, per_elt. */
423 8, /* size of l1 cache. */
424 8, /* size of l2 cache */
425 0, /* size of prefetch block */
426 0, /* number of parallel prefetches */
427 2, /* Branch cost */
428 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
429 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
430 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
431 COSTS_N_INSNS (1), /* cost of FABS instruction. */
432 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
433 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
434
435 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
436 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
437 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
438 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
439 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
440 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
441 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
442 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
443 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
444 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
445 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
446 pentium_memcpy,
447 pentium_memset,
448 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
449 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
450};
451
452/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
453 (we ensure the alignment). For small blocks inline loop is still a
454 noticeable win, for bigger blocks either rep movsl or rep movsb is
455 way to go. Rep movsb has apparently more expensive startup time in CPU,
456 but after 4K the difference is down in the noise. */
457static stringop_algs pentiumpro_memcpy[2] = {
458 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
459 {8192, rep_prefix_4_byte, false},
460 {-1, rep_prefix_1_byte, false}}},
461 DUMMY_STRINGOP_ALGS};
462static stringop_algs pentiumpro_memset[2] = {
463 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
464 {8192, rep_prefix_4_byte, false},
465 {-1, libcall, false}}},
466 DUMMY_STRINGOP_ALGS};
467static const
468struct processor_costs pentiumpro_cost = {
469 COSTS_N_INSNS (1), /* cost of an add instruction */
470 COSTS_N_INSNS (1), /* cost of a lea instruction */
471 COSTS_N_INSNS (1), /* variable shift costs */
472 COSTS_N_INSNS (1), /* constant shift costs */
473 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
474 COSTS_N_INSNS (4), /* HI */
475 COSTS_N_INSNS (4), /* SI */
476 COSTS_N_INSNS (4), /* DI */
477 COSTS_N_INSNS (4)}, /* other */
478 0, /* cost of multiply per each bit set */
479 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
480 COSTS_N_INSNS (17), /* HI */
481 COSTS_N_INSNS (17), /* SI */
482 COSTS_N_INSNS (17), /* DI */
483 COSTS_N_INSNS (17)}, /* other */
484 COSTS_N_INSNS (1), /* cost of movsx */
485 COSTS_N_INSNS (1), /* cost of movzx */
486 8, /* "large" insn */
487 6, /* MOVE_RATIO */
488
489 /* All move costs are relative to integer->integer move times 2 and thus
490 they are latency*2. */
491 2, /* cost for loading QImode using movzbl */
492 {4, 4, 4}, /* cost of loading integer registers
493 in QImode, HImode and SImode.
494 Relative to reg-reg move (2). */
495 {2, 2, 2}, /* cost of storing integer registers */
496 2, /* cost of reg,reg fld/fst */
497 {2, 2, 6}, /* cost of loading fp registers
498 in SFmode, DFmode and XFmode */
499 {4, 4, 6}, /* cost of storing fp registers
500 in SFmode, DFmode and XFmode */
501 2, /* cost of moving MMX register */
502 {2, 2}, /* cost of loading MMX registers
503 in SImode and DImode */
504 {2, 2}, /* cost of storing MMX registers
505 in SImode and DImode */
506 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
507 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
508 in 32,64,128,256 and 512-bit */
509 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
510 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
511 in 32,64,128,256 and 512-bit */
512 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
513 3, 3, /* SSE->integer and integer->SSE moves */
514 4, 4, /* Gather load static, per_elt. */
515 4, 4, /* Gather store static, per_elt. */
516 8, /* size of l1 cache. */
517 256, /* size of l2 cache */
518 32, /* size of prefetch block */
519 6, /* number of parallel prefetches */
520 2, /* Branch cost */
521 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
522 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
523 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
524 COSTS_N_INSNS (2), /* cost of FABS instruction. */
525 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
526 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
527
528 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
529 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
530 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
531 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
532 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
533 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
534 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
535 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
536 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
537 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
538 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
539 pentiumpro_memcpy,
540 pentiumpro_memset,
541 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
542 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
543};
544
545static stringop_algs geode_memcpy[2] = {
546 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
547 DUMMY_STRINGOP_ALGS};
548static stringop_algs geode_memset[2] = {
549 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
550 DUMMY_STRINGOP_ALGS};
551static const
552struct processor_costs geode_cost = {
553 COSTS_N_INSNS (1), /* cost of an add instruction */
554 COSTS_N_INSNS (1), /* cost of a lea instruction */
555 COSTS_N_INSNS (2), /* variable shift costs */
556 COSTS_N_INSNS (1), /* constant shift costs */
557 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
558 COSTS_N_INSNS (4), /* HI */
559 COSTS_N_INSNS (7), /* SI */
560 COSTS_N_INSNS (7), /* DI */
561 COSTS_N_INSNS (7)}, /* other */
562 0, /* cost of multiply per each bit set */
563 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
564 COSTS_N_INSNS (23), /* HI */
565 COSTS_N_INSNS (39), /* SI */
566 COSTS_N_INSNS (39), /* DI */
567 COSTS_N_INSNS (39)}, /* other */
568 COSTS_N_INSNS (1), /* cost of movsx */
569 COSTS_N_INSNS (1), /* cost of movzx */
570 8, /* "large" insn */
571 4, /* MOVE_RATIO */
572
573 /* All move costs are relative to integer->integer move times 2 and thus
574 they are latency*2. */
575 2, /* cost for loading QImode using movzbl */
576 {2, 2, 2}, /* cost of loading integer registers
577 in QImode, HImode and SImode.
578 Relative to reg-reg move (2). */
579 {2, 2, 2}, /* cost of storing integer registers */
580 2, /* cost of reg,reg fld/fst */
581 {2, 2, 2}, /* cost of loading fp registers
582 in SFmode, DFmode and XFmode */
583 {4, 6, 6}, /* cost of storing fp registers
584 in SFmode, DFmode and XFmode */
585
586 2, /* cost of moving MMX register */
587 {2, 2}, /* cost of loading MMX registers
588 in SImode and DImode */
589 {2, 2}, /* cost of storing MMX registers
590 in SImode and DImode */
591 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
592 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
593 in 32,64,128,256 and 512-bit */
594 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
595 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
596 in 32,64,128,256 and 512-bit */
597 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
598 6, 6, /* SSE->integer and integer->SSE moves */
599 2, 2, /* Gather load static, per_elt. */
600 2, 2, /* Gather store static, per_elt. */
601 64, /* size of l1 cache. */
602 128, /* size of l2 cache. */
603 32, /* size of prefetch block */
604 1, /* number of parallel prefetches */
605 1, /* Branch cost */
606 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
607 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
608 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
609 COSTS_N_INSNS (1), /* cost of FABS instruction. */
610 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
611 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
612
613 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
614 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
615 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
616 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
617 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
618 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
619 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
620 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
621 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
622 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
623 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
624 geode_memcpy,
625 geode_memset,
626 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
627 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
628};
629
630static stringop_algs k6_memcpy[2] = {
631 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
632 DUMMY_STRINGOP_ALGS};
633static stringop_algs k6_memset[2] = {
634 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
635 DUMMY_STRINGOP_ALGS};
636static const
637struct processor_costs k6_cost = {
638 COSTS_N_INSNS (1), /* cost of an add instruction */
639 COSTS_N_INSNS (2), /* cost of a lea instruction */
640 COSTS_N_INSNS (1), /* variable shift costs */
641 COSTS_N_INSNS (1), /* constant shift costs */
642 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
643 COSTS_N_INSNS (3), /* HI */
644 COSTS_N_INSNS (3), /* SI */
645 COSTS_N_INSNS (3), /* DI */
646 COSTS_N_INSNS (3)}, /* other */
647 0, /* cost of multiply per each bit set */
648 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
649 COSTS_N_INSNS (18), /* HI */
650 COSTS_N_INSNS (18), /* SI */
651 COSTS_N_INSNS (18), /* DI */
652 COSTS_N_INSNS (18)}, /* other */
653 COSTS_N_INSNS (2), /* cost of movsx */
654 COSTS_N_INSNS (2), /* cost of movzx */
655 8, /* "large" insn */
656 4, /* MOVE_RATIO */
657
658 /* All move costs are relative to integer->integer move times 2 and thus
659 they are latency*2. */
660 3, /* cost for loading QImode using movzbl */
661 {4, 5, 4}, /* cost of loading integer registers
662 in QImode, HImode and SImode.
663 Relative to reg-reg move (2). */
664 {2, 3, 2}, /* cost of storing integer registers */
665 4, /* cost of reg,reg fld/fst */
666 {6, 6, 6}, /* cost of loading fp registers
667 in SFmode, DFmode and XFmode */
668 {4, 4, 4}, /* cost of storing fp registers
669 in SFmode, DFmode and XFmode */
670 2, /* cost of moving MMX register */
671 {2, 2}, /* cost of loading MMX registers
672 in SImode and DImode */
673 {2, 2}, /* cost of storing MMX registers
674 in SImode and DImode */
675 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
676 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
677 in 32,64,128,256 and 512-bit */
678 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
679 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
680 in 32,64,128,256 and 512-bit */
681 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
682 6, 6, /* SSE->integer and integer->SSE moves */
683 2, 2, /* Gather load static, per_elt. */
684 2, 2, /* Gather store static, per_elt. */
685 32, /* size of l1 cache. */
686 32, /* size of l2 cache. Some models
687 have integrated l2 cache, but
688 optimizing for k6 is not important
689 enough to worry about that. */
690 32, /* size of prefetch block */
691 1, /* number of parallel prefetches */
692 1, /* Branch cost */
693 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
694 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
695 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
696 COSTS_N_INSNS (2), /* cost of FABS instruction. */
697 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
698 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
699
700 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
701 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
702 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
703 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
704 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
705 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
706 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
707 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
708 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
709 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
710 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
711 k6_memcpy,
712 k6_memset,
713 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
714 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
715};
716
717/* For some reason, Athlon deals better with REP prefix (relative to loops)
718 compared to K8. Alignment becomes important after 8 bytes for memcpy and
719 128 bytes for memset. */
720static stringop_algs athlon_memcpy[2] = {
721 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
722 DUMMY_STRINGOP_ALGS};
723static stringop_algs athlon_memset[2] = {
724 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
725 DUMMY_STRINGOP_ALGS};
726static const
727struct processor_costs athlon_cost = {
728 COSTS_N_INSNS (1), /* cost of an add instruction */
729 COSTS_N_INSNS (2), /* cost of a lea instruction */
730 COSTS_N_INSNS (1), /* variable shift costs */
731 COSTS_N_INSNS (1), /* constant shift costs */
732 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
733 COSTS_N_INSNS (5), /* HI */
734 COSTS_N_INSNS (5), /* SI */
735 COSTS_N_INSNS (5), /* DI */
736 COSTS_N_INSNS (5)}, /* other */
737 0, /* cost of multiply per each bit set */
738 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
739 COSTS_N_INSNS (26), /* HI */
740 COSTS_N_INSNS (42), /* SI */
741 COSTS_N_INSNS (74), /* DI */
742 COSTS_N_INSNS (74)}, /* other */
743 COSTS_N_INSNS (1), /* cost of movsx */
744 COSTS_N_INSNS (1), /* cost of movzx */
745 8, /* "large" insn */
746 9, /* MOVE_RATIO */
747
748 /* All move costs are relative to integer->integer move times 2 and thus
749 they are latency*2. */
750 4, /* cost for loading QImode using movzbl */
751 {3, 4, 3}, /* cost of loading integer registers
752 in QImode, HImode and SImode.
753 Relative to reg-reg move (2). */
754 {3, 4, 3}, /* cost of storing integer registers */
755 4, /* cost of reg,reg fld/fst */
756 {4, 4, 12}, /* cost of loading fp registers
757 in SFmode, DFmode and XFmode */
758 {6, 6, 8}, /* cost of storing fp registers
759 in SFmode, DFmode and XFmode */
760 2, /* cost of moving MMX register */
761 {4, 4}, /* cost of loading MMX registers
762 in SImode and DImode */
763 {4, 4}, /* cost of storing MMX registers
764 in SImode and DImode */
765 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
766 {4, 4, 6, 12, 24}, /* cost of loading SSE registers
767 in 32,64,128,256 and 512-bit */
768 {4, 4, 6, 12, 24}, /* cost of unaligned loads. */
769 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
770 in 32,64,128,256 and 512-bit */
771 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
772 5, 5, /* SSE->integer and integer->SSE moves */
773 4, 4, /* Gather load static, per_elt. */
774 4, 4, /* Gather store static, per_elt. */
775 64, /* size of l1 cache. */
776 256, /* size of l2 cache. */
777 64, /* size of prefetch block */
778 6, /* number of parallel prefetches */
779 5, /* Branch cost */
780 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
781 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
782 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
783 COSTS_N_INSNS (2), /* cost of FABS instruction. */
784 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
785 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
786
787 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
788 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
789 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
790 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
791 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
792 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
793 /* 11-16 */
794 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
795 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
796 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
797 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
798 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
799 athlon_memcpy,
800 athlon_memset,
801 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
802 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
803};
804
805/* K8 has optimized REP instruction for medium sized blocks, but for very
806 small blocks it is better to use loop. For large blocks, libcall can
807 do nontemporary accesses and beat inline considerably. */
808static stringop_algs k8_memcpy[2] = {
809 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
810 {-1, rep_prefix_4_byte, false}}},
811 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
812 {-1, libcall, false}}}};
813static stringop_algs k8_memset[2] = {
814 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
815 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
816 {libcall, {{48, unrolled_loop, false},
817 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
818static const
819struct processor_costs k8_cost = {
820 COSTS_N_INSNS (1), /* cost of an add instruction */
821 COSTS_N_INSNS (2), /* cost of a lea instruction */
822 COSTS_N_INSNS (1), /* variable shift costs */
823 COSTS_N_INSNS (1), /* constant shift costs */
824 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
825 COSTS_N_INSNS (4), /* HI */
826 COSTS_N_INSNS (3), /* SI */
827 COSTS_N_INSNS (4), /* DI */
828 COSTS_N_INSNS (5)}, /* other */
829 0, /* cost of multiply per each bit set */
830 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
831 COSTS_N_INSNS (26), /* HI */
832 COSTS_N_INSNS (42), /* SI */
833 COSTS_N_INSNS (74), /* DI */
834 COSTS_N_INSNS (74)}, /* other */
835 COSTS_N_INSNS (1), /* cost of movsx */
836 COSTS_N_INSNS (1), /* cost of movzx */
837 8, /* "large" insn */
838 9, /* MOVE_RATIO */
839
840 /* All move costs are relative to integer->integer move times 2 and thus
841 they are latency*2. */
842 4, /* cost for loading QImode using movzbl */
843 {3, 4, 3}, /* cost of loading integer registers
844 in QImode, HImode and SImode.
845 Relative to reg-reg move (2). */
846 {3, 4, 3}, /* cost of storing integer registers */
847 4, /* cost of reg,reg fld/fst */
848 {4, 4, 12}, /* cost of loading fp registers
849 in SFmode, DFmode and XFmode */
850 {6, 6, 8}, /* cost of storing fp registers
851 in SFmode, DFmode and XFmode */
852 2, /* cost of moving MMX register */
853 {3, 3}, /* cost of loading MMX registers
854 in SImode and DImode */
855 {4, 4}, /* cost of storing MMX registers
856 in SImode and DImode */
857 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
858 {4, 3, 6, 12, 24}, /* cost of loading SSE registers
859 in 32,64,128,256 and 512-bit */
860 {4, 3, 6, 12, 24}, /* cost of unaligned loads. */
861 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
862 in 32,64,128,256 and 512-bit */
863 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
864 5, 5, /* SSE->integer and integer->SSE moves */
865 4, 4, /* Gather load static, per_elt. */
866 4, 4, /* Gather store static, per_elt. */
867 64, /* size of l1 cache. */
868 512, /* size of l2 cache. */
869 64, /* size of prefetch block */
870 /* New AMD processors never drop prefetches; if they cannot be performed
871 immediately, they are queued. We set number of simultaneous prefetches
872 to a large constant to reflect this (it probably is not a good idea not
873 to limit number of prefetches at all, as their execution also takes some
874 time). */
875 100, /* number of parallel prefetches */
876 3, /* Branch cost */
877 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
878 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
879 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
880 COSTS_N_INSNS (2), /* cost of FABS instruction. */
881 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
882 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
883
884 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
885 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
886 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
887 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
888 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
889 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
890 /* 11-16 */
891 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
892 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
893 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
894 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
895 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
896 k8_memcpy,
897 k8_memset,
898 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
899 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
900};
901
902/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
903 very small blocks it is better to use loop. For large blocks, libcall can
904 do nontemporary accesses and beat inline considerably. */
905static stringop_algs amdfam10_memcpy[2] = {
906 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
907 {-1, rep_prefix_4_byte, false}}},
908 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909 {-1, libcall, false}}}};
910static stringop_algs amdfam10_memset[2] = {
911 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
912 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
913 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
914 {-1, libcall, false}}}};
915struct processor_costs amdfam10_cost = {
916 COSTS_N_INSNS (1), /* cost of an add instruction */
917 COSTS_N_INSNS (2), /* cost of a lea instruction */
918 COSTS_N_INSNS (1), /* variable shift costs */
919 COSTS_N_INSNS (1), /* constant shift costs */
920 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
921 COSTS_N_INSNS (4), /* HI */
922 COSTS_N_INSNS (3), /* SI */
923 COSTS_N_INSNS (4), /* DI */
924 COSTS_N_INSNS (5)}, /* other */
925 0, /* cost of multiply per each bit set */
926 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
927 COSTS_N_INSNS (35), /* HI */
928 COSTS_N_INSNS (51), /* SI */
929 COSTS_N_INSNS (83), /* DI */
930 COSTS_N_INSNS (83)}, /* other */
931 COSTS_N_INSNS (1), /* cost of movsx */
932 COSTS_N_INSNS (1), /* cost of movzx */
933 8, /* "large" insn */
934 9, /* MOVE_RATIO */
935
936 /* All move costs are relative to integer->integer move times 2 and thus
937 they are latency*2. */
938 4, /* cost for loading QImode using movzbl */
939 {3, 4, 3}, /* cost of loading integer registers
940 in QImode, HImode and SImode.
941 Relative to reg-reg move (2). */
942 {3, 4, 3}, /* cost of storing integer registers */
943 4, /* cost of reg,reg fld/fst */
944 {4, 4, 12}, /* cost of loading fp registers
945 in SFmode, DFmode and XFmode */
946 {6, 6, 8}, /* cost of storing fp registers
947 in SFmode, DFmode and XFmode */
948 2, /* cost of moving MMX register */
949 {3, 3}, /* cost of loading MMX registers
950 in SImode and DImode */
951 {4, 4}, /* cost of storing MMX registers
952 in SImode and DImode */
953 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
954 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
955 in 32,64,128,256 and 512-bit */
956 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
957 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
958 in 32,64,128,256 and 512-bit */
959 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
960 3, 3, /* SSE->integer and integer->SSE moves */
961 /* On K8:
962 MOVD reg64, xmmreg Double FSTORE 4
963 MOVD reg32, xmmreg Double FSTORE 4
964 On AMDFAM10:
965 MOVD reg64, xmmreg Double FADD 3
966 1/1 1/1
967 MOVD reg32, xmmreg Double FADD 3
968 1/1 1/1 */
969 4, 4, /* Gather load static, per_elt. */
970 4, 4, /* Gather store static, per_elt. */
971 64, /* size of l1 cache. */
972 512, /* size of l2 cache. */
973 64, /* size of prefetch block */
974 /* New AMD processors never drop prefetches; if they cannot be performed
975 immediately, they are queued. We set number of simultaneous prefetches
976 to a large constant to reflect this (it probably is not a good idea not
977 to limit number of prefetches at all, as their execution also takes some
978 time). */
979 100, /* number of parallel prefetches */
980 2, /* Branch cost */
981 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
982 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
983 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
984 COSTS_N_INSNS (2), /* cost of FABS instruction. */
985 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
986 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
987
988 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
989 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
990 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
991 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
992 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
993 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
994 /* 11-16 */
995 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
996 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
997 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
998 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
999 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1000 amdfam10_memcpy,
1001 amdfam10_memset,
1002 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1003 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1004};
1005
1006/* BDVER1 has optimized REP instruction for medium sized blocks, but for
1007 very small blocks it is better to use loop. For large blocks, libcall
1008 can do nontemporary accesses and beat inline considerably. */
1009static stringop_algs bdver1_memcpy[2] = {
1010 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1011 {-1, rep_prefix_4_byte, false}}},
1012 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1013 {-1, libcall, false}}}};
1014static stringop_algs bdver1_memset[2] = {
1015 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1016 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1017 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1018 {-1, libcall, false}}}};
1019
1020const struct processor_costs bdver1_cost = {
1021 COSTS_N_INSNS (1), /* cost of an add instruction */
1022 COSTS_N_INSNS (1), /* cost of a lea instruction */
1023 COSTS_N_INSNS (1), /* variable shift costs */
1024 COSTS_N_INSNS (1), /* constant shift costs */
1025 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1026 COSTS_N_INSNS (4), /* HI */
1027 COSTS_N_INSNS (4), /* SI */
1028 COSTS_N_INSNS (6), /* DI */
1029 COSTS_N_INSNS (6)}, /* other */
1030 0, /* cost of multiply per each bit set */
1031 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1032 COSTS_N_INSNS (35), /* HI */
1033 COSTS_N_INSNS (51), /* SI */
1034 COSTS_N_INSNS (83), /* DI */
1035 COSTS_N_INSNS (83)}, /* other */
1036 COSTS_N_INSNS (1), /* cost of movsx */
1037 COSTS_N_INSNS (1), /* cost of movzx */
1038 8, /* "large" insn */
1039 9, /* MOVE_RATIO */
1040
1041 /* All move costs are relative to integer->integer move times 2 and thus
1042 they are latency*2. */
1043 8, /* cost for loading QImode using movzbl */
1044 {8, 8, 8}, /* cost of loading integer registers
1045 in QImode, HImode and SImode.
1046 Relative to reg-reg move (2). */
1047 {8, 8, 8}, /* cost of storing integer registers */
1048 4, /* cost of reg,reg fld/fst */
1049 {12, 12, 28}, /* cost of loading fp registers
1050 in SFmode, DFmode and XFmode */
1051 {10, 10, 18}, /* cost of storing fp registers
1052 in SFmode, DFmode and XFmode */
1053 4, /* cost of moving MMX register */
1054 {12, 12}, /* cost of loading MMX registers
1055 in SImode and DImode */
1056 {10, 10}, /* cost of storing MMX registers
1057 in SImode and DImode */
1058 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1059 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1060 in 32,64,128,256 and 512-bit */
1061 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1062 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1063 in 32,64,128,256 and 512-bit */
1064 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1065 16, 20, /* SSE->integer and integer->SSE moves */
1066 12, 12, /* Gather load static, per_elt. */
1067 10, 10, /* Gather store static, per_elt. */
1068 16, /* size of l1 cache. */
1069 2048, /* size of l2 cache. */
1070 64, /* size of prefetch block */
1071 /* New AMD processors never drop prefetches; if they cannot be performed
1072 immediately, they are queued. We set number of simultaneous prefetches
1073 to a large constant to reflect this (it probably is not a good idea not
1074 to limit number of prefetches at all, as their execution also takes some
1075 time). */
1076 100, /* number of parallel prefetches */
1077 2, /* Branch cost */
1078 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1079 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1080 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1081 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1082 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1083 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1084
1085 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1086 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1087 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1088 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1089 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1090 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1091 /* 9-24 */
1092 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1093 /* 9-27 */
1094 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1095 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1096 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1097 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1098 bdver1_memcpy,
1099 bdver1_memset,
1100 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1101 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1102};
1103
1104/* BDVER2 has optimized REP instruction for medium sized blocks, but for
1105 very small blocks it is better to use loop. For large blocks, libcall
1106 can do nontemporary accesses and beat inline considerably. */
1107
1108static stringop_algs bdver2_memcpy[2] = {
1109 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1110 {-1, rep_prefix_4_byte, false}}},
1111 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1112 {-1, libcall, false}}}};
1113static stringop_algs bdver2_memset[2] = {
1114 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1115 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1116 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1117 {-1, libcall, false}}}};
1118
1119const struct processor_costs bdver2_cost = {
1120 COSTS_N_INSNS (1), /* cost of an add instruction */
1121 COSTS_N_INSNS (1), /* cost of a lea instruction */
1122 COSTS_N_INSNS (1), /* variable shift costs */
1123 COSTS_N_INSNS (1), /* constant shift costs */
1124 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1125 COSTS_N_INSNS (4), /* HI */
1126 COSTS_N_INSNS (4), /* SI */
1127 COSTS_N_INSNS (6), /* DI */
1128 COSTS_N_INSNS (6)}, /* other */
1129 0, /* cost of multiply per each bit set */
1130 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1131 COSTS_N_INSNS (35), /* HI */
1132 COSTS_N_INSNS (51), /* SI */
1133 COSTS_N_INSNS (83), /* DI */
1134 COSTS_N_INSNS (83)}, /* other */
1135 COSTS_N_INSNS (1), /* cost of movsx */
1136 COSTS_N_INSNS (1), /* cost of movzx */
1137 8, /* "large" insn */
1138 9, /* MOVE_RATIO */
1139
1140 /* All move costs are relative to integer->integer move times 2 and thus
1141 they are latency*2. */
1142 8, /* cost for loading QImode using movzbl */
1143 {8, 8, 8}, /* cost of loading integer registers
1144 in QImode, HImode and SImode.
1145 Relative to reg-reg move (2). */
1146 {8, 8, 8}, /* cost of storing integer registers */
1147 4, /* cost of reg,reg fld/fst */
1148 {12, 12, 28}, /* cost of loading fp registers
1149 in SFmode, DFmode and XFmode */
1150 {10, 10, 18}, /* cost of storing fp registers
1151 in SFmode, DFmode and XFmode */
1152 4, /* cost of moving MMX register */
1153 {12, 12}, /* cost of loading MMX registers
1154 in SImode and DImode */
1155 {10, 10}, /* cost of storing MMX registers
1156 in SImode and DImode */
1157 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1158 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1159 in 32,64,128,256 and 512-bit */
1160 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1161 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1162 in 32,64,128,256 and 512-bit */
1163 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1164 16, 20, /* SSE->integer and integer->SSE moves */
1165 12, 12, /* Gather load static, per_elt. */
1166 10, 10, /* Gather store static, per_elt. */
1167 16, /* size of l1 cache. */
1168 2048, /* size of l2 cache. */
1169 64, /* size of prefetch block */
1170 /* New AMD processors never drop prefetches; if they cannot be performed
1171 immediately, they are queued. We set number of simultaneous prefetches
1172 to a large constant to reflect this (it probably is not a good idea not
1173 to limit number of prefetches at all, as their execution also takes some
1174 time). */
1175 100, /* number of parallel prefetches */
1176 2, /* Branch cost */
1177 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1178 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1179 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1180 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1181 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1182 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1183
1184 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1185 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1186 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1187 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1188 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1189 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1190 /* 9-24 */
1191 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1192 /* 9-27 */
1193 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1194 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1195 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1196 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1197 bdver2_memcpy,
1198 bdver2_memset,
1199 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1200 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1201};
1202
1203
1204 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1205 very small blocks it is better to use loop. For large blocks, libcall
1206 can do nontemporary accesses and beat inline considerably. */
1207static stringop_algs bdver3_memcpy[2] = {
1208 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1209 {-1, rep_prefix_4_byte, false}}},
1210 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1211 {-1, libcall, false}}}};
1212static stringop_algs bdver3_memset[2] = {
1213 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1214 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1215 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1216 {-1, libcall, false}}}};
1217struct processor_costs bdver3_cost = {
1218 COSTS_N_INSNS (1), /* cost of an add instruction */
1219 COSTS_N_INSNS (1), /* cost of a lea instruction */
1220 COSTS_N_INSNS (1), /* variable shift costs */
1221 COSTS_N_INSNS (1), /* constant shift costs */
1222 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1223 COSTS_N_INSNS (4), /* HI */
1224 COSTS_N_INSNS (4), /* SI */
1225 COSTS_N_INSNS (6), /* DI */
1226 COSTS_N_INSNS (6)}, /* other */
1227 0, /* cost of multiply per each bit set */
1228 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1229 COSTS_N_INSNS (35), /* HI */
1230 COSTS_N_INSNS (51), /* SI */
1231 COSTS_N_INSNS (83), /* DI */
1232 COSTS_N_INSNS (83)}, /* other */
1233 COSTS_N_INSNS (1), /* cost of movsx */
1234 COSTS_N_INSNS (1), /* cost of movzx */
1235 8, /* "large" insn */
1236 9, /* MOVE_RATIO */
1237
1238 /* All move costs are relative to integer->integer move times 2 and thus
1239 they are latency*2. */
1240 8, /* cost for loading QImode using movzbl */
1241 {8, 8, 8}, /* cost of loading integer registers
1242 in QImode, HImode and SImode.
1243 Relative to reg-reg move (2). */
1244 {8, 8, 8}, /* cost of storing integer registers */
1245 4, /* cost of reg,reg fld/fst */
1246 {12, 12, 28}, /* cost of loading fp registers
1247 in SFmode, DFmode and XFmode */
1248 {10, 10, 18}, /* cost of storing fp registers
1249 in SFmode, DFmode and XFmode */
1250 4, /* cost of moving MMX register */
1251 {12, 12}, /* cost of loading MMX registers
1252 in SImode and DImode */
1253 {10, 10}, /* cost of storing MMX registers
1254 in SImode and DImode */
1255 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1256 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1257 in 32,64,128,256 and 512-bit */
1258 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1259 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1260 in 32,64,128,256 and 512-bit */
1261 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1262 16, 20, /* SSE->integer and integer->SSE moves */
1263 12, 12, /* Gather load static, per_elt. */
1264 10, 10, /* Gather store static, per_elt. */
1265 16, /* size of l1 cache. */
1266 2048, /* size of l2 cache. */
1267 64, /* size of prefetch block */
1268 /* New AMD processors never drop prefetches; if they cannot be performed
1269 immediately, they are queued. We set number of simultaneous prefetches
1270 to a large constant to reflect this (it probably is not a good idea not
1271 to limit number of prefetches at all, as their execution also takes some
1272 time). */
1273 100, /* number of parallel prefetches */
1274 2, /* Branch cost */
1275 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1276 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1277 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1278 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1279 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1280 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1281
1282 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1283 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1284 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1285 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1286 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1287 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1288 /* 9-24 */
1289 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1290 /* 9-27 */
1291 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1292 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1293 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1294 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1295 bdver3_memcpy,
1296 bdver3_memset,
1297 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1298 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1299};
1300
1301/* BDVER4 has optimized REP instruction for medium sized blocks, but for
1302 very small blocks it is better to use loop. For large blocks, libcall
1303 can do nontemporary accesses and beat inline considerably. */
1304static stringop_algs bdver4_memcpy[2] = {
1305 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1306 {-1, rep_prefix_4_byte, false}}},
1307 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1308 {-1, libcall, false}}}};
1309static stringop_algs bdver4_memset[2] = {
1310 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1311 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1312 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1313 {-1, libcall, false}}}};
1314struct processor_costs bdver4_cost = {
1315 COSTS_N_INSNS (1), /* cost of an add instruction */
1316 COSTS_N_INSNS (1), /* cost of a lea instruction */
1317 COSTS_N_INSNS (1), /* variable shift costs */
1318 COSTS_N_INSNS (1), /* constant shift costs */
1319 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1320 COSTS_N_INSNS (4), /* HI */
1321 COSTS_N_INSNS (4), /* SI */
1322 COSTS_N_INSNS (6), /* DI */
1323 COSTS_N_INSNS (6)}, /* other */
1324 0, /* cost of multiply per each bit set */
1325 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1326 COSTS_N_INSNS (35), /* HI */
1327 COSTS_N_INSNS (51), /* SI */
1328 COSTS_N_INSNS (83), /* DI */
1329 COSTS_N_INSNS (83)}, /* other */
1330 COSTS_N_INSNS (1), /* cost of movsx */
1331 COSTS_N_INSNS (1), /* cost of movzx */
1332 8, /* "large" insn */
1333 9, /* MOVE_RATIO */
1334
1335 /* All move costs are relative to integer->integer move times 2 and thus
1336 they are latency*2. */
1337 8, /* cost for loading QImode using movzbl */
1338 {8, 8, 8}, /* cost of loading integer registers
1339 in QImode, HImode and SImode.
1340 Relative to reg-reg move (2). */
1341 {8, 8, 8}, /* cost of storing integer registers */
1342 4, /* cost of reg,reg fld/fst */
1343 {12, 12, 28}, /* cost of loading fp registers
1344 in SFmode, DFmode and XFmode */
1345 {10, 10, 18}, /* cost of storing fp registers
1346 in SFmode, DFmode and XFmode */
1347 4, /* cost of moving MMX register */
1348 {12, 12}, /* cost of loading MMX registers
1349 in SImode and DImode */
1350 {10, 10}, /* cost of storing MMX registers
1351 in SImode and DImode */
1352 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1353 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1354 in 32,64,128,256 and 512-bit */
1355 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1356 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1357 in 32,64,128,256 and 512-bit */
1358 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1359 16, 20, /* SSE->integer and integer->SSE moves */
1360 12, 12, /* Gather load static, per_elt. */
1361 10, 10, /* Gather store static, per_elt. */
1362 16, /* size of l1 cache. */
1363 2048, /* size of l2 cache. */
1364 64, /* size of prefetch block */
1365 /* New AMD processors never drop prefetches; if they cannot be performed
1366 immediately, they are queued. We set number of simultaneous prefetches
1367 to a large constant to reflect this (it probably is not a good idea not
1368 to limit number of prefetches at all, as their execution also takes some
1369 time). */
1370 100, /* number of parallel prefetches */
1371 2, /* Branch cost */
1372 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1373 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1374 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1375 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1376 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1377 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1378
1379 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1380 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1381 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1382 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1383 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1384 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1385 /* 9-24 */
1386 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1387 /* 9-27 */
1388 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1389 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1390 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1391 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1392 bdver4_memcpy,
1393 bdver4_memset,
1394 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1395 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1396};
1397
1398
1399/* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1400 very small blocks it is better to use loop. For large blocks, libcall
1401 can do nontemporary accesses and beat inline considerably. */
1402static stringop_algs znver1_memcpy[2] = {
1403 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1404 {-1, rep_prefix_4_byte, false}}},
1405 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1406 {-1, libcall, false}}}};
1407static stringop_algs znver1_memset[2] = {
1408 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1409 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1410 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1411 {-1, libcall, false}}}};
1412struct processor_costs znver1_cost = {
1413 COSTS_N_INSNS (1), /* cost of an add instruction. */
1414 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1415 COSTS_N_INSNS (1), /* variable shift costs. */
1416 COSTS_N_INSNS (1), /* constant shift costs. */
1417 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1418 COSTS_N_INSNS (3), /* HI. */
1419 COSTS_N_INSNS (3), /* SI. */
1420 COSTS_N_INSNS (3), /* DI. */
1421 COSTS_N_INSNS (3)}, /* other. */
1422 0, /* cost of multiply per each bit
1423 set. */
1424 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1425 bound. */
1426 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1427 COSTS_N_INSNS (22), /* HI. */
1428 COSTS_N_INSNS (30), /* SI. */
1429 COSTS_N_INSNS (45), /* DI. */
1430 COSTS_N_INSNS (45)}, /* other. */
1431 COSTS_N_INSNS (1), /* cost of movsx. */
1432 COSTS_N_INSNS (1), /* cost of movzx. */
1433 8, /* "large" insn. */
1434 9, /* MOVE_RATIO. */
1435
1436 /* All move costs are relative to integer->integer move times 2 and thus
1437 they are latency*2. */
1438
1439 /* reg-reg moves are done by renaming and thus they are even cheaper than
1440 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1441 to doubles of latencies, we do not model this correctly. It does not
1442 seem to make practical difference to bump prices up even more. */
1443 6, /* cost for loading QImode using
1444 movzbl. */
1445 {6, 6, 6}, /* cost of loading integer registers
1446 in QImode, HImode and SImode.
1447 Relative to reg-reg move (2). */
1448 {8, 8, 8}, /* cost of storing integer
1449 registers. */
1450 2, /* cost of reg,reg fld/fst. */
1451 {6, 6, 16}, /* cost of loading fp registers
1452 in SFmode, DFmode and XFmode. */
1453 {8, 8, 16}, /* cost of storing fp registers
1454 in SFmode, DFmode and XFmode. */
1455 2, /* cost of moving MMX register. */
1456 {6, 6}, /* cost of loading MMX registers
1457 in SImode and DImode. */
1458 {8, 8}, /* cost of storing MMX registers
1459 in SImode and DImode. */
1460 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1461 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1462 in 32,64,128,256 and 512-bit. */
1463 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1464 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1465 in 32,64,128,256 and 512-bit. */
1466 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1467 6, 6, /* SSE->integer and integer->SSE moves. */
1468 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1469 throughput 12. Approx 9 uops do not depend on vector size and every load
1470 is 7 uops. */
1471 18, 8, /* Gather load static, per_elt. */
1472 18, 10, /* Gather store static, per_elt. */
1473 32, /* size of l1 cache. */
1474 512, /* size of l2 cache. */
1475 64, /* size of prefetch block. */
1476 /* New AMD processors never drop prefetches; if they cannot be performed
1477 immediately, they are queued. We set number of simultaneous prefetches
1478 to a large constant to reflect this (it probably is not a good idea not
1479 to limit number of prefetches at all, as their execution also takes some
1480 time). */
1481 100, /* number of parallel prefetches. */
1482 3, /* Branch cost. */
1483 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1484 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1485 /* Latency of fdiv is 8-15. */
1486 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1487 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1488 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1489 /* Latency of fsqrt is 4-10. */
1490 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1491
1492 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1493 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1494 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1495 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1496 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1497 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1498 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1499 /* 9-13 */
1500 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1501 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1502 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1503 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1504 and it can execute 2 integer additions and 2 multiplications thus
1505 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1506 that 4 works better than 6 probably due to register pressure.
1507
1508 Integer vector operations are taken by FP unit and execute 3 vector
1509 plus/minus operations per cycle but only one multiply. This is adjusted
1510 in ix86_reassociation_width. */
1511 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1512 znver1_memcpy,
1513 znver1_memset,
1514 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1515 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1516};
1517
1518/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
1519static stringop_algs skylake_memcpy[2] = {
1520 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1521 {libcall, {{16, loop, false}, {512, rep_prefix_8_byte, false},
1522 {-1, libcall, false}}}};
1523
1524static stringop_algs skylake_memset[2] = {
1525 {libcall, {{6, loop_1_byte, true},
1526 {24, loop, true},
1527 {8192, rep_prefix_4_byte, true},
1528 {-1, libcall, false}}},
1529 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, false},
1530 {-1, libcall, false}}}};
1531
1532static const
1533struct processor_costs skylake_cost = {
1534 COSTS_N_INSNS (1), /* cost of an add instruction */
1535 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
1536 COSTS_N_INSNS (1), /* variable shift costs */
1537 COSTS_N_INSNS (1), /* constant shift costs */
1538 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1539 COSTS_N_INSNS (4), /* HI */
1540 COSTS_N_INSNS (3), /* SI */
1541 COSTS_N_INSNS (4), /* DI */
1542 COSTS_N_INSNS (4)}, /* other */
1543 0, /* cost of multiply per each bit set */
1544 {COSTS_N_INSNS (8), /* cost of a divide/mod for QI */
1545 COSTS_N_INSNS (8), /* HI */
1546 COSTS_N_INSNS (11), /* SI */
1547 COSTS_N_INSNS (76), /* DI */
1548 COSTS_N_INSNS (76)}, /* other */
1549 COSTS_N_INSNS (1), /* cost of movsx */
1550 COSTS_N_INSNS (0), /* cost of movzx */
1551 8, /* "large" insn */
1552 17, /* MOVE_RATIO */
1553
1554 6, /* cost for loading QImode using movzbl */
1555 {4, 4, 4}, /* cost of loading integer registers
1556 in QImode, HImode and SImode.
1557 Relative to reg-reg move (2). */
1558 {6, 6, 6}, /* cost of storing integer registers */
1559 2, /* cost of reg,reg fld/fst */
1560 {6, 6, 8}, /* cost of loading fp registers
1561 in SFmode, DFmode and XFmode */
1562 {6, 6, 10}, /* cost of storing fp registers
1563 in SFmode, DFmode and XFmode */
1564 2, /* cost of moving MMX register */
1565 {6, 6}, /* cost of loading MMX registers
1566 in SImode and DImode */
1567 {6, 6}, /* cost of storing MMX registers
1568 in SImode and DImode */
1569 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
1570 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1571 in 32,64,128,256 and 512-bit */
1572 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1573 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1574 in 32,64,128,256 and 512-bit */
1575 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1576 2, 2, /* SSE->integer and integer->SSE moves */
1577 20, 8, /* Gather load static, per_elt. */
1578 22, 10, /* Gather store static, per_elt. */
1579 64, /* size of l1 cache. */
1580 512, /* size of l2 cache. */
1581 64, /* size of prefetch block */
1582 6, /* number of parallel prefetches */
1583 3, /* Branch cost */
1584 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
1585 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1586 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1587 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1588 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1589 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
1590
1591 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1592 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1593 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1594 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1595 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1596 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1597 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
1598 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
1599 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
1600 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1601 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1602 skylake_memcpy,
1603 skylake_memset,
1604 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1605 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1606};
1607 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1608 very small blocks it is better to use loop. For large blocks, libcall can
1609 do nontemporary accesses and beat inline considerably. */
1610static stringop_algs btver1_memcpy[2] = {
1611 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1612 {-1, rep_prefix_4_byte, false}}},
1613 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1614 {-1, libcall, false}}}};
1615static stringop_algs btver1_memset[2] = {
1616 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1617 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1618 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1619 {-1, libcall, false}}}};
1620const struct processor_costs btver1_cost = {
1621 COSTS_N_INSNS (1), /* cost of an add instruction */
1622 COSTS_N_INSNS (2), /* cost of a lea instruction */
1623 COSTS_N_INSNS (1), /* variable shift costs */
1624 COSTS_N_INSNS (1), /* constant shift costs */
1625 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1626 COSTS_N_INSNS (4), /* HI */
1627 COSTS_N_INSNS (3), /* SI */
1628 COSTS_N_INSNS (4), /* DI */
1629 COSTS_N_INSNS (5)}, /* other */
1630 0, /* cost of multiply per each bit set */
1631 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1632 COSTS_N_INSNS (35), /* HI */
1633 COSTS_N_INSNS (51), /* SI */
1634 COSTS_N_INSNS (83), /* DI */
1635 COSTS_N_INSNS (83)}, /* other */
1636 COSTS_N_INSNS (1), /* cost of movsx */
1637 COSTS_N_INSNS (1), /* cost of movzx */
1638 8, /* "large" insn */
1639 9, /* MOVE_RATIO */
1640
1641 /* All move costs are relative to integer->integer move times 2 and thus
1642 they are latency*2. */
1643 8, /* cost for loading QImode using movzbl */
1644 {6, 8, 6}, /* cost of loading integer registers
1645 in QImode, HImode and SImode.
1646 Relative to reg-reg move (2). */
1647 {6, 8, 6}, /* cost of storing integer registers */
1648 4, /* cost of reg,reg fld/fst */
1649 {12, 12, 28}, /* cost of loading fp registers
1650 in SFmode, DFmode and XFmode */
1651 {12, 12, 38}, /* cost of storing fp registers
1652 in SFmode, DFmode and XFmode */
1653 4, /* cost of moving MMX register */
1654 {10, 10}, /* cost of loading MMX registers
1655 in SImode and DImode */
1656 {12, 12}, /* cost of storing MMX registers
1657 in SImode and DImode */
1658 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1659 {10, 10, 12, 24, 48}, /* cost of loading SSE registers
1660 in 32,64,128,256 and 512-bit */
1661 {10, 10, 12, 24, 48}, /* cost of unaligned loads. */
1662 {10, 10, 12, 24, 48}, /* cost of storing SSE registers
1663 in 32,64,128,256 and 512-bit */
1664 {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
1665 14, 14, /* SSE->integer and integer->SSE moves */
1666 10, 10, /* Gather load static, per_elt. */
1667 10, 10, /* Gather store static, per_elt. */
1668 32, /* size of l1 cache. */
1669 512, /* size of l2 cache. */
1670 64, /* size of prefetch block */
1671 100, /* number of parallel prefetches */
1672 2, /* Branch cost */
1673 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1674 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1675 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1676 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1677 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1678 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1679
1680 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1681 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1682 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1683 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1684 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1685 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1686 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1687 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
1688 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
1689 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
1690 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1691 btver1_memcpy,
1692 btver1_memset,
1693 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1694 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1695};
1696
1697static stringop_algs btver2_memcpy[2] = {
1698 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1699 {-1, rep_prefix_4_byte, false}}},
1700 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1701 {-1, libcall, false}}}};
1702static stringop_algs btver2_memset[2] = {
1703 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1704 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1705 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1706 {-1, libcall, false}}}};
1707const struct processor_costs btver2_cost = {
1708 COSTS_N_INSNS (1), /* cost of an add instruction */
1709 COSTS_N_INSNS (2), /* cost of a lea instruction */
1710 COSTS_N_INSNS (1), /* variable shift costs */
1711 COSTS_N_INSNS (1), /* constant shift costs */
1712 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1713 COSTS_N_INSNS (4), /* HI */
1714 COSTS_N_INSNS (3), /* SI */
1715 COSTS_N_INSNS (4), /* DI */
1716 COSTS_N_INSNS (5)}, /* other */
1717 0, /* cost of multiply per each bit set */
1718 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1719 COSTS_N_INSNS (35), /* HI */
1720 COSTS_N_INSNS (51), /* SI */
1721 COSTS_N_INSNS (83), /* DI */
1722 COSTS_N_INSNS (83)}, /* other */
1723 COSTS_N_INSNS (1), /* cost of movsx */
1724 COSTS_N_INSNS (1), /* cost of movzx */
1725 8, /* "large" insn */
1726 9, /* MOVE_RATIO */
1727
1728 /* All move costs are relative to integer->integer move times 2 and thus
1729 they are latency*2. */
1730 8, /* cost for loading QImode using movzbl */
1731 {8, 8, 6}, /* cost of loading integer registers
1732 in QImode, HImode and SImode.
1733 Relative to reg-reg move (2). */
1734 {8, 8, 6}, /* cost of storing integer registers */
1735 4, /* cost of reg,reg fld/fst */
1736 {12, 12, 28}, /* cost of loading fp registers
1737 in SFmode, DFmode and XFmode */
1738 {12, 12, 38}, /* cost of storing fp registers
1739 in SFmode, DFmode and XFmode */
1740 4, /* cost of moving MMX register */
1741 {10, 10}, /* cost of loading MMX registers
1742 in SImode and DImode */
1743 {12, 12}, /* cost of storing MMX registers
1744 in SImode and DImode */
1745 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1746 {10, 10, 12, 24, 48}, /* cost of loading SSE registers
1747 in 32,64,128,256 and 512-bit */
1748 {10, 10, 12, 24, 48}, /* cost of unaligned loads. */
1749 {10, 10, 12, 24, 48}, /* cost of storing SSE registers
1750 in 32,64,128,256 and 512-bit */
1751 {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
1752 14, 14, /* SSE->integer and integer->SSE moves */
1753 10, 10, /* Gather load static, per_elt. */
1754 10, 10, /* Gather store static, per_elt. */
1755 32, /* size of l1 cache. */
1756 2048, /* size of l2 cache. */
1757 64, /* size of prefetch block */
1758 100, /* number of parallel prefetches */
1759 2, /* Branch cost */
1760 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1761 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1762 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1763 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1764 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1765 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1766
1767 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1768 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1769 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1770 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1771 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1772 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1773 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1774 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
1775 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
1776 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1777 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1778 btver2_memcpy,
1779 btver2_memset,
1780 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1781 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1782};
1783
1784static stringop_algs pentium4_memcpy[2] = {
1785 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1786 DUMMY_STRINGOP_ALGS};
1787static stringop_algs pentium4_memset[2] = {
1788 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1789 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1790 DUMMY_STRINGOP_ALGS};
1791
1792static const
1793struct processor_costs pentium4_cost = {
1794 COSTS_N_INSNS (1), /* cost of an add instruction */
1795 COSTS_N_INSNS (3), /* cost of a lea instruction */
1796 COSTS_N_INSNS (4), /* variable shift costs */
1797 COSTS_N_INSNS (4), /* constant shift costs */
1798 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1799 COSTS_N_INSNS (15), /* HI */
1800 COSTS_N_INSNS (15), /* SI */
1801 COSTS_N_INSNS (15), /* DI */
1802 COSTS_N_INSNS (15)}, /* other */
1803 0, /* cost of multiply per each bit set */
1804 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1805 COSTS_N_INSNS (56), /* HI */
1806 COSTS_N_INSNS (56), /* SI */
1807 COSTS_N_INSNS (56), /* DI */
1808 COSTS_N_INSNS (56)}, /* other */
1809 COSTS_N_INSNS (1), /* cost of movsx */
1810 COSTS_N_INSNS (1), /* cost of movzx */
1811 16, /* "large" insn */
1812 6, /* MOVE_RATIO */
1813
1814 /* All move costs are relative to integer->integer move times 2 and thus
1815 they are latency*2. */
1816 5, /* cost for loading QImode using movzbl */
1817 {4, 5, 4}, /* cost of loading integer registers
1818 in QImode, HImode and SImode.
1819 Relative to reg-reg move (2). */
1820 {2, 3, 2}, /* cost of storing integer registers */
1821 12, /* cost of reg,reg fld/fst */
1822 {14, 14, 14}, /* cost of loading fp registers
1823 in SFmode, DFmode and XFmode */
1824 {14, 14, 14}, /* cost of storing fp registers
1825 in SFmode, DFmode and XFmode */
1826 12, /* cost of moving MMX register */
1827 {16, 16}, /* cost of loading MMX registers
1828 in SImode and DImode */
1829 {16, 16}, /* cost of storing MMX registers
1830 in SImode and DImode */
1831 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
1832 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
1833 in 32,64,128,256 and 512-bit */
1834 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
1835 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
1836 in 32,64,128,256 and 512-bit */
1837 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
1838 20, 12, /* SSE->integer and integer->SSE moves */
1839 16, 16, /* Gather load static, per_elt. */
1840 16, 16, /* Gather store static, per_elt. */
1841 8, /* size of l1 cache. */
1842 256, /* size of l2 cache. */
1843 64, /* size of prefetch block */
1844 6, /* number of parallel prefetches */
1845 2, /* Branch cost */
1846 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1847 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1848 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1849 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1850 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1851 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1852
1853 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1854 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1855 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1856 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1857 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1858 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1859 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
1860 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
1861 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
1862 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
1863 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1864 pentium4_memcpy,
1865 pentium4_memset,
1866 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1867 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1868};
1869
1870static stringop_algs nocona_memcpy[2] = {
1871 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1872 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1873 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1874
1875static stringop_algs nocona_memset[2] = {
1876 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1877 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1878 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1879 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1880
1881static const
1882struct processor_costs nocona_cost = {
1883 COSTS_N_INSNS (1), /* cost of an add instruction */
1884 COSTS_N_INSNS (1), /* cost of a lea instruction */
1885 COSTS_N_INSNS (1), /* variable shift costs */
1886 COSTS_N_INSNS (1), /* constant shift costs */
1887 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1888 COSTS_N_INSNS (10), /* HI */
1889 COSTS_N_INSNS (10), /* SI */
1890 COSTS_N_INSNS (10), /* DI */
1891 COSTS_N_INSNS (10)}, /* other */
1892 0, /* cost of multiply per each bit set */
1893 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1894 COSTS_N_INSNS (66), /* HI */
1895 COSTS_N_INSNS (66), /* SI */
1896 COSTS_N_INSNS (66), /* DI */
1897 COSTS_N_INSNS (66)}, /* other */
1898 COSTS_N_INSNS (1), /* cost of movsx */
1899 COSTS_N_INSNS (1), /* cost of movzx */
1900 16, /* "large" insn */
1901 17, /* MOVE_RATIO */
1902
1903 /* All move costs are relative to integer->integer move times 2 and thus
1904 they are latency*2. */
1905 4, /* cost for loading QImode using movzbl */
1906 {4, 4, 4}, /* cost of loading integer registers
1907 in QImode, HImode and SImode.
1908 Relative to reg-reg move (2). */
1909 {4, 4, 4}, /* cost of storing integer registers */
1910 12, /* cost of reg,reg fld/fst */
1911 {14, 14, 14}, /* cost of loading fp registers
1912 in SFmode, DFmode and XFmode */
1913 {14, 14, 14}, /* cost of storing fp registers
1914 in SFmode, DFmode and XFmode */
1915 14, /* cost of moving MMX register */
1916 {12, 12}, /* cost of loading MMX registers
1917 in SImode and DImode */
1918 {12, 12}, /* cost of storing MMX registers
1919 in SImode and DImode */
1920 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
1921 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
1922 in 32,64,128,256 and 512-bit */
1923 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
1924 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
1925 in 32,64,128,256 and 512-bit */
1926 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
1927 20, 12, /* SSE->integer and integer->SSE moves */
1928 12, 12, /* Gather load static, per_elt. */
1929 12, 12, /* Gather store static, per_elt. */
1930 8, /* size of l1 cache. */
1931 1024, /* size of l2 cache. */
1932 64, /* size of prefetch block */
1933 8, /* number of parallel prefetches */
1934 1, /* Branch cost */
1935 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1936 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1937 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1938 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1939 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1940 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1941
1942 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1943 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
1944 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
1945 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
1946 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
1947 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
1948 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
1949 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
1950 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
1951 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
1952 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1953 nocona_memcpy,
1954 nocona_memset,
1955 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1956 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1957};
1958
1959static stringop_algs atom_memcpy[2] = {
1960 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1961 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1962 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1963static stringop_algs atom_memset[2] = {
1964 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1965 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1966 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1967 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1968static const
1969struct processor_costs atom_cost = {
1970 COSTS_N_INSNS (1), /* cost of an add instruction */
1971 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1972 COSTS_N_INSNS (1), /* variable shift costs */
1973 COSTS_N_INSNS (1), /* constant shift costs */
1974 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1975 COSTS_N_INSNS (4), /* HI */
1976 COSTS_N_INSNS (3), /* SI */
1977 COSTS_N_INSNS (4), /* DI */
1978 COSTS_N_INSNS (2)}, /* other */
1979 0, /* cost of multiply per each bit set */
1980 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1981 COSTS_N_INSNS (26), /* HI */
1982 COSTS_N_INSNS (42), /* SI */
1983 COSTS_N_INSNS (74), /* DI */
1984 COSTS_N_INSNS (74)}, /* other */
1985 COSTS_N_INSNS (1), /* cost of movsx */
1986 COSTS_N_INSNS (1), /* cost of movzx */
1987 8, /* "large" insn */
1988 17, /* MOVE_RATIO */
1989
1990 /* All move costs are relative to integer->integer move times 2 and thus
1991 they are latency*2. */
1992 6, /* cost for loading QImode using movzbl */
1993 {6, 6, 6}, /* cost of loading integer registers
1994 in QImode, HImode and SImode.
1995 Relative to reg-reg move (2). */
1996 {6, 6, 6}, /* cost of storing integer registers */
1997 4, /* cost of reg,reg fld/fst */
1998 {6, 6, 18}, /* cost of loading fp registers
1999 in SFmode, DFmode and XFmode */
2000 {14, 14, 24}, /* cost of storing fp registers
2001 in SFmode, DFmode and XFmode */
2002 2, /* cost of moving MMX register */
2003 {8, 8}, /* cost of loading MMX registers
2004 in SImode and DImode */
2005 {10, 10}, /* cost of storing MMX registers
2006 in SImode and DImode */
2007 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2008 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2009 in 32,64,128,256 and 512-bit */
2010 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2011 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2012 in 32,64,128,256 and 512-bit */
2013 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2014 8, 6, /* SSE->integer and integer->SSE moves */
2015 8, 8, /* Gather load static, per_elt. */
2016 8, 8, /* Gather store static, per_elt. */
2017 32, /* size of l1 cache. */
2018 256, /* size of l2 cache. */
2019 64, /* size of prefetch block */
2020 6, /* number of parallel prefetches */
2021 3, /* Branch cost */
2022 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2023 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2024 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2025 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2026 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2027 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2028
2029 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2030 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2031 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2032 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2033 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2034 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2035 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
2036 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
2037 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
2038 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
2039 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2040 atom_memcpy,
2041 atom_memset,
2042 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2043 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2044};
2045
2046static stringop_algs slm_memcpy[2] = {
2047 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2048 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2049 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2050static stringop_algs slm_memset[2] = {
2051 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2052 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2053 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2054 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2055static const
2056struct processor_costs slm_cost = {
2057 COSTS_N_INSNS (1), /* cost of an add instruction */
2058 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2059 COSTS_N_INSNS (1), /* variable shift costs */
2060 COSTS_N_INSNS (1), /* constant shift costs */
2061 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2062 COSTS_N_INSNS (3), /* HI */
2063 COSTS_N_INSNS (3), /* SI */
2064 COSTS_N_INSNS (4), /* DI */
2065 COSTS_N_INSNS (2)}, /* other */
2066 0, /* cost of multiply per each bit set */
2067 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2068 COSTS_N_INSNS (26), /* HI */
2069 COSTS_N_INSNS (42), /* SI */
2070 COSTS_N_INSNS (74), /* DI */
2071 COSTS_N_INSNS (74)}, /* other */
2072 COSTS_N_INSNS (1), /* cost of movsx */
2073 COSTS_N_INSNS (1), /* cost of movzx */
2074 8, /* "large" insn */
2075 17, /* MOVE_RATIO */
2076
2077 /* All move costs are relative to integer->integer move times 2 and thus
2078 they are latency*2. */
2079 8, /* cost for loading QImode using movzbl */
2080 {8, 8, 8}, /* cost of loading integer registers
2081 in QImode, HImode and SImode.
2082 Relative to reg-reg move (2). */
2083 {6, 6, 6}, /* cost of storing integer registers */
2084 2, /* cost of reg,reg fld/fst */
2085 {8, 8, 18}, /* cost of loading fp registers
2086 in SFmode, DFmode and XFmode */
2087 {6, 6, 18}, /* cost of storing fp registers
2088 in SFmode, DFmode and XFmode */
2089 2, /* cost of moving MMX register */
2090 {8, 8}, /* cost of loading MMX registers
2091 in SImode and DImode */
2092 {6, 6}, /* cost of storing MMX registers
2093 in SImode and DImode */
2094 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2095 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2096 in 32,64,128,256 and 512-bit */
2097 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2098 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2099 in 32,64,128,256 and 512-bit */
2100 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2101 8, 6, /* SSE->integer and integer->SSE moves */
2102 8, 8, /* Gather load static, per_elt. */
2103 8, 8, /* Gather store static, per_elt. */
2104 32, /* size of l1 cache. */
2105 256, /* size of l2 cache. */
2106 64, /* size of prefetch block */
2107 6, /* number of parallel prefetches */
2108 3, /* Branch cost */
2109 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2110 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2111 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2112 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2113 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2114 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2115
2116 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2117 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2118 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2119 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2120 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2121 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2122 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
2123 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
2124 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
2125 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
2126 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2127 slm_memcpy,
2128 slm_memset,
2129 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2130 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2131};
2132
2133static stringop_algs intel_memcpy[2] = {
2134 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2135 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2136 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2137static stringop_algs intel_memset[2] = {
2138 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2139 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2140 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2141 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2142static const
2143struct processor_costs intel_cost = {
2144 COSTS_N_INSNS (1), /* cost of an add instruction */
2145 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2146 COSTS_N_INSNS (1), /* variable shift costs */
2147 COSTS_N_INSNS (1), /* constant shift costs */
2148 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2149 COSTS_N_INSNS (3), /* HI */
2150 COSTS_N_INSNS (3), /* SI */
2151 COSTS_N_INSNS (4), /* DI */
2152 COSTS_N_INSNS (2)}, /* other */
2153 0, /* cost of multiply per each bit set */
2154 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2155 COSTS_N_INSNS (26), /* HI */
2156 COSTS_N_INSNS (42), /* SI */
2157 COSTS_N_INSNS (74), /* DI */
2158 COSTS_N_INSNS (74)}, /* other */
2159 COSTS_N_INSNS (1), /* cost of movsx */
2160 COSTS_N_INSNS (1), /* cost of movzx */
2161 8, /* "large" insn */
2162 17, /* MOVE_RATIO */
2163
2164 /* All move costs are relative to integer->integer move times 2 and thus
2165 they are latency*2. */
2166 6, /* cost for loading QImode using movzbl */
2167 {4, 4, 4}, /* cost of loading integer registers
2168 in QImode, HImode and SImode.
2169 Relative to reg-reg move (2). */
2170 {6, 6, 6}, /* cost of storing integer registers */
2171 2, /* cost of reg,reg fld/fst */
2172 {6, 6, 8}, /* cost of loading fp registers
2173 in SFmode, DFmode and XFmode */
2174 {6, 6, 10}, /* cost of storing fp registers
2175 in SFmode, DFmode and XFmode */
2176 2, /* cost of moving MMX register */
2177 {6, 6}, /* cost of loading MMX registers
2178 in SImode and DImode */
2179 {6, 6}, /* cost of storing MMX registers
2180 in SImode and DImode */
2181 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
2182 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
2183 in 32,64,128,256 and 512-bit */
2184 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2185 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
2186 in 32,64,128,256 and 512-bit */
2187 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2188 4, 4, /* SSE->integer and integer->SSE moves */
2189 6, 6, /* Gather load static, per_elt. */
2190 6, 6, /* Gather store static, per_elt. */
2191 32, /* size of l1 cache. */
2192 256, /* size of l2 cache. */
2193 64, /* size of prefetch block */
2194 6, /* number of parallel prefetches */
2195 3, /* Branch cost */
2196 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2197 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2198 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2199 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2200 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2201 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2202
2203 COSTS_N_INSNS (8), /* cost of cheap SSE instruction. */
2204 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2205 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2206 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
2207 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2208 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2209 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2210 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2211 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2212 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
2213 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2214 intel_memcpy,
2215 intel_memset,
2216 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2217 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2218};
2219
2220/* Generic should produce code tuned for Core-i7 (and newer chips)
2221 and btver1 (and newer chips). */
2222
2223static stringop_algs generic_memcpy[2] = {
2224 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2225 {-1, libcall, false}}},
2226 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2227 {-1, libcall, false}}}};
2228static stringop_algs generic_memset[2] = {
2229 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2230 {-1, libcall, false}}},
2231 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2232 {-1, libcall, false}}}};
2233static const
2234struct processor_costs generic_cost = {
2235 COSTS_N_INSNS (1), /* cost of an add instruction */
2236 /* Setting cost to 2 makes our current implementation of synth_mult result in
2237 use of unnecessary temporary registers causing regression on several
2238 SPECfp benchmarks. */
2239 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2240 COSTS_N_INSNS (1), /* variable shift costs */
2241 COSTS_N_INSNS (1), /* constant shift costs */
2242 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2243 COSTS_N_INSNS (4), /* HI */
2244 COSTS_N_INSNS (3), /* SI */
2245 COSTS_N_INSNS (4), /* DI */
2246 COSTS_N_INSNS (4)}, /* other */
2247 0, /* cost of multiply per each bit set */
2248 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2249 COSTS_N_INSNS (22), /* HI */
2250 COSTS_N_INSNS (30), /* SI */
2251 COSTS_N_INSNS (74), /* DI */
2252 COSTS_N_INSNS (74)}, /* other */
2253 COSTS_N_INSNS (1), /* cost of movsx */
2254 COSTS_N_INSNS (1), /* cost of movzx */
2255 8, /* "large" insn */
2256 17, /* MOVE_RATIO */
2257
2258 /* All move costs are relative to integer->integer move times 2 and thus
2259 they are latency*2. */
2260 4, /* cost for loading QImode using movzbl */
2261 {4, 4, 4}, /* cost of loading integer registers
2262 in QImode, HImode and SImode.
2263 Relative to reg-reg move (2). */
2264 {6, 6, 6}, /* cost of storing integer registers */
2265 4, /* cost of reg,reg fld/fst */
2266 {6, 6, 12}, /* cost of loading fp registers
2267 in SFmode, DFmode and XFmode */
2268 {6, 6, 12}, /* cost of storing fp registers
2269 in SFmode, DFmode and XFmode */
2270 2, /* cost of moving MMX register */
2271 {6, 6}, /* cost of loading MMX registers
2272 in SImode and DImode */
2273 {6, 6}, /* cost of storing MMX registers
2274 in SImode and DImode */
2275 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2276 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2277 in 32,64,128,256 and 512-bit */
2278 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
2279 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2280 in 32,64,128,256 and 512-bit */
2281 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
2282 6, 6, /* SSE->integer and integer->SSE moves */
2283 18, 6, /* Gather load static, per_elt. */
2284 18, 6, /* Gather store static, per_elt. */
2285 32, /* size of l1 cache. */
2286 512, /* size of l2 cache. */
2287 64, /* size of prefetch block */
2288 6, /* number of parallel prefetches */
2289 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2290 value is increased to perhaps more appropriate value of 5. */
2291 3, /* Branch cost */
2292 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2293 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2294 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2295 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2296 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2297 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
2298
2299 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2300 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2301 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2302 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2303 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2304 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2305 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2306 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2307 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2308 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
2309 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
2310 generic_memcpy,
2311 generic_memset,
2312 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2313 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2314};
2315
2316/* core_cost should produce code tuned for Core familly of CPUs. */
2317static stringop_algs core_memcpy[2] = {
2318 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2319 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2320 {-1, libcall, false}}}};
2321static stringop_algs core_memset[2] = {
2322 {libcall, {{6, loop_1_byte, true},
2323 {24, loop, true},
2324 {8192, rep_prefix_4_byte, true},
2325 {-1, libcall, false}}},
2326 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2327 {-1, libcall, false}}}};
2328
2329static const
2330struct processor_costs core_cost = {
2331 COSTS_N_INSNS (1), /* cost of an add instruction */
2332 /* On all chips taken into consideration lea is 2 cycles and more. With
2333 this cost however our current implementation of synth_mult results in
2334 use of unnecessary temporary registers causing regression on several
2335 SPECfp benchmarks. */
2336 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2337 COSTS_N_INSNS (1), /* variable shift costs */
2338 COSTS_N_INSNS (1), /* constant shift costs */
2339 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2340 COSTS_N_INSNS (4), /* HI */
2341 COSTS_N_INSNS (3), /* SI */
2342 COSTS_N_INSNS (4), /* DI */
2343 COSTS_N_INSNS (4)}, /* other */
2344 0, /* cost of multiply per each bit set */
2345 {COSTS_N_INSNS (8), /* cost of a divide/mod for QI */
2346 COSTS_N_INSNS (8), /* HI */
2347 /* 8-11 */
2348 COSTS_N_INSNS (11), /* SI */
2349 /* 24-81 */
2350 COSTS_N_INSNS (81), /* DI */
2351 COSTS_N_INSNS (81)}, /* other */
2352 COSTS_N_INSNS (1), /* cost of movsx */
2353 COSTS_N_INSNS (1), /* cost of movzx */
2354 8, /* "large" insn */
2355 17, /* MOVE_RATIO */
2356
2357 /* All move costs are relative to integer->integer move times 2 and thus
2358 they are latency*2. */
2359 6, /* cost for loading QImode using movzbl */
2360 {4, 4, 4}, /* cost of loading integer registers
2361 in QImode, HImode and SImode.
2362 Relative to reg-reg move (2). */
2363 {6, 6, 6}, /* cost of storing integer registers */
2364 2, /* cost of reg,reg fld/fst */
2365 {6, 6, 8}, /* cost of loading fp registers
2366 in SFmode, DFmode and XFmode */
2367 {6, 6, 10}, /* cost of storing fp registers
2368 in SFmode, DFmode and XFmode */
2369 2, /* cost of moving MMX register */
2370 {6, 6}, /* cost of loading MMX registers
2371 in SImode and DImode */
2372 {6, 6}, /* cost of storing MMX registers
2373 in SImode and DImode */
2374 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2375 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
2376 in 32,64,128,256 and 512-bit */
2377 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
2378 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
2379 in 32,64,128,256 and 512-bit */
2380 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2381 2, 2, /* SSE->integer and integer->SSE moves */
2382 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2383 rec. throughput 6.
2384 So 5 uops statically and one uops per load. */
2385 10, 6, /* Gather load static, per_elt. */
2386 10, 6, /* Gather store static, per_elt. */
2387 64, /* size of l1 cache. */
2388 512, /* size of l2 cache. */
2389 64, /* size of prefetch block */
2390 6, /* number of parallel prefetches */
2391 /* FIXME perhaps more appropriate value is 5. */
2392 3, /* Branch cost */
2393 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2394 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2395 /* 10-24 */
2396 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
2397 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2398 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2399 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
2400
2401 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2402 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2403 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2404 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2405 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2406 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2407 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2408 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2409 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2410 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
2411 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2412 core_memcpy,
2413 core_memset,
2414 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2415 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2416};
2417
2418