1 | /* Costs of operations of individual x86 CPUs. |
2 | Copyright (C) 1988-2017 Free Software Foundation, Inc. |
3 | |
4 | This file is part of GCC. |
5 | |
6 | GCC is free software; you can redistribute it and/or modify |
7 | it under the terms of the GNU General Public License as published by |
8 | the Free Software Foundation; either version 3, or (at your option) |
9 | any later version. |
10 | |
11 | GCC is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | GNU General Public License for more details. |
15 | |
16 | Under Section 7 of GPL version 3, you are granted additional |
17 | permissions described in the GCC Runtime Library Exception, version |
18 | 3.1, as published by the Free Software Foundation. |
19 | |
20 | You should have received a copy of the GNU General Public License and |
21 | a copy of the GCC Runtime Library Exception along with this program; |
22 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
23 | <http://www.gnu.org/licenses/>. */ |
24 | /* Processor costs (relative to an add) */ |
25 | /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ |
26 | #define COSTS_N_BYTES(N) ((N) * 2) |
27 | |
28 | #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}} |
29 | |
30 | static stringop_algs ix86_size_memcpy[2] = { |
31 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, |
32 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; |
33 | static stringop_algs ix86_size_memset[2] = { |
34 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, |
35 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; |
36 | |
37 | const |
38 | struct processor_costs ix86_size_cost = {/* costs for tuning for size */ |
39 | COSTS_N_BYTES (2), /* cost of an add instruction */ |
40 | COSTS_N_BYTES (3), /* cost of a lea instruction */ |
41 | COSTS_N_BYTES (2), /* variable shift costs */ |
42 | COSTS_N_BYTES (3), /* constant shift costs */ |
43 | {COSTS_N_BYTES (3), /* cost of starting multiply for QI */ |
44 | COSTS_N_BYTES (3), /* HI */ |
45 | COSTS_N_BYTES (3), /* SI */ |
46 | COSTS_N_BYTES (3), /* DI */ |
47 | COSTS_N_BYTES (5)}, /* other */ |
48 | 0, /* cost of multiply per each bit set */ |
49 | {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */ |
50 | COSTS_N_BYTES (3), /* HI */ |
51 | COSTS_N_BYTES (3), /* SI */ |
52 | COSTS_N_BYTES (3), /* DI */ |
53 | COSTS_N_BYTES (5)}, /* other */ |
54 | COSTS_N_BYTES (3), /* cost of movsx */ |
55 | COSTS_N_BYTES (3), /* cost of movzx */ |
56 | 0, /* "large" insn */ |
57 | 2, /* MOVE_RATIO */ |
58 | |
59 | /* All move costs are relative to integer->integer move times 2. */ |
60 | 2, /* cost for loading QImode using movzbl */ |
61 | {2, 2, 2}, /* cost of loading integer registers |
62 | in QImode, HImode and SImode. |
63 | Relative to reg-reg move (2). */ |
64 | {2, 2, 2}, /* cost of storing integer registers */ |
65 | 2, /* cost of reg,reg fld/fst */ |
66 | {2, 2, 2}, /* cost of loading fp registers |
67 | in SFmode, DFmode and XFmode */ |
68 | {2, 2, 2}, /* cost of storing fp registers |
69 | in SFmode, DFmode and XFmode */ |
70 | 3, /* cost of moving MMX register */ |
71 | {3, 3}, /* cost of loading MMX registers |
72 | in SImode and DImode */ |
73 | {3, 3}, /* cost of storing MMX registers |
74 | in SImode and DImode */ |
75 | 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ |
76 | {3, 3, 3, 3, 3}, /* cost of loading SSE registers |
77 | in 32,64,128,256 and 512-bit */ |
78 | {3, 3, 3, 3, 3}, /* cost of unaligned SSE load |
79 | in 128bit, 256bit and 512bit */ |
80 | {3, 3, 3, 3, 3}, /* cost of storing SSE registers |
81 | in 32,64,128,256 and 512-bit */ |
82 | {3, 3, 3, 3, 3}, /* cost of unaligned SSE store |
83 | in 128bit, 256bit and 512bit */ |
84 | 3, 3, /* SSE->integer and integer->SSE moves */ |
85 | 5, 0, /* Gather load static, per_elt. */ |
86 | 5, 0, /* Gather store static, per_elt. */ |
87 | 0, /* size of l1 cache */ |
88 | 0, /* size of l2 cache */ |
89 | 0, /* size of prefetch block */ |
90 | 0, /* number of parallel prefetches */ |
91 | 2, /* Branch cost */ |
92 | COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */ |
93 | COSTS_N_BYTES (2), /* cost of FMUL instruction. */ |
94 | COSTS_N_BYTES (2), /* cost of FDIV instruction. */ |
95 | COSTS_N_BYTES (2), /* cost of FABS instruction. */ |
96 | COSTS_N_BYTES (2), /* cost of FCHS instruction. */ |
97 | COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ |
98 | |
99 | COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */ |
100 | COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */ |
101 | COSTS_N_BYTES (2), /* cost of MULSS instruction. */ |
102 | COSTS_N_BYTES (2), /* cost of MULSD instruction. */ |
103 | COSTS_N_BYTES (2), /* cost of FMA SS instruction. */ |
104 | COSTS_N_BYTES (2), /* cost of FMA SD instruction. */ |
105 | COSTS_N_BYTES (2), /* cost of DIVSS instruction. */ |
106 | COSTS_N_BYTES (2), /* cost of DIVSD instruction. */ |
107 | COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */ |
108 | COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */ |
109 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
110 | ix86_size_memcpy, |
111 | ix86_size_memset, |
112 | COSTS_N_BYTES (1), /* cond_taken_branch_cost. */ |
113 | COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */ |
114 | }; |
115 | |
116 | /* Processor costs (relative to an add) */ |
117 | static stringop_algs i386_memcpy[2] = { |
118 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, |
119 | DUMMY_STRINGOP_ALGS}; |
120 | static stringop_algs i386_memset[2] = { |
121 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, |
122 | DUMMY_STRINGOP_ALGS}; |
123 | |
124 | static const |
125 | struct processor_costs i386_cost = { /* 386 specific costs */ |
126 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
127 | COSTS_N_INSNS (1), /* cost of a lea instruction */ |
128 | COSTS_N_INSNS (3), /* variable shift costs */ |
129 | COSTS_N_INSNS (2), /* constant shift costs */ |
130 | {COSTS_N_INSNS (6), /* cost of starting multiply for QI */ |
131 | COSTS_N_INSNS (6), /* HI */ |
132 | COSTS_N_INSNS (6), /* SI */ |
133 | COSTS_N_INSNS (6), /* DI */ |
134 | COSTS_N_INSNS (6)}, /* other */ |
135 | COSTS_N_INSNS (1), /* cost of multiply per each bit set */ |
136 | {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */ |
137 | COSTS_N_INSNS (23), /* HI */ |
138 | COSTS_N_INSNS (23), /* SI */ |
139 | COSTS_N_INSNS (23), /* DI */ |
140 | COSTS_N_INSNS (23)}, /* other */ |
141 | COSTS_N_INSNS (3), /* cost of movsx */ |
142 | COSTS_N_INSNS (2), /* cost of movzx */ |
143 | 15, /* "large" insn */ |
144 | 3, /* MOVE_RATIO */ |
145 | |
146 | /* All move costs are relative to integer->integer move times 2 and thus |
147 | they are latency*2. */ |
148 | 4, /* cost for loading QImode using movzbl */ |
149 | {2, 4, 2}, /* cost of loading integer registers |
150 | in QImode, HImode and SImode. |
151 | Relative to reg-reg move (2). */ |
152 | {2, 4, 2}, /* cost of storing integer registers */ |
153 | 2, /* cost of reg,reg fld/fst */ |
154 | {8, 8, 8}, /* cost of loading fp registers |
155 | in SFmode, DFmode and XFmode */ |
156 | {8, 8, 8}, /* cost of storing fp registers |
157 | in SFmode, DFmode and XFmode */ |
158 | 2, /* cost of moving MMX register */ |
159 | {4, 8}, /* cost of loading MMX registers |
160 | in SImode and DImode */ |
161 | {4, 8}, /* cost of storing MMX registers |
162 | in SImode and DImode */ |
163 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
164 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers |
165 | in 32,64,128,256 and 512-bit */ |
166 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ |
167 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers |
168 | in 32,64,128,256 and 512-bit */ |
169 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ |
170 | 3, 3, /* SSE->integer and integer->SSE moves */ |
171 | 4, 4, /* Gather load static, per_elt. */ |
172 | 4, 4, /* Gather store static, per_elt. */ |
173 | 0, /* size of l1 cache */ |
174 | 0, /* size of l2 cache */ |
175 | 0, /* size of prefetch block */ |
176 | 0, /* number of parallel prefetches */ |
177 | 1, /* Branch cost */ |
178 | COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */ |
179 | COSTS_N_INSNS (27), /* cost of FMUL instruction. */ |
180 | COSTS_N_INSNS (88), /* cost of FDIV instruction. */ |
181 | COSTS_N_INSNS (22), /* cost of FABS instruction. */ |
182 | COSTS_N_INSNS (24), /* cost of FCHS instruction. */ |
183 | COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ |
184 | |
185 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
186 | COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */ |
187 | COSTS_N_INSNS (27), /* cost of MULSS instruction. */ |
188 | COSTS_N_INSNS (27), /* cost of MULSD instruction. */ |
189 | COSTS_N_INSNS (27), /* cost of FMA SS instruction. */ |
190 | COSTS_N_INSNS (27), /* cost of FMA SD instruction. */ |
191 | COSTS_N_INSNS (88), /* cost of DIVSS instruction. */ |
192 | COSTS_N_INSNS (88), /* cost of DIVSD instruction. */ |
193 | COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */ |
194 | COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */ |
195 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
196 | i386_memcpy, |
197 | i386_memset, |
198 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
199 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
200 | }; |
201 | |
202 | static stringop_algs i486_memcpy[2] = { |
203 | {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, |
204 | DUMMY_STRINGOP_ALGS}; |
205 | static stringop_algs i486_memset[2] = { |
206 | {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, |
207 | DUMMY_STRINGOP_ALGS}; |
208 | |
209 | static const |
210 | struct processor_costs i486_cost = { /* 486 specific costs */ |
211 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
212 | COSTS_N_INSNS (1), /* cost of a lea instruction */ |
213 | COSTS_N_INSNS (3), /* variable shift costs */ |
214 | COSTS_N_INSNS (2), /* constant shift costs */ |
215 | {COSTS_N_INSNS (12), /* cost of starting multiply for QI */ |
216 | COSTS_N_INSNS (12), /* HI */ |
217 | COSTS_N_INSNS (12), /* SI */ |
218 | COSTS_N_INSNS (12), /* DI */ |
219 | COSTS_N_INSNS (12)}, /* other */ |
220 | 1, /* cost of multiply per each bit set */ |
221 | {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */ |
222 | COSTS_N_INSNS (40), /* HI */ |
223 | COSTS_N_INSNS (40), /* SI */ |
224 | COSTS_N_INSNS (40), /* DI */ |
225 | COSTS_N_INSNS (40)}, /* other */ |
226 | COSTS_N_INSNS (3), /* cost of movsx */ |
227 | COSTS_N_INSNS (2), /* cost of movzx */ |
228 | 15, /* "large" insn */ |
229 | 3, /* MOVE_RATIO */ |
230 | |
231 | /* All move costs are relative to integer->integer move times 2 and thus |
232 | they are latency*2. */ |
233 | 4, /* cost for loading QImode using movzbl */ |
234 | {2, 4, 2}, /* cost of loading integer registers |
235 | in QImode, HImode and SImode. |
236 | Relative to reg-reg move (2). */ |
237 | {2, 4, 2}, /* cost of storing integer registers */ |
238 | 2, /* cost of reg,reg fld/fst */ |
239 | {8, 8, 8}, /* cost of loading fp registers |
240 | in SFmode, DFmode and XFmode */ |
241 | {8, 8, 8}, /* cost of storing fp registers |
242 | in SFmode, DFmode and XFmode */ |
243 | 2, /* cost of moving MMX register */ |
244 | {4, 8}, /* cost of loading MMX registers |
245 | in SImode and DImode */ |
246 | {4, 8}, /* cost of storing MMX registers |
247 | in SImode and DImode */ |
248 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
249 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers |
250 | in 32,64,128,256 and 512-bit */ |
251 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ |
252 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers |
253 | in 32,64,128,256 and 512-bit */ |
254 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ |
255 | 3, 3, /* SSE->integer and integer->SSE moves */ |
256 | 4, 4, /* Gather load static, per_elt. */ |
257 | 4, 4, /* Gather store static, per_elt. */ |
258 | 4, /* size of l1 cache. 486 has 8kB cache |
259 | shared for code and data, so 4kB is |
260 | not really precise. */ |
261 | 4, /* size of l2 cache */ |
262 | 0, /* size of prefetch block */ |
263 | 0, /* number of parallel prefetches */ |
264 | 1, /* Branch cost */ |
265 | COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ |
266 | COSTS_N_INSNS (16), /* cost of FMUL instruction. */ |
267 | COSTS_N_INSNS (73), /* cost of FDIV instruction. */ |
268 | COSTS_N_INSNS (3), /* cost of FABS instruction. */ |
269 | COSTS_N_INSNS (3), /* cost of FCHS instruction. */ |
270 | COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ |
271 | |
272 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
273 | COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ |
274 | COSTS_N_INSNS (16), /* cost of MULSS instruction. */ |
275 | COSTS_N_INSNS (16), /* cost of MULSD instruction. */ |
276 | COSTS_N_INSNS (16), /* cost of FMA SS instruction. */ |
277 | COSTS_N_INSNS (16), /* cost of FMA SD instruction. */ |
278 | COSTS_N_INSNS (73), /* cost of DIVSS instruction. */ |
279 | COSTS_N_INSNS (74), /* cost of DIVSD instruction. */ |
280 | COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */ |
281 | COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */ |
282 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
283 | i486_memcpy, |
284 | i486_memset, |
285 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
286 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
287 | }; |
288 | |
289 | static stringop_algs pentium_memcpy[2] = { |
290 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
291 | DUMMY_STRINGOP_ALGS}; |
292 | static stringop_algs pentium_memset[2] = { |
293 | {libcall, {{-1, rep_prefix_4_byte, false}}}, |
294 | DUMMY_STRINGOP_ALGS}; |
295 | |
296 | static const |
297 | struct processor_costs pentium_cost = { |
298 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
299 | COSTS_N_INSNS (1), /* cost of a lea instruction */ |
300 | COSTS_N_INSNS (4), /* variable shift costs */ |
301 | COSTS_N_INSNS (1), /* constant shift costs */ |
302 | {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ |
303 | COSTS_N_INSNS (11), /* HI */ |
304 | COSTS_N_INSNS (11), /* SI */ |
305 | COSTS_N_INSNS (11), /* DI */ |
306 | COSTS_N_INSNS (11)}, /* other */ |
307 | 0, /* cost of multiply per each bit set */ |
308 | {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ |
309 | COSTS_N_INSNS (25), /* HI */ |
310 | COSTS_N_INSNS (25), /* SI */ |
311 | COSTS_N_INSNS (25), /* DI */ |
312 | COSTS_N_INSNS (25)}, /* other */ |
313 | COSTS_N_INSNS (3), /* cost of movsx */ |
314 | COSTS_N_INSNS (2), /* cost of movzx */ |
315 | 8, /* "large" insn */ |
316 | 6, /* MOVE_RATIO */ |
317 | |
318 | /* All move costs are relative to integer->integer move times 2 and thus |
319 | they are latency*2. */ |
320 | 6, /* cost for loading QImode using movzbl */ |
321 | {2, 4, 2}, /* cost of loading integer registers |
322 | in QImode, HImode and SImode. |
323 | Relative to reg-reg move (2). */ |
324 | {2, 4, 2}, /* cost of storing integer registers */ |
325 | 2, /* cost of reg,reg fld/fst */ |
326 | {2, 2, 6}, /* cost of loading fp registers |
327 | in SFmode, DFmode and XFmode */ |
328 | {4, 4, 6}, /* cost of storing fp registers |
329 | in SFmode, DFmode and XFmode */ |
330 | 8, /* cost of moving MMX register */ |
331 | {8, 8}, /* cost of loading MMX registers |
332 | in SImode and DImode */ |
333 | {8, 8}, /* cost of storing MMX registers |
334 | in SImode and DImode */ |
335 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
336 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers |
337 | in 32,64,128,256 and 512-bit */ |
338 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ |
339 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers |
340 | in 32,64,128,256 and 512-bit */ |
341 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ |
342 | 3, 3, /* SSE->integer and integer->SSE moves */ |
343 | 4, 4, /* Gather load static, per_elt. */ |
344 | 4, 4, /* Gather store static, per_elt. */ |
345 | 8, /* size of l1 cache. */ |
346 | 8, /* size of l2 cache */ |
347 | 0, /* size of prefetch block */ |
348 | 0, /* number of parallel prefetches */ |
349 | 2, /* Branch cost */ |
350 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ |
351 | COSTS_N_INSNS (3), /* cost of FMUL instruction. */ |
352 | COSTS_N_INSNS (39), /* cost of FDIV instruction. */ |
353 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ |
354 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ |
355 | COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ |
356 | |
357 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
358 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
359 | COSTS_N_INSNS (3), /* cost of MULSS instruction. */ |
360 | COSTS_N_INSNS (3), /* cost of MULSD instruction. */ |
361 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
362 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ |
363 | COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ |
364 | COSTS_N_INSNS (39), /* cost of DIVSD instruction. */ |
365 | COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */ |
366 | COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */ |
367 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
368 | pentium_memcpy, |
369 | pentium_memset, |
370 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
371 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
372 | }; |
373 | |
374 | static const |
375 | struct processor_costs lakemont_cost = { |
376 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
377 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ |
378 | COSTS_N_INSNS (1), /* variable shift costs */ |
379 | COSTS_N_INSNS (1), /* constant shift costs */ |
380 | {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ |
381 | COSTS_N_INSNS (11), /* HI */ |
382 | COSTS_N_INSNS (11), /* SI */ |
383 | COSTS_N_INSNS (11), /* DI */ |
384 | COSTS_N_INSNS (11)}, /* other */ |
385 | 0, /* cost of multiply per each bit set */ |
386 | {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ |
387 | COSTS_N_INSNS (25), /* HI */ |
388 | COSTS_N_INSNS (25), /* SI */ |
389 | COSTS_N_INSNS (25), /* DI */ |
390 | COSTS_N_INSNS (25)}, /* other */ |
391 | COSTS_N_INSNS (3), /* cost of movsx */ |
392 | COSTS_N_INSNS (2), /* cost of movzx */ |
393 | 8, /* "large" insn */ |
394 | 17, /* MOVE_RATIO */ |
395 | |
396 | /* All move costs are relative to integer->integer move times 2 and thus |
397 | they are latency*2. */ |
398 | 6, /* cost for loading QImode using movzbl */ |
399 | {2, 4, 2}, /* cost of loading integer registers |
400 | in QImode, HImode and SImode. |
401 | Relative to reg-reg move (2). */ |
402 | {2, 4, 2}, /* cost of storing integer registers */ |
403 | 2, /* cost of reg,reg fld/fst */ |
404 | {2, 2, 6}, /* cost of loading fp registers |
405 | in SFmode, DFmode and XFmode */ |
406 | {4, 4, 6}, /* cost of storing fp registers |
407 | in SFmode, DFmode and XFmode */ |
408 | 8, /* cost of moving MMX register */ |
409 | {8, 8}, /* cost of loading MMX registers |
410 | in SImode and DImode */ |
411 | {8, 8}, /* cost of storing MMX registers |
412 | in SImode and DImode */ |
413 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
414 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers |
415 | in 32,64,128,256 and 512-bit */ |
416 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ |
417 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers |
418 | in 32,64,128,256 and 512-bit */ |
419 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ |
420 | 3, 3, /* SSE->integer and integer->SSE moves */ |
421 | 4, 4, /* Gather load static, per_elt. */ |
422 | 4, 4, /* Gather store static, per_elt. */ |
423 | 8, /* size of l1 cache. */ |
424 | 8, /* size of l2 cache */ |
425 | 0, /* size of prefetch block */ |
426 | 0, /* number of parallel prefetches */ |
427 | 2, /* Branch cost */ |
428 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ |
429 | COSTS_N_INSNS (3), /* cost of FMUL instruction. */ |
430 | COSTS_N_INSNS (39), /* cost of FDIV instruction. */ |
431 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ |
432 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ |
433 | COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ |
434 | |
435 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
436 | COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ |
437 | COSTS_N_INSNS (5), /* cost of MULSS instruction. */ |
438 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ |
439 | COSTS_N_INSNS (10), /* cost of FMA SS instruction. */ |
440 | COSTS_N_INSNS (10), /* cost of FMA SD instruction. */ |
441 | COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ |
442 | COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ |
443 | COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ |
444 | COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ |
445 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
446 | pentium_memcpy, |
447 | pentium_memset, |
448 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
449 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
450 | }; |
451 | |
452 | /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes |
453 | (we ensure the alignment). For small blocks inline loop is still a |
454 | noticeable win, for bigger blocks either rep movsl or rep movsb is |
455 | way to go. Rep movsb has apparently more expensive startup time in CPU, |
456 | but after 4K the difference is down in the noise. */ |
457 | static stringop_algs pentiumpro_memcpy[2] = { |
458 | {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false}, |
459 | {8192, rep_prefix_4_byte, false}, |
460 | {-1, rep_prefix_1_byte, false}}}, |
461 | DUMMY_STRINGOP_ALGS}; |
462 | static stringop_algs pentiumpro_memset[2] = { |
463 | {rep_prefix_4_byte, {{1024, unrolled_loop, false}, |
464 | {8192, rep_prefix_4_byte, false}, |
465 | {-1, libcall, false}}}, |
466 | DUMMY_STRINGOP_ALGS}; |
467 | static const |
468 | struct processor_costs pentiumpro_cost = { |
469 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
470 | COSTS_N_INSNS (1), /* cost of a lea instruction */ |
471 | COSTS_N_INSNS (1), /* variable shift costs */ |
472 | COSTS_N_INSNS (1), /* constant shift costs */ |
473 | {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ |
474 | COSTS_N_INSNS (4), /* HI */ |
475 | COSTS_N_INSNS (4), /* SI */ |
476 | COSTS_N_INSNS (4), /* DI */ |
477 | COSTS_N_INSNS (4)}, /* other */ |
478 | 0, /* cost of multiply per each bit set */ |
479 | {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */ |
480 | COSTS_N_INSNS (17), /* HI */ |
481 | COSTS_N_INSNS (17), /* SI */ |
482 | COSTS_N_INSNS (17), /* DI */ |
483 | COSTS_N_INSNS (17)}, /* other */ |
484 | COSTS_N_INSNS (1), /* cost of movsx */ |
485 | COSTS_N_INSNS (1), /* cost of movzx */ |
486 | 8, /* "large" insn */ |
487 | 6, /* MOVE_RATIO */ |
488 | |
489 | /* All move costs are relative to integer->integer move times 2 and thus |
490 | they are latency*2. */ |
491 | 2, /* cost for loading QImode using movzbl */ |
492 | {4, 4, 4}, /* cost of loading integer registers |
493 | in QImode, HImode and SImode. |
494 | Relative to reg-reg move (2). */ |
495 | {2, 2, 2}, /* cost of storing integer registers */ |
496 | 2, /* cost of reg,reg fld/fst */ |
497 | {2, 2, 6}, /* cost of loading fp registers |
498 | in SFmode, DFmode and XFmode */ |
499 | {4, 4, 6}, /* cost of storing fp registers |
500 | in SFmode, DFmode and XFmode */ |
501 | 2, /* cost of moving MMX register */ |
502 | {2, 2}, /* cost of loading MMX registers |
503 | in SImode and DImode */ |
504 | {2, 2}, /* cost of storing MMX registers |
505 | in SImode and DImode */ |
506 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
507 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers |
508 | in 32,64,128,256 and 512-bit */ |
509 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ |
510 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers |
511 | in 32,64,128,256 and 512-bit */ |
512 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ |
513 | 3, 3, /* SSE->integer and integer->SSE moves */ |
514 | 4, 4, /* Gather load static, per_elt. */ |
515 | 4, 4, /* Gather store static, per_elt. */ |
516 | 8, /* size of l1 cache. */ |
517 | 256, /* size of l2 cache */ |
518 | 32, /* size of prefetch block */ |
519 | 6, /* number of parallel prefetches */ |
520 | 2, /* Branch cost */ |
521 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ |
522 | COSTS_N_INSNS (5), /* cost of FMUL instruction. */ |
523 | COSTS_N_INSNS (56), /* cost of FDIV instruction. */ |
524 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ |
525 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ |
526 | COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ |
527 | |
528 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
529 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
530 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ |
531 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ |
532 | COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ |
533 | COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ |
534 | COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ |
535 | COSTS_N_INSNS (18), /* cost of DIVSD instruction. */ |
536 | COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ |
537 | COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */ |
538 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
539 | pentiumpro_memcpy, |
540 | pentiumpro_memset, |
541 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
542 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
543 | }; |
544 | |
545 | static stringop_algs geode_memcpy[2] = { |
546 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
547 | DUMMY_STRINGOP_ALGS}; |
548 | static stringop_algs geode_memset[2] = { |
549 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
550 | DUMMY_STRINGOP_ALGS}; |
551 | static const |
552 | struct processor_costs geode_cost = { |
553 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
554 | COSTS_N_INSNS (1), /* cost of a lea instruction */ |
555 | COSTS_N_INSNS (2), /* variable shift costs */ |
556 | COSTS_N_INSNS (1), /* constant shift costs */ |
557 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ |
558 | COSTS_N_INSNS (4), /* HI */ |
559 | COSTS_N_INSNS (7), /* SI */ |
560 | COSTS_N_INSNS (7), /* DI */ |
561 | COSTS_N_INSNS (7)}, /* other */ |
562 | 0, /* cost of multiply per each bit set */ |
563 | {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */ |
564 | COSTS_N_INSNS (23), /* HI */ |
565 | COSTS_N_INSNS (39), /* SI */ |
566 | COSTS_N_INSNS (39), /* DI */ |
567 | COSTS_N_INSNS (39)}, /* other */ |
568 | COSTS_N_INSNS (1), /* cost of movsx */ |
569 | COSTS_N_INSNS (1), /* cost of movzx */ |
570 | 8, /* "large" insn */ |
571 | 4, /* MOVE_RATIO */ |
572 | |
573 | /* All move costs are relative to integer->integer move times 2 and thus |
574 | they are latency*2. */ |
575 | 2, /* cost for loading QImode using movzbl */ |
576 | {2, 2, 2}, /* cost of loading integer registers |
577 | in QImode, HImode and SImode. |
578 | Relative to reg-reg move (2). */ |
579 | {2, 2, 2}, /* cost of storing integer registers */ |
580 | 2, /* cost of reg,reg fld/fst */ |
581 | {2, 2, 2}, /* cost of loading fp registers |
582 | in SFmode, DFmode and XFmode */ |
583 | {4, 6, 6}, /* cost of storing fp registers |
584 | in SFmode, DFmode and XFmode */ |
585 | |
586 | 2, /* cost of moving MMX register */ |
587 | {2, 2}, /* cost of loading MMX registers |
588 | in SImode and DImode */ |
589 | {2, 2}, /* cost of storing MMX registers |
590 | in SImode and DImode */ |
591 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
592 | {2, 2, 8, 16, 32}, /* cost of loading SSE registers |
593 | in 32,64,128,256 and 512-bit */ |
594 | {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ |
595 | {2, 2, 8, 16, 32}, /* cost of storing SSE registers |
596 | in 32,64,128,256 and 512-bit */ |
597 | {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ |
598 | 6, 6, /* SSE->integer and integer->SSE moves */ |
599 | 2, 2, /* Gather load static, per_elt. */ |
600 | 2, 2, /* Gather store static, per_elt. */ |
601 | 64, /* size of l1 cache. */ |
602 | 128, /* size of l2 cache. */ |
603 | 32, /* size of prefetch block */ |
604 | 1, /* number of parallel prefetches */ |
605 | 1, /* Branch cost */ |
606 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ |
607 | COSTS_N_INSNS (11), /* cost of FMUL instruction. */ |
608 | COSTS_N_INSNS (47), /* cost of FDIV instruction. */ |
609 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ |
610 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ |
611 | COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ |
612 | |
613 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
614 | COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ |
615 | COSTS_N_INSNS (11), /* cost of MULSS instruction. */ |
616 | COSTS_N_INSNS (11), /* cost of MULSD instruction. */ |
617 | COSTS_N_INSNS (17), /* cost of FMA SS instruction. */ |
618 | COSTS_N_INSNS (17), /* cost of FMA SD instruction. */ |
619 | COSTS_N_INSNS (47), /* cost of DIVSS instruction. */ |
620 | COSTS_N_INSNS (47), /* cost of DIVSD instruction. */ |
621 | COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */ |
622 | COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */ |
623 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
624 | geode_memcpy, |
625 | geode_memset, |
626 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
627 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
628 | }; |
629 | |
630 | static stringop_algs k6_memcpy[2] = { |
631 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
632 | DUMMY_STRINGOP_ALGS}; |
633 | static stringop_algs k6_memset[2] = { |
634 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
635 | DUMMY_STRINGOP_ALGS}; |
636 | static const |
637 | struct processor_costs k6_cost = { |
638 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
639 | COSTS_N_INSNS (2), /* cost of a lea instruction */ |
640 | COSTS_N_INSNS (1), /* variable shift costs */ |
641 | COSTS_N_INSNS (1), /* constant shift costs */ |
642 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ |
643 | COSTS_N_INSNS (3), /* HI */ |
644 | COSTS_N_INSNS (3), /* SI */ |
645 | COSTS_N_INSNS (3), /* DI */ |
646 | COSTS_N_INSNS (3)}, /* other */ |
647 | 0, /* cost of multiply per each bit set */ |
648 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ |
649 | COSTS_N_INSNS (18), /* HI */ |
650 | COSTS_N_INSNS (18), /* SI */ |
651 | COSTS_N_INSNS (18), /* DI */ |
652 | COSTS_N_INSNS (18)}, /* other */ |
653 | COSTS_N_INSNS (2), /* cost of movsx */ |
654 | COSTS_N_INSNS (2), /* cost of movzx */ |
655 | 8, /* "large" insn */ |
656 | 4, /* MOVE_RATIO */ |
657 | |
658 | /* All move costs are relative to integer->integer move times 2 and thus |
659 | they are latency*2. */ |
660 | 3, /* cost for loading QImode using movzbl */ |
661 | {4, 5, 4}, /* cost of loading integer registers |
662 | in QImode, HImode and SImode. |
663 | Relative to reg-reg move (2). */ |
664 | {2, 3, 2}, /* cost of storing integer registers */ |
665 | 4, /* cost of reg,reg fld/fst */ |
666 | {6, 6, 6}, /* cost of loading fp registers |
667 | in SFmode, DFmode and XFmode */ |
668 | {4, 4, 4}, /* cost of storing fp registers |
669 | in SFmode, DFmode and XFmode */ |
670 | 2, /* cost of moving MMX register */ |
671 | {2, 2}, /* cost of loading MMX registers |
672 | in SImode and DImode */ |
673 | {2, 2}, /* cost of storing MMX registers |
674 | in SImode and DImode */ |
675 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
676 | {2, 2, 8, 16, 32}, /* cost of loading SSE registers |
677 | in 32,64,128,256 and 512-bit */ |
678 | {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ |
679 | {2, 2, 8, 16, 32}, /* cost of storing SSE registers |
680 | in 32,64,128,256 and 512-bit */ |
681 | {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ |
682 | 6, 6, /* SSE->integer and integer->SSE moves */ |
683 | 2, 2, /* Gather load static, per_elt. */ |
684 | 2, 2, /* Gather store static, per_elt. */ |
685 | 32, /* size of l1 cache. */ |
686 | 32, /* size of l2 cache. Some models |
687 | have integrated l2 cache, but |
688 | optimizing for k6 is not important |
689 | enough to worry about that. */ |
690 | 32, /* size of prefetch block */ |
691 | 1, /* number of parallel prefetches */ |
692 | 1, /* Branch cost */ |
693 | COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */ |
694 | COSTS_N_INSNS (2), /* cost of FMUL instruction. */ |
695 | COSTS_N_INSNS (56), /* cost of FDIV instruction. */ |
696 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ |
697 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ |
698 | COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ |
699 | |
700 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
701 | COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */ |
702 | COSTS_N_INSNS (2), /* cost of MULSS instruction. */ |
703 | COSTS_N_INSNS (2), /* cost of MULSD instruction. */ |
704 | COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ |
705 | COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ |
706 | COSTS_N_INSNS (56), /* cost of DIVSS instruction. */ |
707 | COSTS_N_INSNS (56), /* cost of DIVSD instruction. */ |
708 | COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */ |
709 | COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */ |
710 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
711 | k6_memcpy, |
712 | k6_memset, |
713 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
714 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
715 | }; |
716 | |
717 | /* For some reason, Athlon deals better with REP prefix (relative to loops) |
718 | compared to K8. Alignment becomes important after 8 bytes for memcpy and |
719 | 128 bytes for memset. */ |
720 | static stringop_algs athlon_memcpy[2] = { |
721 | {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
722 | DUMMY_STRINGOP_ALGS}; |
723 | static stringop_algs athlon_memset[2] = { |
724 | {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
725 | DUMMY_STRINGOP_ALGS}; |
726 | static const |
727 | struct processor_costs athlon_cost = { |
728 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
729 | COSTS_N_INSNS (2), /* cost of a lea instruction */ |
730 | COSTS_N_INSNS (1), /* variable shift costs */ |
731 | COSTS_N_INSNS (1), /* constant shift costs */ |
732 | {COSTS_N_INSNS (5), /* cost of starting multiply for QI */ |
733 | COSTS_N_INSNS (5), /* HI */ |
734 | COSTS_N_INSNS (5), /* SI */ |
735 | COSTS_N_INSNS (5), /* DI */ |
736 | COSTS_N_INSNS (5)}, /* other */ |
737 | 0, /* cost of multiply per each bit set */ |
738 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ |
739 | COSTS_N_INSNS (26), /* HI */ |
740 | COSTS_N_INSNS (42), /* SI */ |
741 | COSTS_N_INSNS (74), /* DI */ |
742 | COSTS_N_INSNS (74)}, /* other */ |
743 | COSTS_N_INSNS (1), /* cost of movsx */ |
744 | COSTS_N_INSNS (1), /* cost of movzx */ |
745 | 8, /* "large" insn */ |
746 | 9, /* MOVE_RATIO */ |
747 | |
748 | /* All move costs are relative to integer->integer move times 2 and thus |
749 | they are latency*2. */ |
750 | 4, /* cost for loading QImode using movzbl */ |
751 | {3, 4, 3}, /* cost of loading integer registers |
752 | in QImode, HImode and SImode. |
753 | Relative to reg-reg move (2). */ |
754 | {3, 4, 3}, /* cost of storing integer registers */ |
755 | 4, /* cost of reg,reg fld/fst */ |
756 | {4, 4, 12}, /* cost of loading fp registers |
757 | in SFmode, DFmode and XFmode */ |
758 | {6, 6, 8}, /* cost of storing fp registers |
759 | in SFmode, DFmode and XFmode */ |
760 | 2, /* cost of moving MMX register */ |
761 | {4, 4}, /* cost of loading MMX registers |
762 | in SImode and DImode */ |
763 | {4, 4}, /* cost of storing MMX registers |
764 | in SImode and DImode */ |
765 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
766 | {4, 4, 6, 12, 24}, /* cost of loading SSE registers |
767 | in 32,64,128,256 and 512-bit */ |
768 | {4, 4, 6, 12, 24}, /* cost of unaligned loads. */ |
769 | {4, 4, 5, 10, 20}, /* cost of storing SSE registers |
770 | in 32,64,128,256 and 512-bit */ |
771 | {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ |
772 | 5, 5, /* SSE->integer and integer->SSE moves */ |
773 | 4, 4, /* Gather load static, per_elt. */ |
774 | 4, 4, /* Gather store static, per_elt. */ |
775 | 64, /* size of l1 cache. */ |
776 | 256, /* size of l2 cache. */ |
777 | 64, /* size of prefetch block */ |
778 | 6, /* number of parallel prefetches */ |
779 | 5, /* Branch cost */ |
780 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ |
781 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ |
782 | COSTS_N_INSNS (24), /* cost of FDIV instruction. */ |
783 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ |
784 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ |
785 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ |
786 | |
787 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
788 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ |
789 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ |
790 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ |
791 | COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ |
792 | COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ |
793 | /* 11-16 */ |
794 | COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ |
795 | COSTS_N_INSNS (24), /* cost of DIVSD instruction. */ |
796 | COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ |
797 | COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */ |
798 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
799 | athlon_memcpy, |
800 | athlon_memset, |
801 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
802 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
803 | }; |
804 | |
805 | /* K8 has optimized REP instruction for medium sized blocks, but for very |
806 | small blocks it is better to use loop. For large blocks, libcall can |
807 | do nontemporary accesses and beat inline considerably. */ |
808 | static stringop_algs k8_memcpy[2] = { |
809 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, |
810 | {-1, rep_prefix_4_byte, false}}}, |
811 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, |
812 | {-1, libcall, false}}}}; |
813 | static stringop_algs k8_memset[2] = { |
814 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, |
815 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
816 | {libcall, {{48, unrolled_loop, false}, |
817 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; |
818 | static const |
819 | struct processor_costs k8_cost = { |
820 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
821 | COSTS_N_INSNS (2), /* cost of a lea instruction */ |
822 | COSTS_N_INSNS (1), /* variable shift costs */ |
823 | COSTS_N_INSNS (1), /* constant shift costs */ |
824 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ |
825 | COSTS_N_INSNS (4), /* HI */ |
826 | COSTS_N_INSNS (3), /* SI */ |
827 | COSTS_N_INSNS (4), /* DI */ |
828 | COSTS_N_INSNS (5)}, /* other */ |
829 | 0, /* cost of multiply per each bit set */ |
830 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ |
831 | COSTS_N_INSNS (26), /* HI */ |
832 | COSTS_N_INSNS (42), /* SI */ |
833 | COSTS_N_INSNS (74), /* DI */ |
834 | COSTS_N_INSNS (74)}, /* other */ |
835 | COSTS_N_INSNS (1), /* cost of movsx */ |
836 | COSTS_N_INSNS (1), /* cost of movzx */ |
837 | 8, /* "large" insn */ |
838 | 9, /* MOVE_RATIO */ |
839 | |
840 | /* All move costs are relative to integer->integer move times 2 and thus |
841 | they are latency*2. */ |
842 | 4, /* cost for loading QImode using movzbl */ |
843 | {3, 4, 3}, /* cost of loading integer registers |
844 | in QImode, HImode and SImode. |
845 | Relative to reg-reg move (2). */ |
846 | {3, 4, 3}, /* cost of storing integer registers */ |
847 | 4, /* cost of reg,reg fld/fst */ |
848 | {4, 4, 12}, /* cost of loading fp registers |
849 | in SFmode, DFmode and XFmode */ |
850 | {6, 6, 8}, /* cost of storing fp registers |
851 | in SFmode, DFmode and XFmode */ |
852 | 2, /* cost of moving MMX register */ |
853 | {3, 3}, /* cost of loading MMX registers |
854 | in SImode and DImode */ |
855 | {4, 4}, /* cost of storing MMX registers |
856 | in SImode and DImode */ |
857 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
858 | {4, 3, 6, 12, 24}, /* cost of loading SSE registers |
859 | in 32,64,128,256 and 512-bit */ |
860 | {4, 3, 6, 12, 24}, /* cost of unaligned loads. */ |
861 | {4, 4, 5, 10, 20}, /* cost of storing SSE registers |
862 | in 32,64,128,256 and 512-bit */ |
863 | {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ |
864 | 5, 5, /* SSE->integer and integer->SSE moves */ |
865 | 4, 4, /* Gather load static, per_elt. */ |
866 | 4, 4, /* Gather store static, per_elt. */ |
867 | 64, /* size of l1 cache. */ |
868 | 512, /* size of l2 cache. */ |
869 | 64, /* size of prefetch block */ |
870 | /* New AMD processors never drop prefetches; if they cannot be performed |
871 | immediately, they are queued. We set number of simultaneous prefetches |
872 | to a large constant to reflect this (it probably is not a good idea not |
873 | to limit number of prefetches at all, as their execution also takes some |
874 | time). */ |
875 | 100, /* number of parallel prefetches */ |
876 | 3, /* Branch cost */ |
877 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ |
878 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ |
879 | COSTS_N_INSNS (19), /* cost of FDIV instruction. */ |
880 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ |
881 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ |
882 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ |
883 | |
884 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
885 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ |
886 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ |
887 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ |
888 | COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ |
889 | COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ |
890 | /* 11-16 */ |
891 | COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ |
892 | COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ |
893 | COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ |
894 | COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ |
895 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
896 | k8_memcpy, |
897 | k8_memset, |
898 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
899 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ |
900 | }; |
901 | |
902 | /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for |
903 | very small blocks it is better to use loop. For large blocks, libcall can |
904 | do nontemporary accesses and beat inline considerably. */ |
905 | static stringop_algs amdfam10_memcpy[2] = { |
906 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, |
907 | {-1, rep_prefix_4_byte, false}}}, |
908 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, |
909 | {-1, libcall, false}}}}; |
910 | static stringop_algs amdfam10_memset[2] = { |
911 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, |
912 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
913 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, |
914 | {-1, libcall, false}}}}; |
915 | struct processor_costs amdfam10_cost = { |
916 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
917 | COSTS_N_INSNS (2), /* cost of a lea instruction */ |
918 | COSTS_N_INSNS (1), /* variable shift costs */ |
919 | COSTS_N_INSNS (1), /* constant shift costs */ |
920 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ |
921 | COSTS_N_INSNS (4), /* HI */ |
922 | COSTS_N_INSNS (3), /* SI */ |
923 | COSTS_N_INSNS (4), /* DI */ |
924 | COSTS_N_INSNS (5)}, /* other */ |
925 | 0, /* cost of multiply per each bit set */ |
926 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ |
927 | COSTS_N_INSNS (35), /* HI */ |
928 | COSTS_N_INSNS (51), /* SI */ |
929 | COSTS_N_INSNS (83), /* DI */ |
930 | COSTS_N_INSNS (83)}, /* other */ |
931 | COSTS_N_INSNS (1), /* cost of movsx */ |
932 | COSTS_N_INSNS (1), /* cost of movzx */ |
933 | 8, /* "large" insn */ |
934 | 9, /* MOVE_RATIO */ |
935 | |
936 | /* All move costs are relative to integer->integer move times 2 and thus |
937 | they are latency*2. */ |
938 | 4, /* cost for loading QImode using movzbl */ |
939 | {3, 4, 3}, /* cost of loading integer registers |
940 | in QImode, HImode and SImode. |
941 | Relative to reg-reg move (2). */ |
942 | {3, 4, 3}, /* cost of storing integer registers */ |
943 | 4, /* cost of reg,reg fld/fst */ |
944 | {4, 4, 12}, /* cost of loading fp registers |
945 | in SFmode, DFmode and XFmode */ |
946 | {6, 6, 8}, /* cost of storing fp registers |
947 | in SFmode, DFmode and XFmode */ |
948 | 2, /* cost of moving MMX register */ |
949 | {3, 3}, /* cost of loading MMX registers |
950 | in SImode and DImode */ |
951 | {4, 4}, /* cost of storing MMX registers |
952 | in SImode and DImode */ |
953 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
954 | {4, 4, 3, 6, 12}, /* cost of loading SSE registers |
955 | in 32,64,128,256 and 512-bit */ |
956 | {4, 4, 3, 7, 12}, /* cost of unaligned loads. */ |
957 | {4, 4, 5, 10, 20}, /* cost of storing SSE registers |
958 | in 32,64,128,256 and 512-bit */ |
959 | {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ |
960 | 3, 3, /* SSE->integer and integer->SSE moves */ |
961 | /* On K8: |
962 | MOVD reg64, xmmreg Double FSTORE 4 |
963 | MOVD reg32, xmmreg Double FSTORE 4 |
964 | On AMDFAM10: |
965 | MOVD reg64, xmmreg Double FADD 3 |
966 | 1/1 1/1 |
967 | MOVD reg32, xmmreg Double FADD 3 |
968 | 1/1 1/1 */ |
969 | 4, 4, /* Gather load static, per_elt. */ |
970 | 4, 4, /* Gather store static, per_elt. */ |
971 | 64, /* size of l1 cache. */ |
972 | 512, /* size of l2 cache. */ |
973 | 64, /* size of prefetch block */ |
974 | /* New AMD processors never drop prefetches; if they cannot be performed |
975 | immediately, they are queued. We set number of simultaneous prefetches |
976 | to a large constant to reflect this (it probably is not a good idea not |
977 | to limit number of prefetches at all, as their execution also takes some |
978 | time). */ |
979 | 100, /* number of parallel prefetches */ |
980 | 2, /* Branch cost */ |
981 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ |
982 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ |
983 | COSTS_N_INSNS (19), /* cost of FDIV instruction. */ |
984 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ |
985 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ |
986 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ |
987 | |
988 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
989 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ |
990 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ |
991 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ |
992 | COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ |
993 | COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ |
994 | /* 11-16 */ |
995 | COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ |
996 | COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ |
997 | COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ |
998 | COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ |
999 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1000 | amdfam10_memcpy, |
1001 | amdfam10_memset, |
1002 | COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ |
1003 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
1004 | }; |
1005 | |
1006 | /* BDVER1 has optimized REP instruction for medium sized blocks, but for |
1007 | very small blocks it is better to use loop. For large blocks, libcall |
1008 | can do nontemporary accesses and beat inline considerably. */ |
1009 | static stringop_algs bdver1_memcpy[2] = { |
1010 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, |
1011 | {-1, rep_prefix_4_byte, false}}}, |
1012 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, |
1013 | {-1, libcall, false}}}}; |
1014 | static stringop_algs bdver1_memset[2] = { |
1015 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, |
1016 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
1017 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, |
1018 | {-1, libcall, false}}}}; |
1019 | |
1020 | const struct processor_costs bdver1_cost = { |
1021 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
1022 | COSTS_N_INSNS (1), /* cost of a lea instruction */ |
1023 | COSTS_N_INSNS (1), /* variable shift costs */ |
1024 | COSTS_N_INSNS (1), /* constant shift costs */ |
1025 | {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ |
1026 | COSTS_N_INSNS (4), /* HI */ |
1027 | COSTS_N_INSNS (4), /* SI */ |
1028 | COSTS_N_INSNS (6), /* DI */ |
1029 | COSTS_N_INSNS (6)}, /* other */ |
1030 | 0, /* cost of multiply per each bit set */ |
1031 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ |
1032 | COSTS_N_INSNS (35), /* HI */ |
1033 | COSTS_N_INSNS (51), /* SI */ |
1034 | COSTS_N_INSNS (83), /* DI */ |
1035 | COSTS_N_INSNS (83)}, /* other */ |
1036 | COSTS_N_INSNS (1), /* cost of movsx */ |
1037 | COSTS_N_INSNS (1), /* cost of movzx */ |
1038 | 8, /* "large" insn */ |
1039 | 9, /* MOVE_RATIO */ |
1040 | |
1041 | /* All move costs are relative to integer->integer move times 2 and thus |
1042 | they are latency*2. */ |
1043 | 8, /* cost for loading QImode using movzbl */ |
1044 | {8, 8, 8}, /* cost of loading integer registers |
1045 | in QImode, HImode and SImode. |
1046 | Relative to reg-reg move (2). */ |
1047 | {8, 8, 8}, /* cost of storing integer registers */ |
1048 | 4, /* cost of reg,reg fld/fst */ |
1049 | {12, 12, 28}, /* cost of loading fp registers |
1050 | in SFmode, DFmode and XFmode */ |
1051 | {10, 10, 18}, /* cost of storing fp registers |
1052 | in SFmode, DFmode and XFmode */ |
1053 | 4, /* cost of moving MMX register */ |
1054 | {12, 12}, /* cost of loading MMX registers |
1055 | in SImode and DImode */ |
1056 | {10, 10}, /* cost of storing MMX registers |
1057 | in SImode and DImode */ |
1058 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
1059 | {12, 12, 10, 20, 30}, /* cost of loading SSE registers |
1060 | in 32,64,128,256 and 512-bit */ |
1061 | {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ |
1062 | {10, 10, 10, 20, 30}, /* cost of storing SSE registers |
1063 | in 32,64,128,256 and 512-bit */ |
1064 | {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ |
1065 | 16, 20, /* SSE->integer and integer->SSE moves */ |
1066 | 12, 12, /* Gather load static, per_elt. */ |
1067 | 10, 10, /* Gather store static, per_elt. */ |
1068 | 16, /* size of l1 cache. */ |
1069 | 2048, /* size of l2 cache. */ |
1070 | 64, /* size of prefetch block */ |
1071 | /* New AMD processors never drop prefetches; if they cannot be performed |
1072 | immediately, they are queued. We set number of simultaneous prefetches |
1073 | to a large constant to reflect this (it probably is not a good idea not |
1074 | to limit number of prefetches at all, as their execution also takes some |
1075 | time). */ |
1076 | 100, /* number of parallel prefetches */ |
1077 | 2, /* Branch cost */ |
1078 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ |
1079 | COSTS_N_INSNS (6), /* cost of FMUL instruction. */ |
1080 | COSTS_N_INSNS (42), /* cost of FDIV instruction. */ |
1081 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ |
1082 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ |
1083 | COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ |
1084 | |
1085 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
1086 | COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1087 | COSTS_N_INSNS (6), /* cost of MULSS instruction. */ |
1088 | COSTS_N_INSNS (6), /* cost of MULSD instruction. */ |
1089 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
1090 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ |
1091 | /* 9-24 */ |
1092 | COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ |
1093 | /* 9-27 */ |
1094 | COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ |
1095 | COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ |
1096 | COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ |
1097 | 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1098 | bdver1_memcpy, |
1099 | bdver1_memset, |
1100 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
1101 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ |
1102 | }; |
1103 | |
1104 | /* BDVER2 has optimized REP instruction for medium sized blocks, but for |
1105 | very small blocks it is better to use loop. For large blocks, libcall |
1106 | can do nontemporary accesses and beat inline considerably. */ |
1107 | |
1108 | static stringop_algs bdver2_memcpy[2] = { |
1109 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, |
1110 | {-1, rep_prefix_4_byte, false}}}, |
1111 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, |
1112 | {-1, libcall, false}}}}; |
1113 | static stringop_algs bdver2_memset[2] = { |
1114 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, |
1115 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
1116 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, |
1117 | {-1, libcall, false}}}}; |
1118 | |
1119 | const struct processor_costs bdver2_cost = { |
1120 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
1121 | COSTS_N_INSNS (1), /* cost of a lea instruction */ |
1122 | COSTS_N_INSNS (1), /* variable shift costs */ |
1123 | COSTS_N_INSNS (1), /* constant shift costs */ |
1124 | {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ |
1125 | COSTS_N_INSNS (4), /* HI */ |
1126 | COSTS_N_INSNS (4), /* SI */ |
1127 | COSTS_N_INSNS (6), /* DI */ |
1128 | COSTS_N_INSNS (6)}, /* other */ |
1129 | 0, /* cost of multiply per each bit set */ |
1130 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ |
1131 | COSTS_N_INSNS (35), /* HI */ |
1132 | COSTS_N_INSNS (51), /* SI */ |
1133 | COSTS_N_INSNS (83), /* DI */ |
1134 | COSTS_N_INSNS (83)}, /* other */ |
1135 | COSTS_N_INSNS (1), /* cost of movsx */ |
1136 | COSTS_N_INSNS (1), /* cost of movzx */ |
1137 | 8, /* "large" insn */ |
1138 | 9, /* MOVE_RATIO */ |
1139 | |
1140 | /* All move costs are relative to integer->integer move times 2 and thus |
1141 | they are latency*2. */ |
1142 | 8, /* cost for loading QImode using movzbl */ |
1143 | {8, 8, 8}, /* cost of loading integer registers |
1144 | in QImode, HImode and SImode. |
1145 | Relative to reg-reg move (2). */ |
1146 | {8, 8, 8}, /* cost of storing integer registers */ |
1147 | 4, /* cost of reg,reg fld/fst */ |
1148 | {12, 12, 28}, /* cost of loading fp registers |
1149 | in SFmode, DFmode and XFmode */ |
1150 | {10, 10, 18}, /* cost of storing fp registers |
1151 | in SFmode, DFmode and XFmode */ |
1152 | 4, /* cost of moving MMX register */ |
1153 | {12, 12}, /* cost of loading MMX registers |
1154 | in SImode and DImode */ |
1155 | {10, 10}, /* cost of storing MMX registers |
1156 | in SImode and DImode */ |
1157 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
1158 | {12, 12, 10, 20, 30}, /* cost of loading SSE registers |
1159 | in 32,64,128,256 and 512-bit */ |
1160 | {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ |
1161 | {10, 10, 10, 20, 30}, /* cost of storing SSE registers |
1162 | in 32,64,128,256 and 512-bit */ |
1163 | {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ |
1164 | 16, 20, /* SSE->integer and integer->SSE moves */ |
1165 | 12, 12, /* Gather load static, per_elt. */ |
1166 | 10, 10, /* Gather store static, per_elt. */ |
1167 | 16, /* size of l1 cache. */ |
1168 | 2048, /* size of l2 cache. */ |
1169 | 64, /* size of prefetch block */ |
1170 | /* New AMD processors never drop prefetches; if they cannot be performed |
1171 | immediately, they are queued. We set number of simultaneous prefetches |
1172 | to a large constant to reflect this (it probably is not a good idea not |
1173 | to limit number of prefetches at all, as their execution also takes some |
1174 | time). */ |
1175 | 100, /* number of parallel prefetches */ |
1176 | 2, /* Branch cost */ |
1177 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ |
1178 | COSTS_N_INSNS (6), /* cost of FMUL instruction. */ |
1179 | COSTS_N_INSNS (42), /* cost of FDIV instruction. */ |
1180 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ |
1181 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ |
1182 | COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ |
1183 | |
1184 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
1185 | COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1186 | COSTS_N_INSNS (6), /* cost of MULSS instruction. */ |
1187 | COSTS_N_INSNS (6), /* cost of MULSD instruction. */ |
1188 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
1189 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ |
1190 | /* 9-24 */ |
1191 | COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ |
1192 | /* 9-27 */ |
1193 | COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ |
1194 | COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ |
1195 | COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ |
1196 | 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1197 | bdver2_memcpy, |
1198 | bdver2_memset, |
1199 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
1200 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ |
1201 | }; |
1202 | |
1203 | |
1204 | /* BDVER3 has optimized REP instruction for medium sized blocks, but for |
1205 | very small blocks it is better to use loop. For large blocks, libcall |
1206 | can do nontemporary accesses and beat inline considerably. */ |
1207 | static stringop_algs bdver3_memcpy[2] = { |
1208 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, |
1209 | {-1, rep_prefix_4_byte, false}}}, |
1210 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, |
1211 | {-1, libcall, false}}}}; |
1212 | static stringop_algs bdver3_memset[2] = { |
1213 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, |
1214 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
1215 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, |
1216 | {-1, libcall, false}}}}; |
1217 | struct processor_costs bdver3_cost = { |
1218 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
1219 | COSTS_N_INSNS (1), /* cost of a lea instruction */ |
1220 | COSTS_N_INSNS (1), /* variable shift costs */ |
1221 | COSTS_N_INSNS (1), /* constant shift costs */ |
1222 | {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ |
1223 | COSTS_N_INSNS (4), /* HI */ |
1224 | COSTS_N_INSNS (4), /* SI */ |
1225 | COSTS_N_INSNS (6), /* DI */ |
1226 | COSTS_N_INSNS (6)}, /* other */ |
1227 | 0, /* cost of multiply per each bit set */ |
1228 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ |
1229 | COSTS_N_INSNS (35), /* HI */ |
1230 | COSTS_N_INSNS (51), /* SI */ |
1231 | COSTS_N_INSNS (83), /* DI */ |
1232 | COSTS_N_INSNS (83)}, /* other */ |
1233 | COSTS_N_INSNS (1), /* cost of movsx */ |
1234 | COSTS_N_INSNS (1), /* cost of movzx */ |
1235 | 8, /* "large" insn */ |
1236 | 9, /* MOVE_RATIO */ |
1237 | |
1238 | /* All move costs are relative to integer->integer move times 2 and thus |
1239 | they are latency*2. */ |
1240 | 8, /* cost for loading QImode using movzbl */ |
1241 | {8, 8, 8}, /* cost of loading integer registers |
1242 | in QImode, HImode and SImode. |
1243 | Relative to reg-reg move (2). */ |
1244 | {8, 8, 8}, /* cost of storing integer registers */ |
1245 | 4, /* cost of reg,reg fld/fst */ |
1246 | {12, 12, 28}, /* cost of loading fp registers |
1247 | in SFmode, DFmode and XFmode */ |
1248 | {10, 10, 18}, /* cost of storing fp registers |
1249 | in SFmode, DFmode and XFmode */ |
1250 | 4, /* cost of moving MMX register */ |
1251 | {12, 12}, /* cost of loading MMX registers |
1252 | in SImode and DImode */ |
1253 | {10, 10}, /* cost of storing MMX registers |
1254 | in SImode and DImode */ |
1255 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
1256 | {12, 12, 10, 20, 30}, /* cost of loading SSE registers |
1257 | in 32,64,128,256 and 512-bit */ |
1258 | {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ |
1259 | {10, 10, 10, 20, 30}, /* cost of storing SSE registers |
1260 | in 32,64,128,256 and 512-bit */ |
1261 | {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ |
1262 | 16, 20, /* SSE->integer and integer->SSE moves */ |
1263 | 12, 12, /* Gather load static, per_elt. */ |
1264 | 10, 10, /* Gather store static, per_elt. */ |
1265 | 16, /* size of l1 cache. */ |
1266 | 2048, /* size of l2 cache. */ |
1267 | 64, /* size of prefetch block */ |
1268 | /* New AMD processors never drop prefetches; if they cannot be performed |
1269 | immediately, they are queued. We set number of simultaneous prefetches |
1270 | to a large constant to reflect this (it probably is not a good idea not |
1271 | to limit number of prefetches at all, as their execution also takes some |
1272 | time). */ |
1273 | 100, /* number of parallel prefetches */ |
1274 | 2, /* Branch cost */ |
1275 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ |
1276 | COSTS_N_INSNS (6), /* cost of FMUL instruction. */ |
1277 | COSTS_N_INSNS (42), /* cost of FDIV instruction. */ |
1278 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ |
1279 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ |
1280 | COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ |
1281 | |
1282 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
1283 | COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1284 | COSTS_N_INSNS (6), /* cost of MULSS instruction. */ |
1285 | COSTS_N_INSNS (6), /* cost of MULSD instruction. */ |
1286 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
1287 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ |
1288 | /* 9-24 */ |
1289 | COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ |
1290 | /* 9-27 */ |
1291 | COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ |
1292 | COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ |
1293 | COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ |
1294 | 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1295 | bdver3_memcpy, |
1296 | bdver3_memset, |
1297 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
1298 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ |
1299 | }; |
1300 | |
1301 | /* BDVER4 has optimized REP instruction for medium sized blocks, but for |
1302 | very small blocks it is better to use loop. For large blocks, libcall |
1303 | can do nontemporary accesses and beat inline considerably. */ |
1304 | static stringop_algs bdver4_memcpy[2] = { |
1305 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, |
1306 | {-1, rep_prefix_4_byte, false}}}, |
1307 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, |
1308 | {-1, libcall, false}}}}; |
1309 | static stringop_algs bdver4_memset[2] = { |
1310 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, |
1311 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
1312 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, |
1313 | {-1, libcall, false}}}}; |
1314 | struct processor_costs bdver4_cost = { |
1315 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
1316 | COSTS_N_INSNS (1), /* cost of a lea instruction */ |
1317 | COSTS_N_INSNS (1), /* variable shift costs */ |
1318 | COSTS_N_INSNS (1), /* constant shift costs */ |
1319 | {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ |
1320 | COSTS_N_INSNS (4), /* HI */ |
1321 | COSTS_N_INSNS (4), /* SI */ |
1322 | COSTS_N_INSNS (6), /* DI */ |
1323 | COSTS_N_INSNS (6)}, /* other */ |
1324 | 0, /* cost of multiply per each bit set */ |
1325 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ |
1326 | COSTS_N_INSNS (35), /* HI */ |
1327 | COSTS_N_INSNS (51), /* SI */ |
1328 | COSTS_N_INSNS (83), /* DI */ |
1329 | COSTS_N_INSNS (83)}, /* other */ |
1330 | COSTS_N_INSNS (1), /* cost of movsx */ |
1331 | COSTS_N_INSNS (1), /* cost of movzx */ |
1332 | 8, /* "large" insn */ |
1333 | 9, /* MOVE_RATIO */ |
1334 | |
1335 | /* All move costs are relative to integer->integer move times 2 and thus |
1336 | they are latency*2. */ |
1337 | 8, /* cost for loading QImode using movzbl */ |
1338 | {8, 8, 8}, /* cost of loading integer registers |
1339 | in QImode, HImode and SImode. |
1340 | Relative to reg-reg move (2). */ |
1341 | {8, 8, 8}, /* cost of storing integer registers */ |
1342 | 4, /* cost of reg,reg fld/fst */ |
1343 | {12, 12, 28}, /* cost of loading fp registers |
1344 | in SFmode, DFmode and XFmode */ |
1345 | {10, 10, 18}, /* cost of storing fp registers |
1346 | in SFmode, DFmode and XFmode */ |
1347 | 4, /* cost of moving MMX register */ |
1348 | {12, 12}, /* cost of loading MMX registers |
1349 | in SImode and DImode */ |
1350 | {10, 10}, /* cost of storing MMX registers |
1351 | in SImode and DImode */ |
1352 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
1353 | {12, 12, 10, 20, 30}, /* cost of loading SSE registers |
1354 | in 32,64,128,256 and 512-bit */ |
1355 | {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ |
1356 | {10, 10, 10, 20, 30}, /* cost of storing SSE registers |
1357 | in 32,64,128,256 and 512-bit */ |
1358 | {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ |
1359 | 16, 20, /* SSE->integer and integer->SSE moves */ |
1360 | 12, 12, /* Gather load static, per_elt. */ |
1361 | 10, 10, /* Gather store static, per_elt. */ |
1362 | 16, /* size of l1 cache. */ |
1363 | 2048, /* size of l2 cache. */ |
1364 | 64, /* size of prefetch block */ |
1365 | /* New AMD processors never drop prefetches; if they cannot be performed |
1366 | immediately, they are queued. We set number of simultaneous prefetches |
1367 | to a large constant to reflect this (it probably is not a good idea not |
1368 | to limit number of prefetches at all, as their execution also takes some |
1369 | time). */ |
1370 | 100, /* number of parallel prefetches */ |
1371 | 2, /* Branch cost */ |
1372 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ |
1373 | COSTS_N_INSNS (6), /* cost of FMUL instruction. */ |
1374 | COSTS_N_INSNS (42), /* cost of FDIV instruction. */ |
1375 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ |
1376 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ |
1377 | COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ |
1378 | |
1379 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
1380 | COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1381 | COSTS_N_INSNS (6), /* cost of MULSS instruction. */ |
1382 | COSTS_N_INSNS (6), /* cost of MULSD instruction. */ |
1383 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
1384 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ |
1385 | /* 9-24 */ |
1386 | COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ |
1387 | /* 9-27 */ |
1388 | COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ |
1389 | COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ |
1390 | COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ |
1391 | 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1392 | bdver4_memcpy, |
1393 | bdver4_memset, |
1394 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
1395 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ |
1396 | }; |
1397 | |
1398 | |
1399 | /* ZNVER1 has optimized REP instruction for medium sized blocks, but for |
1400 | very small blocks it is better to use loop. For large blocks, libcall |
1401 | can do nontemporary accesses and beat inline considerably. */ |
1402 | static stringop_algs znver1_memcpy[2] = { |
1403 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, |
1404 | {-1, rep_prefix_4_byte, false}}}, |
1405 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, |
1406 | {-1, libcall, false}}}}; |
1407 | static stringop_algs znver1_memset[2] = { |
1408 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, |
1409 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
1410 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, |
1411 | {-1, libcall, false}}}}; |
1412 | struct processor_costs znver1_cost = { |
1413 | COSTS_N_INSNS (1), /* cost of an add instruction. */ |
1414 | COSTS_N_INSNS (1), /* cost of a lea instruction. */ |
1415 | COSTS_N_INSNS (1), /* variable shift costs. */ |
1416 | COSTS_N_INSNS (1), /* constant shift costs. */ |
1417 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ |
1418 | COSTS_N_INSNS (3), /* HI. */ |
1419 | COSTS_N_INSNS (3), /* SI. */ |
1420 | COSTS_N_INSNS (3), /* DI. */ |
1421 | COSTS_N_INSNS (3)}, /* other. */ |
1422 | 0, /* cost of multiply per each bit |
1423 | set. */ |
1424 | /* Depending on parameters, idiv can get faster on ryzen. This is upper |
1425 | bound. */ |
1426 | {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ |
1427 | COSTS_N_INSNS (22), /* HI. */ |
1428 | COSTS_N_INSNS (30), /* SI. */ |
1429 | COSTS_N_INSNS (45), /* DI. */ |
1430 | COSTS_N_INSNS (45)}, /* other. */ |
1431 | COSTS_N_INSNS (1), /* cost of movsx. */ |
1432 | COSTS_N_INSNS (1), /* cost of movzx. */ |
1433 | 8, /* "large" insn. */ |
1434 | 9, /* MOVE_RATIO. */ |
1435 | |
1436 | /* All move costs are relative to integer->integer move times 2 and thus |
1437 | they are latency*2. */ |
1438 | |
1439 | /* reg-reg moves are done by renaming and thus they are even cheaper than |
1440 | 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond |
1441 | to doubles of latencies, we do not model this correctly. It does not |
1442 | seem to make practical difference to bump prices up even more. */ |
1443 | 6, /* cost for loading QImode using |
1444 | movzbl. */ |
1445 | {6, 6, 6}, /* cost of loading integer registers |
1446 | in QImode, HImode and SImode. |
1447 | Relative to reg-reg move (2). */ |
1448 | {8, 8, 8}, /* cost of storing integer |
1449 | registers. */ |
1450 | 2, /* cost of reg,reg fld/fst. */ |
1451 | {6, 6, 16}, /* cost of loading fp registers |
1452 | in SFmode, DFmode and XFmode. */ |
1453 | {8, 8, 16}, /* cost of storing fp registers |
1454 | in SFmode, DFmode and XFmode. */ |
1455 | 2, /* cost of moving MMX register. */ |
1456 | {6, 6}, /* cost of loading MMX registers |
1457 | in SImode and DImode. */ |
1458 | {8, 8}, /* cost of storing MMX registers |
1459 | in SImode and DImode. */ |
1460 | 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ |
1461 | {6, 6, 6, 10, 20}, /* cost of loading SSE registers |
1462 | in 32,64,128,256 and 512-bit. */ |
1463 | {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ |
1464 | {8, 8, 8, 8, 16}, /* cost of storing SSE registers |
1465 | in 32,64,128,256 and 512-bit. */ |
1466 | {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ |
1467 | 6, 6, /* SSE->integer and integer->SSE moves. */ |
1468 | /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, |
1469 | throughput 12. Approx 9 uops do not depend on vector size and every load |
1470 | is 7 uops. */ |
1471 | 18, 8, /* Gather load static, per_elt. */ |
1472 | 18, 10, /* Gather store static, per_elt. */ |
1473 | 32, /* size of l1 cache. */ |
1474 | 512, /* size of l2 cache. */ |
1475 | 64, /* size of prefetch block. */ |
1476 | /* New AMD processors never drop prefetches; if they cannot be performed |
1477 | immediately, they are queued. We set number of simultaneous prefetches |
1478 | to a large constant to reflect this (it probably is not a good idea not |
1479 | to limit number of prefetches at all, as their execution also takes some |
1480 | time). */ |
1481 | 100, /* number of parallel prefetches. */ |
1482 | 3, /* Branch cost. */ |
1483 | COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ |
1484 | COSTS_N_INSNS (5), /* cost of FMUL instruction. */ |
1485 | /* Latency of fdiv is 8-15. */ |
1486 | COSTS_N_INSNS (15), /* cost of FDIV instruction. */ |
1487 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ |
1488 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ |
1489 | /* Latency of fsqrt is 4-10. */ |
1490 | COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ |
1491 | |
1492 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
1493 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1494 | COSTS_N_INSNS (3), /* cost of MULSS instruction. */ |
1495 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ |
1496 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ |
1497 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ |
1498 | COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ |
1499 | /* 9-13 */ |
1500 | COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ |
1501 | COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ |
1502 | COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ |
1503 | /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles |
1504 | and it can execute 2 integer additions and 2 multiplications thus |
1505 | reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests |
1506 | that 4 works better than 6 probably due to register pressure. |
1507 | |
1508 | Integer vector operations are taken by FP unit and execute 3 vector |
1509 | plus/minus operations per cycle but only one multiply. This is adjusted |
1510 | in ix86_reassociation_width. */ |
1511 | 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ |
1512 | znver1_memcpy, |
1513 | znver1_memset, |
1514 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
1515 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ |
1516 | }; |
1517 | |
1518 | /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ |
1519 | static stringop_algs skylake_memcpy[2] = { |
1520 | {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, |
1521 | {libcall, {{16, loop, false}, {512, rep_prefix_8_byte, false}, |
1522 | {-1, libcall, false}}}}; |
1523 | |
1524 | static stringop_algs skylake_memset[2] = { |
1525 | {libcall, {{6, loop_1_byte, true}, |
1526 | {24, loop, true}, |
1527 | {8192, rep_prefix_4_byte, true}, |
1528 | {-1, libcall, false}}}, |
1529 | {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, false}, |
1530 | {-1, libcall, false}}}}; |
1531 | |
1532 | static const |
1533 | struct processor_costs skylake_cost = { |
1534 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
1535 | COSTS_N_INSNS (1)+1, /* cost of a lea instruction */ |
1536 | COSTS_N_INSNS (1), /* variable shift costs */ |
1537 | COSTS_N_INSNS (1), /* constant shift costs */ |
1538 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ |
1539 | COSTS_N_INSNS (4), /* HI */ |
1540 | COSTS_N_INSNS (3), /* SI */ |
1541 | COSTS_N_INSNS (4), /* DI */ |
1542 | COSTS_N_INSNS (4)}, /* other */ |
1543 | 0, /* cost of multiply per each bit set */ |
1544 | {COSTS_N_INSNS (8), /* cost of a divide/mod for QI */ |
1545 | COSTS_N_INSNS (8), /* HI */ |
1546 | COSTS_N_INSNS (11), /* SI */ |
1547 | COSTS_N_INSNS (76), /* DI */ |
1548 | COSTS_N_INSNS (76)}, /* other */ |
1549 | COSTS_N_INSNS (1), /* cost of movsx */ |
1550 | COSTS_N_INSNS (0), /* cost of movzx */ |
1551 | 8, /* "large" insn */ |
1552 | 17, /* MOVE_RATIO */ |
1553 | |
1554 | 6, /* cost for loading QImode using movzbl */ |
1555 | {4, 4, 4}, /* cost of loading integer registers |
1556 | in QImode, HImode and SImode. |
1557 | Relative to reg-reg move (2). */ |
1558 | {6, 6, 6}, /* cost of storing integer registers */ |
1559 | 2, /* cost of reg,reg fld/fst */ |
1560 | {6, 6, 8}, /* cost of loading fp registers |
1561 | in SFmode, DFmode and XFmode */ |
1562 | {6, 6, 10}, /* cost of storing fp registers |
1563 | in SFmode, DFmode and XFmode */ |
1564 | 2, /* cost of moving MMX register */ |
1565 | {6, 6}, /* cost of loading MMX registers |
1566 | in SImode and DImode */ |
1567 | {6, 6}, /* cost of storing MMX registers |
1568 | in SImode and DImode */ |
1569 | 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ |
1570 | {6, 6, 6, 10, 20}, /* cost of loading SSE registers |
1571 | in 32,64,128,256 and 512-bit */ |
1572 | {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ |
1573 | {8, 8, 8, 8, 16}, /* cost of storing SSE registers |
1574 | in 32,64,128,256 and 512-bit */ |
1575 | {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ |
1576 | 2, 2, /* SSE->integer and integer->SSE moves */ |
1577 | 20, 8, /* Gather load static, per_elt. */ |
1578 | 22, 10, /* Gather store static, per_elt. */ |
1579 | 64, /* size of l1 cache. */ |
1580 | 512, /* size of l2 cache. */ |
1581 | 64, /* size of prefetch block */ |
1582 | 6, /* number of parallel prefetches */ |
1583 | 3, /* Branch cost */ |
1584 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ |
1585 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ |
1586 | COSTS_N_INSNS (20), /* cost of FDIV instruction. */ |
1587 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ |
1588 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ |
1589 | COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ |
1590 | |
1591 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
1592 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1593 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ |
1594 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ |
1595 | COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ |
1596 | COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ |
1597 | COSTS_N_INSNS (11), /* cost of DIVSS instruction. */ |
1598 | COSTS_N_INSNS (14), /* cost of DIVSD instruction. */ |
1599 | COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */ |
1600 | COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ |
1601 | 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ |
1602 | skylake_memcpy, |
1603 | skylake_memset, |
1604 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
1605 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
1606 | }; |
1607 | /* BTVER1 has optimized REP instruction for medium sized blocks, but for |
1608 | very small blocks it is better to use loop. For large blocks, libcall can |
1609 | do nontemporary accesses and beat inline considerably. */ |
1610 | static stringop_algs btver1_memcpy[2] = { |
1611 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, |
1612 | {-1, rep_prefix_4_byte, false}}}, |
1613 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, |
1614 | {-1, libcall, false}}}}; |
1615 | static stringop_algs btver1_memset[2] = { |
1616 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, |
1617 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
1618 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, |
1619 | {-1, libcall, false}}}}; |
1620 | const struct processor_costs btver1_cost = { |
1621 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
1622 | COSTS_N_INSNS (2), /* cost of a lea instruction */ |
1623 | COSTS_N_INSNS (1), /* variable shift costs */ |
1624 | COSTS_N_INSNS (1), /* constant shift costs */ |
1625 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ |
1626 | COSTS_N_INSNS (4), /* HI */ |
1627 | COSTS_N_INSNS (3), /* SI */ |
1628 | COSTS_N_INSNS (4), /* DI */ |
1629 | COSTS_N_INSNS (5)}, /* other */ |
1630 | 0, /* cost of multiply per each bit set */ |
1631 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ |
1632 | COSTS_N_INSNS (35), /* HI */ |
1633 | COSTS_N_INSNS (51), /* SI */ |
1634 | COSTS_N_INSNS (83), /* DI */ |
1635 | COSTS_N_INSNS (83)}, /* other */ |
1636 | COSTS_N_INSNS (1), /* cost of movsx */ |
1637 | COSTS_N_INSNS (1), /* cost of movzx */ |
1638 | 8, /* "large" insn */ |
1639 | 9, /* MOVE_RATIO */ |
1640 | |
1641 | /* All move costs are relative to integer->integer move times 2 and thus |
1642 | they are latency*2. */ |
1643 | 8, /* cost for loading QImode using movzbl */ |
1644 | {6, 8, 6}, /* cost of loading integer registers |
1645 | in QImode, HImode and SImode. |
1646 | Relative to reg-reg move (2). */ |
1647 | {6, 8, 6}, /* cost of storing integer registers */ |
1648 | 4, /* cost of reg,reg fld/fst */ |
1649 | {12, 12, 28}, /* cost of loading fp registers |
1650 | in SFmode, DFmode and XFmode */ |
1651 | {12, 12, 38}, /* cost of storing fp registers |
1652 | in SFmode, DFmode and XFmode */ |
1653 | 4, /* cost of moving MMX register */ |
1654 | {10, 10}, /* cost of loading MMX registers |
1655 | in SImode and DImode */ |
1656 | {12, 12}, /* cost of storing MMX registers |
1657 | in SImode and DImode */ |
1658 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
1659 | {10, 10, 12, 24, 48}, /* cost of loading SSE registers |
1660 | in 32,64,128,256 and 512-bit */ |
1661 | {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ |
1662 | {10, 10, 12, 24, 48}, /* cost of storing SSE registers |
1663 | in 32,64,128,256 and 512-bit */ |
1664 | {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ |
1665 | 14, 14, /* SSE->integer and integer->SSE moves */ |
1666 | 10, 10, /* Gather load static, per_elt. */ |
1667 | 10, 10, /* Gather store static, per_elt. */ |
1668 | 32, /* size of l1 cache. */ |
1669 | 512, /* size of l2 cache. */ |
1670 | 64, /* size of prefetch block */ |
1671 | 100, /* number of parallel prefetches */ |
1672 | 2, /* Branch cost */ |
1673 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ |
1674 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ |
1675 | COSTS_N_INSNS (19), /* cost of FDIV instruction. */ |
1676 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ |
1677 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ |
1678 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ |
1679 | |
1680 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
1681 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1682 | COSTS_N_INSNS (2), /* cost of MULSS instruction. */ |
1683 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ |
1684 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ |
1685 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ |
1686 | COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ |
1687 | COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ |
1688 | COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ |
1689 | COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */ |
1690 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1691 | btver1_memcpy, |
1692 | btver1_memset, |
1693 | COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ |
1694 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
1695 | }; |
1696 | |
1697 | static stringop_algs btver2_memcpy[2] = { |
1698 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, |
1699 | {-1, rep_prefix_4_byte, false}}}, |
1700 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, |
1701 | {-1, libcall, false}}}}; |
1702 | static stringop_algs btver2_memset[2] = { |
1703 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, |
1704 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
1705 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, |
1706 | {-1, libcall, false}}}}; |
1707 | const struct processor_costs btver2_cost = { |
1708 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
1709 | COSTS_N_INSNS (2), /* cost of a lea instruction */ |
1710 | COSTS_N_INSNS (1), /* variable shift costs */ |
1711 | COSTS_N_INSNS (1), /* constant shift costs */ |
1712 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ |
1713 | COSTS_N_INSNS (4), /* HI */ |
1714 | COSTS_N_INSNS (3), /* SI */ |
1715 | COSTS_N_INSNS (4), /* DI */ |
1716 | COSTS_N_INSNS (5)}, /* other */ |
1717 | 0, /* cost of multiply per each bit set */ |
1718 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ |
1719 | COSTS_N_INSNS (35), /* HI */ |
1720 | COSTS_N_INSNS (51), /* SI */ |
1721 | COSTS_N_INSNS (83), /* DI */ |
1722 | COSTS_N_INSNS (83)}, /* other */ |
1723 | COSTS_N_INSNS (1), /* cost of movsx */ |
1724 | COSTS_N_INSNS (1), /* cost of movzx */ |
1725 | 8, /* "large" insn */ |
1726 | 9, /* MOVE_RATIO */ |
1727 | |
1728 | /* All move costs are relative to integer->integer move times 2 and thus |
1729 | they are latency*2. */ |
1730 | 8, /* cost for loading QImode using movzbl */ |
1731 | {8, 8, 6}, /* cost of loading integer registers |
1732 | in QImode, HImode and SImode. |
1733 | Relative to reg-reg move (2). */ |
1734 | {8, 8, 6}, /* cost of storing integer registers */ |
1735 | 4, /* cost of reg,reg fld/fst */ |
1736 | {12, 12, 28}, /* cost of loading fp registers |
1737 | in SFmode, DFmode and XFmode */ |
1738 | {12, 12, 38}, /* cost of storing fp registers |
1739 | in SFmode, DFmode and XFmode */ |
1740 | 4, /* cost of moving MMX register */ |
1741 | {10, 10}, /* cost of loading MMX registers |
1742 | in SImode and DImode */ |
1743 | {12, 12}, /* cost of storing MMX registers |
1744 | in SImode and DImode */ |
1745 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
1746 | {10, 10, 12, 24, 48}, /* cost of loading SSE registers |
1747 | in 32,64,128,256 and 512-bit */ |
1748 | {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ |
1749 | {10, 10, 12, 24, 48}, /* cost of storing SSE registers |
1750 | in 32,64,128,256 and 512-bit */ |
1751 | {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ |
1752 | 14, 14, /* SSE->integer and integer->SSE moves */ |
1753 | 10, 10, /* Gather load static, per_elt. */ |
1754 | 10, 10, /* Gather store static, per_elt. */ |
1755 | 32, /* size of l1 cache. */ |
1756 | 2048, /* size of l2 cache. */ |
1757 | 64, /* size of prefetch block */ |
1758 | 100, /* number of parallel prefetches */ |
1759 | 2, /* Branch cost */ |
1760 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ |
1761 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ |
1762 | COSTS_N_INSNS (19), /* cost of FDIV instruction. */ |
1763 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ |
1764 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ |
1765 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ |
1766 | |
1767 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
1768 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1769 | COSTS_N_INSNS (2), /* cost of MULSS instruction. */ |
1770 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ |
1771 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ |
1772 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ |
1773 | COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ |
1774 | COSTS_N_INSNS (19), /* cost of DIVSD instruction. */ |
1775 | COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */ |
1776 | COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */ |
1777 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1778 | btver2_memcpy, |
1779 | btver2_memset, |
1780 | COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ |
1781 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
1782 | }; |
1783 | |
1784 | static stringop_algs pentium4_memcpy[2] = { |
1785 | {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, |
1786 | DUMMY_STRINGOP_ALGS}; |
1787 | static stringop_algs pentium4_memset[2] = { |
1788 | {libcall, {{6, loop_1_byte, false}, {48, loop, false}, |
1789 | {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
1790 | DUMMY_STRINGOP_ALGS}; |
1791 | |
1792 | static const |
1793 | struct processor_costs pentium4_cost = { |
1794 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
1795 | COSTS_N_INSNS (3), /* cost of a lea instruction */ |
1796 | COSTS_N_INSNS (4), /* variable shift costs */ |
1797 | COSTS_N_INSNS (4), /* constant shift costs */ |
1798 | {COSTS_N_INSNS (15), /* cost of starting multiply for QI */ |
1799 | COSTS_N_INSNS (15), /* HI */ |
1800 | COSTS_N_INSNS (15), /* SI */ |
1801 | COSTS_N_INSNS (15), /* DI */ |
1802 | COSTS_N_INSNS (15)}, /* other */ |
1803 | 0, /* cost of multiply per each bit set */ |
1804 | {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */ |
1805 | COSTS_N_INSNS (56), /* HI */ |
1806 | COSTS_N_INSNS (56), /* SI */ |
1807 | COSTS_N_INSNS (56), /* DI */ |
1808 | COSTS_N_INSNS (56)}, /* other */ |
1809 | COSTS_N_INSNS (1), /* cost of movsx */ |
1810 | COSTS_N_INSNS (1), /* cost of movzx */ |
1811 | 16, /* "large" insn */ |
1812 | 6, /* MOVE_RATIO */ |
1813 | |
1814 | /* All move costs are relative to integer->integer move times 2 and thus |
1815 | they are latency*2. */ |
1816 | 5, /* cost for loading QImode using movzbl */ |
1817 | {4, 5, 4}, /* cost of loading integer registers |
1818 | in QImode, HImode and SImode. |
1819 | Relative to reg-reg move (2). */ |
1820 | {2, 3, 2}, /* cost of storing integer registers */ |
1821 | 12, /* cost of reg,reg fld/fst */ |
1822 | {14, 14, 14}, /* cost of loading fp registers |
1823 | in SFmode, DFmode and XFmode */ |
1824 | {14, 14, 14}, /* cost of storing fp registers |
1825 | in SFmode, DFmode and XFmode */ |
1826 | 12, /* cost of moving MMX register */ |
1827 | {16, 16}, /* cost of loading MMX registers |
1828 | in SImode and DImode */ |
1829 | {16, 16}, /* cost of storing MMX registers |
1830 | in SImode and DImode */ |
1831 | 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ |
1832 | {16, 16, 16, 32, 64}, /* cost of loading SSE registers |
1833 | in 32,64,128,256 and 512-bit */ |
1834 | {32, 32, 32, 64, 128}, /* cost of unaligned loads. */ |
1835 | {16, 16, 16, 32, 64}, /* cost of storing SSE registers |
1836 | in 32,64,128,256 and 512-bit */ |
1837 | {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ |
1838 | 20, 12, /* SSE->integer and integer->SSE moves */ |
1839 | 16, 16, /* Gather load static, per_elt. */ |
1840 | 16, 16, /* Gather store static, per_elt. */ |
1841 | 8, /* size of l1 cache. */ |
1842 | 256, /* size of l2 cache. */ |
1843 | 64, /* size of prefetch block */ |
1844 | 6, /* number of parallel prefetches */ |
1845 | 2, /* Branch cost */ |
1846 | COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ |
1847 | COSTS_N_INSNS (7), /* cost of FMUL instruction. */ |
1848 | COSTS_N_INSNS (43), /* cost of FDIV instruction. */ |
1849 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ |
1850 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ |
1851 | COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ |
1852 | |
1853 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
1854 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1855 | COSTS_N_INSNS (6), /* cost of MULSS instruction. */ |
1856 | COSTS_N_INSNS (6), /* cost of MULSD instruction. */ |
1857 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
1858 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ |
1859 | COSTS_N_INSNS (23), /* cost of DIVSS instruction. */ |
1860 | COSTS_N_INSNS (38), /* cost of DIVSD instruction. */ |
1861 | COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */ |
1862 | COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */ |
1863 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1864 | pentium4_memcpy, |
1865 | pentium4_memset, |
1866 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
1867 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
1868 | }; |
1869 | |
1870 | static stringop_algs nocona_memcpy[2] = { |
1871 | {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, |
1872 | {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false}, |
1873 | {100000, unrolled_loop, false}, {-1, libcall, false}}}}; |
1874 | |
1875 | static stringop_algs nocona_memset[2] = { |
1876 | {libcall, {{6, loop_1_byte, false}, {48, loop, false}, |
1877 | {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
1878 | {libcall, {{24, loop, false}, {64, unrolled_loop, false}, |
1879 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; |
1880 | |
1881 | static const |
1882 | struct processor_costs nocona_cost = { |
1883 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
1884 | COSTS_N_INSNS (1), /* cost of a lea instruction */ |
1885 | COSTS_N_INSNS (1), /* variable shift costs */ |
1886 | COSTS_N_INSNS (1), /* constant shift costs */ |
1887 | {COSTS_N_INSNS (10), /* cost of starting multiply for QI */ |
1888 | COSTS_N_INSNS (10), /* HI */ |
1889 | COSTS_N_INSNS (10), /* SI */ |
1890 | COSTS_N_INSNS (10), /* DI */ |
1891 | COSTS_N_INSNS (10)}, /* other */ |
1892 | 0, /* cost of multiply per each bit set */ |
1893 | {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */ |
1894 | COSTS_N_INSNS (66), /* HI */ |
1895 | COSTS_N_INSNS (66), /* SI */ |
1896 | COSTS_N_INSNS (66), /* DI */ |
1897 | COSTS_N_INSNS (66)}, /* other */ |
1898 | COSTS_N_INSNS (1), /* cost of movsx */ |
1899 | COSTS_N_INSNS (1), /* cost of movzx */ |
1900 | 16, /* "large" insn */ |
1901 | 17, /* MOVE_RATIO */ |
1902 | |
1903 | /* All move costs are relative to integer->integer move times 2 and thus |
1904 | they are latency*2. */ |
1905 | 4, /* cost for loading QImode using movzbl */ |
1906 | {4, 4, 4}, /* cost of loading integer registers |
1907 | in QImode, HImode and SImode. |
1908 | Relative to reg-reg move (2). */ |
1909 | {4, 4, 4}, /* cost of storing integer registers */ |
1910 | 12, /* cost of reg,reg fld/fst */ |
1911 | {14, 14, 14}, /* cost of loading fp registers |
1912 | in SFmode, DFmode and XFmode */ |
1913 | {14, 14, 14}, /* cost of storing fp registers |
1914 | in SFmode, DFmode and XFmode */ |
1915 | 14, /* cost of moving MMX register */ |
1916 | {12, 12}, /* cost of loading MMX registers |
1917 | in SImode and DImode */ |
1918 | {12, 12}, /* cost of storing MMX registers |
1919 | in SImode and DImode */ |
1920 | 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ |
1921 | {12, 12, 12, 24, 48}, /* cost of loading SSE registers |
1922 | in 32,64,128,256 and 512-bit */ |
1923 | {24, 24, 24, 48, 96}, /* cost of unaligned loads. */ |
1924 | {12, 12, 12, 24, 48}, /* cost of storing SSE registers |
1925 | in 32,64,128,256 and 512-bit */ |
1926 | {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ |
1927 | 20, 12, /* SSE->integer and integer->SSE moves */ |
1928 | 12, 12, /* Gather load static, per_elt. */ |
1929 | 12, 12, /* Gather store static, per_elt. */ |
1930 | 8, /* size of l1 cache. */ |
1931 | 1024, /* size of l2 cache. */ |
1932 | 64, /* size of prefetch block */ |
1933 | 8, /* number of parallel prefetches */ |
1934 | 1, /* Branch cost */ |
1935 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ |
1936 | COSTS_N_INSNS (8), /* cost of FMUL instruction. */ |
1937 | COSTS_N_INSNS (40), /* cost of FDIV instruction. */ |
1938 | COSTS_N_INSNS (3), /* cost of FABS instruction. */ |
1939 | COSTS_N_INSNS (3), /* cost of FCHS instruction. */ |
1940 | COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ |
1941 | |
1942 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
1943 | COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1944 | COSTS_N_INSNS (7), /* cost of MULSS instruction. */ |
1945 | COSTS_N_INSNS (7), /* cost of MULSD instruction. */ |
1946 | COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ |
1947 | COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ |
1948 | COSTS_N_INSNS (32), /* cost of DIVSS instruction. */ |
1949 | COSTS_N_INSNS (40), /* cost of DIVSD instruction. */ |
1950 | COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */ |
1951 | COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */ |
1952 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1953 | nocona_memcpy, |
1954 | nocona_memset, |
1955 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
1956 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
1957 | }; |
1958 | |
1959 | static stringop_algs atom_memcpy[2] = { |
1960 | {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, |
1961 | {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, |
1962 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; |
1963 | static stringop_algs atom_memset[2] = { |
1964 | {libcall, {{8, loop, false}, {15, unrolled_loop, false}, |
1965 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
1966 | {libcall, {{24, loop, false}, {32, unrolled_loop, false}, |
1967 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; |
1968 | static const |
1969 | struct processor_costs atom_cost = { |
1970 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
1971 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ |
1972 | COSTS_N_INSNS (1), /* variable shift costs */ |
1973 | COSTS_N_INSNS (1), /* constant shift costs */ |
1974 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ |
1975 | COSTS_N_INSNS (4), /* HI */ |
1976 | COSTS_N_INSNS (3), /* SI */ |
1977 | COSTS_N_INSNS (4), /* DI */ |
1978 | COSTS_N_INSNS (2)}, /* other */ |
1979 | 0, /* cost of multiply per each bit set */ |
1980 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ |
1981 | COSTS_N_INSNS (26), /* HI */ |
1982 | COSTS_N_INSNS (42), /* SI */ |
1983 | COSTS_N_INSNS (74), /* DI */ |
1984 | COSTS_N_INSNS (74)}, /* other */ |
1985 | COSTS_N_INSNS (1), /* cost of movsx */ |
1986 | COSTS_N_INSNS (1), /* cost of movzx */ |
1987 | 8, /* "large" insn */ |
1988 | 17, /* MOVE_RATIO */ |
1989 | |
1990 | /* All move costs are relative to integer->integer move times 2 and thus |
1991 | they are latency*2. */ |
1992 | 6, /* cost for loading QImode using movzbl */ |
1993 | {6, 6, 6}, /* cost of loading integer registers |
1994 | in QImode, HImode and SImode. |
1995 | Relative to reg-reg move (2). */ |
1996 | {6, 6, 6}, /* cost of storing integer registers */ |
1997 | 4, /* cost of reg,reg fld/fst */ |
1998 | {6, 6, 18}, /* cost of loading fp registers |
1999 | in SFmode, DFmode and XFmode */ |
2000 | {14, 14, 24}, /* cost of storing fp registers |
2001 | in SFmode, DFmode and XFmode */ |
2002 | 2, /* cost of moving MMX register */ |
2003 | {8, 8}, /* cost of loading MMX registers |
2004 | in SImode and DImode */ |
2005 | {10, 10}, /* cost of storing MMX registers |
2006 | in SImode and DImode */ |
2007 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
2008 | {8, 8, 8, 16, 32}, /* cost of loading SSE registers |
2009 | in 32,64,128,256 and 512-bit */ |
2010 | {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ |
2011 | {8, 8, 8, 16, 32}, /* cost of storing SSE registers |
2012 | in 32,64,128,256 and 512-bit */ |
2013 | {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ |
2014 | 8, 6, /* SSE->integer and integer->SSE moves */ |
2015 | 8, 8, /* Gather load static, per_elt. */ |
2016 | 8, 8, /* Gather store static, per_elt. */ |
2017 | 32, /* size of l1 cache. */ |
2018 | 256, /* size of l2 cache. */ |
2019 | 64, /* size of prefetch block */ |
2020 | 6, /* number of parallel prefetches */ |
2021 | 3, /* Branch cost */ |
2022 | COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ |
2023 | COSTS_N_INSNS (8), /* cost of FMUL instruction. */ |
2024 | COSTS_N_INSNS (20), /* cost of FDIV instruction. */ |
2025 | COSTS_N_INSNS (8), /* cost of FABS instruction. */ |
2026 | COSTS_N_INSNS (8), /* cost of FCHS instruction. */ |
2027 | COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ |
2028 | |
2029 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
2030 | COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ |
2031 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ |
2032 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ |
2033 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
2034 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ |
2035 | COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ |
2036 | COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ |
2037 | COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ |
2038 | COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ |
2039 | 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ |
2040 | atom_memcpy, |
2041 | atom_memset, |
2042 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
2043 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
2044 | }; |
2045 | |
2046 | static stringop_algs slm_memcpy[2] = { |
2047 | {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, |
2048 | {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, |
2049 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; |
2050 | static stringop_algs slm_memset[2] = { |
2051 | {libcall, {{8, loop, false}, {15, unrolled_loop, false}, |
2052 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
2053 | {libcall, {{24, loop, false}, {32, unrolled_loop, false}, |
2054 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; |
2055 | static const |
2056 | struct processor_costs slm_cost = { |
2057 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
2058 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ |
2059 | COSTS_N_INSNS (1), /* variable shift costs */ |
2060 | COSTS_N_INSNS (1), /* constant shift costs */ |
2061 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ |
2062 | COSTS_N_INSNS (3), /* HI */ |
2063 | COSTS_N_INSNS (3), /* SI */ |
2064 | COSTS_N_INSNS (4), /* DI */ |
2065 | COSTS_N_INSNS (2)}, /* other */ |
2066 | 0, /* cost of multiply per each bit set */ |
2067 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ |
2068 | COSTS_N_INSNS (26), /* HI */ |
2069 | COSTS_N_INSNS (42), /* SI */ |
2070 | COSTS_N_INSNS (74), /* DI */ |
2071 | COSTS_N_INSNS (74)}, /* other */ |
2072 | COSTS_N_INSNS (1), /* cost of movsx */ |
2073 | COSTS_N_INSNS (1), /* cost of movzx */ |
2074 | 8, /* "large" insn */ |
2075 | 17, /* MOVE_RATIO */ |
2076 | |
2077 | /* All move costs are relative to integer->integer move times 2 and thus |
2078 | they are latency*2. */ |
2079 | 8, /* cost for loading QImode using movzbl */ |
2080 | {8, 8, 8}, /* cost of loading integer registers |
2081 | in QImode, HImode and SImode. |
2082 | Relative to reg-reg move (2). */ |
2083 | {6, 6, 6}, /* cost of storing integer registers */ |
2084 | 2, /* cost of reg,reg fld/fst */ |
2085 | {8, 8, 18}, /* cost of loading fp registers |
2086 | in SFmode, DFmode and XFmode */ |
2087 | {6, 6, 18}, /* cost of storing fp registers |
2088 | in SFmode, DFmode and XFmode */ |
2089 | 2, /* cost of moving MMX register */ |
2090 | {8, 8}, /* cost of loading MMX registers |
2091 | in SImode and DImode */ |
2092 | {6, 6}, /* cost of storing MMX registers |
2093 | in SImode and DImode */ |
2094 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
2095 | {8, 8, 8, 16, 32}, /* cost of loading SSE registers |
2096 | in 32,64,128,256 and 512-bit */ |
2097 | {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ |
2098 | {8, 8, 8, 16, 32}, /* cost of storing SSE registers |
2099 | in 32,64,128,256 and 512-bit */ |
2100 | {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ |
2101 | 8, 6, /* SSE->integer and integer->SSE moves */ |
2102 | 8, 8, /* Gather load static, per_elt. */ |
2103 | 8, 8, /* Gather store static, per_elt. */ |
2104 | 32, /* size of l1 cache. */ |
2105 | 256, /* size of l2 cache. */ |
2106 | 64, /* size of prefetch block */ |
2107 | 6, /* number of parallel prefetches */ |
2108 | 3, /* Branch cost */ |
2109 | COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ |
2110 | COSTS_N_INSNS (8), /* cost of FMUL instruction. */ |
2111 | COSTS_N_INSNS (20), /* cost of FDIV instruction. */ |
2112 | COSTS_N_INSNS (8), /* cost of FABS instruction. */ |
2113 | COSTS_N_INSNS (8), /* cost of FCHS instruction. */ |
2114 | COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ |
2115 | |
2116 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
2117 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
2118 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ |
2119 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ |
2120 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
2121 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ |
2122 | COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ |
2123 | COSTS_N_INSNS (69), /* cost of DIVSD instruction. */ |
2124 | COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */ |
2125 | COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */ |
2126 | 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
2127 | slm_memcpy, |
2128 | slm_memset, |
2129 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
2130 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
2131 | }; |
2132 | |
2133 | static stringop_algs intel_memcpy[2] = { |
2134 | {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, |
2135 | {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, |
2136 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; |
2137 | static stringop_algs intel_memset[2] = { |
2138 | {libcall, {{8, loop, false}, {15, unrolled_loop, false}, |
2139 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, |
2140 | {libcall, {{24, loop, false}, {32, unrolled_loop, false}, |
2141 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; |
2142 | static const |
2143 | struct processor_costs intel_cost = { |
2144 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
2145 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ |
2146 | COSTS_N_INSNS (1), /* variable shift costs */ |
2147 | COSTS_N_INSNS (1), /* constant shift costs */ |
2148 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ |
2149 | COSTS_N_INSNS (3), /* HI */ |
2150 | COSTS_N_INSNS (3), /* SI */ |
2151 | COSTS_N_INSNS (4), /* DI */ |
2152 | COSTS_N_INSNS (2)}, /* other */ |
2153 | 0, /* cost of multiply per each bit set */ |
2154 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ |
2155 | COSTS_N_INSNS (26), /* HI */ |
2156 | COSTS_N_INSNS (42), /* SI */ |
2157 | COSTS_N_INSNS (74), /* DI */ |
2158 | COSTS_N_INSNS (74)}, /* other */ |
2159 | COSTS_N_INSNS (1), /* cost of movsx */ |
2160 | COSTS_N_INSNS (1), /* cost of movzx */ |
2161 | 8, /* "large" insn */ |
2162 | 17, /* MOVE_RATIO */ |
2163 | |
2164 | /* All move costs are relative to integer->integer move times 2 and thus |
2165 | they are latency*2. */ |
2166 | 6, /* cost for loading QImode using movzbl */ |
2167 | {4, 4, 4}, /* cost of loading integer registers |
2168 | in QImode, HImode and SImode. |
2169 | Relative to reg-reg move (2). */ |
2170 | {6, 6, 6}, /* cost of storing integer registers */ |
2171 | 2, /* cost of reg,reg fld/fst */ |
2172 | {6, 6, 8}, /* cost of loading fp registers |
2173 | in SFmode, DFmode and XFmode */ |
2174 | {6, 6, 10}, /* cost of storing fp registers |
2175 | in SFmode, DFmode and XFmode */ |
2176 | 2, /* cost of moving MMX register */ |
2177 | {6, 6}, /* cost of loading MMX registers |
2178 | in SImode and DImode */ |
2179 | {6, 6}, /* cost of storing MMX registers |
2180 | in SImode and DImode */ |
2181 | 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ |
2182 | {6, 6, 6, 6, 6}, /* cost of loading SSE registers |
2183 | in 32,64,128,256 and 512-bit */ |
2184 | {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ |
2185 | {6, 6, 6, 6, 6}, /* cost of storing SSE registers |
2186 | in 32,64,128,256 and 512-bit */ |
2187 | {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ |
2188 | 4, 4, /* SSE->integer and integer->SSE moves */ |
2189 | 6, 6, /* Gather load static, per_elt. */ |
2190 | 6, 6, /* Gather store static, per_elt. */ |
2191 | 32, /* size of l1 cache. */ |
2192 | 256, /* size of l2 cache. */ |
2193 | 64, /* size of prefetch block */ |
2194 | 6, /* number of parallel prefetches */ |
2195 | 3, /* Branch cost */ |
2196 | COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ |
2197 | COSTS_N_INSNS (8), /* cost of FMUL instruction. */ |
2198 | COSTS_N_INSNS (20), /* cost of FDIV instruction. */ |
2199 | COSTS_N_INSNS (8), /* cost of FABS instruction. */ |
2200 | COSTS_N_INSNS (8), /* cost of FCHS instruction. */ |
2201 | COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ |
2202 | |
2203 | COSTS_N_INSNS (8), /* cost of cheap SSE instruction. */ |
2204 | COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ |
2205 | COSTS_N_INSNS (8), /* cost of MULSS instruction. */ |
2206 | COSTS_N_INSNS (8), /* cost of MULSD instruction. */ |
2207 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
2208 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ |
2209 | COSTS_N_INSNS (20), /* cost of DIVSS instruction. */ |
2210 | COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ |
2211 | COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */ |
2212 | COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */ |
2213 | 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
2214 | intel_memcpy, |
2215 | intel_memset, |
2216 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
2217 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
2218 | }; |
2219 | |
2220 | /* Generic should produce code tuned for Core-i7 (and newer chips) |
2221 | and btver1 (and newer chips). */ |
2222 | |
2223 | static stringop_algs generic_memcpy[2] = { |
2224 | {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, |
2225 | {-1, libcall, false}}}, |
2226 | {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, |
2227 | {-1, libcall, false}}}}; |
2228 | static stringop_algs generic_memset[2] = { |
2229 | {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, |
2230 | {-1, libcall, false}}}, |
2231 | {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, |
2232 | {-1, libcall, false}}}}; |
2233 | static const |
2234 | struct processor_costs generic_cost = { |
2235 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
2236 | /* Setting cost to 2 makes our current implementation of synth_mult result in |
2237 | use of unnecessary temporary registers causing regression on several |
2238 | SPECfp benchmarks. */ |
2239 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ |
2240 | COSTS_N_INSNS (1), /* variable shift costs */ |
2241 | COSTS_N_INSNS (1), /* constant shift costs */ |
2242 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ |
2243 | COSTS_N_INSNS (4), /* HI */ |
2244 | COSTS_N_INSNS (3), /* SI */ |
2245 | COSTS_N_INSNS (4), /* DI */ |
2246 | COSTS_N_INSNS (4)}, /* other */ |
2247 | 0, /* cost of multiply per each bit set */ |
2248 | {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */ |
2249 | COSTS_N_INSNS (22), /* HI */ |
2250 | COSTS_N_INSNS (30), /* SI */ |
2251 | COSTS_N_INSNS (74), /* DI */ |
2252 | COSTS_N_INSNS (74)}, /* other */ |
2253 | COSTS_N_INSNS (1), /* cost of movsx */ |
2254 | COSTS_N_INSNS (1), /* cost of movzx */ |
2255 | 8, /* "large" insn */ |
2256 | 17, /* MOVE_RATIO */ |
2257 | |
2258 | /* All move costs are relative to integer->integer move times 2 and thus |
2259 | they are latency*2. */ |
2260 | 4, /* cost for loading QImode using movzbl */ |
2261 | {4, 4, 4}, /* cost of loading integer registers |
2262 | in QImode, HImode and SImode. |
2263 | Relative to reg-reg move (2). */ |
2264 | {6, 6, 6}, /* cost of storing integer registers */ |
2265 | 4, /* cost of reg,reg fld/fst */ |
2266 | {6, 6, 12}, /* cost of loading fp registers |
2267 | in SFmode, DFmode and XFmode */ |
2268 | {6, 6, 12}, /* cost of storing fp registers |
2269 | in SFmode, DFmode and XFmode */ |
2270 | 2, /* cost of moving MMX register */ |
2271 | {6, 6}, /* cost of loading MMX registers |
2272 | in SImode and DImode */ |
2273 | {6, 6}, /* cost of storing MMX registers |
2274 | in SImode and DImode */ |
2275 | 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ |
2276 | {6, 6, 6, 10, 15}, /* cost of loading SSE registers |
2277 | in 32,64,128,256 and 512-bit */ |
2278 | {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ |
2279 | {6, 6, 6, 10, 15}, /* cost of storing SSE registers |
2280 | in 32,64,128,256 and 512-bit */ |
2281 | {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ |
2282 | 6, 6, /* SSE->integer and integer->SSE moves */ |
2283 | 18, 6, /* Gather load static, per_elt. */ |
2284 | 18, 6, /* Gather store static, per_elt. */ |
2285 | 32, /* size of l1 cache. */ |
2286 | 512, /* size of l2 cache. */ |
2287 | 64, /* size of prefetch block */ |
2288 | 6, /* number of parallel prefetches */ |
2289 | /* Benchmarks shows large regressions on K8 sixtrack benchmark when this |
2290 | value is increased to perhaps more appropriate value of 5. */ |
2291 | 3, /* Branch cost */ |
2292 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ |
2293 | COSTS_N_INSNS (5), /* cost of FMUL instruction. */ |
2294 | COSTS_N_INSNS (20), /* cost of FDIV instruction. */ |
2295 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ |
2296 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ |
2297 | COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ |
2298 | |
2299 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
2300 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
2301 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ |
2302 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ |
2303 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ |
2304 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ |
2305 | COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ |
2306 | COSTS_N_INSNS (32), /* cost of DIVSD instruction. */ |
2307 | COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */ |
2308 | COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */ |
2309 | 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ |
2310 | generic_memcpy, |
2311 | generic_memset, |
2312 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
2313 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
2314 | }; |
2315 | |
2316 | /* core_cost should produce code tuned for Core familly of CPUs. */ |
2317 | static stringop_algs core_memcpy[2] = { |
2318 | {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, |
2319 | {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, |
2320 | {-1, libcall, false}}}}; |
2321 | static stringop_algs core_memset[2] = { |
2322 | {libcall, {{6, loop_1_byte, true}, |
2323 | {24, loop, true}, |
2324 | {8192, rep_prefix_4_byte, true}, |
2325 | {-1, libcall, false}}}, |
2326 | {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, |
2327 | {-1, libcall, false}}}}; |
2328 | |
2329 | static const |
2330 | struct processor_costs core_cost = { |
2331 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
2332 | /* On all chips taken into consideration lea is 2 cycles and more. With |
2333 | this cost however our current implementation of synth_mult results in |
2334 | use of unnecessary temporary registers causing regression on several |
2335 | SPECfp benchmarks. */ |
2336 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ |
2337 | COSTS_N_INSNS (1), /* variable shift costs */ |
2338 | COSTS_N_INSNS (1), /* constant shift costs */ |
2339 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ |
2340 | COSTS_N_INSNS (4), /* HI */ |
2341 | COSTS_N_INSNS (3), /* SI */ |
2342 | COSTS_N_INSNS (4), /* DI */ |
2343 | COSTS_N_INSNS (4)}, /* other */ |
2344 | 0, /* cost of multiply per each bit set */ |
2345 | {COSTS_N_INSNS (8), /* cost of a divide/mod for QI */ |
2346 | COSTS_N_INSNS (8), /* HI */ |
2347 | /* 8-11 */ |
2348 | COSTS_N_INSNS (11), /* SI */ |
2349 | /* 24-81 */ |
2350 | COSTS_N_INSNS (81), /* DI */ |
2351 | COSTS_N_INSNS (81)}, /* other */ |
2352 | COSTS_N_INSNS (1), /* cost of movsx */ |
2353 | COSTS_N_INSNS (1), /* cost of movzx */ |
2354 | 8, /* "large" insn */ |
2355 | 17, /* MOVE_RATIO */ |
2356 | |
2357 | /* All move costs are relative to integer->integer move times 2 and thus |
2358 | they are latency*2. */ |
2359 | 6, /* cost for loading QImode using movzbl */ |
2360 | {4, 4, 4}, /* cost of loading integer registers |
2361 | in QImode, HImode and SImode. |
2362 | Relative to reg-reg move (2). */ |
2363 | {6, 6, 6}, /* cost of storing integer registers */ |
2364 | 2, /* cost of reg,reg fld/fst */ |
2365 | {6, 6, 8}, /* cost of loading fp registers |
2366 | in SFmode, DFmode and XFmode */ |
2367 | {6, 6, 10}, /* cost of storing fp registers |
2368 | in SFmode, DFmode and XFmode */ |
2369 | 2, /* cost of moving MMX register */ |
2370 | {6, 6}, /* cost of loading MMX registers |
2371 | in SImode and DImode */ |
2372 | {6, 6}, /* cost of storing MMX registers |
2373 | in SImode and DImode */ |
2374 | 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ |
2375 | {6, 6, 6, 6, 12}, /* cost of loading SSE registers |
2376 | in 32,64,128,256 and 512-bit */ |
2377 | {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ |
2378 | {6, 6, 6, 6, 12}, /* cost of storing SSE registers |
2379 | in 32,64,128,256 and 512-bit */ |
2380 | {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ |
2381 | 2, 2, /* SSE->integer and integer->SSE moves */ |
2382 | /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, |
2383 | rec. throughput 6. |
2384 | So 5 uops statically and one uops per load. */ |
2385 | 10, 6, /* Gather load static, per_elt. */ |
2386 | 10, 6, /* Gather store static, per_elt. */ |
2387 | 64, /* size of l1 cache. */ |
2388 | 512, /* size of l2 cache. */ |
2389 | 64, /* size of prefetch block */ |
2390 | 6, /* number of parallel prefetches */ |
2391 | /* FIXME perhaps more appropriate value is 5. */ |
2392 | 3, /* Branch cost */ |
2393 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ |
2394 | COSTS_N_INSNS (5), /* cost of FMUL instruction. */ |
2395 | /* 10-24 */ |
2396 | COSTS_N_INSNS (24), /* cost of FDIV instruction. */ |
2397 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ |
2398 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ |
2399 | COSTS_N_INSNS (23), /* cost of FSQRT instruction. */ |
2400 | |
2401 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
2402 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
2403 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ |
2404 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ |
2405 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ |
2406 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ |
2407 | COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ |
2408 | COSTS_N_INSNS (32), /* cost of DIVSD instruction. */ |
2409 | COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */ |
2410 | COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */ |
2411 | 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ |
2412 | core_memcpy, |
2413 | core_memset, |
2414 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
2415 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ |
2416 | }; |
2417 | |
2418 | |