1 | /* From the Intel IA-64 Optimization Guide, choose the minimum latency |
2 | alternative. */ |
3 | |
4 | #include <sysdep.h> |
5 | #undef ret |
6 | |
7 | #include <shlib-compat.h> |
8 | |
9 | #if SHLIB_COMPAT(libc, GLIBC_2_2, GLIBC_2_2_6) |
10 | |
11 | /* __divtf3 |
12 | Compute a 80-bit IEEE double-extended quotient. |
13 | farg0 holds the dividend. farg1 holds the divisor. */ |
14 | |
15 | ENTRY(___divtf3) |
16 | cmp.eq p7, p0 = r0, r0 |
17 | frcpa.s0 f10, p6 = farg0, farg1 |
18 | ;; |
19 | (p6) cmp.ne p7, p0 = r0, r0 |
20 | .pred.rel.mutex p6, p7 |
21 | (p6) fnma.s1 f11 = farg1, f10, f1 |
22 | (p6) fma.s1 f12 = farg0, f10, f0 |
23 | ;; |
24 | (p6) fma.s1 f13 = f11, f11, f0 |
25 | (p6) fma.s1 f14 = f11, f11, f11 |
26 | ;; |
27 | (p6) fma.s1 f11 = f13, f13, f11 |
28 | (p6) fma.s1 f13 = f14, f10, f10 |
29 | ;; |
30 | (p6) fma.s1 f10 = f13, f11, f10 |
31 | (p6) fnma.s1 f11 = farg1, f12, farg0 |
32 | ;; |
33 | (p6) fma.s1 f11 = f11, f10, f12 |
34 | (p6) fnma.s1 f12 = farg1, f10, f1 |
35 | ;; |
36 | (p6) fma.s1 f10 = f12, f10, f10 |
37 | (p6) fnma.s1 f12 = farg1, f11, farg0 |
38 | ;; |
39 | (p6) fma.s0 fret0 = f12, f10, f11 |
40 | (p7) mov fret0 = f10 |
41 | br.ret.sptk rp |
42 | END(___divtf3) |
43 | .symver ___divtf3, __divtf3@GLIBC_2.2 |
44 | |
45 | /* __divdf3 |
46 | Compute a 64-bit IEEE double quotient. |
47 | farg0 holds the dividend. farg1 holds the divisor. */ |
48 | |
49 | ENTRY(___divdf3) |
50 | cmp.eq p7, p0 = r0, r0 |
51 | frcpa.s0 f10, p6 = farg0, farg1 |
52 | ;; |
53 | (p6) cmp.ne p7, p0 = r0, r0 |
54 | .pred.rel.mutex p6, p7 |
55 | (p6) fmpy.s1 f11 = farg0, f10 |
56 | (p6) fnma.s1 f12 = farg1, f10, f1 |
57 | ;; |
58 | (p6) fma.s1 f11 = f12, f11, f11 |
59 | (p6) fmpy.s1 f13 = f12, f12 |
60 | ;; |
61 | (p6) fma.s1 f10 = f12, f10, f10 |
62 | (p6) fma.s1 f11 = f13, f11, f11 |
63 | ;; |
64 | (p6) fmpy.s1 f12 = f13, f13 |
65 | (p6) fma.s1 f10 = f13, f10, f10 |
66 | ;; |
67 | (p6) fma.d.s1 f11 = f12, f11, f11 |
68 | (p6) fma.s1 f10 = f12, f10, f10 |
69 | ;; |
70 | (p6) fnma.d.s1 f8 = farg1, f11, farg0 |
71 | ;; |
72 | (p6) fma.d fret0 = f8, f10, f11 |
73 | (p7) mov fret0 = f10 |
74 | br.ret.sptk rp |
75 | ;; |
76 | END(___divdf3) |
77 | .symver ___divdf3, __divdf3@GLIBC_2.2 |
78 | |
79 | /* __divsf3 |
80 | Compute a 32-bit IEEE float quotient. |
81 | farg0 holds the dividend. farg1 holds the divisor. */ |
82 | |
83 | ENTRY(___divsf3) |
84 | cmp.eq p7, p0 = r0, r0 |
85 | frcpa.s0 f10, p6 = farg0, farg1 |
86 | ;; |
87 | (p6) cmp.ne p7, p0 = r0, r0 |
88 | .pred.rel.mutex p6, p7 |
89 | (p6) fmpy.s1 f8 = farg0, f10 |
90 | (p6) fnma.s1 f9 = farg1, f10, f1 |
91 | ;; |
92 | (p6) fma.s1 f8 = f9, f8, f8 |
93 | (p6) fmpy.s1 f9 = f9, f9 |
94 | ;; |
95 | (p6) fma.s1 f8 = f9, f8, f8 |
96 | (p6) fmpy.s1 f9 = f9, f9 |
97 | ;; |
98 | (p6) fma.d.s1 f10 = f9, f8, f8 |
99 | ;; |
100 | (p6) fnorm.s.s0 fret0 = f10 |
101 | (p7) mov fret0 = f10 |
102 | br.ret.sptk rp |
103 | ;; |
104 | END(___divsf3) |
105 | .symver ___divsf3, __divsf3@GLIBC_2.2 |
106 | |
107 | /* __divdi3 |
108 | Compute a 64-bit integer quotient. |
109 | in0 holds the dividend. in1 holds the divisor. */ |
110 | |
111 | ENTRY(___divdi3) |
112 | .regstk 2,0,0,0 |
113 | /* Transfer inputs to FP registers. */ |
114 | setf.sig f8 = in0 |
115 | setf.sig f9 = in1 |
116 | ;; |
117 | /* Convert the inputs to FP, so that they won't be treated as |
118 | unsigned. */ |
119 | fcvt.xf f8 = f8 |
120 | fcvt.xf f9 = f9 |
121 | ;; |
122 | /* Compute the reciprocal approximation. */ |
123 | frcpa.s1 f10, p6 = f8, f9 |
124 | ;; |
125 | /* 3 Newton-Raphson iterations. */ |
126 | (p6) fnma.s1 f11 = f9, f10, f1 |
127 | (p6) fmpy.s1 f12 = f8, f10 |
128 | ;; |
129 | (p6) fmpy.s1 f13 = f11, f11 |
130 | (p6) fma.s1 f12 = f11, f12, f12 |
131 | ;; |
132 | (p6) fma.s1 f10 = f11, f10, f10 |
133 | (p6) fma.s1 f11 = f13, f12, f12 |
134 | ;; |
135 | (p6) fma.s1 f10 = f13, f10, f10 |
136 | (p6) fnma.s1 f12 = f9, f11, f8 |
137 | ;; |
138 | (p6) fma.s1 f10 = f12, f10, f11 |
139 | ;; |
140 | /* Round quotient to an integer. */ |
141 | fcvt.fx.trunc.s1 f10 = f10 |
142 | ;; |
143 | /* Transfer result to GP registers. */ |
144 | getf.sig ret0 = f10 |
145 | br.ret.sptk rp |
146 | ;; |
147 | END(___divdi3) |
148 | .symver ___divdi3, __divdi3@GLIBC_2.2 |
149 | |
150 | /* __moddi3 |
151 | Compute a 64-bit integer modulus. |
152 | in0 holds the dividend (a). in1 holds the divisor (b). */ |
153 | |
154 | ENTRY(___moddi3) |
155 | .regstk 2,0,0,0 |
156 | /* Transfer inputs to FP registers. */ |
157 | setf.sig f14 = in0 |
158 | setf.sig f9 = in1 |
159 | ;; |
160 | /* Convert the inputs to FP, so that they won't be treated as |
161 | unsigned. */ |
162 | fcvt.xf f8 = f14 |
163 | fcvt.xf f9 = f9 |
164 | ;; |
165 | /* Compute the reciprocal approximation. */ |
166 | frcpa.s1 f10, p6 = f8, f9 |
167 | ;; |
168 | /* 3 Newton-Raphson iterations. */ |
169 | (p6) fmpy.s1 f12 = f8, f10 |
170 | (p6) fnma.s1 f11 = f9, f10, f1 |
171 | ;; |
172 | (p6) fma.s1 f12 = f11, f12, f12 |
173 | (p6) fmpy.s1 f13 = f11, f11 |
174 | ;; |
175 | (p6) fma.s1 f10 = f11, f10, f10 |
176 | (p6) fma.s1 f11 = f13, f12, f12 |
177 | ;; |
178 | sub in1 = r0, in1 |
179 | (p6) fma.s1 f10 = f13, f10, f10 |
180 | (p6) fnma.s1 f12 = f9, f11, f8 |
181 | ;; |
182 | setf.sig f9 = in1 |
183 | (p6) fma.s1 f10 = f12, f10, f11 |
184 | ;; |
185 | fcvt.fx.trunc.s1 f10 = f10 |
186 | ;; |
187 | /* r = q * (-b) + a */ |
188 | xma.l f10 = f10, f9, f14 |
189 | ;; |
190 | /* Transfer result to GP registers. */ |
191 | getf.sig ret0 = f10 |
192 | br.ret.sptk rp |
193 | ;; |
194 | END(___moddi3) |
195 | .symver ___moddi3, __moddi3@GLIBC_2.2 |
196 | |
197 | /* __udivdi3 |
198 | Compute a 64-bit unsigned integer quotient. |
199 | in0 holds the dividend. in1 holds the divisor. */ |
200 | |
201 | ENTRY(___udivdi3) |
202 | .regstk 2,0,0,0 |
203 | /* Transfer inputs to FP registers. */ |
204 | setf.sig f8 = in0 |
205 | setf.sig f9 = in1 |
206 | ;; |
207 | /* Convert the inputs to FP, to avoid FP software-assist faults. */ |
208 | fcvt.xuf.s1 f8 = f8 |
209 | fcvt.xuf.s1 f9 = f9 |
210 | ;; |
211 | /* Compute the reciprocal approximation. */ |
212 | frcpa.s1 f10, p6 = f8, f9 |
213 | ;; |
214 | /* 3 Newton-Raphson iterations. */ |
215 | (p6) fnma.s1 f11 = f9, f10, f1 |
216 | (p6) fmpy.s1 f12 = f8, f10 |
217 | ;; |
218 | (p6) fmpy.s1 f13 = f11, f11 |
219 | (p6) fma.s1 f12 = f11, f12, f12 |
220 | ;; |
221 | (p6) fma.s1 f10 = f11, f10, f10 |
222 | (p6) fma.s1 f11 = f13, f12, f12 |
223 | ;; |
224 | (p6) fma.s1 f10 = f13, f10, f10 |
225 | (p6) fnma.s1 f12 = f9, f11, f8 |
226 | ;; |
227 | (p6) fma.s1 f10 = f12, f10, f11 |
228 | ;; |
229 | /* Round quotient to an unsigned integer. */ |
230 | fcvt.fxu.trunc.s1 f10 = f10 |
231 | ;; |
232 | /* Transfer result to GP registers. */ |
233 | getf.sig ret0 = f10 |
234 | br.ret.sptk rp |
235 | ;; |
236 | END(___udivdi3) |
237 | .symver ___udivdi3, __udivdi3@GLIBC_2.2 |
238 | |
239 | /* __umoddi3 |
240 | Compute a 64-bit unsigned integer modulus. |
241 | in0 holds the dividend (a). in1 holds the divisor (b). */ |
242 | |
243 | ENTRY(___umoddi3) |
244 | .regstk 2,0,0,0 |
245 | /* Transfer inputs to FP registers. */ |
246 | setf.sig f14 = in0 |
247 | setf.sig f9 = in1 |
248 | ;; |
249 | /* Convert the inputs to FP, to avoid FP software assist faults. */ |
250 | fcvt.xuf.s1 f8 = f14 |
251 | fcvt.xuf.s1 f9 = f9 |
252 | ;; |
253 | /* Compute the reciprocal approximation. */ |
254 | frcpa.s1 f10, p6 = f8, f9 |
255 | ;; |
256 | /* 3 Newton-Raphson iterations. */ |
257 | (p6) fmpy.s1 f12 = f8, f10 |
258 | (p6) fnma.s1 f11 = f9, f10, f1 |
259 | ;; |
260 | (p6) fma.s1 f12 = f11, f12, f12 |
261 | (p6) fmpy.s1 f13 = f11, f11 |
262 | ;; |
263 | (p6) fma.s1 f10 = f11, f10, f10 |
264 | (p6) fma.s1 f11 = f13, f12, f12 |
265 | ;; |
266 | sub in1 = r0, in1 |
267 | (p6) fma.s1 f10 = f13, f10, f10 |
268 | (p6) fnma.s1 f12 = f9, f11, f8 |
269 | ;; |
270 | setf.sig f9 = in1 |
271 | (p6) fma.s1 f10 = f12, f10, f11 |
272 | ;; |
273 | /* Round quotient to an unsigned integer. */ |
274 | fcvt.fxu.trunc.s1 f10 = f10 |
275 | ;; |
276 | /* r = q * (-b) + a */ |
277 | xma.l f10 = f10, f9, f14 |
278 | ;; |
279 | /* Transfer result to GP registers. */ |
280 | getf.sig ret0 = f10 |
281 | br.ret.sptk rp |
282 | ;; |
283 | END(___umoddi3) |
284 | .symver ___umoddi3, __umoddi3@GLIBC_2.2 |
285 | |
286 | /* __multi3 |
287 | Compute a 128-bit multiply of 128-bit multiplicands. |
288 | in0/in1 holds one multiplicand (a), in2/in3 holds the other one (b). */ |
289 | |
290 | ENTRY(___multi3) |
291 | .regstk 4,0,0,0 |
292 | setf.sig f6 = in1 |
293 | movl r19 = 0xffffffff |
294 | setf.sig f7 = in2 |
295 | ;; |
296 | and r14 = r19, in0 |
297 | ;; |
298 | setf.sig f10 = r14 |
299 | and r14 = r19, in2 |
300 | xmpy.l f9 = f6, f7 |
301 | ;; |
302 | setf.sig f6 = r14 |
303 | shr.u r14 = in0, 32 |
304 | ;; |
305 | setf.sig f7 = r14 |
306 | shr.u r14 = in2, 32 |
307 | ;; |
308 | setf.sig f8 = r14 |
309 | xmpy.l f11 = f10, f6 |
310 | xmpy.l f6 = f7, f6 |
311 | ;; |
312 | getf.sig r16 = f11 |
313 | xmpy.l f7 = f7, f8 |
314 | ;; |
315 | shr.u r14 = r16, 32 |
316 | and r16 = r19, r16 |
317 | getf.sig r17 = f6 |
318 | setf.sig f6 = in0 |
319 | ;; |
320 | setf.sig f11 = r14 |
321 | getf.sig r21 = f7 |
322 | setf.sig f7 = in3 |
323 | ;; |
324 | xma.l f11 = f10, f8, f11 |
325 | xma.l f6 = f6, f7, f9 |
326 | ;; |
327 | getf.sig r18 = f11 |
328 | ;; |
329 | add r18 = r18, r17 |
330 | ;; |
331 | and r15 = r19, r18 |
332 | cmp.ltu p7, p6 = r18, r17 |
333 | ;; |
334 | getf.sig r22 = f6 |
335 | (p7) adds r14 = 1, r19 |
336 | ;; |
337 | (p7) add r21 = r21, r14 |
338 | shr.u r14 = r18, 32 |
339 | shl r15 = r15, 32 |
340 | ;; |
341 | add r20 = r21, r14 |
342 | ;; |
343 | add ret0 = r15, r16 |
344 | add ret1 = r22, r20 |
345 | br.ret.sptk rp |
346 | ;; |
347 | END(___multi3) |
348 | .symver ___multi3, __multi3@GLIBC_2.2 |
349 | |
350 | #endif |
351 | |