1//===----------------------Hexagon builtin routine ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9// Double Precision Multiply
10#define A r1:0
11#define AH r1
12#define AL r0
13#define B r3:2
14#define BH r3
15#define BL r2
16
17#define BTMP r5:4
18#define BTMPH r5
19#define BTMPL r4
20
21#define PP_ODD r7:6
22#define PP_ODD_H r7
23#define PP_ODD_L r6
24
25#define ONE r9:8
26#define S_ONE r8
27#define S_ZERO r9
28
29#define PP_HH r11:10
30#define PP_HH_H r11
31#define PP_HH_L r10
32
33#define ATMP r13:12
34#define ATMPH r13
35#define ATMPL r12
36
37#define PP_LL r15:14
38#define PP_LL_H r15
39#define PP_LL_L r14
40
41#define TMP r28
42
43#define MANTBITS 52
44#define HI_MANTBITS 20
45#define EXPBITS 11
46#define BIAS 1024
47#define MANTISSA_TO_INT_BIAS 52
48
49// Some constant to adjust normalization amount in error code
50// Amount to right shift the partial product to get to a denorm
51#define FUDGE 5
52
53#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
54#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
55#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
56#define END(TAG) .size TAG,.-TAG
57
58#define SR_ROUND_OFF 22
59 .text
60 .global __hexagon_muldf3
61 .type __hexagon_muldf3,@function
62 Q6_ALIAS(muldf3)
63 FAST_ALIAS(muldf3)
64 FAST2_ALIAS(muldf3)
65 .p2align 5
66__hexagon_muldf3:
67 {
68 p0 = dfclass(A,#2)
69 p0 = dfclass(B,#2)
70 ATMP = combine(##0x40000000,#0)
71 }
72 {
73 ATMP = insert(A,#MANTBITS,#EXPBITS-1)
74 BTMP = asl(B,#EXPBITS-1)
75 TMP = #-BIAS
76 ONE = #1
77 }
78 {
79 PP_ODD = mpyu(BTMPL,ATMPH)
80 BTMP = insert(ONE,#2,#62)
81 }
82 // since we know that the MSB of the H registers is zero, we should never carry
83 // H <= 2^31-1. L <= 2^32-1. Therefore, HL <= 2^63-2^32-2^31+1
84 // Adding 2 HLs, we get 2^64-3*2^32+2 maximum.
85 // Therefore, we can add 3 2^32-1 values safely without carry. We only need one.
86 {
87 PP_LL = mpyu(ATMPL,BTMPL)
88 PP_ODD += mpyu(ATMPL,BTMPH)
89 }
90 {
91 PP_ODD += lsr(PP_LL,#32)
92 PP_HH = mpyu(ATMPH,BTMPH)
93 BTMP = combine(##BIAS+BIAS-4,#0)
94 }
95 {
96 PP_HH += lsr(PP_ODD,#32)
97 if (!p0) jump .Lmul_abnormal
98 p1 = cmp.eq(PP_LL_L,#0) // 64 lsb's 0?
99 p1 = cmp.eq(PP_ODD_L,#0) // 64 lsb's 0?
100 }
101
102 // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
103 // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
104
105#undef PP_ODD
106#undef PP_ODD_H
107#undef PP_ODD_L
108#define EXP10 r7:6
109#define EXP1 r7
110#define EXP0 r6
111 {
112 if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
113 EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
114 EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
115 }
116 {
117 PP_LL = neg(PP_HH)
118 EXP0 += add(TMP,EXP1)
119 TMP = xor(AH,BH)
120 }
121 {
122 if (!p2.new) PP_HH = PP_LL
123 p2 = cmp.gt(TMP,#-1)
124 p0 = !cmp.gt(EXP0,BTMPH)
125 p0 = cmp.gt(EXP0,BTMPL)
126 if (!p0.new) jump:nt .Lmul_ovf_unf
127 }
128 {
129 A = convert_d2df(PP_HH)
130 EXP0 = add(EXP0,#-BIAS-58)
131 }
132 {
133 AH += asl(EXP0,#HI_MANTBITS)
134 jumpr r31
135 }
136
137 .falign
138.Lpossible_unf:
139 // We end up with a positive exponent
140 // But we may have rounded up to an exponent of 1.
141 // If the exponent is 1, if we rounded up to it
142 // we need to also raise underflow
143 // Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
144 // And the PP should also have more than one bit set
145 //
146 // Note: ATMP should have abs(PP_HH)
147 // Note: BTMPL should have 0x7FEFFFFF
148 {
149 p0 = cmp.eq(AL,#0)
150 p0 = bitsclr(AH,BTMPL)
151 if (!p0.new) jumpr:t r31
152 BTMPH = #0x7fff
153 }
154 {
155 p0 = bitsset(ATMPH,BTMPH)
156 BTMPL = USR
157 BTMPH = #0x030
158 }
159 {
160 if (p0) BTMPL = or(BTMPL,BTMPH)
161 }
162 {
163 USR = BTMPL
164 }
165 {
166 p0 = dfcmp.eq(A,A)
167 jumpr r31
168 }
169 .falign
170.Lmul_ovf_unf:
171 {
172 A = convert_d2df(PP_HH)
173 ATMP = abs(PP_HH) // take absolute value
174 EXP1 = add(EXP0,#-BIAS-58)
175 }
176 {
177 AH += asl(EXP1,#HI_MANTBITS)
178 EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
179 BTMPL = ##0x7FEFFFFF
180 }
181 {
182 EXP1 += add(EXP0,##-BIAS-58)
183 //BTMPH = add(clb(ATMP),#-2)
184 BTMPH = #0
185 }
186 {
187 p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow
188 if (p0.new) jump:nt .Lmul_ovf
189 }
190 {
191 p0 = cmp.gt(EXP1,#0)
192 if (p0.new) jump:nt .Lpossible_unf
193 BTMPH = sub(EXP0,BTMPH)
194 TMP = #63 // max amount to shift
195 }
196 // Underflow
197 //
198 // PP_HH has the partial product with sticky LSB.
199 // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
200 // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
201 // The exponent of PP_HH is in EXP1, which is non-positive (0 or negative)
202 // That's the exponent that happens after the normalization
203 //
204 // EXP0 has the exponent that, when added to the normalized value, is out of range.
205 //
206 // Strategy:
207 //
208 // * Shift down bits, with sticky bit, such that the bits are aligned according
209 // to the LZ count and appropriate exponent, but not all the way to mantissa
210 // field, keep around the last few bits.
211 // * Put a 1 near the MSB
212 // * Check the LSBs for inexact; if inexact also set underflow
213 // * Convert [u]d2df -- will correctly round according to rounding mode
214 // * Replace exponent field with zero
215
216 {
217 BTMPL = #0 // offset for extract
218 BTMPH = sub(#FUDGE,BTMPH) // amount to right shift
219 }
220 {
221 p3 = cmp.gt(PP_HH_H,#-1) // is it positive?
222 BTMPH = min(BTMPH,TMP) // Don't shift more than 63
223 PP_HH = ATMP
224 }
225 {
226 TMP = USR
227 PP_LL = extractu(PP_HH,BTMP)
228 }
229 {
230 PP_HH = asr(PP_HH,BTMPH)
231 BTMPL = #0x0030 // underflow flag
232 AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
233 }
234 {
235 p0 = cmp.gtu(ONE,PP_LL) // Did we extract all zeros?
236 if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE) // add sticky bit
237 PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3) // Add back in a bit so we can use convert instruction
238 }
239 {
240 PP_LL = neg(PP_HH)
241 p1 = bitsclr(PP_HH_L,#0x7) // Are the LSB's clear?
242 if (!p1.new) TMP = or(BTMPL,TMP) // If not, Inexact+Underflow
243 }
244 {
245 if (!p3) PP_HH = PP_LL
246 USR = TMP
247 }
248 {
249 A = convert_d2df(PP_HH) // Do rounding
250 p0 = dfcmp.eq(A,A) // realize exception
251 }
252 {
253 AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1) // Insert correct exponent
254 jumpr r31
255 }
256 .falign
257.Lmul_ovf:
258 // We get either max finite value or infinity. Either way, overflow+inexact
259 {
260 TMP = USR
261 ATMP = combine(##0x7fefffff,#-1) // positive max finite
262 A = PP_HH
263 }
264 {
265 PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits
266 TMP = or(TMP,#0x28) // inexact + overflow
267 BTMP = combine(##0x7ff00000,#0) // positive infinity
268 }
269 {
270 USR = TMP
271 PP_LL_L ^= lsr(AH,#31) // Does sign match rounding?
272 TMP = PP_LL_L // unmodified rounding mode
273 }
274 {
275 p0 = !cmp.eq(TMP,#1) // If not round-to-zero and
276 p0 = !cmp.eq(PP_LL_L,#2) // Not rounding the other way,
277 if (p0.new) ATMP = BTMP // we should get infinity
278 p0 = dfcmp.eq(A,A) // Realize FP exception if enabled
279 }
280 {
281 A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign
282 jumpr r31
283 }
284
285.Lmul_abnormal:
286 {
287 ATMP = extractu(A,#63,#0) // strip off sign
288 BTMP = extractu(B,#63,#0) // strip off sign
289 }
290 {
291 p3 = cmp.gtu(ATMP,BTMP)
292 if (!p3.new) A = B // sort values
293 if (!p3.new) B = A // sort values
294 }
295 {
296 // Any NaN --> NaN, possibly raise invalid if sNaN
297 p0 = dfclass(A,#0x0f) // A not NaN?
298 if (!p0.new) jump:nt .Linvalid_nan
299 if (!p3) ATMP = BTMP
300 if (!p3) BTMP = ATMP
301 }
302 {
303 // Infinity * nonzero number is infinity
304 p1 = dfclass(A,#0x08) // A is infinity
305 p1 = dfclass(B,#0x0e) // B is nonzero
306 }
307 {
308 // Infinity * zero --> NaN, raise invalid
309 // Other zeros return zero
310 p0 = dfclass(A,#0x08) // A is infinity
311 p0 = dfclass(B,#0x01) // B is zero
312 }
313 {
314 if (p1) jump .Ltrue_inf
315 p2 = dfclass(B,#0x01)
316 }
317 {
318 if (p0) jump .Linvalid_zeroinf
319 if (p2) jump .Ltrue_zero // so return zero
320 TMP = ##0x7c000000
321 }
322 // We are left with a normal or subnormal times a subnormal. A > B
323 // If A and B are both very small (exp(a) < BIAS-MANTBITS),
324 // we go to a single sticky bit, which we can round easily.
325 // If A and B might multiply to something bigger, decrease A exponent and increase
326 // B exponent and try again
327 {
328 p0 = bitsclr(AH,TMP)
329 if (p0.new) jump:nt .Lmul_tiny
330 }
331 {
332 TMP = cl0(BTMP)
333 }
334 {
335 TMP = add(TMP,#-EXPBITS)
336 }
337 {
338 BTMP = asl(BTMP,TMP)
339 }
340 {
341 B = insert(BTMP,#63,#0)
342 AH -= asl(TMP,#HI_MANTBITS)
343 }
344 jump __hexagon_muldf3
345.Lmul_tiny:
346 {
347 TMP = USR
348 A = xor(A,B) // get sign bit
349 }
350 {
351 TMP = or(TMP,#0x30) // Inexact + Underflow
352 A = insert(ONE,#63,#0) // put in rounded up value
353 BTMPH = extractu(TMP,#2,#SR_ROUND_OFF) // get rounding mode
354 }
355 {
356 USR = TMP
357 p0 = cmp.gt(BTMPH,#1) // Round towards pos/neg inf?
358 if (!p0.new) AL = #0 // If not, zero
359 BTMPH ^= lsr(AH,#31) // rounding my way --> set LSB
360 }
361 {
362 p0 = cmp.eq(BTMPH,#3) // if rounding towards right inf
363 if (!p0.new) AL = #0 // don't go to zero
364 jumpr r31
365 }
366.Linvalid_zeroinf:
367 {
368 TMP = USR
369 }
370 {
371 A = #-1
372 TMP = or(TMP,#2)
373 }
374 {
375 USR = TMP
376 }
377 {
378 p0 = dfcmp.uo(A,A) // force exception if enabled
379 jumpr r31
380 }
381.Linvalid_nan:
382 {
383 p0 = dfclass(B,#0x0f) // if B is not NaN
384 TMP = convert_df2sf(A) // will generate invalid if sNaN
385 if (p0.new) B = A // make it whatever A is
386 }
387 {
388 BL = convert_df2sf(B) // will generate invalid if sNaN
389 A = #-1
390 jumpr r31
391 }
392 .falign
393.Ltrue_zero:
394 {
395 A = B
396 B = A
397 }
398.Ltrue_inf:
399 {
400 BH = extract(BH,#1,#31)
401 }
402 {
403 AH ^= asl(BH,#31)
404 jumpr r31
405 }
406END(__hexagon_muldf3)
407
408#undef ATMP
409#undef ATMPL
410#undef ATMPH
411#undef BTMP
412#undef BTMPL
413#undef BTMPH
414

source code of compiler-rt/lib/builtins/hexagon/dfmul.S