1//===----------------------Hexagon builtin routine ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9// Double Precision square root
10
11#define EXP r28
12
13#define A r1:0
14#define AH r1
15#define AL r0
16
17#define SFSH r3:2
18#define SF_S r3
19#define SF_H r2
20
21#define SFHALF_SONE r5:4
22#define S_ONE r4
23#define SFHALF r5
24#define SF_D r6
25#define SF_E r7
26#define RECIPEST r8
27#define SFRAD r9
28
29#define FRACRAD r11:10
30#define FRACRADH r11
31#define FRACRADL r10
32
33#define ROOT r13:12
34#define ROOTHI r13
35#define ROOTLO r12
36
37#define PROD r15:14
38#define PRODHI r15
39#define PRODLO r14
40
41#define P_TMP p0
42#define P_EXP1 p1
43#define NORMAL p2
44
45#define SF_EXPBITS 8
46#define SF_MANTBITS 23
47
48#define DF_EXPBITS 11
49#define DF_MANTBITS 52
50
51#define DF_BIAS 0x3ff
52
53#define DFCLASS_ZERO 0x01
54#define DFCLASS_NORMAL 0x02
55#define DFCLASS_DENORMAL 0x02
56#define DFCLASS_INFINITE 0x08
57#define DFCLASS_NAN 0x10
58
59#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function
60#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function
61#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function
62#define END(TAG) .size TAG,.-TAG
63
64 .text
65 .global __hexagon_sqrtdf2
66 .type __hexagon_sqrtdf2,@function
67 .global __hexagon_sqrt
68 .type __hexagon_sqrt,@function
69 Q6_ALIAS(sqrtdf2)
70 Q6_ALIAS(sqrt)
71 FAST_ALIAS(sqrtdf2)
72 FAST_ALIAS(sqrt)
73 FAST2_ALIAS(sqrtdf2)
74 FAST2_ALIAS(sqrt)
75 .type sqrt,@function
76 .p2align 5
77__hexagon_sqrtdf2:
78__hexagon_sqrt:
79 {
80 PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)
81 EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32)
82 SFHALF_SONE = combine(##0x3f000004,#1)
83 }
84 {
85 NORMAL = dfclass(A,#DFCLASS_NORMAL) // Is it normal
86 NORMAL = cmp.gt(AH,#-1) // and positive?
87 if (!NORMAL.new) jump:nt .Lsqrt_abnormal
88 SFRAD = or(SFHALF,PRODLO)
89 }
90#undef NORMAL
91.Ldenormal_restart:
92 {
93 FRACRAD = A
94 SF_E,P_TMP = sfinvsqrta(SFRAD)
95 SFHALF = and(SFHALF,#-16)
96 SFSH = #0
97 }
98#undef A
99#undef AH
100#undef AL
101#define ERROR r1:0
102#define ERRORHI r1
103#define ERRORLO r0
104 // SF_E : reciprocal square root
105 // SF_H : half rsqrt
106 // sf_S : square root
107 // SF_D : error term
108 // SFHALF: 0.5
109 {
110 SF_S += sfmpy(SF_E,SFRAD):lib // s0: root
111 SF_H += sfmpy(SF_E,SFHALF):lib // h0: 0.5*y0. Could also decrement exponent...
112 SF_D = SFHALF
113#undef SFRAD
114#define SHIFTAMT r9
115 SHIFTAMT = and(EXP,#1)
116 }
117 {
118 SF_D -= sfmpy(SF_S,SF_H):lib // d0: 0.5-H*S = 0.5-0.5*~1
119 FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) // replace upper bits with hidden
120 P_EXP1 = cmp.gtu(SHIFTAMT,#0)
121 }
122 {
123 SF_S += sfmpy(SF_S,SF_D):lib // s1: refine sqrt
124 SF_H += sfmpy(SF_H,SF_D):lib // h1: refine half-recip
125 SF_D = SFHALF
126 SHIFTAMT = mux(P_EXP1,#8,#9)
127 }
128 {
129 SF_D -= sfmpy(SF_S,SF_H):lib // d1: error term
130 FRACRAD = asl(FRACRAD,SHIFTAMT) // Move fracrad bits to right place
131 SHIFTAMT = mux(P_EXP1,#3,#2)
132 }
133 {
134 SF_H += sfmpy(SF_H,SF_D):lib // d2: rsqrt
135 // cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x).
136 PROD = asl(FRACRAD,SHIFTAMT) // fracrad<<(2+exp1)
137 }
138 {
139 SF_H = and(SF_H,##0x007fffff)
140 }
141 {
142 SF_H = add(SF_H,##0x00800000 - 3)
143 SHIFTAMT = mux(P_EXP1,#7,#8)
144 }
145 {
146 RECIPEST = asl(SF_H,SHIFTAMT)
147 SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0))
148 }
149 {
150 ROOT = mpyu(RECIPEST,PRODHI) // root = mpyu_full(recipest,hi(fracrad<<(2+exp1)))
151 }
152
153#undef SFSH // r3:2
154#undef SF_H // r2
155#undef SF_S // r3
156#undef S_ONE // r4
157#undef SFHALF // r5
158#undef SFHALF_SONE // r5:4
159#undef SF_D // r6
160#undef SF_E // r7
161
162#define HL r3:2
163#define LL r5:4
164#define HH r7:6
165
166#undef P_EXP1
167#define P_CARRY0 p1
168#define P_CARRY1 p2
169#define P_CARRY2 p3
170
171 // Iteration 0
172 // Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply
173 // We can shift and subtract instead of shift and add?
174 {
175 ERROR = asl(FRACRAD,#15)
176 PROD = mpyu(ROOTHI,ROOTHI)
177 P_CARRY0 = cmp.eq(r0,r0)
178 }
179 {
180 ERROR -= asl(PROD,#15)
181 PROD = mpyu(ROOTHI,ROOTLO)
182 P_CARRY1 = cmp.eq(r0,r0)
183 }
184 {
185 ERROR -= lsr(PROD,#16)
186 P_CARRY2 = cmp.eq(r0,r0)
187 }
188 {
189 ERROR = mpyu(ERRORHI,RECIPEST)
190 }
191 {
192 ROOT += lsr(ERROR,SHIFTAMT)
193 SHIFTAMT = add(SHIFTAMT,#16)
194 ERROR = asl(FRACRAD,#31) // for next iter
195 }
196 // Iteration 1
197 {
198 PROD = mpyu(ROOTHI,ROOTHI)
199 ERROR -= mpyu(ROOTHI,ROOTLO) // amount is 31, no shift needed
200 }
201 {
202 ERROR -= asl(PROD,#31)
203 PROD = mpyu(ROOTLO,ROOTLO)
204 }
205 {
206 ERROR -= lsr(PROD,#33)
207 }
208 {
209 ERROR = mpyu(ERRORHI,RECIPEST)
210 }
211 {
212 ROOT += lsr(ERROR,SHIFTAMT)
213 SHIFTAMT = add(SHIFTAMT,#16)
214 ERROR = asl(FRACRAD,#47) // for next iter
215 }
216 // Iteration 2
217 {
218 PROD = mpyu(ROOTHI,ROOTHI)
219 }
220 {
221 ERROR -= asl(PROD,#47)
222 PROD = mpyu(ROOTHI,ROOTLO)
223 }
224 {
225 ERROR -= asl(PROD,#16) // bidir shr 31-47
226 PROD = mpyu(ROOTLO,ROOTLO)
227 }
228 {
229 ERROR -= lsr(PROD,#17) // 64-47
230 }
231 {
232 ERROR = mpyu(ERRORHI,RECIPEST)
233 }
234 {
235 ROOT += lsr(ERROR,SHIFTAMT)
236 }
237#undef ERROR
238#undef PROD
239#undef PRODHI
240#undef PRODLO
241#define REM_HI r15:14
242#define REM_HI_HI r15
243#define REM_LO r1:0
244#undef RECIPEST
245#undef SHIFTAMT
246#define TWOROOT_LO r9:8
247 // Adjust Root
248 {
249 HL = mpyu(ROOTHI,ROOTLO)
250 LL = mpyu(ROOTLO,ROOTLO)
251 REM_HI = #0
252 REM_LO = #0
253 }
254 {
255 HL += lsr(LL,#33)
256 LL += asl(HL,#33)
257 P_CARRY0 = cmp.eq(r0,r0)
258 }
259 {
260 HH = mpyu(ROOTHI,ROOTHI)
261 REM_LO = sub(REM_LO,LL,P_CARRY0):carry
262 TWOROOT_LO = #1
263 }
264 {
265 HH += lsr(HL,#31)
266 TWOROOT_LO += asl(ROOT,#1)
267 }
268#undef HL
269#undef LL
270#define REM_HI_TMP r3:2
271#define REM_HI_TMP_HI r3
272#define REM_LO_TMP r5:4
273 {
274 REM_HI = sub(FRACRAD,HH,P_CARRY0):carry
275 REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry
276#undef FRACRAD
277#undef HH
278#define ZERO r11:10
279#define ONE r7:6
280 ONE = #1
281 ZERO = #0
282 }
283 {
284 REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry
285 ONE = add(ROOT,ONE)
286 EXP = add(EXP,#-DF_BIAS) // subtract bias --> signed exp
287 }
288 {
289 // If carry set, no borrow: result was still positive
290 if (P_CARRY1) ROOT = ONE
291 if (P_CARRY1) REM_LO = REM_LO_TMP
292 if (P_CARRY1) REM_HI = REM_HI_TMP
293 }
294 {
295 REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry
296 ONE = #1
297 EXP = asr(EXP,#1) // divide signed exp by 2
298 }
299 {
300 REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry
301 ONE = add(ROOT,ONE)
302 }
303 {
304 if (P_CARRY2) ROOT = ONE
305 if (P_CARRY2) REM_LO = REM_LO_TMP
306 // since tworoot <= 2^32, remhi must be zero
307#undef REM_HI_TMP
308#undef REM_HI_TMP_HI
309#define S_ONE r2
310#define ADJ r3
311 S_ONE = #1
312 }
313 {
314 P_TMP = cmp.eq(REM_LO,ZERO) // is the low part zero
315 if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE) // if so, it's exact... hopefully
316 ADJ = cl0(ROOT)
317 EXP = add(EXP,#-63)
318 }
319#undef REM_LO
320#define RET r1:0
321#define RETHI r1
322 {
323 RET = convert_ud2df(ROOT) // set up mantissa, maybe set inexact flag
324 EXP = add(EXP,ADJ) // add back bias
325 }
326 {
327 RETHI += asl(EXP,#DF_MANTBITS-32) // add exponent adjust
328 jumpr r31
329 }
330#undef REM_LO_TMP
331#undef REM_HI_TMP
332#undef REM_HI_TMP_HI
333#undef REM_LO
334#undef REM_HI
335#undef TWOROOT_LO
336
337#undef RET
338#define A r1:0
339#define AH r1
340#define AL r1
341#undef S_ONE
342#define TMP r3:2
343#define TMPHI r3
344#define TMPLO r2
345#undef P_CARRY0
346#define P_NEG p1
347
348
349#define SFHALF r5
350#define SFRAD r9
351.Lsqrt_abnormal:
352 {
353 P_TMP = dfclass(A,#DFCLASS_ZERO) // zero?
354 if (P_TMP.new) jumpr:t r31
355 }
356 {
357 P_TMP = dfclass(A,#DFCLASS_NAN)
358 if (P_TMP.new) jump:nt .Lsqrt_nan
359 }
360 {
361 P_TMP = cmp.gt(AH,#-1)
362 if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg
363 if (!P_TMP.new) EXP = ##0x7F800001 // sNaN
364 }
365 {
366 P_TMP = dfclass(A,#DFCLASS_INFINITE)
367 if (P_TMP.new) jumpr:nt r31
368 }
369 // If we got here, we're denormal
370 // prepare to restart
371 {
372 A = extractu(A,#DF_MANTBITS,#0) // Extract mantissa
373 }
374 {
375 EXP = add(clb(A),#-DF_EXPBITS) // how much to normalize?
376 }
377 {
378 A = asl(A,EXP) // Shift mantissa
379 EXP = sub(#1,EXP) // Form exponent
380 }
381 {
382 AH = insert(EXP,#1,#DF_MANTBITS-32) // insert lsb of exponent
383 }
384 {
385 TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) // get sf value (mant+exp1)
386 SFHALF = ##0x3f000004 // form half constant
387 }
388 {
389 SFRAD = or(SFHALF,TMPLO) // form sf value
390 SFHALF = and(SFHALF,#-16)
391 jump .Ldenormal_restart // restart
392 }
393.Lsqrt_nan:
394 {
395 EXP = convert_df2sf(A) // if sNaN, get invalid
396 A = #-1 // qNaN
397 jumpr r31
398 }
399.Lsqrt_invalid_neg:
400 {
401 A = convert_sf2df(EXP) // Invalid,NaNval
402 jumpr r31
403 }
404END(__hexagon_sqrt)
405END(__hexagon_sqrtdf2)
406

source code of compiler-rt/lib/builtins/hexagon/dfsqrt.S