dfmul.S source code [compiler-rt/lib/builtins/hexagon/dfmul.S]

1	//===----------------------Hexagon builtin routine ------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	// Double Precision Multiply
10	#define A r1:0
11	#define AH r1
12	#define AL r0
13	#define B r3:2
14	#define BH r3
15	#define BL r2
16
17	#define BTMP r5:4
18	#define BTMPH r5
19	#define BTMPL r4
20
21	#define PP_ODD r7:6
22	#define PP_ODD_H r7
23	#define PP_ODD_L r6
24
25	#define ONE r9:8
26	#define S_ONE r8
27	#define S_ZERO r9
28
29	#define PP_HH r11:10
30	#define PP_HH_H r11
31	#define PP_HH_L r10
32
33	#define ATMP r13:12
34	#define ATMPH r13
35	#define ATMPL r12
36
37	#define PP_LL r15:14
38	#define PP_LL_H r15
39	#define PP_LL_L r14
40
41	#define TMP r28
42
43	#define MANTBITS 52
44	#define HI_MANTBITS 20
45	#define EXPBITS 11
46	#define BIAS 1024
47	#define MANTISSA_TO_INT_BIAS 52
48
49	// Some constant to adjust normalization amount in error code
50	// Amount to right shift the partial product to get to a denorm
51	#define FUDGE 5
52
53	#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
54	#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
55	#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
56	#define END(TAG) .size TAG,.-TAG
57
58	#define SR_ROUND_OFF 22
59	.text
60	.global __hexagon_muldf3
61	.type __hexagon_muldf3,@function
62	Q6_ALIAS(muldf3)
63	FAST_ALIAS(muldf3)
64	FAST2_ALIAS(muldf3)
65	.p2align `5`
66	__hexagon_muldf3:
67	{
68	p0 = dfclass(A,#`2`)
69	p0 = dfclass(B,#`2`)
70	ATMP = combine(##`0x40000000`,#`0`)
71	}
72	{
73	ATMP = insert(A,#MANTBITS,#EXPBITS-`1`)
74	BTMP = asl(B,#EXPBITS-`1`)
75	TMP = #-BIAS
76	ONE = #`1`
77	}
78	{
79	PP_ODD = mpyu(BTMPL,ATMPH)
80	BTMP = insert(ONE,#`2`,#`62`)
81	}
82	// since we know that the MSB of the H registers is zero, we should never carry
83	// H <= 2^31-1. L <= 2^32-1. Therefore, HL <= 2^63-2^32-2^31+1
84	// Adding 2 HLs, we get 2^64-32^32+2 maximum.*
85	// Therefore, we can add 3 2^32-1 values safely without carry. We only need one.
86	{
87	PP_LL = mpyu(ATMPL,BTMPL)
88	PP_ODD += mpyu(ATMPL,BTMPH)
89	}
90	{
91	PP_ODD += lsr(PP_LL,#`32`)
92	PP_HH = mpyu(ATMPH,BTMPH)
93	BTMP = combine(##BIAS+BIAS-`4`,#`0`)
94	}
95	{
96	PP_HH += lsr(PP_ODD,#`32`)
97	if (!p0) jump .Lmul_abnormal
98	p1 = cmp.eq(PP_LL_L,#`0`) // 64 lsb's 0?
99	p1 = cmp.eq(PP_ODD_L,#`0`) // 64 lsb's 0?
100	}
101
102	// PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
103	// PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
104
105	#undef PP_ODD
106	#undef PP_ODD_H
107	#undef PP_ODD_L
108	#define EXP10 r7:6
109	#define EXP1 r7
110	#define EXP0 r6
111	{
112	if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
113	EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
114	EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
115	}
116	{
117	PP_LL = neg(PP_HH)
118	EXP0 += add(TMP,EXP1)
119	TMP = xor(AH,BH)
120	}
121	{
122	if (!p2.new) PP_HH = PP_LL
123	p2 = cmp.gt(TMP,#-`1`)
124	p0 = !cmp.gt(EXP0,BTMPH)
125	p0 = cmp.gt(EXP0,BTMPL)
126	if (!p0.new) jump:nt .Lmul_ovf_unf
127	}
128	{
129	A = convert_d2df(PP_HH)
130	EXP0 = add(EXP0,#-BIAS-`58`)
131	}
132	{
133	AH += asl(EXP0,#HI_MANTBITS)
134	jumpr r31
135	}
136
137	.falign
138	.Lpossible_unf:
139	// We end up with a positive exponent
140	// But we may have rounded up to an exponent of 1.
141	// If the exponent is 1, if we rounded up to it
142	// we need to also raise underflow
143	// Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
144	// And the PP should also have more than one bit set
145	//
146	// Note: ATMP should have abs(PP_HH)
147	// Note: BTMPL should have 0x7FEFFFFF
148	{
149	p0 = cmp.eq(AL,#`0`)
150	p0 = bitsclr(AH,BTMPL)
151	if (!p0.new) jumpr:t r31
152	BTMPH = #`0x7fff`
153	}
154	{
155	p0 = bitsset(ATMPH,BTMPH)
156	BTMPL = USR
157	BTMPH = #`0x030`
158	}
159	{
160	if (p0) BTMPL = or(BTMPL,BTMPH)
161	}
162	{
163	USR = BTMPL
164	}
165	{
166	p0 = dfcmp.eq(A,A)
167	jumpr r31
168	}
169	.falign
170	.Lmul_ovf_unf:
171	{
172	A = convert_d2df(PP_HH)
173	ATMP = abs(PP_HH) // take absolute value
174	EXP1 = add(EXP0,#-BIAS-`58`)
175	}
176	{
177	AH += asl(EXP1,#HI_MANTBITS)
178	EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
179	BTMPL = ##`0x7FEFFFFF`
180	}
181	{
182	EXP1 += add(EXP0,##-BIAS-`58`)
183	//BTMPH = add(clb(ATMP),#-2)
184	BTMPH = #`0`
185	}
186	{
187	p0 = cmp.gt(EXP1,##BIAS+BIAS-`2`) // overflow
188	if (p0.new) jump:nt .Lmul_ovf
189	}
190	{
191	p0 = cmp.gt(EXP1,#`0`)
192	if (p0.new) jump:nt .Lpossible_unf
193	BTMPH = sub(EXP0,BTMPH)
194	TMP = #`63` // max amount to shift
195	}
196	// Underflow
197	//
198	// PP_HH has the partial product with sticky LSB.
199	// PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
200	// PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
201	// The exponent of PP_HH is in EXP1, which is non-positive (0 or negative)
202	// That's the exponent that happens after the normalization
203	//
204	// EXP0 has the exponent that, when added to the normalized value, is out of range.
205	//
206	// Strategy:
207	//
208	// Shift down bits, with sticky bit, such that the bits are aligned according*
209	// to the LZ count and appropriate exponent, but not all the way to mantissa
210	// field, keep around the last few bits.
211	// Put a 1 near the MSB*
212	// Check the LSBs for inexact; if inexact also set underflow*
213	// Convert [u]d2df -- will correctly round according to rounding mode*
214	// Replace exponent field with zero*
215
216	{
217	BTMPL = #`0` // offset for extract
218	BTMPH = sub(#FUDGE,BTMPH) // amount to right shift
219	}
220	{
221	p3 = cmp.gt(PP_HH_H,#-`1`) // is it positive?
222	BTMPH = min(BTMPH,TMP) // Don't shift more than 63
223	PP_HH = ATMP
224	}
225	{
226	TMP = USR
227	PP_LL = extractu(PP_HH,BTMP)
228	}
229	{
230	PP_HH = asr(PP_HH,BTMPH)
231	BTMPL = #`0x0030` // underflow flag
232	AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
233	}
234	{
235	p0 = cmp.gtu(ONE,PP_LL) // Did we extract all zeros?
236	if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE) // add sticky bit
237	PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+`3`) // Add back in a bit so we can use convert instruction
238	}
239	{
240	PP_LL = neg(PP_HH)
241	p1 = bitsclr(PP_HH_L,#`0x7`) // Are the LSB's clear?
242	if (!p1.new) TMP = or(BTMPL,TMP) // If not, Inexact+Underflow
243	}
244	{
245	if (!p3) PP_HH = PP_LL
246	USR = TMP
247	}
248	{
249	A = convert_d2df(PP_HH) // Do rounding
250	p0 = dfcmp.eq(A,A) // realize exception
251	}
252	{
253	AH = insert(S_ZERO,#EXPBITS-`1`,#HI_MANTBITS+`1`) // Insert correct exponent
254	jumpr r31
255	}
256	.falign
257	.Lmul_ovf:
258	// We get either max finite value or infinity. Either way, overflow+inexact
259	{
260	TMP = USR
261	ATMP = combine(##`0x7fefffff`,#-`1`) // positive max finite
262	A = PP_HH
263	}
264	{
265	PP_LL_L = extractu(TMP,#`2`,#SR_ROUND_OFF) // rounding bits
266	TMP = or(TMP,#`0x28`) // inexact + overflow
267	BTMP = combine(##`0x7ff00000`,#`0`) // positive infinity
268	}
269	{
270	USR = TMP
271	PP_LL_L ^= lsr(AH,#`31`) // Does sign match rounding?
272	TMP = PP_LL_L // unmodified rounding mode
273	}
274	{
275	p0 = !cmp.eq(TMP,#`1`) // If not round-to-zero and
276	p0 = !cmp.eq(PP_LL_L,#`2`) // Not rounding the other way,
277	if (p0.new) ATMP = BTMP // we should get infinity
278	p0 = dfcmp.eq(A,A) // Realize FP exception if enabled
279	}
280	{
281	A = insert(ATMP,#`63`,#`0`) // insert inf/maxfinite, leave sign
282	jumpr r31
283	}
284
285	.Lmul_abnormal:
286	{
287	ATMP = extractu(A,#`63`,#`0`) // strip off sign
288	BTMP = extractu(B,#`63`,#`0`) // strip off sign
289	}
290	{
291	p3 = cmp.gtu(ATMP,BTMP)
292	if (!p3.new) A = B // sort values
293	if (!p3.new) B = A // sort values
294	}
295	{
296	// Any NaN --> NaN, possibly raise invalid if sNaN
297	p0 = dfclass(A,#`0x0f`) // A not NaN?
298	if (!p0.new) jump:nt .Linvalid_nan
299	if (!p3) ATMP = BTMP
300	if (!p3) BTMP = ATMP
301	}
302	{
303	// Infinity nonzero number is infinity*
304	p1 = dfclass(A,#`0x08`) // A is infinity
305	p1 = dfclass(B,#`0x0e`) // B is nonzero
306	}
307	{
308	// Infinity zero --> NaN, raise invalid*
309	// Other zeros return zero
310	p0 = dfclass(A,#`0x08`) // A is infinity
311	p0 = dfclass(B,#`0x01`) // B is zero
312	}
313	{
314	if (p1) jump .Ltrue_inf
315	p2 = dfclass(B,#`0x01`)
316	}
317	{
318	if (p0) jump .Linvalid_zeroinf
319	if (p2) jump .Ltrue_zero // so return zero
320	TMP = ##`0x7c000000`
321	}
322	// We are left with a normal or subnormal times a subnormal. A > B
323	// If A and B are both very small (exp(a) < BIAS-MANTBITS),
324	// we go to a single sticky bit, which we can round easily.
325	// If A and B might multiply to something bigger, decrease A exponent and increase
326	// B exponent and try again
327	{
328	p0 = bitsclr(AH,TMP)
329	if (p0.new) jump:nt .Lmul_tiny
330	}
331	{
332	TMP = cl0(BTMP)
333	}
334	{
335	TMP = add(TMP,#-EXPBITS)
336	}
337	{
338	BTMP = asl(BTMP,TMP)
339	}
340	{
341	B = insert(BTMP,#`63`,#`0`)
342	AH -= asl(TMP,#HI_MANTBITS)
343	}
344	jump __hexagon_muldf3
345	.Lmul_tiny:
346	{
347	TMP = USR
348	A = xor(A,B) // get sign bit
349	}
350	{
351	TMP = or(TMP,#`0x30`) // Inexact + Underflow
352	A = insert(ONE,#`63`,#`0`) // put in rounded up value
353	BTMPH = extractu(TMP,#`2`,#SR_ROUND_OFF) // get rounding mode
354	}
355	{
356	USR = TMP
357	p0 = cmp.gt(BTMPH,#`1`) // Round towards pos/neg inf?
358	if (!p0.new) AL = #`0` // If not, zero
359	BTMPH ^= lsr(AH,#`31`) // rounding my way --> set LSB
360	}
361	{
362	p0 = cmp.eq(BTMPH,#`3`) // if rounding towards right inf
363	if (!p0.new) AL = #`0` // don't go to zero
364	jumpr r31
365	}
366	.Linvalid_zeroinf:
367	{
368	TMP = USR
369	}
370	{
371	A = #-`1`
372	TMP = or(TMP,#`2`)
373	}
374	{
375	USR = TMP
376	}
377	{
378	p0 = dfcmp.uo(A,A) // force exception if enabled
379	jumpr r31
380	}
381	.Linvalid_nan:
382	{
383	p0 = dfclass(B,#`0x0f`) // if B is not NaN
384	TMP = convert_df2sf(A) // will generate invalid if sNaN
385	if (p0.new) B = A // make it whatever A is
386	}
387	{
388	BL = convert_df2sf(B) // will generate invalid if sNaN
389	A = #-`1`
390	jumpr r31
391	}
392	.falign
393	.Ltrue_zero:
394	{
395	A = B
396	B = A
397	}
398	.Ltrue_inf:
399	{
400	BH = extract(BH,#`1`,#`31`)
401	}
402	{
403	AH ^= asl(BH,#`31`)
404	jumpr r31
405	}
406	END(__hexagon_muldf3)
407
408	#undef ATMP
409	#undef ATMPL
410	#undef ATMPH
411	#undef BTMP
412	#undef BTMPL
413	#undef BTMPH
414

source code of compiler-rt/lib/builtins/hexagon/dfmul.S