svml_s_powf4_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S]

1	/ Function powf vectorized with SSE4.*
2	Copyright (C) 2014-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20	#include "svml_s_powf_data.h"
21
22	.section .text.sse4, "ax", @progbits
23	ENTRY (_ZGVbN4vv_powf_sse4)
24	/*
25	ALGORITHM DESCRIPTION:
26
27	We are using the next identity: pow(x,y) = 2^(y log2(x)).*
28
29	1) log2(x) calculation
30	Here we use the following formula.
31	Let \|x\|=2^k1X1, where k1 is integer, 1<=X1<2.*
32	Let C ~= 1/ln(2),
33	Rcp1 ~= 1/X1, X2=Rcp1X1,*
34	Rcp2 ~= 1/X2, X3=Rcp2X2,*
35	Rcp3 ~= 1/X3, Rcp3C ~= C/X3.
36	Then
37	log2\|x\| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
38	log2(X1Rcp1Rcp2Rcp3C/C),*
39	where X1Rcp1Rcp2Rcp3C = C(1+q), q is very small.
40
41	The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
42	Rcp3C, log2(C/Rcp3C) are taken from tables.
43	Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1Rcp2Rcp3C
44	is exactly represented in target precision.
45
46	log2(X1Rcp1Rcp2Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =*
47	= 1/(ln2)q - 1/(2ln2)q^2 + 1/(3ln2)q^3 - ... =*
48	= 1/(Cln2)cq - 1/(2C^2ln2)cq^2 + 1/(3C^3ln2)cq^3 - ... =
49	= (1 + a1)cq + a2cq^2 + a3cq^3 + ...,*
50	where
51	cq=X1Rcp1Rcp2Rcp3C-C,*
52	a1=1/(Cln(2))-1 is small,*
53	a2=1/(2C^2ln2),
54	a3=1/(3C^3ln2),
55	...
56	Log2 result is split by three parts: HH+HL+HLL
57
58	2) Calculation of ylog2(x)*
59	Split y into YHi+YLo.
60	Get high PH and medium PL parts of ylog2\|x\|.*
61	Get low PLL part of ylog2\|x\|.*
62	Now we have PH+PL+PLL ~= ylog2\|x\|.*
63
64	3) Calculation of 2^(ylog2(x))*
65	Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
66	where expK=7 in this implementation, N and j are integers,
67	0<=j<=2^expK-1, \|Z\|<2^(-expK-1). Hence
68	2^(PH+PL+PLL) ~= 2^N 2^(j/2^expK) * 2^Z,*
69	where 2^(j/2^expK) is stored in a table, and
70	2^Z ~= 1 + B1Z + B2Z^2 ... + B5Z^5.*
71	We compute 2^(PH+PL+PLL) as follows:
72	Break PH into PHH + PHL, where PHH = N + j/2^expK.
73	Z = PHL + PL + PLL
74	Exp2Poly = B1Z + B2Z^2 ... + B5Z^5*
75	Get 2^(j/2^expK) from table in the form THI+TLO.
76	Now we have 2^(PH+PL+PLL) ~= 2^N (THI + TLO) * (1 + Exp2Poly).*
77	Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
78	ResHi := THI
79	ResLo := THI Exp2Poly + TLO*
80	Get exponent ERes of the result:
81	Res := ResHi + ResLo:
82	Result := ex(Res) + N. /*
83
84	pushq %rbp
85	cfi_adjust_cfa_offset (`8`)
86	cfi_rel_offset (%rbp, `0`)
87	movq %rsp, %rbp
88	cfi_def_cfa_register (%rbp)
89	andq $-`64`, %rsp
90	subq $`256`, %rsp
91	movaps %xmm0, %xmm3
92	movhlps %xmm0, %xmm3
93	movaps %xmm1, %xmm5
94	movups %xmm8, `112`(%rsp)
95	movaps %xmm5, %xmm2
96	cvtps2pd %xmm3, %xmm8
97	cvtps2pd %xmm5, %xmm7
98	movups %xmm9, `96`(%rsp)
99	movaps %xmm0, %xmm4
100	cvtps2pd %xmm0, %xmm9
101	movq __svml_spow_data@GOTPCREL(%rip), %rdx
102	movups %xmm10, `176`(%rsp)
103	movups %xmm13, `48`(%rsp)
104	movups _ExpMask(%rdx), %xmm6
105
106	/ preserve mantissa, set input exponent to 2^(-10) /
107	movaps %xmm6, %xmm10
108	andps %xmm8, %xmm6
109	andps %xmm9, %xmm10
110
111	/ exponent bits selection /
112	psrlq $`20`, %xmm9
113	orps _Two10(%rdx), %xmm6
114	psrlq $`20`, %xmm8
115	orps _Two10(%rdx), %xmm10
116
117	/ reciprocal approximation good to at least 11 bits /
118	cvtpd2ps %xmm6, %xmm13
119	cvtpd2ps %xmm10, %xmm1
120	movlhps %xmm13, %xmm13
121	movhlps %xmm5, %xmm2
122	movlhps %xmm1, %xmm1
123	movups %xmm12, `208`(%rsp)
124	rcpps %xmm13, %xmm12
125	movups %xmm11, `80`(%rsp)
126	cvtps2pd %xmm2, %xmm11
127	rcpps %xmm1, %xmm2
128	movups %xmm14, `144`(%rsp)
129	cvtps2pd %xmm12, %xmm14
130	movups %xmm15, `160`(%rsp)
131	cvtps2pd %xmm2, %xmm15
132	shufps $`221`, %xmm8, %xmm9
133
134	/ round reciprocal to nearest integer, will have 1+9 mantissa bits /
135	roundpd $`0`, %xmm14, %xmm14
136
137	/ biased exponent in DP format /
138	pshufd $`238`, %xmm9, %xmm8
139	roundpd $`0`, %xmm15, %xmm15
140	cvtdq2pd %xmm8, %xmm1
141	mulpd %xmm15, %xmm10
142	mulpd %xmm14, %xmm6
143	cvtdq2pd %xmm9, %xmm2
144	subpd _One(%rdx), %xmm10
145	subpd _One(%rdx), %xmm6
146
147	/ table lookup /
148	movaps %xmm14, %xmm8
149	movaps %xmm15, %xmm9
150	psrlq $`40`, %xmm8
151	psrlq $`40`, %xmm9
152	movd %xmm8, %r8d
153	movd %xmm9, %eax
154	psubd _NMINNORM(%rdx), %xmm4
155	movdqu _ABSMASK(%rdx), %xmm3
156	pextrd $`2`, %xmm8, %r9d
157	pand %xmm5, %xmm3
158	movups _Threshold(%rdx), %xmm8
159	pextrd $`2`, %xmm9, %ecx
160	movaps %xmm8, %xmm9
161	cmpltpd %xmm15, %xmm9
162	cmpltpd %xmm14, %xmm8
163	andps _Bias(%rdx), %xmm9
164	movaps %xmm10, %xmm14
165	andps _Bias(%rdx), %xmm8
166	movaps %xmm6, %xmm15
167	orps _Bias1(%rdx), %xmm9
168	orps _Bias1(%rdx), %xmm8
169	subpd %xmm9, %xmm2
170	subpd %xmm8, %xmm1
171	mulpd %xmm10, %xmm14
172	mulpd %xmm6, %xmm15
173	mulpd _L2(%rdx), %xmm2
174	mulpd _L2(%rdx), %xmm1
175	movups _poly_coeff_3(%rdx), %xmm9
176	movaps %xmm9, %xmm8
177	mulpd %xmm10, %xmm8
178	mulpd %xmm6, %xmm9
179	addpd _poly_coeff_4(%rdx), %xmm8
180	addpd _poly_coeff_4(%rdx), %xmm9
181	mulpd %xmm14, %xmm8
182	mulpd %xmm15, %xmm9
183
184	/ reconstruction /
185	addpd %xmm8, %xmm10
186	addpd %xmm9, %xmm6
187	movslq %eax, %rax
188	movslq %r8d, %r8
189	movslq %ecx, %rcx
190	movslq %r9d, %r9
191	movsd _Log2Rcp_lookup(%rdx,%rax), %xmm13
192	movsd _Log2Rcp_lookup(%rdx,%r8), %xmm12
193	movhpd _Log2Rcp_lookup(%rdx,%rcx), %xmm13
194	movhpd _Log2Rcp_lookup(%rdx,%r9), %xmm12
195	addpd %xmm10, %xmm13
196	addpd %xmm6, %xmm12
197	addpd %xmm13, %xmm2
198	addpd %xmm12, %xmm1
199	mulpd %xmm7, %xmm2
200	mulpd %xmm11, %xmm1
201	movups __dbInvLn2(%rdx), %xmm11
202	movdqa %xmm4, %xmm12
203	movaps %xmm11, %xmm10
204	mulpd %xmm2, %xmm10
205	mulpd %xmm1, %xmm11
206
207	/ to round down; if dR is an integer we will get R = 1, which is ok /
208	movaps %xmm10, %xmm8
209	movaps %xmm11, %xmm9
210	subpd __dbHALF(%rdx), %xmm8
211	subpd __dbHALF(%rdx), %xmm9
212	addpd __dbShifter(%rdx), %xmm8
213	addpd __dbShifter(%rdx), %xmm9
214	movaps %xmm8, %xmm6
215	movaps %xmm9, %xmm7
216	subpd __dbShifter(%rdx), %xmm6
217	subpd __dbShifter(%rdx), %xmm7
218
219	/ [0..1) /
220	subpd %xmm6, %xmm10
221	subpd %xmm7, %xmm11
222	mulpd __dbC1(%rdx), %xmm10
223	mulpd __dbC1(%rdx), %xmm11
224
225	/ hi bits /
226	shufps $`221`, %xmm1, %xmm2
227	movdqu _NMAXVAL(%rdx), %xmm1
228	pcmpgtd %xmm1, %xmm12
229	pcmpeqd %xmm1, %xmm4
230	por %xmm4, %xmm12
231	movdqa %xmm3, %xmm1
232	movdqu _INF(%rdx), %xmm4
233	pcmpgtd %xmm4, %xmm1
234	pcmpeqd %xmm4, %xmm3
235
236	/ iAbsX = iAbsX&iAbsMask /
237	pand __iAbsMask(%rdx), %xmm2
238	por %xmm3, %xmm1
239
240	/ iRangeMask = (iAbsX>iDomainRange) /
241	pcmpgtd __iDomainRange(%rdx), %xmm2
242	por %xmm1, %xmm12
243	movups __lbLOWKBITS(%rdx), %xmm3
244	por %xmm2, %xmm12
245
246	/ low K bits /
247	movaps %xmm3, %xmm2
248	andps %xmm9, %xmm3
249	andps %xmm8, %xmm2
250	psrlq $`11`, %xmm8
251
252	/ dpP= _dbT+lJT_ITEM_GRAN /*
253	movd %xmm2, %r10d
254	psrlq $`11`, %xmm9
255	movd %xmm3, %ecx
256
257	/ NB : including +/- sign for the exponent!! /
258	psllq $`52`, %xmm8
259	psllq $`52`, %xmm9
260	pextrw $`4`, %xmm2, %r11d
261	pextrw $`4`, %xmm3, %r8d
262	movmskps %xmm12, %eax
263	shll $`3`, %r10d
264	shll $`3`, %ecx
265	shll $`3`, %r11d
266	shll $`3`, %r8d
267	movq `13952`(%rdx,%r10), %xmm6
268	movq `13952`(%rdx,%rcx), %xmm7
269	movhpd `13952`(%rdx,%r11), %xmm6
270	movhpd `13952`(%rdx,%r8), %xmm7
271	mulpd %xmm6, %xmm10
272	mulpd %xmm7, %xmm11
273	addpd %xmm10, %xmm6
274	addpd %xmm11, %xmm7
275	paddq %xmm8, %xmm6
276	paddq %xmm9, %xmm7
277	cvtpd2ps %xmm6, %xmm1
278	cvtpd2ps %xmm7, %xmm4
279	movlhps %xmm4, %xmm1
280	testl %eax, %eax
281	jne .LBL_1_3
282
283	.LBL_1_2:
284	cfi_remember_state
285	movups `112`(%rsp), %xmm8
286	movaps %xmm1, %xmm0
287	movups `96`(%rsp), %xmm9
288	movups `176`(%rsp), %xmm10
289	movups `80`(%rsp), %xmm11
290	movups `208`(%rsp), %xmm12
291	movups `48`(%rsp), %xmm13
292	movups `144`(%rsp), %xmm14
293	movups `160`(%rsp), %xmm15
294	movq %rbp, %rsp
295	cfi_def_cfa_register (%rsp)
296	popq %rbp
297	cfi_adjust_cfa_offset (-`8`)
298	cfi_restore (%rbp)
299	ret
300
301	.LBL_1_3:
302	cfi_restore_state
303	movups %xmm0, `64`(%rsp)
304	movups %xmm5, `128`(%rsp)
305	movups %xmm1, `192`(%rsp)
306	je .LBL_1_2
307
308	xorb %cl, %cl
309	xorl %edx, %edx
310	movq %rsi, `8`(%rsp)
311	movq %rdi, (%rsp)
312	movq %r12, `40`(%rsp)
313	cfi_offset_rel_rsp (`12`, `40`)
314	movb %cl, %r12b
315	movq %r13, `32`(%rsp)
316	cfi_offset_rel_rsp (`13`, `32`)
317	movl %eax, %r13d
318	movq %r14, `24`(%rsp)
319	cfi_offset_rel_rsp (`14`, `24`)
320	movl %edx, %r14d
321	movq %r15, `16`(%rsp)
322	cfi_offset_rel_rsp (`15`, `16`)
323	cfi_remember_state
324
325	.LBL_1_6:
326	btl %r14d, %r13d
327	jc .LBL_1_12
328
329	.LBL_1_7:
330	lea `1`(%r14), %esi
331	btl %esi, %r13d
332	jc .LBL_1_10
333
334	.LBL_1_8:
335	incb %r12b
336	addl $`2`, %r14d
337	cmpb $`16`, %r12b
338	jb .LBL_1_6
339
340	movq `8`(%rsp), %rsi
341	movq (%rsp), %rdi
342	movq `40`(%rsp), %r12
343	cfi_restore (%r12)
344	movq `32`(%rsp), %r13
345	cfi_restore (%r13)
346	movq `24`(%rsp), %r14
347	cfi_restore (%r14)
348	movq `16`(%rsp), %r15
349	cfi_restore (%r15)
350	movups `192`(%rsp), %xmm1
351	jmp .LBL_1_2
352
353	.LBL_1_10:
354	cfi_restore_state
355	movzbl %r12b, %r15d
356	movss `68`(%rsp,%r15,`8`), %xmm0
357	movss `132`(%rsp,%r15,`8`), %xmm1
358
359	call JUMPTARGET(powf)
360
361	movss %xmm0, `196`(%rsp,%r15,`8`)
362	jmp .LBL_1_8
363
364	.LBL_1_12:
365	movzbl %r12b, %r15d
366	movss `64`(%rsp,%r15,`8`), %xmm0
367	movss `128`(%rsp,%r15,`8`), %xmm1
368
369	call JUMPTARGET(powf)
370
371	movss %xmm0, `192`(%rsp,%r15,`8`)
372	jmp .LBL_1_7
373
374	END (_ZGVbN4vv_powf_sse4)
375

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S