svml_s_expf16_core_avx512.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S]

1	/ Function expf vectorized with AVX-512. KNL and SKX versions.*
2	Copyright (C) 2014-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20	#include "svml_s_expf_data.h"
21	#include "svml_s_wrapper_impl.h"
22
23	.section .text.evex512, "ax", @progbits
24	ENTRY (_ZGVeN16v_expf_knl)
25	/*
26	ALGORITHM DESCRIPTION:
27
28	Argument representation:
29	M = rint(X2^k/ln2) = 2^kN+j
30	X = Mln2/2^k + r = Nln2 + ln2(j/2^k) + r*
31	then -ln2/2^(k+1) < r < ln2/2^(k+1)
32	Alternatively:
33	M = trunc(X2^k/ln2)*
34	then 0 < r < ln2/2^k
35
36	Result calculation:
37	exp(X) = exp(Nln2 + ln2(j/2^k) + r)
38	= 2^N 2^(j/2^k) * exp(r)*
39	2^N is calculated by bit manipulation
40	2^(j/2^k) is computed from table lookup
41	exp(r) is approximated by polynomial
42
43	The table lookup is skipped if k = 0.
44	For low accuracy approximation, exp(r) ~ 1 or 1+r. /*
45
46	pushq %rbp
47	cfi_adjust_cfa_offset (`8`)
48	cfi_rel_offset (%rbp, `0`)
49	movq %rsp, %rbp
50	cfi_def_cfa_register (%rbp)
51	andq $-`64`, %rsp
52	subq $`1280`, %rsp
53	movq __svml_sexp_data@GOTPCREL(%rip), %rax
54
55	/ r = x-nln2_hi/2^k /*
56	vmovaps %zmm0, %zmm6
57
58	/ compare against threshold /
59	movl $-`1`, %ecx
60	vmovups __sInvLn2(%rax), %zmm3
61	vmovups __sLn2hi(%rax), %zmm5
62
63	/ m = x2^k/ln2 + shifter /*
64	vfmadd213ps __sShifter(%rax), %zmm0, %zmm3
65	vmovups __sPC5(%rax), %zmm9
66
67	/ n = m - shifter = rint(x2^k/ln2) /*
68	vsubps __sShifter(%rax), %zmm3, %zmm7
69
70	/ remove sign of x by "and" operation /
71	vpandd __iAbsMask(%rax), %zmm0, %zmm1
72	vpaddd __iBias(%rax), %zmm3, %zmm4
73	vpcmpgtd __iDomainRange(%rax), %zmm1, %k1
74
75	/ compute 2^N with "shift" /
76	vpslld $`23`, %zmm4, %zmm8
77	vfnmadd231ps %zmm7, %zmm5, %zmm6
78	vpbroadcastd %ecx, %zmm2{%k1}{z}
79
80	/ r = r-nln2_lo/2^k = x - nln2/2^k /
81	vfnmadd132ps __sLn2lo(%rax), %zmm6, %zmm7
82
83	/ set mask for overflow/underflow /
84	vptestmd %zmm2, %zmm2, %k0
85	kmovw %k0, %ecx
86
87	/ c5r+c4 /*
88	vfmadd213ps __sPC4(%rax), %zmm7, %zmm9
89
90	/ (c5r+c4)r+c3 /
91	vfmadd213ps __sPC3(%rax), %zmm7, %zmm9
92
93	/ ((c5r+c4)r+c3)r+c2 /*
94	vfmadd213ps __sPC2(%rax), %zmm7, %zmm9
95
96	/ (((c5r+c4)r+c3)r+c2)r+c1 /
97	vfmadd213ps __sPC1(%rax), %zmm7, %zmm9
98
99	/ exp(r) = ((((c5r+c4)r+c3)r+c2)r+c1)r+c0 /*
100	vfmadd213ps __sPC0(%rax), %zmm7, %zmm9
101
102	/ 2^Nexp(r) /*
103	vmulps %zmm9, %zmm8, %zmm1
104	testl %ecx, %ecx
105	jne .LBL_1_3
106
107	.LBL_1_2:
108	cfi_remember_state
109	vmovaps %zmm1, %zmm0
110	movq %rbp, %rsp
111	cfi_def_cfa_register (%rsp)
112	popq %rbp
113	cfi_adjust_cfa_offset (-`8`)
114	cfi_restore (%rbp)
115	ret
116
117	.LBL_1_3:
118	cfi_restore_state
119	vmovups %zmm0, `1152`(%rsp)
120	vmovups %zmm1, `1216`(%rsp)
121	je .LBL_1_2
122
123	xorb %dl, %dl
124	kmovw %k4, `1048`(%rsp)
125	xorl %eax, %eax
126	kmovw %k5, `1040`(%rsp)
127	kmovw %k6, `1032`(%rsp)
128	kmovw %k7, `1024`(%rsp)
129	vmovups %zmm16, `960`(%rsp)
130	vmovups %zmm17, `896`(%rsp)
131	vmovups %zmm18, `832`(%rsp)
132	vmovups %zmm19, `768`(%rsp)
133	vmovups %zmm20, `704`(%rsp)
134	vmovups %zmm21, `640`(%rsp)
135	vmovups %zmm22, `576`(%rsp)
136	vmovups %zmm23, `512`(%rsp)
137	vmovups %zmm24, `448`(%rsp)
138	vmovups %zmm25, `384`(%rsp)
139	vmovups %zmm26, `320`(%rsp)
140	vmovups %zmm27, `256`(%rsp)
141	vmovups %zmm28, `192`(%rsp)
142	vmovups %zmm29, `128`(%rsp)
143	vmovups %zmm30, `64`(%rsp)
144	vmovups %zmm31, (%rsp)
145	movq %rsi, `1064`(%rsp)
146	movq %rdi, `1056`(%rsp)
147	movq %r12, `1096`(%rsp)
148	cfi_offset_rel_rsp (`12`, `1096`)
149	movb %dl, %r12b
150	movq %r13, `1088`(%rsp)
151	cfi_offset_rel_rsp (`13`, `1088`)
152	movl %ecx, %r13d
153	movq %r14, `1080`(%rsp)
154	cfi_offset_rel_rsp (`14`, `1080`)
155	movl %eax, %r14d
156	movq %r15, `1072`(%rsp)
157	cfi_offset_rel_rsp (`15`, `1072`)
158	cfi_remember_state
159
160	.LBL_1_6:
161	btl %r14d, %r13d
162	jc .LBL_1_12
163
164	.LBL_1_7:
165	lea `1`(%r14), %esi
166	btl %esi, %r13d
167	jc .LBL_1_10
168
169	.LBL_1_8:
170	addb $`1`, %r12b
171	addl $`2`, %r14d
172	cmpb $`16`, %r12b
173	jb .LBL_1_6
174
175	kmovw `1048`(%rsp), %k4
176	movq `1064`(%rsp), %rsi
177	kmovw `1040`(%rsp), %k5
178	movq `1056`(%rsp), %rdi
179	kmovw `1032`(%rsp), %k6
180	movq `1096`(%rsp), %r12
181	cfi_restore (%r12)
182	movq `1088`(%rsp), %r13
183	cfi_restore (%r13)
184	kmovw `1024`(%rsp), %k7
185	vmovups `960`(%rsp), %zmm16
186	vmovups `896`(%rsp), %zmm17
187	vmovups `832`(%rsp), %zmm18
188	vmovups `768`(%rsp), %zmm19
189	vmovups `704`(%rsp), %zmm20
190	vmovups `640`(%rsp), %zmm21
191	vmovups `576`(%rsp), %zmm22
192	vmovups `512`(%rsp), %zmm23
193	vmovups `448`(%rsp), %zmm24
194	vmovups `384`(%rsp), %zmm25
195	vmovups `320`(%rsp), %zmm26
196	vmovups `256`(%rsp), %zmm27
197	vmovups `192`(%rsp), %zmm28
198	vmovups `128`(%rsp), %zmm29
199	vmovups `64`(%rsp), %zmm30
200	vmovups (%rsp), %zmm31
201	movq `1080`(%rsp), %r14
202	cfi_restore (%r14)
203	movq `1072`(%rsp), %r15
204	cfi_restore (%r15)
205	vmovups `1216`(%rsp), %zmm1
206	jmp .LBL_1_2
207
208	.LBL_1_10:
209	cfi_restore_state
210	movzbl %r12b, %r15d
211	vmovss `1156`(%rsp,%r15,`8`), %xmm0
212	call JUMPTARGET(expf)
213	vmovss %xmm0, `1220`(%rsp,%r15,`8`)
214	jmp .LBL_1_8
215
216	.LBL_1_12:
217	movzbl %r12b, %r15d
218	vmovss `1152`(%rsp,%r15,`8`), %xmm0
219	call JUMPTARGET(expf)
220	vmovss %xmm0, `1216`(%rsp,%r15,`8`)
221	jmp .LBL_1_7
222
223	END (_ZGVeN16v_expf_knl)
224
225	ENTRY (_ZGVeN16v_expf_skx)
226	/*
227	ALGORITHM DESCRIPTION:
228
229	Argument representation:
230	M = rint(X2^k/ln2) = 2^kN+j
231	X = Mln2/2^k + r = Nln2 + ln2(j/2^k) + r*
232	then -ln2/2^(k+1) < r < ln2/2^(k+1)
233	Alternatively:
234	M = trunc(X2^k/ln2)*
235	then 0 < r < ln2/2^k
236
237	Result calculation:
238	exp(X) = exp(Nln2 + ln2(j/2^k) + r)
239	= 2^N 2^(j/2^k) * exp(r)*
240	2^N is calculated by bit manipulation
241	2^(j/2^k) is computed from table lookup
242	exp(r) is approximated by polynomial
243
244	The table lookup is skipped if k = 0.
245	For low accuracy approximation, exp(r) ~ 1 or 1+r. /*
246
247	pushq %rbp
248	cfi_adjust_cfa_offset (`8`)
249	cfi_rel_offset (%rbp, `0`)
250	movq %rsp, %rbp
251	cfi_def_cfa_register (%rbp)
252	andq $-`64`, %rsp
253	subq $`1280`, %rsp
254	movq __svml_sexp_data@GOTPCREL(%rip), %rax
255
256	/ r = x-nln2_hi/2^k /*
257	vmovaps %zmm0, %zmm7
258
259	/ compare against threshold /
260	vpternlogd $`0xff`, %zmm3, %zmm3, %zmm3
261	vmovups __sInvLn2(%rax), %zmm4
262	vmovups __sShifter(%rax), %zmm1
263	vmovups __sLn2hi(%rax), %zmm6
264	vmovups __sPC5(%rax), %zmm10
265
266	/ m = x2^k/ln2 + shifter /*
267	vfmadd213ps %zmm1, %zmm0, %zmm4
268
269	/ n = m - shifter = rint(x2^k/ln2) /*
270	vsubps %zmm1, %zmm4, %zmm8
271	vpaddd __iBias(%rax), %zmm4, %zmm5
272	vfnmadd231ps %zmm8, %zmm6, %zmm7
273
274	/ compute 2^N with "shift" /
275	vpslld $`23`, %zmm5, %zmm9
276
277	/ r = r-nln2_lo/2^k = x - nln2/2^k /
278	vfnmadd132ps __sLn2lo(%rax), %zmm7, %zmm8
279
280	/ c5r+c4 /*
281	vfmadd213ps __sPC4(%rax), %zmm8, %zmm10
282
283	/ (c5r+c4)r+c3 /
284	vfmadd213ps __sPC3(%rax), %zmm8, %zmm10
285
286	/ ((c5r+c4)r+c3)r+c2 /*
287	vfmadd213ps __sPC2(%rax), %zmm8, %zmm10
288
289	/ (((c5r+c4)r+c3)r+c2)r+c1 /
290	vfmadd213ps __sPC1(%rax), %zmm8, %zmm10
291
292	/ exp(r) = ((((c5r+c4)r+c3)r+c2)r+c1)r+c0 /*
293	vfmadd213ps __sPC0(%rax), %zmm8, %zmm10
294
295	/ 2^Nexp(r) /*
296	vmulps %zmm10, %zmm9, %zmm1
297
298	/ remove sign of x by "and" operation /
299	vpandd __iAbsMask(%rax), %zmm0, %zmm2
300	vpcmpd $`2`, __iDomainRange(%rax), %zmm2, %k1
301	vpandnd %zmm2, %zmm2, %zmm3{%k1}
302
303	/ set mask for overflow/underflow /
304	vptestmd %zmm3, %zmm3, %k0
305	kmovw %k0, %ecx
306	testl %ecx, %ecx
307	jne .LBL_2_3
308
309	.LBL_2_2:
310	cfi_remember_state
311	vmovaps %zmm1, %zmm0
312	movq %rbp, %rsp
313	cfi_def_cfa_register (%rsp)
314	popq %rbp
315	cfi_adjust_cfa_offset (-`8`)
316	cfi_restore (%rbp)
317	ret
318
319	.LBL_2_3:
320	cfi_restore_state
321	vmovups %zmm0, `1152`(%rsp)
322	vmovups %zmm1, `1216`(%rsp)
323	je .LBL_2_2
324
325	xorb %dl, %dl
326	xorl %eax, %eax
327	kmovw %k4, `1048`(%rsp)
328	kmovw %k5, `1040`(%rsp)
329	kmovw %k6, `1032`(%rsp)
330	kmovw %k7, `1024`(%rsp)
331	vmovups %zmm16, `960`(%rsp)
332	vmovups %zmm17, `896`(%rsp)
333	vmovups %zmm18, `832`(%rsp)
334	vmovups %zmm19, `768`(%rsp)
335	vmovups %zmm20, `704`(%rsp)
336	vmovups %zmm21, `640`(%rsp)
337	vmovups %zmm22, `576`(%rsp)
338	vmovups %zmm23, `512`(%rsp)
339	vmovups %zmm24, `448`(%rsp)
340	vmovups %zmm25, `384`(%rsp)
341	vmovups %zmm26, `320`(%rsp)
342	vmovups %zmm27, `256`(%rsp)
343	vmovups %zmm28, `192`(%rsp)
344	vmovups %zmm29, `128`(%rsp)
345	vmovups %zmm30, `64`(%rsp)
346	vmovups %zmm31, (%rsp)
347	movq %rsi, `1064`(%rsp)
348	movq %rdi, `1056`(%rsp)
349	movq %r12, `1096`(%rsp)
350	cfi_offset_rel_rsp (`12`, `1096`)
351	movb %dl, %r12b
352	movq %r13, `1088`(%rsp)
353	cfi_offset_rel_rsp (`13`, `1088`)
354	movl %ecx, %r13d
355	movq %r14, `1080`(%rsp)
356	cfi_offset_rel_rsp (`14`, `1080`)
357	movl %eax, %r14d
358	movq %r15, `1072`(%rsp)
359	cfi_offset_rel_rsp (`15`, `1072`)
360	cfi_remember_state
361
362
363	.LBL_2_6:
364	btl %r14d, %r13d
365	jc .LBL_2_12
366
367	.LBL_2_7:
368	lea `1`(%r14), %esi
369	btl %esi, %r13d
370	jc .LBL_2_10
371
372	.LBL_2_8:
373	incb %r12b
374	addl $`2`, %r14d
375	cmpb $`16`, %r12b
376	jb .LBL_2_6
377
378	kmovw `1048`(%rsp), %k4
379	kmovw `1040`(%rsp), %k5
380	kmovw `1032`(%rsp), %k6
381	kmovw `1024`(%rsp), %k7
382	vmovups `960`(%rsp), %zmm16
383	vmovups `896`(%rsp), %zmm17
384	vmovups `832`(%rsp), %zmm18
385	vmovups `768`(%rsp), %zmm19
386	vmovups `704`(%rsp), %zmm20
387	vmovups `640`(%rsp), %zmm21
388	vmovups `576`(%rsp), %zmm22
389	vmovups `512`(%rsp), %zmm23
390	vmovups `448`(%rsp), %zmm24
391	vmovups `384`(%rsp), %zmm25
392	vmovups `320`(%rsp), %zmm26
393	vmovups `256`(%rsp), %zmm27
394	vmovups `192`(%rsp), %zmm28
395	vmovups `128`(%rsp), %zmm29
396	vmovups `64`(%rsp), %zmm30
397	vmovups (%rsp), %zmm31
398	vmovups `1216`(%rsp), %zmm1
399	movq `1064`(%rsp), %rsi
400	movq `1056`(%rsp), %rdi
401	movq `1096`(%rsp), %r12
402	cfi_restore (%r12)
403	movq `1088`(%rsp), %r13
404	cfi_restore (%r13)
405	movq `1080`(%rsp), %r14
406	cfi_restore (%r14)
407	movq `1072`(%rsp), %r15
408	cfi_restore (%r15)
409	jmp .LBL_2_2
410
411	.LBL_2_10:
412	cfi_restore_state
413	movzbl %r12b, %r15d
414	vmovss `1156`(%rsp,%r15,`8`), %xmm0
415	vzeroupper
416	vmovss `1156`(%rsp,%r15,`8`), %xmm0
417
418	call JUMPTARGET(expf)
419
420	vmovss %xmm0, `1220`(%rsp,%r15,`8`)
421	jmp .LBL_2_8
422
423	.LBL_2_12:
424	movzbl %r12b, %r15d
425	vmovss `1152`(%rsp,%r15,`8`), %xmm0
426	vzeroupper
427	vmovss `1152`(%rsp,%r15,`8`), %xmm0
428
429	call JUMPTARGET(expf)
430
431	vmovss %xmm0, `1216`(%rsp,%r15,`8`)
432	jmp .LBL_2_7
433
434	END (_ZGVeN16v_expf_skx)
435

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S