1/* Function powf vectorized with AVX-512. KNL and SKX versions.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_s_powf_data.h"
21#include "svml_s_wrapper_impl.h"
22
23/*
24 ALGORITHM DESCRIPTION:
25
26 We are using the next identity : pow(x,y) = 2^(y * log2(x)).
27
28 1) log2(x) calculation
29 Here we use the following formula.
30 Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
31 Let C ~= 1/ln(2),
32 Rcp1 ~= 1/X1, X2=Rcp1*X1,
33 Rcp2 ~= 1/X2, X3=Rcp2*X2,
34 Rcp3 ~= 1/X3, Rcp3C ~= C/X3.
35 Then
36 log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
37 log2(X1*Rcp1*Rcp2*Rcp3C/C),
38 where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
39
40 The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
41 Rcp3C, log2(C/Rcp3C) are taken from tables.
42 Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
43 is exactly represented in target precision.
44
45 log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
46 = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
47 = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
48 = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
49 where
50 cq=X1*Rcp1*Rcp2*Rcp3C-C,
51 a1=1/(C*ln(2))-1 is small,
52 a2=1/(2*C^2*ln2),
53 a3=1/(3*C^3*ln2),
54 ...
55 Log2 result is split by three parts: HH+HL+HLL
56
57 2) Calculation of y*log2(x)
58 Split y into YHi+YLo.
59 Get high PH and medium PL parts of y*log2|x|.
60 Get low PLL part of y*log2|x|.
61 Now we have PH+PL+PLL ~= y*log2|x|.
62
63 3) Calculation of 2^(y*log2(x))
64 Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
65 where expK=7 in this implementation, N and j are integers,
66 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence
67 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
68 where 2^(j/2^expK) is stored in a table, and
69 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
70 We compute 2^(PH+PL+PLL) as follows:
71 Break PH into PHH + PHL, where PHH = N + j/2^expK.
72 Z = PHL + PL + PLL
73 Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
74 Get 2^(j/2^expK) from table in the form THI+TLO.
75 Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
76 Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
77 ResHi := THI
78 ResLo := THI * Exp2Poly + TLO
79 Get exponent ERes of the result:
80 Res := ResHi + ResLo:
81 Result := ex(Res) + N. */
82
83 .section .text.evex512, "ax", @progbits
84ENTRY (_ZGVeN16vv_powf_knl)
85 pushq %rbp
86 cfi_adjust_cfa_offset (8)
87 cfi_rel_offset (%rbp, 0)
88 movq %rsp, %rbp
89 cfi_def_cfa_register (%rbp)
90 andq $-64, %rsp
91 subq $1344, %rsp
92 movq __svml_spow_data@GOTPCREL(%rip), %rdx
93 vmovaps %zmm1, %zmm9
94 vshuff32x4 $238, %zmm0, %zmm0, %zmm7
95 kxnorw %k3, %k3, %k3
96 vcvtps2pd %ymm0, %zmm14
97 vcvtps2pd %ymm7, %zmm10
98 movl $-1, %eax
99 movq $-1, %rcx
100 vpandd _ABSMASK(%rdx), %zmm9, %zmm4
101 vmovups _ExpMask(%rdx), %zmm6
102
103/* exponent bits selection */
104 vpsrlq $20, %zmm14, %zmm13
105 vshuff32x4 $238, %zmm9, %zmm9, %zmm8
106 vpcmpd $5, _INF(%rdx), %zmm4, %k2
107 vpsrlq $32, %zmm13, %zmm15
108 vcvtps2pd %ymm8, %zmm2
109 vmovups _Two10(%rdx), %zmm4
110 vpmovqd %zmm15, %ymm12
111 vcvtps2pd %ymm9, %zmm1
112 vpsubd _NMINNORM(%rdx), %zmm0, %zmm3
113 vpbroadcastd %eax, %zmm8{%k2}{z}
114 vpcmpd $5, _NMAXVAL(%rdx), %zmm3, %k1
115
116/* preserve mantissa, set input exponent to 2^(-10) */
117 vmovaps %zmm6, %zmm3
118 vpternlogq $248, %zmm6, %zmm10, %zmm4
119 vpsrlq $20, %zmm10, %zmm10
120 vpternlogq $234, _Two10(%rdx), %zmm14, %zmm3
121
122/* reciprocal approximation good to at least 11 bits */
123 vrcp28pd %zmm4, %zmm11
124 vpsrlq $32, %zmm10, %zmm14
125 vpbroadcastd %eax, %zmm7{%k1}{z}
126 kxnorw %k1, %k1, %k1
127 vrcp28pd %zmm3, %zmm5
128 vpmovqd %zmm14, %ymm6
129 vshufi32x4 $68, %zmm6, %zmm12, %zmm13
130 vmovups _One(%rdx), %zmm6
131
132/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
133 vrndscalepd $8, %zmm5, %zmm14
134
135/* biased exponent in DP format */
136 vshuff32x4 $238, %zmm13, %zmm13, %zmm5
137 vrndscalepd $8, %zmm11, %zmm11
138 vcmppd $30, _Threshold(%rdx), %zmm14, %k2
139 vcvtdq2pd %ymm13, %zmm10
140 vcvtdq2pd %ymm5, %zmm15
141
142/* table lookup */
143 vpsrlq $40, %zmm14, %zmm13
144 vpxord %zmm5, %zmm5, %zmm5
145 vgatherqpd _Log2Rcp_lookup(%rdx,%zmm13), %zmm5{%k3}
146 vfmsub213pd %zmm6, %zmm14, %zmm3
147 vfmsub213pd %zmm6, %zmm11, %zmm4
148 vcmppd $30, _Threshold(%rdx), %zmm11, %k3
149 vpbroadcastq %rcx, %zmm14{%k2}{z}
150
151/* dpP= _dbT+lJ*T_ITEM_GRAN */
152 kxnorw %k2, %k2, %k2
153 vpsrlq $40, %zmm11, %zmm12
154 vpxord %zmm6, %zmm6, %zmm6
155 vpbroadcastq %rcx, %zmm11{%k3}{z}
156 kxnorw %k3, %k3, %k3
157 vgatherqpd _Log2Rcp_lookup(%rdx,%zmm12), %zmm6{%k1}
158 vmovups _Bias1(%rdx), %zmm12
159 vpternlogq $236, _Bias(%rdx), %zmm12, %zmm14
160 vpternlogq $248, _Bias(%rdx), %zmm11, %zmm12
161 vsubpd %zmm14, %zmm10, %zmm13
162 vsubpd %zmm12, %zmm15, %zmm10
163 vmovups _poly_coeff_3(%rdx), %zmm11
164 vmovups _poly_coeff_4(%rdx), %zmm15
165 vfmadd213pd %zmm15, %zmm4, %zmm11
166 vmulpd %zmm4, %zmm4, %zmm12
167 vmovaps %zmm15, %zmm14
168 vmulpd %zmm3, %zmm3, %zmm15
169 vfmadd231pd _poly_coeff_3(%rdx), %zmm3, %zmm14
170
171/* reconstruction */
172 vfmadd213pd %zmm4, %zmm12, %zmm11
173 vfmadd213pd %zmm3, %zmm15, %zmm14
174 vaddpd %zmm6, %zmm11, %zmm11
175 vaddpd %zmm5, %zmm14, %zmm3
176 vfmadd231pd _L2(%rdx), %zmm10, %zmm11
177 vfmadd132pd _L2(%rdx), %zmm3, %zmm13
178 vmulpd %zmm2, %zmm11, %zmm12
179 vmulpd %zmm1, %zmm13, %zmm10
180 vmulpd __dbInvLn2(%rdx), %zmm12, %zmm6
181
182/* hi bits */
183 vpsrlq $32, %zmm12, %zmm12
184 vmulpd __dbInvLn2(%rdx), %zmm10, %zmm1
185
186/* to round down; if dR is an integer we will get R = 1, which is ok */
187 vsubpd __dbHALF(%rdx), %zmm6, %zmm4
188 vpsrlq $32, %zmm10, %zmm11
189 vpmovqd %zmm11, %ymm3
190 vsubpd __dbHALF(%rdx), %zmm1, %zmm2
191 vaddpd __dbShifter(%rdx), %zmm4, %zmm14
192 vpmovqd %zmm12, %ymm4
193 vshufi32x4 $68, %zmm4, %zmm3, %zmm5
194 vpxord %zmm4, %zmm4, %zmm4
195 vaddpd __dbShifter(%rdx), %zmm2, %zmm2
196
197/* iAbsX = iAbsX&iAbsMask; */
198 vpandd __iAbsMask(%rdx), %zmm5, %zmm11
199 vpxord %zmm5, %zmm5, %zmm5
200 vsubpd __dbShifter(%rdx), %zmm14, %zmm13
201
202/* iRangeMask = (iAbsX>iDomainRange) */
203 vpcmpgtd __iDomainRange(%rdx), %zmm11, %k1
204 vsubpd __dbShifter(%rdx), %zmm2, %zmm15
205 vpbroadcastd %eax, %zmm10{%k1}{z}
206 vpternlogd $254, %zmm8, %zmm7, %zmm10
207
208/* [0..1) */
209 vsubpd %zmm15, %zmm1, %zmm1
210
211/* low K bits */
212 vpandq __lbLOWKBITS(%rdx), %zmm14, %zmm11
213 vgatherqpd 13952(%rdx,%zmm11,8), %zmm5{%k3}
214 vsubpd %zmm13, %zmm6, %zmm7
215 vptestmd %zmm10, %zmm10, %k0
216 vpandq __lbLOWKBITS(%rdx), %zmm2, %zmm10
217 vmulpd __dbC1(%rdx), %zmm1, %zmm1
218 vmulpd __dbC1(%rdx), %zmm7, %zmm3
219 vpsrlq $11, %zmm2, %zmm8
220 vpsrlq $11, %zmm14, %zmm2
221
222/* NB : including +/- sign for the exponent!! */
223 vpsllq $52, %zmm8, %zmm8
224 kmovw %k0, %ecx
225 vpsllq $52, %zmm2, %zmm6
226 vfmadd213pd %zmm5, %zmm3, %zmm5
227 vgatherqpd 13952(%rdx,%zmm10,8), %zmm4{%k2}
228 vfmadd213pd %zmm4, %zmm1, %zmm4
229 vpaddq %zmm6, %zmm5, %zmm10
230 vcvtpd2ps %zmm10, %ymm12
231 vpaddq %zmm8, %zmm4, %zmm7
232 vcvtpd2ps %zmm7, %ymm11
233 vshuff32x4 $68, %zmm12, %zmm11, %zmm1
234 testl %ecx, %ecx
235 jne .LBL_1_3
236
237.LBL_1_2:
238 cfi_remember_state
239 vmovaps %zmm1, %zmm0
240 movq %rbp, %rsp
241 cfi_def_cfa_register (%rsp)
242 popq %rbp
243 cfi_adjust_cfa_offset (-8)
244 cfi_restore (%rbp)
245 ret
246
247.LBL_1_3:
248 cfi_restore_state
249 vmovups %zmm0, 1152(%rsp)
250 vmovups %zmm9, 1216(%rsp)
251 vmovups %zmm1, 1280(%rsp)
252 je .LBL_1_2
253
254 xorb %dl, %dl
255 kmovw %k4, 1048(%rsp)
256 xorl %eax, %eax
257 kmovw %k5, 1040(%rsp)
258 kmovw %k6, 1032(%rsp)
259 kmovw %k7, 1024(%rsp)
260 vmovups %zmm16, 960(%rsp)
261 vmovups %zmm17, 896(%rsp)
262 vmovups %zmm18, 832(%rsp)
263 vmovups %zmm19, 768(%rsp)
264 vmovups %zmm20, 704(%rsp)
265 vmovups %zmm21, 640(%rsp)
266 vmovups %zmm22, 576(%rsp)
267 vmovups %zmm23, 512(%rsp)
268 vmovups %zmm24, 448(%rsp)
269 vmovups %zmm25, 384(%rsp)
270 vmovups %zmm26, 320(%rsp)
271 vmovups %zmm27, 256(%rsp)
272 vmovups %zmm28, 192(%rsp)
273 vmovups %zmm29, 128(%rsp)
274 vmovups %zmm30, 64(%rsp)
275 vmovups %zmm31, (%rsp)
276 movq %rsi, 1064(%rsp)
277 movq %rdi, 1056(%rsp)
278 movq %r12, 1096(%rsp)
279 cfi_offset_rel_rsp (12, 1096)
280 movb %dl, %r12b
281 movq %r13, 1088(%rsp)
282 cfi_offset_rel_rsp (13, 1088)
283 movl %ecx, %r13d
284 movq %r14, 1080(%rsp)
285 cfi_offset_rel_rsp (14, 1080)
286 movl %eax, %r14d
287 movq %r15, 1072(%rsp)
288 cfi_offset_rel_rsp (15, 1072)
289 cfi_remember_state
290
291.LBL_1_6:
292 btl %r14d, %r13d
293 jc .LBL_1_12
294
295.LBL_1_7:
296 lea 1(%r14), %esi
297 btl %esi, %r13d
298 jc .LBL_1_10
299
300.LBL_1_8:
301 addb $1, %r12b
302 addl $2, %r14d
303 cmpb $16, %r12b
304 jb .LBL_1_6
305
306 kmovw 1048(%rsp), %k4
307 movq 1064(%rsp), %rsi
308 kmovw 1040(%rsp), %k5
309 movq 1056(%rsp), %rdi
310 kmovw 1032(%rsp), %k6
311 movq 1096(%rsp), %r12
312 cfi_restore (%r12)
313 movq 1088(%rsp), %r13
314 cfi_restore (%r13)
315 kmovw 1024(%rsp), %k7
316 vmovups 960(%rsp), %zmm16
317 vmovups 896(%rsp), %zmm17
318 vmovups 832(%rsp), %zmm18
319 vmovups 768(%rsp), %zmm19
320 vmovups 704(%rsp), %zmm20
321 vmovups 640(%rsp), %zmm21
322 vmovups 576(%rsp), %zmm22
323 vmovups 512(%rsp), %zmm23
324 vmovups 448(%rsp), %zmm24
325 vmovups 384(%rsp), %zmm25
326 vmovups 320(%rsp), %zmm26
327 vmovups 256(%rsp), %zmm27
328 vmovups 192(%rsp), %zmm28
329 vmovups 128(%rsp), %zmm29
330 vmovups 64(%rsp), %zmm30
331 vmovups (%rsp), %zmm31
332 movq 1080(%rsp), %r14
333 cfi_restore (%r14)
334 movq 1072(%rsp), %r15
335 cfi_restore (%r15)
336 vmovups 1280(%rsp), %zmm1
337 jmp .LBL_1_2
338
339.LBL_1_10:
340 cfi_restore_state
341 movzbl %r12b, %r15d
342 vmovss 1156(%rsp,%r15,8), %xmm0
343 vmovss 1220(%rsp,%r15,8), %xmm1
344 call JUMPTARGET(powf)
345 vmovss %xmm0, 1284(%rsp,%r15,8)
346 jmp .LBL_1_8
347
348.LBL_1_12:
349 movzbl %r12b, %r15d
350 vmovss 1152(%rsp,%r15,8), %xmm0
351 vmovss 1216(%rsp,%r15,8), %xmm1
352 call JUMPTARGET(powf)
353 vmovss %xmm0, 1280(%rsp,%r15,8)
354 jmp .LBL_1_7
355END (_ZGVeN16vv_powf_knl)
356
357ENTRY (_ZGVeN16vv_powf_skx)
358 pushq %rbp
359 cfi_adjust_cfa_offset (8)
360 cfi_rel_offset (%rbp, 0)
361 movq %rsp, %rbp
362 cfi_def_cfa_register (%rbp)
363 andq $-64, %rsp
364 subq $1344, %rsp
365 movq __svml_spow_data@GOTPCREL(%rip), %rax
366 vextractf32x8 $1, %zmm1, %ymm14
367 vextractf32x8 $1, %zmm0, %ymm15
368 vpsubd _NMINNORM(%rax), %zmm0, %zmm9
369 vmovups %zmm26, 1280(%rsp)
370 vmovups _ExpMask(%rax), %zmm6
371 vpcmpd $1, _NMAXVAL(%rax), %zmm9, %k1
372 vcvtps2pd %ymm0, %zmm5
373 vcvtps2pd %ymm1, %zmm12
374 kxnorw %k3, %k3, %k3
375
376/* exponent bits selection */
377 vpsrlq $20, %zmm5, %zmm3
378 vpsrlq $32, %zmm3, %zmm2
379 vpmovqd %zmm2, %ymm11
380 vcvtps2pd %ymm14, %zmm13
381 vpternlogd $0xff, %zmm14, %zmm14, %zmm14
382 vmovaps %zmm14, %zmm26
383 vpandd _ABSMASK(%rax), %zmm1, %zmm8
384 vpcmpd $1, _INF(%rax), %zmm8, %k2
385 vpandnd %zmm9, %zmm9, %zmm26{%k1}
386 vmovups _Two10(%rax), %zmm9
387 kxnorw %k1, %k1, %k1
388 vcvtps2pd %ymm15, %zmm4
389 vmovaps %zmm14, %zmm15
390
391/* preserve mantissa, set input exponent to 2^(-10) */
392 vpternlogq $248, %zmm6, %zmm4, %zmm9
393 vpsrlq $20, %zmm4, %zmm4
394
395/* reciprocal approximation good to at least 11 bits */
396 vrcp14pd %zmm9, %zmm10
397
398/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
399 vrndscalepd $8, %zmm10, %zmm3
400 vmovups _One(%rax), %zmm10
401 vfmsub213pd %zmm10, %zmm3, %zmm9
402 vpandnd %zmm8, %zmm8, %zmm15{%k2}
403 vmovaps %zmm6, %zmm8
404 vpternlogq $234, _Two10(%rax), %zmm5, %zmm8
405 vpsrlq $32, %zmm4, %zmm5
406 vrcp14pd %zmm8, %zmm7
407 vpmovqd %zmm5, %ymm6
408 vrndscalepd $8, %zmm7, %zmm2
409 vfmsub213pd %zmm10, %zmm2, %zmm8
410
411/* table lookup */
412 vpsrlq $40, %zmm2, %zmm10
413 vinserti32x8 $1, %ymm6, %zmm11, %zmm4
414 vpsrlq $40, %zmm3, %zmm11
415
416/* biased exponent in DP format */
417 vextracti32x8 $1, %zmm4, %ymm7
418 vcvtdq2pd %ymm4, %zmm6
419 vpmovqd %zmm10, %ymm4
420 vpmovqd %zmm11, %ymm5
421 vpxord %zmm10, %zmm10, %zmm10
422 vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
423 vpternlogd $0xff, %zmm4, %zmm4, %zmm4
424 vpxord %zmm11, %zmm11, %zmm11
425 vcvtdq2pd %ymm7, %zmm7
426 vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
427 vmovups _Threshold(%rax), %zmm5
428 vcmppd $21, %zmm2, %zmm5, %k2
429 vcmppd $21, %zmm3, %zmm5, %k3
430 vmovups _Bias1(%rax), %zmm3
431 vmovaps %zmm4, %zmm2
432 vpandnq %zmm5, %zmm5, %zmm2{%k2}
433 vpternlogq $236, _Bias(%rax), %zmm3, %zmm2
434
435/* dpP= _dbT+lJ*T_ITEM_GRAN */
436 kxnorw %k2, %k2, %k2
437 vpandnq %zmm5, %zmm5, %zmm4{%k3}
438 vpternlogq $248, _Bias(%rax), %zmm4, %zmm3
439 vsubpd %zmm2, %zmm6, %zmm4
440 vmovups _poly_coeff_3(%rax), %zmm6
441 vmovups _poly_coeff_4(%rax), %zmm2
442 vsubpd %zmm3, %zmm7, %zmm5
443 vmulpd %zmm8, %zmm8, %zmm7
444 vfmadd213pd %zmm2, %zmm9, %zmm6
445 kxnorw %k3, %k3, %k3
446 vmovaps %zmm2, %zmm3
447 vmulpd %zmm9, %zmm9, %zmm2
448 vfmadd231pd _poly_coeff_3(%rax), %zmm8, %zmm3
449
450/* reconstruction */
451 vfmadd213pd %zmm9, %zmm2, %zmm6
452 vfmadd213pd %zmm8, %zmm7, %zmm3
453 vaddpd %zmm11, %zmm6, %zmm8
454 vaddpd %zmm10, %zmm3, %zmm9
455 vfmadd231pd _L2(%rax), %zmm5, %zmm8
456 vfmadd132pd _L2(%rax), %zmm9, %zmm4
457 vmulpd %zmm13, %zmm8, %zmm13
458 vmulpd %zmm12, %zmm4, %zmm3
459 vmulpd __dbInvLn2(%rax), %zmm13, %zmm10
460 vmulpd __dbInvLn2(%rax), %zmm3, %zmm8
461
462/* hi bits */
463 vpsrlq $32, %zmm3, %zmm4
464 vpsrlq $32, %zmm13, %zmm13
465
466/* to round down; if dR is an integer we will get R = 1, which is ok */
467 vsubpd __dbHALF(%rax), %zmm8, %zmm12
468 vpmovqd %zmm4, %ymm5
469 vpmovqd %zmm13, %ymm2
470 vsubpd __dbHALF(%rax), %zmm10, %zmm9
471 vaddpd __dbShifter(%rax), %zmm12, %zmm7
472 vaddpd __dbShifter(%rax), %zmm9, %zmm9
473 vsubpd __dbShifter(%rax), %zmm7, %zmm11
474 vsubpd __dbShifter(%rax), %zmm9, %zmm12
475 vinserti32x8 $1, %ymm2, %zmm5, %zmm3
476
477/* iAbsX = iAbsX&iAbsMask */
478 vpandd __iAbsMask(%rax), %zmm3, %zmm4
479
480/* iRangeMask = (iAbsX>iDomainRange) */
481 vpcmpd $2, __iDomainRange(%rax), %zmm4, %k1
482 vpandnd %zmm4, %zmm4, %zmm14{%k1}
483 vpternlogd $254, %zmm15, %zmm26, %zmm14
484
485/* [0..1) */
486 vsubpd %zmm11, %zmm8, %zmm15
487 vsubpd %zmm12, %zmm10, %zmm26
488 vptestmd %zmm14, %zmm14, %k0
489 vpsrlq $11, %zmm7, %zmm8
490 vpsrlq $11, %zmm9, %zmm10
491 vmulpd __dbC1(%rax), %zmm26, %zmm26
492 vmulpd __dbC1(%rax), %zmm15, %zmm15
493
494/* NB : including +/- sign for the exponent!! */
495 vpsllq $52, %zmm10, %zmm13
496 vpsllq $52, %zmm8, %zmm12
497 kmovw %k0, %ecx
498
499/* low K bits */
500 vpandq __lbLOWKBITS(%rax), %zmm9, %zmm14
501 vpandq __lbLOWKBITS(%rax), %zmm7, %zmm6
502 vpmovqd %zmm14, %ymm7
503 vpmovqd %zmm6, %ymm9
504 vpxord %zmm2, %zmm2, %zmm2
505 vgatherdpd 13952(%rax,%ymm7,8), %zmm2{%k3}
506 vfmadd213pd %zmm2, %zmm26, %zmm2
507 vpaddq %zmm13, %zmm2, %zmm2
508 vcvtpd2ps %zmm2, %ymm4
509 vpxord %zmm11, %zmm11, %zmm11
510 vgatherdpd 13952(%rax,%ymm9,8), %zmm11{%k2}
511 vfmadd213pd %zmm11, %zmm15, %zmm11
512 vpaddq %zmm12, %zmm11, %zmm3
513 vcvtpd2ps %zmm3, %ymm5
514 vinsertf32x8 $1, %ymm4, %zmm5, %zmm2
515 testl %ecx, %ecx
516 jne .LBL_2_3
517
518.LBL_2_2:
519 cfi_remember_state
520 vmovups 1280(%rsp), %zmm26
521 vmovaps %zmm2, %zmm0
522 movq %rbp, %rsp
523 cfi_def_cfa_register (%rsp)
524 popq %rbp
525 cfi_adjust_cfa_offset (-8)
526 cfi_restore (%rbp)
527 ret
528
529.LBL_2_3:
530 cfi_restore_state
531 vmovups %zmm0, 1088(%rsp)
532 vmovups %zmm1, 1152(%rsp)
533 vmovups %zmm2, 1216(%rsp)
534 je .LBL_2_2
535
536 xorb %dl, %dl
537 xorl %eax, %eax
538 kmovw %k4, 984(%rsp)
539 kmovw %k5, 976(%rsp)
540 kmovw %k6, 968(%rsp)
541 kmovw %k7, 960(%rsp)
542 vmovups %zmm16, 896(%rsp)
543 vmovups %zmm17, 832(%rsp)
544 vmovups %zmm18, 768(%rsp)
545 vmovups %zmm19, 704(%rsp)
546 vmovups %zmm20, 640(%rsp)
547 vmovups %zmm21, 576(%rsp)
548 vmovups %zmm22, 512(%rsp)
549 vmovups %zmm23, 448(%rsp)
550 vmovups %zmm24, 384(%rsp)
551 vmovups %zmm25, 320(%rsp)
552 vmovups %zmm27, 256(%rsp)
553 vmovups %zmm28, 192(%rsp)
554 vmovups %zmm29, 128(%rsp)
555 vmovups %zmm30, 64(%rsp)
556 vmovups %zmm31, (%rsp)
557 movq %rsi, 1000(%rsp)
558 movq %rdi, 992(%rsp)
559 movq %r12, 1032(%rsp)
560 cfi_offset_rel_rsp (12, 1032)
561 movb %dl, %r12b
562 movq %r13, 1024(%rsp)
563 cfi_offset_rel_rsp (13, 1024)
564 movl %ecx, %r13d
565 movq %r14, 1016(%rsp)
566 cfi_offset_rel_rsp (14, 1016)
567 movl %eax, %r14d
568 movq %r15, 1008(%rsp)
569 cfi_offset_rel_rsp (15, 1008)
570 cfi_remember_state
571
572.LBL_2_6:
573 btl %r14d, %r13d
574 jc .LBL_2_12
575
576.LBL_2_7:
577 lea 1(%r14), %esi
578 btl %esi, %r13d
579 jc .LBL_2_10
580
581.LBL_2_8:
582 incb %r12b
583 addl $2, %r14d
584 cmpb $16, %r12b
585 jb .LBL_2_6
586
587 kmovw 984(%rsp), %k4
588 kmovw 976(%rsp), %k5
589 kmovw 968(%rsp), %k6
590 kmovw 960(%rsp), %k7
591 vmovups 896(%rsp), %zmm16
592 vmovups 832(%rsp), %zmm17
593 vmovups 768(%rsp), %zmm18
594 vmovups 704(%rsp), %zmm19
595 vmovups 640(%rsp), %zmm20
596 vmovups 576(%rsp), %zmm21
597 vmovups 512(%rsp), %zmm22
598 vmovups 448(%rsp), %zmm23
599 vmovups 384(%rsp), %zmm24
600 vmovups 320(%rsp), %zmm25
601 vmovups 256(%rsp), %zmm27
602 vmovups 192(%rsp), %zmm28
603 vmovups 128(%rsp), %zmm29
604 vmovups 64(%rsp), %zmm30
605 vmovups (%rsp), %zmm31
606 vmovups 1216(%rsp), %zmm2
607 movq 1000(%rsp), %rsi
608 movq 992(%rsp), %rdi
609 movq 1032(%rsp), %r12
610 cfi_restore (%r12)
611 movq 1024(%rsp), %r13
612 cfi_restore (%r13)
613 movq 1016(%rsp), %r14
614 cfi_restore (%r14)
615 movq 1008(%rsp), %r15
616 cfi_restore (%r15)
617 jmp .LBL_2_2
618
619.LBL_2_10:
620 cfi_restore_state
621 movzbl %r12b, %r15d
622 vmovss 1156(%rsp,%r15,8), %xmm1
623 vzeroupper
624 vmovss 1092(%rsp,%r15,8), %xmm0
625 call JUMPTARGET(powf)
626 vmovss %xmm0, 1220(%rsp,%r15,8)
627 jmp .LBL_2_8
628
629.LBL_2_12:
630 movzbl %r12b, %r15d
631 vmovss 1152(%rsp,%r15,8), %xmm1
632 vzeroupper
633 vmovss 1088(%rsp,%r15,8), %xmm0
634 call JUMPTARGET(powf)
635 vmovss %xmm0, 1216(%rsp,%r15,8)
636 jmp .LBL_2_7
637END (_ZGVeN16vv_powf_skx)
638

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S