1/* Function log1pf vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * 1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1, 2)
23 * Get short reciprocal approximation Rcp ~ 1/xh
24 * R = (Rcp*xh - 1.0) + Rcp*xl
25 * log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
26 * log(Rcp) is tabulated
27 *
28 *
29 */
30
31/* Offsets for data table __svml_slog1p_data_internal
32 */
33#define SgnMask 0
34#define sOne 64
35#define sPoly_1 128
36#define sPoly_2 192
37#define sPoly_3 256
38#define sPoly_4 320
39#define sPoly_5 384
40#define sPoly_6 448
41#define sPoly_7 512
42#define sPoly_8 576
43#define iHiDelta 640
44#define iLoRange 704
45#define iBrkValue 768
46#define iOffExpoMask 832
47#define sLn2 896
48
49#include <sysdep.h>
50
51 .section .text.evex512, "ax", @progbits
52ENTRY(_ZGVeN16v_log1pf_skx)
53 pushq %rbp
54 cfi_def_cfa_offset(16)
55 movq %rsp, %rbp
56 cfi_def_cfa(6, 16)
57 cfi_offset(6, -16)
58 andq $-64, %rsp
59 subq $192, %rsp
60 vmovups sOne+__svml_slog1p_data_internal(%rip), %zmm2
61
62 /* reduction: compute r, n */
63 vmovups iBrkValue+__svml_slog1p_data_internal(%rip), %zmm12
64 vmovups SgnMask+__svml_slog1p_data_internal(%rip), %zmm4
65 vmovaps %zmm0, %zmm3
66
67 /* compute 1+x as high, low parts */
68 vmaxps {sae}, %zmm3, %zmm2, %zmm5
69 vminps {sae}, %zmm3, %zmm2, %zmm7
70 vandnps %zmm3, %zmm4, %zmm1
71 vpternlogd $255, %zmm4, %zmm4, %zmm4
72 vaddps {rn-sae}, %zmm7, %zmm5, %zmm9
73 vpsubd %zmm12, %zmm9, %zmm10
74 vsubps {rn-sae}, %zmm9, %zmm5, %zmm6
75
76 /* check argument value ranges */
77 vpaddd iHiDelta+__svml_slog1p_data_internal(%rip), %zmm9, %zmm8
78 vpsrad $23, %zmm10, %zmm13
79 vmovups sPoly_5+__svml_slog1p_data_internal(%rip), %zmm9
80 vpcmpd $5, iLoRange+__svml_slog1p_data_internal(%rip), %zmm8, %k1
81 vpslld $23, %zmm13, %zmm14
82 vaddps {rn-sae}, %zmm7, %zmm6, %zmm15
83 vcvtdq2ps {rn-sae}, %zmm13, %zmm0
84 vpsubd %zmm14, %zmm2, %zmm13
85 vmovups sPoly_8+__svml_slog1p_data_internal(%rip), %zmm7
86 vmovups sPoly_1+__svml_slog1p_data_internal(%rip), %zmm14
87 vmulps {rn-sae}, %zmm13, %zmm15, %zmm6
88 vpandd iOffExpoMask+__svml_slog1p_data_internal(%rip), %zmm10, %zmm11
89 vpaddd %zmm12, %zmm11, %zmm5
90 vmovups sPoly_4+__svml_slog1p_data_internal(%rip), %zmm10
91 vmovups sPoly_3+__svml_slog1p_data_internal(%rip), %zmm11
92 vmovups sPoly_2+__svml_slog1p_data_internal(%rip), %zmm12
93
94 /* polynomial evaluation */
95 vsubps {rn-sae}, %zmm2, %zmm5, %zmm2
96 vaddps {rn-sae}, %zmm6, %zmm2, %zmm15
97 vmovups sPoly_7+__svml_slog1p_data_internal(%rip), %zmm2
98 vfmadd231ps {rn-sae}, %zmm15, %zmm7, %zmm2
99 vpandnd %zmm8, %zmm8, %zmm4{%k1}
100 vmovups sPoly_6+__svml_slog1p_data_internal(%rip), %zmm8
101
102 /* combine and get argument value range mask */
103 vptestmd %zmm4, %zmm4, %k0
104 vfmadd213ps {rn-sae}, %zmm8, %zmm15, %zmm2
105 kmovw %k0, %edx
106 vfmadd213ps {rn-sae}, %zmm9, %zmm15, %zmm2
107 vfmadd213ps {rn-sae}, %zmm10, %zmm15, %zmm2
108 vfmadd213ps {rn-sae}, %zmm11, %zmm15, %zmm2
109 vfmadd213ps {rn-sae}, %zmm12, %zmm15, %zmm2
110 vfmadd213ps {rn-sae}, %zmm14, %zmm15, %zmm2
111 vmulps {rn-sae}, %zmm15, %zmm2, %zmm4
112 vfmadd213ps {rn-sae}, %zmm15, %zmm15, %zmm4
113
114 /* final reconstruction */
115 vmovups sLn2+__svml_slog1p_data_internal(%rip), %zmm15
116 vfmadd213ps {rn-sae}, %zmm4, %zmm15, %zmm0
117 vorps %zmm1, %zmm0, %zmm0
118 testl %edx, %edx
119
120 /* Go to special inputs processing branch */
121 jne L(SPECIAL_VALUES_BRANCH)
122 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm3
123
124 /* Restore registers
125 * and exit the function
126 */
127
128L(EXIT):
129 movq %rbp, %rsp
130 popq %rbp
131 cfi_def_cfa(7, 8)
132 cfi_restore(6)
133 ret
134 cfi_def_cfa(6, 16)
135 cfi_offset(6, -16)
136
137 /* Branch to process
138 * special inputs
139 */
140
141L(SPECIAL_VALUES_BRANCH):
142 vmovups %zmm3, 64(%rsp)
143 vmovups %zmm0, 128(%rsp)
144 # LOE rbx r12 r13 r14 r15 edx zmm0
145
146 xorl %eax, %eax
147 # LOE rbx r12 r13 r14 r15 eax edx
148
149 vzeroupper
150 movq %r12, 16(%rsp)
151 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
152 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
153 movl %eax, %r12d
154 movq %r13, 8(%rsp)
155 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
156 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
157 movl %edx, %r13d
158 movq %r14, (%rsp)
159 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
160 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
161 # LOE rbx r15 r12d r13d
162
163 /* Range mask
164 * bits check
165 */
166
167L(RANGEMASK_CHECK):
168 btl %r12d, %r13d
169
170 /* Call scalar math function */
171 jc L(SCALAR_MATH_CALL)
172 # LOE rbx r15 r12d r13d
173
174 /* Special inputs
175 * processing loop
176 */
177
178L(SPECIAL_VALUES_LOOP):
179 incl %r12d
180 cmpl $16, %r12d
181
182 /* Check bits in range mask */
183 jl L(RANGEMASK_CHECK)
184 # LOE rbx r15 r12d r13d
185
186 movq 16(%rsp), %r12
187 cfi_restore(12)
188 movq 8(%rsp), %r13
189 cfi_restore(13)
190 movq (%rsp), %r14
191 cfi_restore(14)
192 vmovups 128(%rsp), %zmm0
193
194 /* Go to exit */
195 jmp L(EXIT)
196 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
197 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
198 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
199 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
200 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
201 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
202 # LOE rbx r12 r13 r14 r15 zmm0
203
204 /* Scalar math function call
205 * to process special input
206 */
207
208L(SCALAR_MATH_CALL):
209 movl %r12d, %r14d
210 vmovss 64(%rsp, %r14, 4), %xmm0
211 call log1pf@PLT
212 # LOE rbx r14 r15 r12d r13d xmm0
213
214 vmovss %xmm0, 128(%rsp, %r14, 4)
215
216 /* Process special inputs in loop */
217 jmp L(SPECIAL_VALUES_LOOP)
218 # LOE rbx r15 r12d r13d
219END(_ZGVeN16v_log1pf_skx)
220
221 .section .rodata, "a"
222 .align 64
223
224#ifdef __svml_slog1p_data_internal_typedef
225typedef unsigned int VUINT32;
226typedef struct {
227 __declspec(align(64)) VUINT32 SgnMask[16][1];
228 __declspec(align(64)) VUINT32 sOne[16][1];
229 __declspec(align(64)) VUINT32 sPoly[8][16][1];
230 __declspec(align(64)) VUINT32 iHiDelta[16][1];
231 __declspec(align(64)) VUINT32 iLoRange[16][1];
232 __declspec(align(64)) VUINT32 iBrkValue[16][1];
233 __declspec(align(64)) VUINT32 iOffExpoMask[16][1];
234 __declspec(align(64)) VUINT32 sLn2[16][1];
235} __svml_slog1p_data_internal;
236#endif
237__svml_slog1p_data_internal:
238 /* SgnMask */
239 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
240 /* sOne = SP 1.0 */
241 .align 64
242 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
243 /* sPoly[] = SP polynomial */
244 .align 64
245 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
246 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
247 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
248 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
249 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
250 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
251 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
252 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
253 /* iHiDelta = SP 80000000-7f000000 */
254 .align 64
255 .long 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000
256 /* iLoRange = SP 00800000+iHiDelta */
257 .align 64
258 .long 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000
259 /* iBrkValue = SP 2/3 */
260 .align 64
261 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
262 /* iOffExpoMask = SP significand mask */
263 .align 64
264 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
265 /* sLn2 = SP ln(2) */
266 .align 64
267 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
268 .align 64
269 .type __svml_slog1p_data_internal, @object
270 .size __svml_slog1p_data_internal, .-__svml_slog1p_data_internal
271

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S