1/* Function log2f vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Get short reciprocal approximation Rcp ~ 1/mantissa(x)
23 * R = Rcp*x - 1.0
24 * log2(x) = k - log2(Rcp) + poly_approximation(R)
25 * log2(Rcp) is tabulated
26 *
27 *
28 */
29
30/* Offsets for data table __svml_slog2_data_internal_avx512
31 */
32#define One 0
33#define coeff4 64
34#define coeff3 128
35#define coeff2 192
36#define coeff1 256
37
38#include <sysdep.h>
39
40 .section .text.evex512, "ax", @progbits
41ENTRY(_ZGVeN16v_log2f_skx)
42 pushq %rbp
43 cfi_def_cfa_offset(16)
44 movq %rsp, %rbp
45 cfi_def_cfa(6, 16)
46 cfi_offset(6, -16)
47 andq $-64, %rsp
48 subq $192, %rsp
49 vgetmantps $11, {sae}, %zmm0, %zmm3
50 vmovups __svml_slog2_data_internal_avx512(%rip), %zmm1
51 vgetexpps {sae}, %zmm0, %zmm5
52
53 /* x<=0? */
54 vfpclassps $94, %zmm0, %k0
55 vsubps {rn-sae}, %zmm1, %zmm3, %zmm9
56 vpsrld $19, %zmm3, %zmm7
57 vgetexpps {sae}, %zmm3, %zmm6
58 vpermps coeff4+__svml_slog2_data_internal_avx512(%rip), %zmm7, %zmm1
59 vpermps coeff3+__svml_slog2_data_internal_avx512(%rip), %zmm7, %zmm2
60 vpermps coeff2+__svml_slog2_data_internal_avx512(%rip), %zmm7, %zmm4
61 vpermps coeff1+__svml_slog2_data_internal_avx512(%rip), %zmm7, %zmm8
62 vsubps {rn-sae}, %zmm6, %zmm5, %zmm10
63 vfmadd213ps {rn-sae}, %zmm2, %zmm9, %zmm1
64 kmovw %k0, %edx
65 vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm1
66 vfmadd213ps {rn-sae}, %zmm8, %zmm9, %zmm1
67 vfmadd213ps {rn-sae}, %zmm10, %zmm9, %zmm1
68 testl %edx, %edx
69
70 /* Go to special inputs processing branch */
71 jne L(SPECIAL_VALUES_BRANCH)
72 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
73
74 /* Restore registers
75 * and exit the function
76 */
77
78L(EXIT):
79 vmovaps %zmm1, %zmm0
80 movq %rbp, %rsp
81 popq %rbp
82 cfi_def_cfa(7, 8)
83 cfi_restore(6)
84 ret
85 cfi_def_cfa(6, 16)
86 cfi_offset(6, -16)
87
88 /* Branch to process
89 * special inputs
90 */
91
92L(SPECIAL_VALUES_BRANCH):
93 vmovups %zmm0, 64(%rsp)
94 vmovups %zmm1, 128(%rsp)
95 # LOE rbx r12 r13 r14 r15 edx zmm1
96
97 xorl %eax, %eax
98 # LOE rbx r12 r13 r14 r15 eax edx
99
100 vzeroupper
101 movq %r12, 16(%rsp)
102 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
103 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
104 movl %eax, %r12d
105 movq %r13, 8(%rsp)
106 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
107 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
108 movl %edx, %r13d
109 movq %r14, (%rsp)
110 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
111 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
112 # LOE rbx r15 r12d r13d
113
114 /* Range mask
115 * bits check
116 */
117
118L(RANGEMASK_CHECK):
119 btl %r12d, %r13d
120
121 /* Call scalar math function */
122 jc L(SCALAR_MATH_CALL)
123 # LOE rbx r15 r12d r13d
124
125 /* Special inputs
126 * processing loop
127 */
128
129L(SPECIAL_VALUES_LOOP):
130 incl %r12d
131 cmpl $16, %r12d
132
133 /* Check bits in range mask */
134 jl L(RANGEMASK_CHECK)
135 # LOE rbx r15 r12d r13d
136
137 movq 16(%rsp), %r12
138 cfi_restore(12)
139 movq 8(%rsp), %r13
140 cfi_restore(13)
141 movq (%rsp), %r14
142 cfi_restore(14)
143 vmovups 128(%rsp), %zmm1
144
145 /* Go to exit */
146 jmp L(EXIT)
147 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
148 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
149 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
150 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
151 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
152 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
153 # LOE rbx r12 r13 r14 r15 zmm1
154
155 /* Scalar math function call
156 * to process special input
157 */
158
159L(SCALAR_MATH_CALL):
160 movl %r12d, %r14d
161 vmovss 64(%rsp, %r14, 4), %xmm0
162 call log2f@PLT
163 # LOE rbx r14 r15 r12d r13d xmm0
164
165 vmovss %xmm0, 128(%rsp, %r14, 4)
166
167 /* Process special inputs in loop */
168 jmp L(SPECIAL_VALUES_LOOP)
169 # LOE rbx r15 r12d r13d
170END(_ZGVeN16v_log2f_skx)
171
172 .section .rodata, "a"
173 .align 64
174
175#ifdef __svml_slog2_data_internal_avx512_typedef
176typedef unsigned int VUINT32;
177typedef struct {
178 __declspec(align(64)) VUINT32 One[16][1];
179 __declspec(align(64)) VUINT32 coeff4[16][1];
180 __declspec(align(64)) VUINT32 coeff3[16][1];
181 __declspec(align(64)) VUINT32 coeff2[16][1];
182 __declspec(align(64)) VUINT32 coeff1[16][1];
183} __svml_slog2_data_internal_avx512;
184#endif
185__svml_slog2_data_internal_avx512:
186 /* One */
187 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
188 // c4
189 .align 64
190 .long 0xbea77e4a, 0xbe8aae3d
191 .long 0xbe67fe32, 0xbe43d1b6
192 .long 0xbe26a589, 0xbe0ee09b
193 .long 0xbdf6a8a1, 0xbdd63b49
194 .long 0xbf584e51, 0xbf3e80a1
195 .long 0xbf2892f0, 0xbf15d377
196 .long 0xbf05b525, 0xbeef8e30
197 .long 0xbed75c8f, 0xbec24184
198 // c3
199 .align 64
200 .long 0x3ef5910c, 0x3ef045a1
201 .long 0x3ee7d87e, 0x3eddbb84
202 .long 0x3ed2d6df, 0x3ec7bbd2
203 .long 0x3ebcc42f, 0x3eb22616
204 .long 0x3e8f3399, 0x3eb1223e
205 .long 0x3ec9db4a, 0x3edb7a09
206 .long 0x3ee79a1a, 0x3eef77cb
207 .long 0x3ef407a4, 0x3ef607b4
208 // c2
209 .align 64
210 .long 0xbf38a934, 0xbf387de6
211 .long 0xbf37f6f0, 0xbf37048b
212 .long 0xbf35a88a, 0xbf33ed04
213 .long 0xbf31df56, 0xbf2f8d82
214 .long 0xbf416814, 0xbf3daf58
215 .long 0xbf3b5c08, 0xbf39fa2a
216 .long 0xbf393713, 0xbf38d7e1
217 .long 0xbf38b2cd, 0xbf38aa62
218 // c1
219 .align 64
220 .long 0x3fb8aa3b, 0x3fb8a9c0
221 .long 0x3fb8a6e8, 0x3fb89f4e
222 .long 0x3fb890cb, 0x3fb879b1
223 .long 0x3fb858d8, 0x3fb82d90
224 .long 0x3fb8655e, 0x3fb8883a
225 .long 0x3fb89aea, 0x3fb8a42f
226 .long 0x3fb8a848, 0x3fb8a9c9
227 .long 0x3fb8aa2f, 0x3fb8aa3b
228 .align 64
229 .type __svml_slog2_data_internal_avx512, @object
230 .size __svml_slog2_data_internal_avx512, .-__svml_slog2_data_internal_avx512
231

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S