1/* strlen/wcslen optimized with 256/512-bit EVEX instructions.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19
20#include <isa-level.h>
21
22#if ISA_SHOULD_BUILD (4)
23
24# include <sysdep.h>
25
26# ifdef USE_AS_WCSLEN
27# define VPCMPEQ vpcmpeqd
28# define VPCMPNEQ vpcmpneqd
29# define VPTESTN vptestnmd
30# define VPTEST vptestmd
31# define VPMINU vpminud
32# define CHAR_SIZE 4
33# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg
34# else
35# define VPCMPEQ vpcmpeqb
36# define VPCMPNEQ vpcmpneqb
37# define VPTESTN vptestnmb
38# define VPTEST vptestmb
39# define VPMINU vpminub
40# define CHAR_SIZE 1
41# define CHAR_SIZE_SHIFT_REG(reg)
42
43# define REG_WIDTH VEC_SIZE
44# endif
45
46# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
47
48# include "reg-macros.h"
49
50# if CHAR_PER_VEC == 64
51
52# define TAIL_RETURN_LBL first_vec_x2
53# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
54
55# define FALLTHROUGH_RETURN_LBL first_vec_x3
56# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
57
58# else
59
60# define TAIL_RETURN_LBL first_vec_x3
61# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
62
63# define FALLTHROUGH_RETURN_LBL first_vec_x2
64# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
65# endif
66
67# define XZERO VMM_128(0)
68# define VZERO VMM(0)
69# define PAGE_SIZE 4096
70
71 .section SECTION(.text), "ax", @progbits
72ENTRY_P2ALIGN(STRLEN, 6)
73 movl %edi, %eax
74 vpxorq %XZERO, %XZERO, %XZERO
75 andl $(PAGE_SIZE - 1), %eax
76 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
77 ja L(cross_page_boundary)
78
79 /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
80 null byte. */
81 VPCMPEQ (%rdi), %VZERO, %k0
82 KMOV %k0, %VRAX
83 test %VRAX, %VRAX
84 jz L(aligned_more)
85 bsf %VRAX, %VRAX
86 ret
87
88 .p2align 4,, 8
89L(first_vec_x4):
90 bsf %VRAX, %VRAX
91 subl %ecx, %edi
92 CHAR_SIZE_SHIFT_REG (edi)
93 leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
94 ret
95
96
97
98 /* Aligned more for strnlen compares remaining length vs 2 *
99 CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
100 going to the loop. */
101 .p2align 4,, 10
102L(aligned_more):
103 movq %rdi, %rcx
104 andq $(VEC_SIZE * -1), %rdi
105L(cross_page_continue):
106 /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
107 rechecking bounds. */
108 VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
109 KMOV %k0, %VRAX
110 test %VRAX, %VRAX
111 jnz L(first_vec_x1)
112
113 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
114 KMOV %k0, %VRAX
115 test %VRAX, %VRAX
116 jnz L(first_vec_x2)
117
118 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
119 KMOV %k0, %VRAX
120 test %VRAX, %VRAX
121 jnz L(first_vec_x3)
122
123 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
124 KMOV %k0, %VRAX
125 test %VRAX, %VRAX
126 jnz L(first_vec_x4)
127
128 subq $(VEC_SIZE * -1), %rdi
129
130# if CHAR_PER_VEC == 64
131 /* No partial register stalls on processors that we use evex512
132 on and this saves code size. */
133 xorb %dil, %dil
134# else
135 andq $-(VEC_SIZE * 4), %rdi
136# endif
137
138
139
140 /* Compare 4 * VEC at a time forward. */
141 .p2align 4
142L(loop_4x_vec):
143 VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
144 VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
145 VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
146 VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
147 VPTESTN %VMM(2), %VMM(2), %k0
148 VPTESTN %VMM(4), %VMM(4), %k2
149
150 subq $-(VEC_SIZE * 4), %rdi
151 KORTEST %k0, %k2
152 jz L(loop_4x_vec)
153
154 VPTESTN %VMM(1), %VMM(1), %k1
155 KMOV %k1, %VRAX
156 test %VRAX, %VRAX
157 jnz L(first_vec_x0)
158
159 KMOV %k0, %VRAX
160 test %VRAX, %VRAX
161 jnz L(first_vec_x1)
162
163 VPTESTN %VMM(3), %VMM(3), %k0
164
165# if CHAR_PER_VEC == 64
166 KMOV %k0, %VRAX
167 test %VRAX, %VRAX
168 jnz L(first_vec_x2)
169 KMOV %k2, %VRAX
170# else
171 /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. */
172 kmovd %k2, %edx
173 kmovd %k0, %eax
174 salq $CHAR_PER_VEC, %rdx
175 orq %rdx, %rax
176# endif
177
178 /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. */
179 .p2align 4,, 2
180L(FALLTHROUGH_RETURN_LBL):
181 bsfq %rax, %rax
182 subq %rcx, %rdi
183 CHAR_SIZE_SHIFT_REG (rdi)
184 leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
185 ret
186
187 .p2align 4,, 8
188L(first_vec_x0):
189 bsf %VRAX, %VRAX
190 sub %rcx, %rdi
191 CHAR_SIZE_SHIFT_REG (rdi)
192 addq %rdi, %rax
193 ret
194
195 .p2align 4,, 10
196L(first_vec_x1):
197 bsf %VRAX, %VRAX
198 sub %rcx, %rdi
199 CHAR_SIZE_SHIFT_REG (rdi)
200 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
201 ret
202
203 .p2align 4,, 10
204 /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. */
205L(TAIL_RETURN_LBL):
206 bsf %VRAX, %VRAX
207 sub %VRCX, %VRDI
208 CHAR_SIZE_SHIFT_REG (VRDI)
209 lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
210 ret
211
212 .p2align 4,, 8
213L(cross_page_boundary):
214 movq %rdi, %rcx
215 /* Align data to VEC_SIZE. */
216 andq $-VEC_SIZE, %rdi
217
218 VPCMPEQ (%rdi), %VZERO, %k0
219
220 KMOV %k0, %VRAX
221# ifdef USE_AS_WCSLEN
222 movl %ecx, %edx
223 shrl $2, %edx
224 andl $(CHAR_PER_VEC - 1), %edx
225 shrx %edx, %eax, %eax
226 testl %eax, %eax
227# else
228 shr %cl, %VRAX
229# endif
230 jz L(cross_page_continue)
231 bsf %VRAX, %VRAX
232 ret
233
234END(STRLEN)
235#endif
236

source code of glibc/sysdeps/x86_64/multiarch/strlen-evex-base.S