1/* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using
2
3 Copyright (C) 2011-2022 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21#define VEC_SIZE 16
22#define PAGE_SIZE 4096
23
24 .text
25ENTRY_P2ALIGN(__memrchr, 6)
26#ifdef __ILP32__
27 /* Clear upper bits. */
28 mov %RDX_LP, %RDX_LP
29#endif
30 movd %esi, %xmm0
31
32 /* Get end pointer. */
33 leaq (%rdx, %rdi), %rcx
34
35 punpcklbw %xmm0, %xmm0
36 punpcklwd %xmm0, %xmm0
37 pshufd $0, %xmm0, %xmm0
38
39 /* Check if we can load 1x VEC without cross a page. */
40 testl $(PAGE_SIZE - VEC_SIZE), %ecx
41 jz L(page_cross)
42
43 /* NB: This load happens regardless of whether rdx (len) is zero. Since
44 it doesn't cross a page and the standard gurantees any pointer have
45 at least one-valid byte this load must be safe. For the entire
46 history of the x86 memrchr implementation this has been possible so
47 no code "should" be relying on a zero-length check before this load.
48 The zero-length check is moved to the page cross case because it is
49 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
50 into 2-cache lines. */
51 movups -(VEC_SIZE)(%rcx), %xmm1
52 pcmpeqb %xmm0, %xmm1
53 pmovmskb %xmm1, %eax
54
55 subq $VEC_SIZE, %rdx
56 ja L(more_1x_vec)
57L(ret_vec_x0_test):
58 /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
59 zero. */
60 bsrl %eax, %eax
61 jz L(ret_0)
62 /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
63 if out of bounds. */
64 addl %edx, %eax
65 jl L(zero_0)
66 /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
67 ptr. */
68 addq %rdi, %rax
69L(ret_0):
70 ret
71
72 .p2align 4,, 5
73L(ret_vec_x0):
74 bsrl %eax, %eax
75 leaq -(VEC_SIZE)(%rcx, %rax), %rax
76 ret
77
78 .p2align 4,, 2
79L(zero_0):
80 xorl %eax, %eax
81 ret
82
83
84 .p2align 4,, 8
85L(more_1x_vec):
86 testl %eax, %eax
87 jnz L(ret_vec_x0)
88
89 /* Align rcx (pointer to string). */
90 decq %rcx
91 andq $-VEC_SIZE, %rcx
92
93 movq %rcx, %rdx
94 /* NB: We could consistenyl save 1-byte in this pattern with `movaps
95 %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
96 it adds more frontend uops (even if the moves can be eliminated) and
97 some percentage of the time actual backend uops. */
98 movaps -(VEC_SIZE)(%rcx), %xmm1
99 pcmpeqb %xmm0, %xmm1
100 subq %rdi, %rdx
101 pmovmskb %xmm1, %eax
102
103 cmpq $(VEC_SIZE * 2), %rdx
104 ja L(more_2x_vec)
105L(last_2x_vec):
106 subl $VEC_SIZE, %edx
107 jbe L(ret_vec_x0_test)
108
109 testl %eax, %eax
110 jnz L(ret_vec_x0)
111
112 movaps -(VEC_SIZE * 2)(%rcx), %xmm1
113 pcmpeqb %xmm0, %xmm1
114 pmovmskb %xmm1, %eax
115
116 subl $VEC_SIZE, %edx
117 bsrl %eax, %eax
118 jz L(ret_1)
119 addl %edx, %eax
120 jl L(zero_0)
121 addq %rdi, %rax
122L(ret_1):
123 ret
124
125 /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
126 causes the hot pause (length <= VEC_SIZE) to span multiple cache
127 lines. Naturally aligned % 16 to 8-bytes. */
128L(page_cross):
129 /* Zero length check. */
130 testq %rdx, %rdx
131 jz L(zero_0)
132
133 leaq -1(%rcx), %r8
134 andq $-(VEC_SIZE), %r8
135
136 movaps (%r8), %xmm1
137 pcmpeqb %xmm0, %xmm1
138 pmovmskb %xmm1, %esi
139 /* Shift out negative alignment (because we are starting from endptr and
140 working backwards). */
141 negl %ecx
142 /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
143 explicitly. */
144 andl $(VEC_SIZE - 1), %ecx
145 shl %cl, %esi
146 movzwl %si, %eax
147 leaq (%rdi, %rdx), %rcx
148 cmpq %rdi, %r8
149 ja L(more_1x_vec)
150 subl $VEC_SIZE, %edx
151 bsrl %eax, %eax
152 jz L(ret_2)
153 addl %edx, %eax
154 jl L(zero_1)
155 addq %rdi, %rax
156L(ret_2):
157 ret
158
159 /* Fits in aliging bytes. */
160L(zero_1):
161 xorl %eax, %eax
162 ret
163
164 .p2align 4,, 5
165L(ret_vec_x1):
166 bsrl %eax, %eax
167 leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
168 ret
169
170 .p2align 4,, 8
171L(more_2x_vec):
172 testl %eax, %eax
173 jnz L(ret_vec_x0)
174
175 movaps -(VEC_SIZE * 2)(%rcx), %xmm1
176 pcmpeqb %xmm0, %xmm1
177 pmovmskb %xmm1, %eax
178 testl %eax, %eax
179 jnz L(ret_vec_x1)
180
181
182 movaps -(VEC_SIZE * 3)(%rcx), %xmm1
183 pcmpeqb %xmm0, %xmm1
184 pmovmskb %xmm1, %eax
185
186 subq $(VEC_SIZE * 4), %rdx
187 ja L(more_4x_vec)
188
189 addl $(VEC_SIZE), %edx
190 jle L(ret_vec_x2_test)
191
192L(last_vec):
193 testl %eax, %eax
194 jnz L(ret_vec_x2)
195
196 movaps -(VEC_SIZE * 4)(%rcx), %xmm1
197 pcmpeqb %xmm0, %xmm1
198 pmovmskb %xmm1, %eax
199
200 subl $(VEC_SIZE), %edx
201 bsrl %eax, %eax
202 jz L(ret_3)
203 addl %edx, %eax
204 jl L(zero_2)
205 addq %rdi, %rax
206L(ret_3):
207 ret
208
209 .p2align 4,, 6
210L(ret_vec_x2_test):
211 bsrl %eax, %eax
212 jz L(zero_2)
213 addl %edx, %eax
214 jl L(zero_2)
215 addq %rdi, %rax
216 ret
217
218L(zero_2):
219 xorl %eax, %eax
220 ret
221
222
223 .p2align 4,, 5
224L(ret_vec_x2):
225 bsrl %eax, %eax
226 leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
227 ret
228
229 .p2align 4,, 5
230L(ret_vec_x3):
231 bsrl %eax, %eax
232 leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
233 ret
234
235 .p2align 4,, 8
236L(more_4x_vec):
237 testl %eax, %eax
238 jnz L(ret_vec_x2)
239
240 movaps -(VEC_SIZE * 4)(%rcx), %xmm1
241 pcmpeqb %xmm0, %xmm1
242 pmovmskb %xmm1, %eax
243
244 testl %eax, %eax
245 jnz L(ret_vec_x3)
246
247 addq $-(VEC_SIZE * 4), %rcx
248 cmpq $(VEC_SIZE * 4), %rdx
249 jbe L(last_4x_vec)
250
251 /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
252 keeping the code from spilling to the next cache line. */
253 addq $(VEC_SIZE * 4 - 1), %rcx
254 andq $-(VEC_SIZE * 4), %rcx
255 leaq (VEC_SIZE * 4)(%rdi), %rdx
256 andq $-(VEC_SIZE * 4), %rdx
257
258 .p2align 4,, 11
259L(loop_4x_vec):
260 movaps (VEC_SIZE * -1)(%rcx), %xmm1
261 movaps (VEC_SIZE * -2)(%rcx), %xmm2
262 movaps (VEC_SIZE * -3)(%rcx), %xmm3
263 movaps (VEC_SIZE * -4)(%rcx), %xmm4
264 pcmpeqb %xmm0, %xmm1
265 pcmpeqb %xmm0, %xmm2
266 pcmpeqb %xmm0, %xmm3
267 pcmpeqb %xmm0, %xmm4
268
269 por %xmm1, %xmm2
270 por %xmm3, %xmm4
271 por %xmm2, %xmm4
272
273 pmovmskb %xmm4, %esi
274 testl %esi, %esi
275 jnz L(loop_end)
276
277 addq $-(VEC_SIZE * 4), %rcx
278 cmpq %rdx, %rcx
279 jne L(loop_4x_vec)
280
281 subl %edi, %edx
282
283 /* Ends up being 1-byte nop. */
284 .p2align 4,, 2
285L(last_4x_vec):
286 movaps -(VEC_SIZE)(%rcx), %xmm1
287 pcmpeqb %xmm0, %xmm1
288 pmovmskb %xmm1, %eax
289
290 cmpl $(VEC_SIZE * 2), %edx
291 jbe L(last_2x_vec)
292
293 testl %eax, %eax
294 jnz L(ret_vec_x0)
295
296
297 movaps -(VEC_SIZE * 2)(%rcx), %xmm1
298 pcmpeqb %xmm0, %xmm1
299 pmovmskb %xmm1, %eax
300
301 testl %eax, %eax
302 jnz L(ret_vec_end)
303
304 movaps -(VEC_SIZE * 3)(%rcx), %xmm1
305 pcmpeqb %xmm0, %xmm1
306 pmovmskb %xmm1, %eax
307
308 subl $(VEC_SIZE * 3), %edx
309 ja L(last_vec)
310 bsrl %eax, %eax
311 jz L(ret_4)
312 addl %edx, %eax
313 jl L(zero_3)
314 addq %rdi, %rax
315L(ret_4):
316 ret
317
318 /* Ends up being 1-byte nop. */
319 .p2align 4,, 3
320L(loop_end):
321 pmovmskb %xmm1, %eax
322 sall $16, %eax
323 jnz L(ret_vec_end)
324
325 pmovmskb %xmm2, %eax
326 testl %eax, %eax
327 jnz L(ret_vec_end)
328
329 pmovmskb %xmm3, %eax
330 /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
331 then it won't affect the result in esi (VEC4). If ecx is non-zero
332 then CHAR in VEC3 and bsrq will use that position. */
333 sall $16, %eax
334 orl %esi, %eax
335 bsrl %eax, %eax
336 leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
337 ret
338
339L(ret_vec_end):
340 bsrl %eax, %eax
341 leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
342 ret
343 /* Use in L(last_4x_vec). In the same cache line. This is just a spare
344 aligning bytes. */
345L(zero_3):
346 xorl %eax, %eax
347 ret
348 /* 2-bytes from next cache line. */
349END(__memrchr)
350weak_alias (__memrchr, memrchr)
351

source code of glibc/sysdeps/x86_64/memrchr.S