memrchr.S source code [glibc/sysdeps/x86_64/memrchr.S]

1	/ fast SSE2 memrchr with 64 byte loop and pmaxub instruction using*
2
3	Copyright (C) 2011-2022 Free Software Foundation, Inc.
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include <sysdep.h>
21	#define VEC_SIZE 16
22	#define PAGE_SIZE 4096
23
24	.text
25	ENTRY_P2ALIGN(__memrchr, `6`)
26	#ifdef __ILP32__
27	/ Clear upper bits. /
28	mov %RDX_LP, %RDX_LP
29	#endif
30	movd %esi, %xmm0
31
32	/ Get end pointer. /
33	leaq (%rdx, %rdi), %rcx
34
35	punpcklbw %xmm0, %xmm0
36	punpcklwd %xmm0, %xmm0
37	pshufd $`0`, %xmm0, %xmm0
38
39	/ Check if we can load 1x VEC without cross a page. /
40	testl $(PAGE_SIZE - VEC_SIZE), %ecx
41	jz L(page_cross)
42
43	/ NB: This load happens regardless of whether rdx (len) is zero. Since*
44	it doesn't cross a page and the standard gurantees any pointer have
45	at least one-valid byte this load must be safe. For the entire
46	history of the x86 memrchr implementation this has been possible so
47	no code "should" be relying on a zero-length check before this load.
48	The zero-length check is moved to the page cross case because it is
49	1) pretty cold and including it pushes the hot case len <= VEC_SIZE
50	into 2-cache lines. /*
51	movups -(VEC_SIZE)(%rcx), %xmm1
52	pcmpeqb %xmm0, %xmm1
53	pmovmskb %xmm1, %eax
54
55	subq $VEC_SIZE, %rdx
56	ja L(more_1x_vec)
57	L(ret_vec_x0_test):
58	/ Zero-flag set if eax (src) is zero. Destination unchanged if src is*
59	zero. /*
60	bsrl %eax, %eax
61	jz L(ret_0)
62	/ Check if the CHAR match is in bounds. Need to truly zero `eax` here*
63	if out of bounds. /*
64	addl %edx, %eax
65	jl L(zero_0)
66	/ Since we subtracted VEC_SIZE from rdx earlier we can just add to base*
67	ptr. /*
68	addq %rdi, %rax
69	L(ret_0):
70	ret
71
72	.p2align `4`,, `5`
73	L(ret_vec_x0):
74	bsrl %eax, %eax
75	leaq -(VEC_SIZE)(%rcx, %rax), %rax
76	ret
77
78	.p2align `4`,, `2`
79	L(zero_0):
80	xorl %eax, %eax
81	ret
82
83
84	.p2align `4`,, `8`
85	L(more_1x_vec):
86	testl %eax, %eax
87	jnz L(ret_vec_x0)
88
89	/ Align rcx (pointer to string). /
90	decq %rcx
91	andq $-VEC_SIZE, %rcx
92
93	movq %rcx, %rdx
94	/ NB: We could consistenyl save 1-byte in this pattern with `movaps*
95	%xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
96	it adds more frontend uops (even if the moves can be eliminated) and
97	some percentage of the time actual backend uops. /*
98	movaps -(VEC_SIZE)(%rcx), %xmm1
99	pcmpeqb %xmm0, %xmm1
100	subq %rdi, %rdx
101	pmovmskb %xmm1, %eax
102
103	cmpq $(VEC_SIZE * `2`), %rdx
104	ja L(more_2x_vec)
105	L(last_2x_vec):
106	subl $VEC_SIZE, %edx
107	jbe L(ret_vec_x0_test)
108
109	testl %eax, %eax
110	jnz L(ret_vec_x0)
111
112	movaps -(VEC_SIZE * `2`)(%rcx), %xmm1
113	pcmpeqb %xmm0, %xmm1
114	pmovmskb %xmm1, %eax
115
116	subl $VEC_SIZE, %edx
117	bsrl %eax, %eax
118	jz L(ret_1)
119	addl %edx, %eax
120	jl L(zero_0)
121	addq %rdi, %rax
122	L(ret_1):
123	ret
124
125	/ Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)*
126	causes the hot pause (length <= VEC_SIZE) to span multiple cache
127	lines. Naturally aligned % 16 to 8-bytes. /*
128	L(page_cross):
129	/ Zero length check. /
130	testq %rdx, %rdx
131	jz L(zero_0)
132
133	leaq -`1`(%rcx), %r8
134	andq $-(VEC_SIZE), %r8
135
136	movaps (%r8), %xmm1
137	pcmpeqb %xmm0, %xmm1
138	pmovmskb %xmm1, %esi
139	/ Shift out negative alignment (because we are starting from endptr and*
140	working backwards). /*
141	negl %ecx
142	/ 32-bit shift but VEC_SIZE=16 so need to mask the shift count*
143	explicitly. /*
144	andl $(VEC_SIZE - `1`), %ecx
145	shl %cl, %esi
146	movzwl %si, %eax
147	leaq (%rdi, %rdx), %rcx
148	cmpq %rdi, %r8
149	ja L(more_1x_vec)
150	subl $VEC_SIZE, %edx
151	bsrl %eax, %eax
152	jz L(ret_2)
153	addl %edx, %eax
154	jl L(zero_1)
155	addq %rdi, %rax
156	L(ret_2):
157	ret
158
159	/ Fits in aliging bytes. /
160	L(zero_1):
161	xorl %eax, %eax
162	ret
163
164	.p2align `4`,, `5`
165	L(ret_vec_x1):
166	bsrl %eax, %eax
167	leaq -(VEC_SIZE * `2`)(%rcx, %rax), %rax
168	ret
169
170	.p2align `4`,, `8`
171	L(more_2x_vec):
172	testl %eax, %eax
173	jnz L(ret_vec_x0)
174
175	movaps -(VEC_SIZE * `2`)(%rcx), %xmm1
176	pcmpeqb %xmm0, %xmm1
177	pmovmskb %xmm1, %eax
178	testl %eax, %eax
179	jnz L(ret_vec_x1)
180
181
182	movaps -(VEC_SIZE * `3`)(%rcx), %xmm1
183	pcmpeqb %xmm0, %xmm1
184	pmovmskb %xmm1, %eax
185
186	subq $(VEC_SIZE * `4`), %rdx
187	ja L(more_4x_vec)
188
189	addl $(VEC_SIZE), %edx
190	jle L(ret_vec_x2_test)
191
192	L(last_vec):
193	testl %eax, %eax
194	jnz L(ret_vec_x2)
195
196	movaps -(VEC_SIZE * `4`)(%rcx), %xmm1
197	pcmpeqb %xmm0, %xmm1
198	pmovmskb %xmm1, %eax
199
200	subl $(VEC_SIZE), %edx
201	bsrl %eax, %eax
202	jz L(ret_3)
203	addl %edx, %eax
204	jl L(zero_2)
205	addq %rdi, %rax
206	L(ret_3):
207	ret
208
209	.p2align `4`,, `6`
210	L(ret_vec_x2_test):
211	bsrl %eax, %eax
212	jz L(zero_2)
213	addl %edx, %eax
214	jl L(zero_2)
215	addq %rdi, %rax
216	ret
217
218	L(zero_2):
219	xorl %eax, %eax
220	ret
221
222
223	.p2align `4`,, `5`
224	L(ret_vec_x2):
225	bsrl %eax, %eax
226	leaq -(VEC_SIZE * `3`)(%rcx, %rax), %rax
227	ret
228
229	.p2align `4`,, `5`
230	L(ret_vec_x3):
231	bsrl %eax, %eax
232	leaq -(VEC_SIZE * `4`)(%rcx, %rax), %rax
233	ret
234
235	.p2align `4`,, `8`
236	L(more_4x_vec):
237	testl %eax, %eax
238	jnz L(ret_vec_x2)
239
240	movaps -(VEC_SIZE * `4`)(%rcx), %xmm1
241	pcmpeqb %xmm0, %xmm1
242	pmovmskb %xmm1, %eax
243
244	testl %eax, %eax
245	jnz L(ret_vec_x3)
246
247	addq $-(VEC_SIZE * `4`), %rcx
248	cmpq $(VEC_SIZE * `4`), %rdx
249	jbe L(last_4x_vec)
250
251	/ Offset everything by 4x VEC_SIZE here to save a few bytes at the end*
252	keeping the code from spilling to the next cache line. /*
253	addq $(VEC_SIZE * `4` - `1`), %rcx
254	andq $-(VEC_SIZE * `4`), %rcx
255	leaq (VEC_SIZE * `4`)(%rdi), %rdx
256	andq $-(VEC_SIZE * `4`), %rdx
257
258	.p2align `4`,, `11`
259	L(loop_4x_vec):
260	movaps (VEC_SIZE * -`1`)(%rcx), %xmm1
261	movaps (VEC_SIZE * -`2`)(%rcx), %xmm2
262	movaps (VEC_SIZE * -`3`)(%rcx), %xmm3
263	movaps (VEC_SIZE * -`4`)(%rcx), %xmm4
264	pcmpeqb %xmm0, %xmm1
265	pcmpeqb %xmm0, %xmm2
266	pcmpeqb %xmm0, %xmm3
267	pcmpeqb %xmm0, %xmm4
268
269	por %xmm1, %xmm2
270	por %xmm3, %xmm4
271	por %xmm2, %xmm4
272
273	pmovmskb %xmm4, %esi
274	testl %esi, %esi
275	jnz L(loop_end)
276
277	addq $-(VEC_SIZE * `4`), %rcx
278	cmpq %rdx, %rcx
279	jne L(loop_4x_vec)
280
281	subl %edi, %edx
282
283	/ Ends up being 1-byte nop. /
284	.p2align `4`,, `2`
285	L(last_4x_vec):
286	movaps -(VEC_SIZE)(%rcx), %xmm1
287	pcmpeqb %xmm0, %xmm1
288	pmovmskb %xmm1, %eax
289
290	cmpl $(VEC_SIZE * `2`), %edx
291	jbe L(last_2x_vec)
292
293	testl %eax, %eax
294	jnz L(ret_vec_x0)
295
296
297	movaps -(VEC_SIZE * `2`)(%rcx), %xmm1
298	pcmpeqb %xmm0, %xmm1
299	pmovmskb %xmm1, %eax
300
301	testl %eax, %eax
302	jnz L(ret_vec_end)
303
304	movaps -(VEC_SIZE * `3`)(%rcx), %xmm1
305	pcmpeqb %xmm0, %xmm1
306	pmovmskb %xmm1, %eax
307
308	subl $(VEC_SIZE * `3`), %edx
309	ja L(last_vec)
310	bsrl %eax, %eax
311	jz L(ret_4)
312	addl %edx, %eax
313	jl L(zero_3)
314	addq %rdi, %rax
315	L(ret_4):
316	ret
317
318	/ Ends up being 1-byte nop. /
319	.p2align `4`,, `3`
320	L(loop_end):
321	pmovmskb %xmm1, %eax
322	sall $`16`, %eax
323	jnz L(ret_vec_end)
324
325	pmovmskb %xmm2, %eax
326	testl %eax, %eax
327	jnz L(ret_vec_end)
328
329	pmovmskb %xmm3, %eax
330	/ Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)*
331	then it won't affect the result in esi (VEC4). If ecx is non-zero
332	then CHAR in VEC3 and bsrq will use that position. /*
333	sall $`16`, %eax
334	orl %esi, %eax
335	bsrl %eax, %eax
336	leaq -(VEC_SIZE * `4`)(%rcx, %rax), %rax
337	ret
338
339	L(ret_vec_end):
340	bsrl %eax, %eax
341	leaq (VEC_SIZE * -`2`)(%rax, %rcx), %rax
342	ret
343	/ Use in L(last_4x_vec). In the same cache line. This is just a spare*
344	aligning bytes. /*
345	L(zero_3):
346	xorl %eax, %eax
347	ret
348	/ 2-bytes from next cache line. /
349	END(__memrchr)
350	weak_alias (__memrchr, memrchr)
351

source code of glibc/sysdeps/x86_64/memrchr.S