memchr.S source code [glibc/sysdeps/powerpc/powerpc64/power8/memchr.S]

1	/ Optimized memchr implementation for POWER8.*
2	Copyright (C) 2017-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	/ void [r3] memchr (const void s [r3], int c [r4], size_t n [r5]) /
22
23	#ifndef MEMCHR
24	# define MEMCHR __memchr
25	#endif
26	.machine power8
27	ENTRY_TOCLESS (MEMCHR)
28	CALL_MCOUNT `3`
29	dcbt `0`, r3
30	clrrdi r8, r3, `3`
31	insrdi r4, r4, `8`, `48`
32
33	/ Calculate the last acceptable address and check for possible*
34	addition overflow by using satured math:
35	r7 = r3 + r5
36	r7 \|= -(r7 < x) /*
37	add r7, r3, r5
38	subfc r6, r3, r7
39	subfe r9, r9, r9
40	extsw r6, r9
41	or r7, r7, r6
42
43	insrdi r4, r4, `16`, `32`
44	cmpldi r5, `32`
45	li r9, -`1`
46	rlwinm r6, r3, `3`, `26`, `28` / Calculate padding. /
47	insrdi r4, r4, `32`, `0`
48	mr r10, r7
49	addi r7, r7, -`1`
50	#ifdef __LITTLE_ENDIAN__
51	sld r9, r9, r6
52	#else
53	srd r9, r9, r6
54	#endif
55	ble L(small_range)
56	andi. r11, r3, `63`
57	beq cr0, L(align_qw)
58	clrldi r11, r3, `61`
59	ld r12, `0`(r8) / Load doubleword from memory. /
60	cmpb r3, r12, r4 / Check for BYTEs in DWORD1. /
61	and r3, r3, r9
62	clrldi r6, r7, `61` / Byte count - 1 in last dword. /
63	clrrdi r7, r7, `3` / Address of last doubleword. /
64	cmpldi cr7, r3, `0` / Does r3 indicate we got a hit? /
65	bne cr7, L(done)
66	addi r8, r8, `8`
67	addi r5, r5, -`8`
68	add r5, r5, r11
69
70	/ Are we now aligned to a quadword boundary? /
71	andi. r11, r8, `15`
72	beq cr0, L(align_qw)
73
74	/ Handle DWORD to make it QW aligned. /
75	ld r12, `0`(r8)
76	cmpb r3, r12, r4
77	cmpldi cr7, r3, `0`
78	bne cr7, L(done)
79	addi r5, r5, -`8`
80	addi r8, r8, `8`
81	/ At this point, r8 is 16B aligned. /
82	L(align_qw):
83	vspltisb v0, `0`
84	/ Precompute vbpermq constant. /
85	vspltisb v10, `3`
86	li r0, `0`
87	lvsl v11, r0, r0
88	vslb v10, v11, v10
89	mtvrd v1, r4
90	vspltb v1, v1, `7`
91	cmpldi r5, `64`
92	ble L(tail64)
93	/ Are we 64-byte aligned? If so, jump to the vectorized loop.*
94	Note: aligning to 64-byte will necessarily slow down performance for
95	strings around 64 bytes in length due to the extra comparisons
96	required to check alignment for the vectorized loop. This is a
97	necessary tradeoff we are willing to take in order to speed up the
98	calculation for larger strings. /*
99	andi. r11, r8, `63`
100	beq cr0, L(preloop_64B)
101	/ In order to begin the 64B loop, it needs to be 64*
102	bytes aligned. So read until it is 64B aligned. /*
103	lvx v4, `0`, r8
104	vcmpequb v6, v1, v4
105	vcmpequb. v11, v0, v6
106	bnl cr6, L(found_16B)
107	addi r8, r8, `16`
108	addi r5, r5, -`16`
109
110	andi. r11, r8, `63`
111	beq cr0, L(preloop_64B)
112	lvx v4, `0`, r8
113	vcmpequb v6, v1, v4
114	vcmpequb. v11, v0, v6
115	bnl cr6, L(found_16B)
116	addi r8, r8, `16`
117	addi r5, r5, -`16`
118
119	andi. r11, r8, `63`
120	beq cr0, L(preloop_64B)
121	lvx v4, `0`, r8
122	vcmpequb v6, v1, v4
123	vcmpequb. v11, v0, v6
124	bnl cr6, L(found_16B)
125	addi r8, r8, `16`
126	addi r5, r5, -`16`
127	/ At this point it should be 64B aligned.*
128	Prepare for the 64B loop. /*
129	L(preloop_64B):
130	cmpldi r5, `64` / Check if r5 < 64. /
131	ble L(tail64)
132	sub r6, r10, r8
133	srdi r9, r6, `6` / Number of loop iterations. /
134	mtctr r9 / Setup the counter. /
135	li r11, `16` / Load required offsets. /
136	li r9, `32`
137	li r7, `48`
138
139	/ Handle r5 > 64. Loop over the bytes in strides of 64B. /
140	.align `4`
141	L(loop):
142	lvx v2, `0`, r8 / Load 4 quadwords. /
143	lvx v3, r8, r11
144	lvx v4, v8, r9
145	lvx v5, v8, r7
146	vcmpequb v6, v1, v2
147	vcmpequb v7, v1, v3
148	vcmpequb v8, v1, v4
149	vcmpequb v9, v1, v5
150	vor v11, v6, v7
151	vor v12, v8, v9
152	vor v11, v11, v12 / Compare and merge into one VR for speed. /
153	vcmpequb. v11, v0, v11
154	bnl cr6, L(found)
155	addi r8, r8, `64` / Adjust address for the next iteration. /
156	bdnz L(loop)
157	clrldi r5, r6, `58`
158
159	/ Handle remainder of 64B loop or r5 > 64. /
160	.align `4`
161	L(tail64):
162	cmpldi r5, `0`
163	beq L(null)
164	lvx v4, `0`, r8
165	vcmpequb v6, v1, v4
166	vcmpequb. v11, v0, v6
167	bnl cr6, L(found_16B)
168	addi r8, r8, `16`
169	cmpldi cr6, r5, `16`
170	ble cr6, L(null)
171	addi r5, r5, -`16`
172
173	lvx v4, `0`, r8
174	vcmpequb v6, v1, v4
175	vcmpequb. v11, v0, v6
176	bnl cr6, L(found_16B)
177	addi r8, r8, `16`
178	cmpldi cr6, r5, `16`
179	ble cr6, L(null)
180	addi r5, r5, -`16`
181
182	lvx v4, `0`, r8
183	vcmpequb v6, v1, v4
184	vcmpequb. v11, v0, v6
185	bnl cr6, L(found_16B)
186	addi r8, r8, `16`
187	cmpldi cr6, r5, `16`
188	ble cr6, L(null)
189	addi r5, r5, -`16`
190
191	lvx v4, `0`, r8
192	vcmpequb v6, v1, v4
193	vcmpequb. v11, v0, v6
194	bnl cr6, L(found_16B)
195	li r3, `0`
196	blr
197
198	/ Found a match in 64B loop. /
199	.align `4`
200	L(found):
201	/ Permute the first bit of each byte into bits 48-63. /
202	vbpermq v6, v6, v10
203	vbpermq v7, v7, v10
204	vbpermq v8, v8, v10
205	vbpermq v9, v9, v10
206	/ Shift each component into its correct position for merging. /
207	#ifdef __LITTLE_ENDIAN__
208	vsldoi v7, v7, v7, `2`
209	vsldoi v8, v8, v8, `4`
210	vsldoi v9, v9, v9, `6`
211	#else
212	vsldoi v6, v6, v6, `6`
213	vsldoi v7, v7, v7, `4`
214	vsldoi v8, v8, v8, `2`
215	#endif
216	/ Merge the results and move to a GPR. /
217	vor v11, v6, v7
218	vor v4, v9, v8
219	vor v4, v11, v4
220	mfvrd r5, v4
221	#ifdef __LITTLE_ENDIAN__
222	addi r6, r5, -`1`
223	andc r6, r6, r5
224	popcntd r6, r6
225	#else
226	cntlzd r6, r5 / Count leading zeros before the match. /
227	#endif
228	add r3, r8, r6 / Compute final length. /
229	blr
230
231	/ Found a match in last 16 bytes. /
232	.align `4`
233	L(found_16B):
234	/ Permute the first bit of each byte into bits 48-63. /
235	vbpermq v6, v6, v10
236	/ Shift each component into its correct position for merging. /
237	#ifdef __LITTLE_ENDIAN__
238	mfvrd r7, v6
239	addi r6, r7, -`1`
240	andc r6, r6, r7
241	popcntd r6, r6
242	#else
243	vsldoi v6, v6, v6, `6`
244	mfvrd r7, v6
245	cntlzd r6, r7 / Count leading zeros before the match. /
246	#endif
247	add r3, r8, r6 / Compute final length. /
248	cmpld r6, r5
249	bltlr
250	li r3, `0`
251	blr
252
253	.align `4`
254	/ r3 has the output of the cmpb instruction, that is, it contains*
255	0xff in the same position as BYTE in the original
256	doubleword from the string. Use that to calculate the pointer.
257	We need to make sure BYTE is before* the end of the range. /
258	L(done):
259	#ifdef __LITTLE_ENDIAN__
260	addi r0, r3, -`1`
261	andc r0, r0, r3
262	popcntd r0, r0 / Count trailing zeros. /
263	#else
264	cntlzd r0, r3 / Count leading zeros before the match. /
265	#endif
266	cmpld r8, r7 / Are we on the last dword? /
267	srdi r0, r0, `3` / Convert leading/trailing zeros to bytes. /
268	add r3, r8, r0
269	cmpld cr7, r0, r6 / If on the last dword, check byte offset. /
270	bnelr
271	blelr cr7
272	li r3, `0`
273	blr
274
275	.align `4`
276	L(null):
277	li r3, `0`
278	blr
279
280	/ Deals with size <= 32. /
281	.align `4`
282	L(small_range):
283	cmpldi r5, `0`
284	beq L(null)
285	ld r12, `0`(r8) / Load word from memory. /
286	cmpb r3, r12, r4 / Check for BYTE in DWORD1. /
287	and r3, r3, r9
288	cmpldi cr7, r3, `0`
289	clrldi r6, r7, `61` / Byte count - 1 in last dword. /
290	clrrdi r7, r7, `3` / Address of last doubleword. /
291	cmpld r8, r7 / Are we done already? /
292	bne cr7, L(done)
293	beqlr
294
295	ldu r12, `8`(r8)
296	cmpb r3, r12, r4
297	cmpldi cr6, r3, `0`
298	cmpld r8, r7
299	bne cr6, L(done) / Found something. /
300	beqlr / Hit end of string (length). /
301
302	ldu r12, `8`(r8)
303	cmpb r3, r12, r4
304	cmpldi cr6, r3, `0`
305	cmpld r8, r7
306	bne cr6, L(done)
307	beqlr
308
309	ldu r12, `8`(r8)
310	cmpb r3, r12, r4
311	cmpldi cr6, r3, `0`
312	cmpld r8, r7
313	bne cr6, L(done)
314	beqlr
315
316	ldu r12, `8`(r8)
317	cmpb r3, r12, r4
318	cmpldi cr6, r3, `0`
319	bne cr6, L(done)
320	blr
321
322	END (MEMCHR)
323	weak_alias (__memchr, memchr)
324	libc_hidden_builtin_def (memchr)
325

source code of glibc/sysdeps/powerpc/powerpc64/power8/memchr.S