strnlen.S source code [glibc/sysdeps/powerpc/powerpc64/power8/strnlen.S]

1	/ Optimized strnlen implementation for POWER8 using a vmx loop.*
2
3	Copyright (C) 2017-2024 Free Software Foundation, Inc.
4	This file is part of the GNU C Library.
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9	The GNU C Library is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12	Lesser General Public License for more details.
13	You should have received a copy of the GNU Lesser General Public
14	License along with the GNU C Library; if not, see
15	<https://www.gnu.org/licenses/>. /*
16
17	/ It is implemented the following heuristic:*
18	1. Case maxlen <= 32: align the pointer to 8 bytes to loop through
19	reading doublewords. Uses the POWER7 algorithm.
20	2. Case maxlen > 32: check for null bytes in the first 16 bytes using
21	unaligned accesses. Return length if found. Otherwise:
22	2.1 Case maxlen < 64: deduct the bytes previously read, align
23	the pointer to 16 bytes and loop through reading quadwords
24	until find null bytes or reach maxlen.
25	2.2 Case maxlen > 64: deduct the bytes previously read, align
26	the pointer to 64 bytes and set up a counter to loop through
27	reading in strides of 64 bytes. In case it finished the loop
28	with null bytes not found, process the remainder bytes by
29	switching to the loop to heuristic in 2.1. /*
30
31	#include <sysdep.h>
32
33	/ Define default page size to 4KB. /
34	#define PAGE_SIZE 4096
35
36
37	/ int [r3] strnlen (char s [r3], size_t maxlen [r4]) /*
38	.machine power8
39	ENTRY_TOCLESS (__strnlen)
40	CALL_MCOUNT `2`
41	dcbt `0`,r3
42
43	cmpldi r4,`32` / Check if maxlen <= 32. /
44	ble L(small_range) / If maxlen <= 32. /
45
46	/ Upcoming 16 bytes unaligned accesses cannot cross the page boundary*
47	otherwise the processor throws an memory access error.
48	Use following code to check there is room for such as accesses:
49	(((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16)
50	If it is disallowed then switch to the code that handles
51	the string when maxlen <= 32. /*
52	clrldi r10,r3,`52`
53	cmpldi cr7,r10,PAGE_SIZE-`16`
54	bgt cr7,L(small_range) / If less than 16B of page end. /
55
56	/ Compute our permute constant r8. /
57	li r7,`0`
58	/ Compute a bpermd constant to move bit 0 of each word into*
59	a halfword value, and count trailing zeros. /*
60	#ifdef __LITTLE_ENDIAN__
61	li r8,`0x2820`
62	oris r8,r8,`0x3830`
63	sldi r8,r8,`32`
64	ori r8,r8,`0x0800`
65	oris r8,r8,`0x1810`
66	#else
67	li r8,`0x1018`
68	oris r8,r8,`0x0008`
69	sldi r8,r8,`32`
70	ori r8,r8,`0x3038`
71	oris r8,r8,`0x2028`
72	#endif
73
74	/ maxlen > 32. Optimistically check for null bytes in the first*
75	16 bytes of the string using unaligned accesses. /*
76	ld r5,`0`(r3)
77	ld r6,`8`(r3)
78	cmpb r10,r7,r5 / Check for null bytes in DWORD1. /
79	cmpb r11,r7,r6 / Check for null bytes in DWORD2. /
80	or. r7,r10,r11
81	bne cr0, L(early_find) / If found null bytes. /
82
83	/ At this point maxlen > 32 and null bytes were not found at first*
84	16 bytes. Prepare for loop using VMX. /*
85
86	/ r3 == s, r4 == maxlen. All other volatile regs are unused now. /
87
88	addi r5,r3,`16` / Align up, or just add the 16B we*
89	already checked. /*
90	li r0,`15`
91	and r7,r5,r0 / Find offset into 16B alignment. /
92	andc r5,r5,r0 / Quadword align up s to the next quadword. /
93	li r0,`16`
94	subf r0,r7,r0
95	subf r4,r0,r4 / Deduct unaligned bytes from maxlen. /
96
97
98	/ Compute offsets for vmx loads, and precompute the vbpermq*
99	constants for both the 64B and 16B loops. /*
100	li r6,`0`
101	vspltisb v0,`0`
102	vspltisb v10,`3`
103	lvsl v11,r6,r6
104	vslb v10,v11,v10
105
106	cmpldi r4,`64` / Check maxlen < 64. /
107	blt L(smaller) / If maxlen < 64 /
108
109	/ In order to begin the 64B loop, it needs to be 64*
110	bytes aligned. So read quadwords until it is aligned or found null
111	bytes. At worst case it will be aligned after the fourth iteration,
112	so unroll the loop to avoid counter checking. /*
113	andi. r7,r5,`63` / Check if is 64 bytes aligned. /
114	beq cr0,L(preloop_64B) / If it is already 64B aligned. /
115	lvx v1,r5,r6
116	vcmpequb. v1,v1,v0
117	addi r5,r5,`16`
118	addi r4,r4,-`16` / Decrement maxlen in 16 bytes. /
119	bne cr6,L(found_aligning64B) / If found null bytes. /
120
121	/ Unroll 2x above code block until aligned or find null bytes. /
122	andi. r7,r5,`63`
123	beq cr0,L(preloop_64B)
124	lvx v1,r5,r6
125	vcmpequb. v1,v1,v0
126	addi r5,r5,`16`
127	addi r4,r4,-`16`
128	bne cr6,L(found_aligning64B)
129
130	andi. r7,r5,`63`
131	beq cr0,L(preloop_64B)
132	lvx v1,r5,r6
133	vcmpequb. v1,v1,v0
134	addi r5,r5,`16`
135	addi r4,r4,-`16`
136	bne cr6,L(found_aligning64B)
137
138	/ At this point it should be 16 bytes aligned.*
139	Prepare for the 64B loop. /*
140	.p2align `4`
141	L(preloop_64B):
142	/ Check if maxlen became is less than 64, therefore disallowing the*
143	64B loop. If it happened switch to the 16B loop code. /*
144	cmpldi r4,`64` / Check if maxlen < 64. /
145	blt L(smaller) / If maxlen < 64. /
146	/ Set some constant values. /
147	li r7,`16`
148	li r10,`32`
149	li r9,`48`
150
151	/ Compute the number of 64 bytes iterations needed. /
152	srdi r11,r4,`6` / Compute loop count (maxlen / 64). /
153	andi. r4,r4,`63` / Set maxlen the remainder (maxlen % 64). /
154	mtctr r11 / Move loop count to counter register. /
155
156	/ Handle maxlen > 64. Loop over the bytes in strides of 64B. /
157	.p2align `4`
158	L(loop_64B):
159	lvx v1,r5,r6 / r5 is the pointer to s. /
160	lvx v2,r5,r7
161	lvx v3,r5,r10
162	lvx v4,r5,r9
163	/ Compare the four 16B vectors to obtain the least 16 values.*
164	Null bytes should emerge into v7, then check for null bytes. /*
165	vminub v5,v1,v2
166	vminub v6,v3,v4
167	vminub v7,v5,v6
168	vcmpequb. v7,v7,v0 / Check for null bytes. /
169	addi r5,r5,`64` / Add pointer to next iteration. /
170	bne cr6,L(found_64B) / If found null bytes. /
171	bdnz L(loop_64B) / Continue the loop if count > 0. /
172
173	/ Hit loop end without null match. So branch to handle the remainder. /
174
175	/ Prepare a 16B loop to handle two cases:*
176	1. If 32 > maxlen < 64.
177	2. If maxlen >= 64, and reached end of the 64B loop with null
178	bytes not found. Thus handle the remainder bytes here. /*
179	.p2align `4`
180	L(smaller):
181	cmpldi r4,`0` / Check maxlen is zero. /
182	beq L(done) / If maxlen is zero. /
183
184	/ Place rounded up number of qw's to check into a vmx*
185	register, and use some vector tricks to minimize
186	branching. /*
187	mtvrd v7,r4 / copy maxlen from gpr to vector register. /
188	vspltisb v5,`1`
189	vspltisb v6,`15`
190	vspltb v2,v7,`7`
191	vaddubs v3,v5,v6
192
193	#ifdef __LITTLE_ENDIAN__
194	vspltish v5,`1` / Compute 16 in each byte. /
195	#endif
196
197	/ Loop in 16B aligned incremements now. /
198	.p2align `4`
199	L(loop_16B):
200	lvx v1,r5,r6 / Load quadword into vector register. /
201	addi r5,r5,`16` / Increment address to next 16B block. /
202	vor v7,v2,v2 / Save loop count (v2) into v7. /
203	vsububs v2,v2,v3 / Subtract 16B from count, saturate at 0. /
204	vminub v4,v1,v2
205	vcmpequb. v4,v4,v0 / Checking for null bytes. /
206	beq cr6,L(loop_16B) / If null bytes not found. /
207
208	vcmpequb v1,v1,v0
209	vbpermq v1,v1,v10
210	#ifdef __LITTLE_ENDIAN__
211	vsubuhm v2,v1,v5 / Form a mask of trailing zeros. /
212	vandc v2,v2,v1
213	vpopcnth v1,v2 / count of trailing zeros, 16 if none. /
214	#else
215	vclzh v1,v1 / count the leading zeros, 16 if none. /
216	#endif
217	/ Truncate to maximum allowable offset. /
218	vcmpgtub v2,v1,v7 / Compare and truncate for matches beyond*
219	maxlen. /*
220	vsel v1,v1,v7,v2 / 0-16 is now in byte 7. /
221
222	mfvrd r0,v1
223	addi r5,r5,-`16` / Undo speculative bump. /
224	extsb r0,r0 / Clear whatever gunk is in the high 56b. /
225	add r5,r5,r0 / Add the offset of whatever was found. /
226	L(done):
227	subf r3,r3,r5 / Length is equal to the offset of null byte*
228	matched minus the pointer to s. /*
229	blr / Done. /
230
231	/ Handle case of maxlen > 64 and found null bytes in last block*
232	of 64 bytes read. /*
233	.p2align `4`
234	L(found_64B):
235	/ A zero was found. Reduce the result. /
236	vcmpequb v1,v1,v0
237	vcmpequb v2,v2,v0
238	vcmpequb v3,v3,v0
239	vcmpequb v4,v4,v0
240
241	/ Permute the first bit of each byte into bits 48-63. /
242	vbpermq v1,v1,v10
243	vbpermq v2,v2,v10
244	vbpermq v3,v3,v10
245	vbpermq v4,v4,v10
246
247	/ Shift each component into its correct position for merging. /
248	#ifdef __LITTLE_ENDIAN__
249	vsldoi v2,v2,v2,`2`
250	vsldoi v3,v3,v3,`4`
251	vsldoi v4,v4,v4,`6`
252	#else
253	vsldoi v1,v1,v1,`6`
254	vsldoi v2,v2,v2,`4`
255	vsldoi v3,v3,v3,`2`
256	#endif
257
258	/ Merge the results and move to a GPR. /
259	vor v1,v2,v1
260	vor v2,v3,v4
261	vor v4,v1,v2
262
263	/ Adjust address to the start of the current 64B block. /
264	addi r5,r5,-`64`
265
266	mfvrd r10,v4
267	#ifdef __LITTLE_ENDIAN__
268	addi r9,r10,-`1` / Form a mask from trailing zeros. /
269	andc r9,r9,r10
270	popcntd r0,r9 / Count the bits in the mask. /
271	#else
272	cntlzd r0,r10 / Count leading zeros before the match. /
273	#endif
274	subf r5,r3,r5
275	add r3,r5,r0 / Compute final length. /
276	blr / Done. /
277
278	/ Handle case where null bytes were found while aligning*
279	as a preparation for the 64B loop. /*
280	.p2align `4`
281	L(found_aligning64B):
282	vbpermq v1,v1,v10
283	#ifdef __LITTLE_ENDIAN__
284	mfvrd r10,v1
285	addi r9,r10,-`1` / Form a mask from trailing zeros. /
286	andc r9,r9,r10
287	popcntd r0,r9 / Count the bits in the mask. /
288	#else
289	vsldoi v1,v1,v1,`6`
290	mfvrd r10,v1
291	cntlzd r0,r10 / Count leading zeros before the match. /
292	#endif
293	addi r5,r5,-`16` / Adjust address to offset of last 16 bytes*
294	read. /*
295	/ Calculate length as subtracted the pointer to s of last 16 bytes*
296	offset, added with the bytes before the match. /*
297	subf r5,r3,r5
298	add r3,r5,r0
299	blr / Done. /
300
301	/ Handle case of maxlen > 32 and found a null bytes within the first*
302	16 bytes of s. /*
303	.p2align `4`
304	L(early_find):
305	bpermd r5,r8,r10 / r8 contains the bit permute constants. /
306	bpermd r6,r8,r11
307	sldi r5,r5,`8`
308	or r5,r5,r6 / r5 should hold a 16B mask of*
309	a potential 0. /*
310	cntlzd r5,r5 / Count leading zeros. /
311	addi r3,r5,-`48` / Deduct the 48 leading zeros always*
312	present. /*
313	blr / Done. /
314
315	/ Handle case of maxlen <= 32. Use the POWER7 algorithm. /
316	.p2align `4`
317	L(small_range):
318	clrrdi r8,r3,`3` / Align the pointer to 8B. /
319	li r0,`0`
320	/ Register's content at this point:*
321	r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B,
322	r7 == last acceptable address. /*
323	cmpldi r4,`0` / Check if maxlen is zero. /
324	beq L(end_max) / If maxlen is zero. /
325
326	/ Calculate the last acceptable address and check for possible*
327	addition overflow by using satured math:
328	r7 = r3 + r4
329	r7 \|= -(r7 < x) /*
330	add r7,r3,r4
331	subfc r6,r3,r7
332	subfe r9,r9,r9
333	extsw r6,r9
334	or r7,r7,r6
335	addi r7,r7,-`1`
336
337	clrrdi r7,r7,`3` / Align to 8B address of last*
338	acceptable address. /*
339
340	rlwinm r6,r3,`3`,`26`,`28` / Calculate padding. /
341	ld r12,`0`(r8) / Load aligned doubleword. /
342	cmpb r10,r12,r0 / Check for null bytes. /
343	#ifdef __LITTLE_ENDIAN__
344	srd r10,r10,r6
345	sld r10,r10,r6
346	#else
347	sld r10,r10,r6
348	srd r10,r10,r6
349	#endif /* __LITTLE_ENDIAN__ */
350	cmpldi cr7,r10,`0`
351	bne cr7,L(done_small) / If found null byte. /
352
353	cmpld r8,r7 / Check if reached maxlen. /
354	beq L(end_max) / If reached maxlen. /
355
356	/ Still handling case of maxlen <= 32. Read doubleword aligned until*
357	find null bytes or reach maxlen. /*
358	.p2align `4`
359	L(loop_small):
360	ldu r12,`8`(r8) / Load next doubleword and update r8. /
361	cmpb r10,r12,r0 / Check for null bytes. /
362	cmpldi cr6,r10,`0`
363	bne cr6,L(done_small) / If found null bytes. /
364	cmpld r8,r7 / Check if reached maxlen. /
365	bne L(loop_small) / If it has more bytes to read. /
366	mr r3,r4 / Reached maxlen with null bytes not found.*
367	Length is equal to maxlen. /*
368	blr / Done. /
369
370	/ Still handling case of maxlen <= 32. Found null bytes.*
371	Registers: r10 == match bits within doubleword, r8 == address of
372	last doubleword read, r3 == pointer to s, r4 == maxlen. /*
373	.p2align `4`
374	L(done_small):
375	#ifdef __LITTLE_ENDIAN__
376	/ Count trailing zeros. /
377	addi r0,r10,-`1`
378	andc r0,r0,r10
379	popcntd r0,r0
380	#else
381	cntlzd r0,r10 / Count leading zeros before the match. /
382	#endif
383	sub r3,r8,r3 / Calculate total of bytes before the match. /
384	srdi r0,r0,`3` / Convert leading/trailing zeros to bytes. /
385	add r3,r3,r0 / Length until the match. /
386	cmpld r3,r4 / Check length is greater than maxlen. /
387	blelr
388	mr r3,r4 / If length is greater than maxlen, return*
389	maxlen. /*
390	blr
391
392	/ Handle case of reached maxlen with null bytes not found. /
393	.p2align `4`
394	L(end_max):
395	mr r3,r4 / Length is equal to maxlen. /
396	blr / Done. /
397
398
399	END (__strnlen)
400	libc_hidden_def (__strnlen)
401	weak_alias (__strnlen, strnlen)
402	libc_hidden_def (strnlen)
403

source code of glibc/sysdeps/powerpc/powerpc64/power8/strnlen.S