1/* Optimized strnlen implementation for POWER8 using a vmx loop.
2
3 Copyright (C) 2017-2024 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9 The GNU C Library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13 You should have received a copy of the GNU Lesser General Public
14 License along with the GNU C Library; if not, see
15 <https://www.gnu.org/licenses/>. */
16
17/* It is implemented the following heuristic:
18 1. Case maxlen <= 32: align the pointer to 8 bytes to loop through
19 reading doublewords. Uses the POWER7 algorithm.
20 2. Case maxlen > 32: check for null bytes in the first 16 bytes using
21 unaligned accesses. Return length if found. Otherwise:
22 2.1 Case maxlen < 64: deduct the bytes previously read, align
23 the pointer to 16 bytes and loop through reading quadwords
24 until find null bytes or reach maxlen.
25 2.2 Case maxlen > 64: deduct the bytes previously read, align
26 the pointer to 64 bytes and set up a counter to loop through
27 reading in strides of 64 bytes. In case it finished the loop
28 with null bytes not found, process the remainder bytes by
29 switching to the loop to heuristic in 2.1. */
30
31#include <sysdep.h>
32
33/* Define default page size to 4KB. */
34#define PAGE_SIZE 4096
35
36
37/* int [r3] strnlen (char *s [r3], size_t maxlen [r4]) */
38 .machine power8
39ENTRY_TOCLESS (__strnlen)
40 CALL_MCOUNT 2
41 dcbt 0,r3
42
43 cmpldi r4,32 /* Check if maxlen <= 32. */
44 ble L(small_range) /* If maxlen <= 32. */
45
46 /* Upcoming 16 bytes unaligned accesses cannot cross the page boundary
47 otherwise the processor throws an memory access error.
48 Use following code to check there is room for such as accesses:
49 (((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16)
50 If it is disallowed then switch to the code that handles
51 the string when maxlen <= 32. */
52 clrldi r10,r3,52
53 cmpldi cr7,r10,PAGE_SIZE-16
54 bgt cr7,L(small_range) /* If less than 16B of page end. */
55
56 /* Compute our permute constant r8. */
57 li r7,0
58 /* Compute a bpermd constant to move bit 0 of each word into
59 a halfword value, and count trailing zeros. */
60#ifdef __LITTLE_ENDIAN__
61 li r8,0x2820
62 oris r8,r8,0x3830
63 sldi r8,r8,32
64 ori r8,r8,0x0800
65 oris r8,r8,0x1810
66#else
67 li r8,0x1018
68 oris r8,r8,0x0008
69 sldi r8,r8,32
70 ori r8,r8,0x3038
71 oris r8,r8,0x2028
72#endif
73
74 /* maxlen > 32. Optimistically check for null bytes in the first
75 16 bytes of the string using unaligned accesses. */
76 ld r5,0(r3)
77 ld r6,8(r3)
78 cmpb r10,r7,r5 /* Check for null bytes in DWORD1. */
79 cmpb r11,r7,r6 /* Check for null bytes in DWORD2. */
80 or. r7,r10,r11
81 bne cr0, L(early_find) /* If found null bytes. */
82
83 /* At this point maxlen > 32 and null bytes were not found at first
84 16 bytes. Prepare for loop using VMX. */
85
86 /* r3 == s, r4 == maxlen. All other volatile regs are unused now. */
87
88 addi r5,r3,16 /* Align up, or just add the 16B we
89 already checked. */
90 li r0,15
91 and r7,r5,r0 /* Find offset into 16B alignment. */
92 andc r5,r5,r0 /* Quadword align up s to the next quadword. */
93 li r0,16
94 subf r0,r7,r0
95 subf r4,r0,r4 /* Deduct unaligned bytes from maxlen. */
96
97
98 /* Compute offsets for vmx loads, and precompute the vbpermq
99 constants for both the 64B and 16B loops. */
100 li r6,0
101 vspltisb v0,0
102 vspltisb v10,3
103 lvsl v11,r6,r6
104 vslb v10,v11,v10
105
106 cmpldi r4,64 /* Check maxlen < 64. */
107 blt L(smaller) /* If maxlen < 64 */
108
109 /* In order to begin the 64B loop, it needs to be 64
110 bytes aligned. So read quadwords until it is aligned or found null
111 bytes. At worst case it will be aligned after the fourth iteration,
112 so unroll the loop to avoid counter checking. */
113 andi. r7,r5,63 /* Check if is 64 bytes aligned. */
114 beq cr0,L(preloop_64B) /* If it is already 64B aligned. */
115 lvx v1,r5,r6
116 vcmpequb. v1,v1,v0
117 addi r5,r5,16
118 addi r4,r4,-16 /* Decrement maxlen in 16 bytes. */
119 bne cr6,L(found_aligning64B) /* If found null bytes. */
120
121 /* Unroll 2x above code block until aligned or find null bytes. */
122 andi. r7,r5,63
123 beq cr0,L(preloop_64B)
124 lvx v1,r5,r6
125 vcmpequb. v1,v1,v0
126 addi r5,r5,16
127 addi r4,r4,-16
128 bne cr6,L(found_aligning64B)
129
130 andi. r7,r5,63
131 beq cr0,L(preloop_64B)
132 lvx v1,r5,r6
133 vcmpequb. v1,v1,v0
134 addi r5,r5,16
135 addi r4,r4,-16
136 bne cr6,L(found_aligning64B)
137
138 /* At this point it should be 16 bytes aligned.
139 Prepare for the 64B loop. */
140 .p2align 4
141L(preloop_64B):
142 /* Check if maxlen became is less than 64, therefore disallowing the
143 64B loop. If it happened switch to the 16B loop code. */
144 cmpldi r4,64 /* Check if maxlen < 64. */
145 blt L(smaller) /* If maxlen < 64. */
146 /* Set some constant values. */
147 li r7,16
148 li r10,32
149 li r9,48
150
151 /* Compute the number of 64 bytes iterations needed. */
152 srdi r11,r4,6 /* Compute loop count (maxlen / 64). */
153 andi. r4,r4,63 /* Set maxlen the remainder (maxlen % 64). */
154 mtctr r11 /* Move loop count to counter register. */
155
156 /* Handle maxlen > 64. Loop over the bytes in strides of 64B. */
157 .p2align 4
158L(loop_64B):
159 lvx v1,r5,r6 /* r5 is the pointer to s. */
160 lvx v2,r5,r7
161 lvx v3,r5,r10
162 lvx v4,r5,r9
163 /* Compare the four 16B vectors to obtain the least 16 values.
164 Null bytes should emerge into v7, then check for null bytes. */
165 vminub v5,v1,v2
166 vminub v6,v3,v4
167 vminub v7,v5,v6
168 vcmpequb. v7,v7,v0 /* Check for null bytes. */
169 addi r5,r5,64 /* Add pointer to next iteration. */
170 bne cr6,L(found_64B) /* If found null bytes. */
171 bdnz L(loop_64B) /* Continue the loop if count > 0. */
172
173/* Hit loop end without null match. So branch to handle the remainder. */
174
175 /* Prepare a 16B loop to handle two cases:
176 1. If 32 > maxlen < 64.
177 2. If maxlen >= 64, and reached end of the 64B loop with null
178 bytes not found. Thus handle the remainder bytes here. */
179 .p2align 4
180L(smaller):
181 cmpldi r4,0 /* Check maxlen is zero. */
182 beq L(done) /* If maxlen is zero. */
183
184 /* Place rounded up number of qw's to check into a vmx
185 register, and use some vector tricks to minimize
186 branching. */
187 mtvrd v7,r4 /* copy maxlen from gpr to vector register. */
188 vspltisb v5,1
189 vspltisb v6,15
190 vspltb v2,v7,7
191 vaddubs v3,v5,v6
192
193#ifdef __LITTLE_ENDIAN__
194 vspltish v5,1 /* Compute 16 in each byte. */
195#endif
196
197 /* Loop in 16B aligned incremements now. */
198 .p2align 4
199L(loop_16B):
200 lvx v1,r5,r6 /* Load quadword into vector register. */
201 addi r5,r5,16 /* Increment address to next 16B block. */
202 vor v7,v2,v2 /* Save loop count (v2) into v7. */
203 vsububs v2,v2,v3 /* Subtract 16B from count, saturate at 0. */
204 vminub v4,v1,v2
205 vcmpequb. v4,v4,v0 /* Checking for null bytes. */
206 beq cr6,L(loop_16B) /* If null bytes not found. */
207
208 vcmpequb v1,v1,v0
209 vbpermq v1,v1,v10
210#ifdef __LITTLE_ENDIAN__
211 vsubuhm v2,v1,v5 /* Form a mask of trailing zeros. */
212 vandc v2,v2,v1
213 vpopcnth v1,v2 /* count of trailing zeros, 16 if none. */
214#else
215 vclzh v1,v1 /* count the leading zeros, 16 if none. */
216#endif
217 /* Truncate to maximum allowable offset. */
218 vcmpgtub v2,v1,v7 /* Compare and truncate for matches beyond
219 maxlen. */
220 vsel v1,v1,v7,v2 /* 0-16 is now in byte 7. */
221
222 mfvrd r0,v1
223 addi r5,r5,-16 /* Undo speculative bump. */
224 extsb r0,r0 /* Clear whatever gunk is in the high 56b. */
225 add r5,r5,r0 /* Add the offset of whatever was found. */
226L(done):
227 subf r3,r3,r5 /* Length is equal to the offset of null byte
228 matched minus the pointer to s. */
229 blr /* Done. */
230
231 /* Handle case of maxlen > 64 and found null bytes in last block
232 of 64 bytes read. */
233 .p2align 4
234L(found_64B):
235 /* A zero was found. Reduce the result. */
236 vcmpequb v1,v1,v0
237 vcmpequb v2,v2,v0
238 vcmpequb v3,v3,v0
239 vcmpequb v4,v4,v0
240
241 /* Permute the first bit of each byte into bits 48-63. */
242 vbpermq v1,v1,v10
243 vbpermq v2,v2,v10
244 vbpermq v3,v3,v10
245 vbpermq v4,v4,v10
246
247 /* Shift each component into its correct position for merging. */
248#ifdef __LITTLE_ENDIAN__
249 vsldoi v2,v2,v2,2
250 vsldoi v3,v3,v3,4
251 vsldoi v4,v4,v4,6
252#else
253 vsldoi v1,v1,v1,6
254 vsldoi v2,v2,v2,4
255 vsldoi v3,v3,v3,2
256#endif
257
258 /* Merge the results and move to a GPR. */
259 vor v1,v2,v1
260 vor v2,v3,v4
261 vor v4,v1,v2
262
263 /* Adjust address to the start of the current 64B block. */
264 addi r5,r5,-64
265
266 mfvrd r10,v4
267#ifdef __LITTLE_ENDIAN__
268 addi r9,r10,-1 /* Form a mask from trailing zeros. */
269 andc r9,r9,r10
270 popcntd r0,r9 /* Count the bits in the mask. */
271#else
272 cntlzd r0,r10 /* Count leading zeros before the match. */
273#endif
274 subf r5,r3,r5
275 add r3,r5,r0 /* Compute final length. */
276 blr /* Done. */
277
278 /* Handle case where null bytes were found while aligning
279 as a preparation for the 64B loop. */
280 .p2align 4
281L(found_aligning64B):
282 vbpermq v1,v1,v10
283#ifdef __LITTLE_ENDIAN__
284 mfvrd r10,v1
285 addi r9,r10,-1 /* Form a mask from trailing zeros. */
286 andc r9,r9,r10
287 popcntd r0,r9 /* Count the bits in the mask. */
288#else
289 vsldoi v1,v1,v1,6
290 mfvrd r10,v1
291 cntlzd r0,r10 /* Count leading zeros before the match. */
292#endif
293 addi r5,r5,-16 /* Adjust address to offset of last 16 bytes
294 read. */
295 /* Calculate length as subtracted the pointer to s of last 16 bytes
296 offset, added with the bytes before the match. */
297 subf r5,r3,r5
298 add r3,r5,r0
299 blr /* Done. */
300
301 /* Handle case of maxlen > 32 and found a null bytes within the first
302 16 bytes of s. */
303 .p2align 4
304L(early_find):
305 bpermd r5,r8,r10 /* r8 contains the bit permute constants. */
306 bpermd r6,r8,r11
307 sldi r5,r5,8
308 or r5,r5,r6 /* r5 should hold a 16B mask of
309 a potential 0. */
310 cntlzd r5,r5 /* Count leading zeros. */
311 addi r3,r5,-48 /* Deduct the 48 leading zeros always
312 present. */
313 blr /* Done. */
314
315 /* Handle case of maxlen <= 32. Use the POWER7 algorithm. */
316 .p2align 4
317L(small_range):
318 clrrdi r8,r3,3 /* Align the pointer to 8B. */
319 li r0,0
320 /* Register's content at this point:
321 r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B,
322 r7 == last acceptable address. */
323 cmpldi r4,0 /* Check if maxlen is zero. */
324 beq L(end_max) /* If maxlen is zero. */
325
326 /* Calculate the last acceptable address and check for possible
327 addition overflow by using satured math:
328 r7 = r3 + r4
329 r7 |= -(r7 < x) */
330 add r7,r3,r4
331 subfc r6,r3,r7
332 subfe r9,r9,r9
333 extsw r6,r9
334 or r7,r7,r6
335 addi r7,r7,-1
336
337 clrrdi r7,r7,3 /* Align to 8B address of last
338 acceptable address. */
339
340 rlwinm r6,r3,3,26,28 /* Calculate padding. */
341 ld r12,0(r8) /* Load aligned doubleword. */
342 cmpb r10,r12,r0 /* Check for null bytes. */
343#ifdef __LITTLE_ENDIAN__
344 srd r10,r10,r6
345 sld r10,r10,r6
346#else
347 sld r10,r10,r6
348 srd r10,r10,r6
349#endif /* __LITTLE_ENDIAN__ */
350 cmpldi cr7,r10,0
351 bne cr7,L(done_small) /* If found null byte. */
352
353 cmpld r8,r7 /* Check if reached maxlen. */
354 beq L(end_max) /* If reached maxlen. */
355
356 /* Still handling case of maxlen <= 32. Read doubleword aligned until
357 find null bytes or reach maxlen. */
358 .p2align 4
359L(loop_small):
360 ldu r12,8(r8) /* Load next doubleword and update r8. */
361 cmpb r10,r12,r0 /* Check for null bytes. */
362 cmpldi cr6,r10,0
363 bne cr6,L(done_small) /* If found null bytes. */
364 cmpld r8,r7 /* Check if reached maxlen. */
365 bne L(loop_small) /* If it has more bytes to read. */
366 mr r3,r4 /* Reached maxlen with null bytes not found.
367 Length is equal to maxlen. */
368 blr /* Done. */
369
370 /* Still handling case of maxlen <= 32. Found null bytes.
371 Registers: r10 == match bits within doubleword, r8 == address of
372 last doubleword read, r3 == pointer to s, r4 == maxlen. */
373 .p2align 4
374L(done_small):
375#ifdef __LITTLE_ENDIAN__
376 /* Count trailing zeros. */
377 addi r0,r10,-1
378 andc r0,r0,r10
379 popcntd r0,r0
380#else
381 cntlzd r0,r10 /* Count leading zeros before the match. */
382#endif
383 sub r3,r8,r3 /* Calculate total of bytes before the match. */
384 srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
385 add r3,r3,r0 /* Length until the match. */
386 cmpld r3,r4 /* Check length is greater than maxlen. */
387 blelr
388 mr r3,r4 /* If length is greater than maxlen, return
389 maxlen. */
390 blr
391
392 /* Handle case of reached maxlen with null bytes not found. */
393 .p2align 4
394L(end_max):
395 mr r3,r4 /* Length is equal to maxlen. */
396 blr /* Done. */
397
398
399END (__strnlen)
400libc_hidden_def (__strnlen)
401weak_alias (__strnlen, strnlen)
402libc_hidden_def (strnlen)
403

source code of glibc/sysdeps/powerpc/powerpc64/power8/strnlen.S