1 | /* Optimized strnlen implementation for POWER8 using a vmx loop. |
2 | |
3 | Copyright (C) 2017-2024 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | The GNU C Library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | You should have received a copy of the GNU Lesser General Public |
14 | License along with the GNU C Library; if not, see |
15 | <https://www.gnu.org/licenses/>. */ |
16 | |
17 | /* It is implemented the following heuristic: |
18 | 1. Case maxlen <= 32: align the pointer to 8 bytes to loop through |
19 | reading doublewords. Uses the POWER7 algorithm. |
20 | 2. Case maxlen > 32: check for null bytes in the first 16 bytes using |
21 | unaligned accesses. Return length if found. Otherwise: |
22 | 2.1 Case maxlen < 64: deduct the bytes previously read, align |
23 | the pointer to 16 bytes and loop through reading quadwords |
24 | until find null bytes or reach maxlen. |
25 | 2.2 Case maxlen > 64: deduct the bytes previously read, align |
26 | the pointer to 64 bytes and set up a counter to loop through |
27 | reading in strides of 64 bytes. In case it finished the loop |
28 | with null bytes not found, process the remainder bytes by |
29 | switching to the loop to heuristic in 2.1. */ |
30 | |
31 | #include <sysdep.h> |
32 | |
33 | /* Define default page size to 4KB. */ |
34 | #define PAGE_SIZE 4096 |
35 | |
36 | |
37 | /* int [r3] strnlen (char *s [r3], size_t maxlen [r4]) */ |
38 | .machine power8 |
39 | ENTRY_TOCLESS (__strnlen) |
40 | CALL_MCOUNT 2 |
41 | dcbt 0,r3 |
42 | |
43 | cmpldi r4,32 /* Check if maxlen <= 32. */ |
44 | ble L(small_range) /* If maxlen <= 32. */ |
45 | |
46 | /* Upcoming 16 bytes unaligned accesses cannot cross the page boundary |
47 | otherwise the processor throws an memory access error. |
48 | Use following code to check there is room for such as accesses: |
49 | (((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16) |
50 | If it is disallowed then switch to the code that handles |
51 | the string when maxlen <= 32. */ |
52 | clrldi r10,r3,52 |
53 | cmpldi cr7,r10,PAGE_SIZE-16 |
54 | bgt cr7,L(small_range) /* If less than 16B of page end. */ |
55 | |
56 | /* Compute our permute constant r8. */ |
57 | li r7,0 |
58 | /* Compute a bpermd constant to move bit 0 of each word into |
59 | a halfword value, and count trailing zeros. */ |
60 | #ifdef __LITTLE_ENDIAN__ |
61 | li r8,0x2820 |
62 | oris r8,r8,0x3830 |
63 | sldi r8,r8,32 |
64 | ori r8,r8,0x0800 |
65 | oris r8,r8,0x1810 |
66 | #else |
67 | li r8,0x1018 |
68 | oris r8,r8,0x0008 |
69 | sldi r8,r8,32 |
70 | ori r8,r8,0x3038 |
71 | oris r8,r8,0x2028 |
72 | #endif |
73 | |
74 | /* maxlen > 32. Optimistically check for null bytes in the first |
75 | 16 bytes of the string using unaligned accesses. */ |
76 | ld r5,0(r3) |
77 | ld r6,8(r3) |
78 | cmpb r10,r7,r5 /* Check for null bytes in DWORD1. */ |
79 | cmpb r11,r7,r6 /* Check for null bytes in DWORD2. */ |
80 | or. r7,r10,r11 |
81 | bne cr0, L(early_find) /* If found null bytes. */ |
82 | |
83 | /* At this point maxlen > 32 and null bytes were not found at first |
84 | 16 bytes. Prepare for loop using VMX. */ |
85 | |
86 | /* r3 == s, r4 == maxlen. All other volatile regs are unused now. */ |
87 | |
88 | addi r5,r3,16 /* Align up, or just add the 16B we |
89 | already checked. */ |
90 | li r0,15 |
91 | and r7,r5,r0 /* Find offset into 16B alignment. */ |
92 | andc r5,r5,r0 /* Quadword align up s to the next quadword. */ |
93 | li r0,16 |
94 | subf r0,r7,r0 |
95 | subf r4,r0,r4 /* Deduct unaligned bytes from maxlen. */ |
96 | |
97 | |
98 | /* Compute offsets for vmx loads, and precompute the vbpermq |
99 | constants for both the 64B and 16B loops. */ |
100 | li r6,0 |
101 | vspltisb v0,0 |
102 | vspltisb v10,3 |
103 | lvsl v11,r6,r6 |
104 | vslb v10,v11,v10 |
105 | |
106 | cmpldi r4,64 /* Check maxlen < 64. */ |
107 | blt L(smaller) /* If maxlen < 64 */ |
108 | |
109 | /* In order to begin the 64B loop, it needs to be 64 |
110 | bytes aligned. So read quadwords until it is aligned or found null |
111 | bytes. At worst case it will be aligned after the fourth iteration, |
112 | so unroll the loop to avoid counter checking. */ |
113 | andi. r7,r5,63 /* Check if is 64 bytes aligned. */ |
114 | beq cr0,L(preloop_64B) /* If it is already 64B aligned. */ |
115 | lvx v1,r5,r6 |
116 | vcmpequb. v1,v1,v0 |
117 | addi r5,r5,16 |
118 | addi r4,r4,-16 /* Decrement maxlen in 16 bytes. */ |
119 | bne cr6,L(found_aligning64B) /* If found null bytes. */ |
120 | |
121 | /* Unroll 2x above code block until aligned or find null bytes. */ |
122 | andi. r7,r5,63 |
123 | beq cr0,L(preloop_64B) |
124 | lvx v1,r5,r6 |
125 | vcmpequb. v1,v1,v0 |
126 | addi r5,r5,16 |
127 | addi r4,r4,-16 |
128 | bne cr6,L(found_aligning64B) |
129 | |
130 | andi. r7,r5,63 |
131 | beq cr0,L(preloop_64B) |
132 | lvx v1,r5,r6 |
133 | vcmpequb. v1,v1,v0 |
134 | addi r5,r5,16 |
135 | addi r4,r4,-16 |
136 | bne cr6,L(found_aligning64B) |
137 | |
138 | /* At this point it should be 16 bytes aligned. |
139 | Prepare for the 64B loop. */ |
140 | .p2align 4 |
141 | L(preloop_64B): |
142 | /* Check if maxlen became is less than 64, therefore disallowing the |
143 | 64B loop. If it happened switch to the 16B loop code. */ |
144 | cmpldi r4,64 /* Check if maxlen < 64. */ |
145 | blt L(smaller) /* If maxlen < 64. */ |
146 | /* Set some constant values. */ |
147 | li r7,16 |
148 | li r10,32 |
149 | li r9,48 |
150 | |
151 | /* Compute the number of 64 bytes iterations needed. */ |
152 | srdi r11,r4,6 /* Compute loop count (maxlen / 64). */ |
153 | andi. r4,r4,63 /* Set maxlen the remainder (maxlen % 64). */ |
154 | mtctr r11 /* Move loop count to counter register. */ |
155 | |
156 | /* Handle maxlen > 64. Loop over the bytes in strides of 64B. */ |
157 | .p2align 4 |
158 | L(loop_64B): |
159 | lvx v1,r5,r6 /* r5 is the pointer to s. */ |
160 | lvx v2,r5,r7 |
161 | lvx v3,r5,r10 |
162 | lvx v4,r5,r9 |
163 | /* Compare the four 16B vectors to obtain the least 16 values. |
164 | Null bytes should emerge into v7, then check for null bytes. */ |
165 | vminub v5,v1,v2 |
166 | vminub v6,v3,v4 |
167 | vminub v7,v5,v6 |
168 | vcmpequb. v7,v7,v0 /* Check for null bytes. */ |
169 | addi r5,r5,64 /* Add pointer to next iteration. */ |
170 | bne cr6,L(found_64B) /* If found null bytes. */ |
171 | bdnz L(loop_64B) /* Continue the loop if count > 0. */ |
172 | |
173 | /* Hit loop end without null match. So branch to handle the remainder. */ |
174 | |
175 | /* Prepare a 16B loop to handle two cases: |
176 | 1. If 32 > maxlen < 64. |
177 | 2. If maxlen >= 64, and reached end of the 64B loop with null |
178 | bytes not found. Thus handle the remainder bytes here. */ |
179 | .p2align 4 |
180 | L(smaller): |
181 | cmpldi r4,0 /* Check maxlen is zero. */ |
182 | beq L(done) /* If maxlen is zero. */ |
183 | |
184 | /* Place rounded up number of qw's to check into a vmx |
185 | register, and use some vector tricks to minimize |
186 | branching. */ |
187 | mtvrd v7,r4 /* copy maxlen from gpr to vector register. */ |
188 | vspltisb v5,1 |
189 | vspltisb v6,15 |
190 | vspltb v2,v7,7 |
191 | vaddubs v3,v5,v6 |
192 | |
193 | #ifdef __LITTLE_ENDIAN__ |
194 | vspltish v5,1 /* Compute 16 in each byte. */ |
195 | #endif |
196 | |
197 | /* Loop in 16B aligned incremements now. */ |
198 | .p2align 4 |
199 | L(loop_16B): |
200 | lvx v1,r5,r6 /* Load quadword into vector register. */ |
201 | addi r5,r5,16 /* Increment address to next 16B block. */ |
202 | vor v7,v2,v2 /* Save loop count (v2) into v7. */ |
203 | vsububs v2,v2,v3 /* Subtract 16B from count, saturate at 0. */ |
204 | vminub v4,v1,v2 |
205 | vcmpequb. v4,v4,v0 /* Checking for null bytes. */ |
206 | beq cr6,L(loop_16B) /* If null bytes not found. */ |
207 | |
208 | vcmpequb v1,v1,v0 |
209 | vbpermq v1,v1,v10 |
210 | #ifdef __LITTLE_ENDIAN__ |
211 | vsubuhm v2,v1,v5 /* Form a mask of trailing zeros. */ |
212 | vandc v2,v2,v1 |
213 | vpopcnth v1,v2 /* count of trailing zeros, 16 if none. */ |
214 | #else |
215 | vclzh v1,v1 /* count the leading zeros, 16 if none. */ |
216 | #endif |
217 | /* Truncate to maximum allowable offset. */ |
218 | vcmpgtub v2,v1,v7 /* Compare and truncate for matches beyond |
219 | maxlen. */ |
220 | vsel v1,v1,v7,v2 /* 0-16 is now in byte 7. */ |
221 | |
222 | mfvrd r0,v1 |
223 | addi r5,r5,-16 /* Undo speculative bump. */ |
224 | extsb r0,r0 /* Clear whatever gunk is in the high 56b. */ |
225 | add r5,r5,r0 /* Add the offset of whatever was found. */ |
226 | L(done): |
227 | subf r3,r3,r5 /* Length is equal to the offset of null byte |
228 | matched minus the pointer to s. */ |
229 | blr /* Done. */ |
230 | |
231 | /* Handle case of maxlen > 64 and found null bytes in last block |
232 | of 64 bytes read. */ |
233 | .p2align 4 |
234 | L(found_64B): |
235 | /* A zero was found. Reduce the result. */ |
236 | vcmpequb v1,v1,v0 |
237 | vcmpequb v2,v2,v0 |
238 | vcmpequb v3,v3,v0 |
239 | vcmpequb v4,v4,v0 |
240 | |
241 | /* Permute the first bit of each byte into bits 48-63. */ |
242 | vbpermq v1,v1,v10 |
243 | vbpermq v2,v2,v10 |
244 | vbpermq v3,v3,v10 |
245 | vbpermq v4,v4,v10 |
246 | |
247 | /* Shift each component into its correct position for merging. */ |
248 | #ifdef __LITTLE_ENDIAN__ |
249 | vsldoi v2,v2,v2,2 |
250 | vsldoi v3,v3,v3,4 |
251 | vsldoi v4,v4,v4,6 |
252 | #else |
253 | vsldoi v1,v1,v1,6 |
254 | vsldoi v2,v2,v2,4 |
255 | vsldoi v3,v3,v3,2 |
256 | #endif |
257 | |
258 | /* Merge the results and move to a GPR. */ |
259 | vor v1,v2,v1 |
260 | vor v2,v3,v4 |
261 | vor v4,v1,v2 |
262 | |
263 | /* Adjust address to the start of the current 64B block. */ |
264 | addi r5,r5,-64 |
265 | |
266 | mfvrd r10,v4 |
267 | #ifdef __LITTLE_ENDIAN__ |
268 | addi r9,r10,-1 /* Form a mask from trailing zeros. */ |
269 | andc r9,r9,r10 |
270 | popcntd r0,r9 /* Count the bits in the mask. */ |
271 | #else |
272 | cntlzd r0,r10 /* Count leading zeros before the match. */ |
273 | #endif |
274 | subf r5,r3,r5 |
275 | add r3,r5,r0 /* Compute final length. */ |
276 | blr /* Done. */ |
277 | |
278 | /* Handle case where null bytes were found while aligning |
279 | as a preparation for the 64B loop. */ |
280 | .p2align 4 |
281 | L(found_aligning64B): |
282 | vbpermq v1,v1,v10 |
283 | #ifdef __LITTLE_ENDIAN__ |
284 | mfvrd r10,v1 |
285 | addi r9,r10,-1 /* Form a mask from trailing zeros. */ |
286 | andc r9,r9,r10 |
287 | popcntd r0,r9 /* Count the bits in the mask. */ |
288 | #else |
289 | vsldoi v1,v1,v1,6 |
290 | mfvrd r10,v1 |
291 | cntlzd r0,r10 /* Count leading zeros before the match. */ |
292 | #endif |
293 | addi r5,r5,-16 /* Adjust address to offset of last 16 bytes |
294 | read. */ |
295 | /* Calculate length as subtracted the pointer to s of last 16 bytes |
296 | offset, added with the bytes before the match. */ |
297 | subf r5,r3,r5 |
298 | add r3,r5,r0 |
299 | blr /* Done. */ |
300 | |
301 | /* Handle case of maxlen > 32 and found a null bytes within the first |
302 | 16 bytes of s. */ |
303 | .p2align 4 |
304 | L(early_find): |
305 | bpermd r5,r8,r10 /* r8 contains the bit permute constants. */ |
306 | bpermd r6,r8,r11 |
307 | sldi r5,r5,8 |
308 | or r5,r5,r6 /* r5 should hold a 16B mask of |
309 | a potential 0. */ |
310 | cntlzd r5,r5 /* Count leading zeros. */ |
311 | addi r3,r5,-48 /* Deduct the 48 leading zeros always |
312 | present. */ |
313 | blr /* Done. */ |
314 | |
315 | /* Handle case of maxlen <= 32. Use the POWER7 algorithm. */ |
316 | .p2align 4 |
317 | L(small_range): |
318 | clrrdi r8,r3,3 /* Align the pointer to 8B. */ |
319 | li r0,0 |
320 | /* Register's content at this point: |
321 | r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B, |
322 | r7 == last acceptable address. */ |
323 | cmpldi r4,0 /* Check if maxlen is zero. */ |
324 | beq L(end_max) /* If maxlen is zero. */ |
325 | |
326 | /* Calculate the last acceptable address and check for possible |
327 | addition overflow by using satured math: |
328 | r7 = r3 + r4 |
329 | r7 |= -(r7 < x) */ |
330 | add r7,r3,r4 |
331 | subfc r6,r3,r7 |
332 | subfe r9,r9,r9 |
333 | extsw r6,r9 |
334 | or r7,r7,r6 |
335 | addi r7,r7,-1 |
336 | |
337 | clrrdi r7,r7,3 /* Align to 8B address of last |
338 | acceptable address. */ |
339 | |
340 | rlwinm r6,r3,3,26,28 /* Calculate padding. */ |
341 | ld r12,0(r8) /* Load aligned doubleword. */ |
342 | cmpb r10,r12,r0 /* Check for null bytes. */ |
343 | #ifdef __LITTLE_ENDIAN__ |
344 | srd r10,r10,r6 |
345 | sld r10,r10,r6 |
346 | #else |
347 | sld r10,r10,r6 |
348 | srd r10,r10,r6 |
349 | #endif /* __LITTLE_ENDIAN__ */ |
350 | cmpldi cr7,r10,0 |
351 | bne cr7,L(done_small) /* If found null byte. */ |
352 | |
353 | cmpld r8,r7 /* Check if reached maxlen. */ |
354 | beq L(end_max) /* If reached maxlen. */ |
355 | |
356 | /* Still handling case of maxlen <= 32. Read doubleword aligned until |
357 | find null bytes or reach maxlen. */ |
358 | .p2align 4 |
359 | L(loop_small): |
360 | ldu r12,8(r8) /* Load next doubleword and update r8. */ |
361 | cmpb r10,r12,r0 /* Check for null bytes. */ |
362 | cmpldi cr6,r10,0 |
363 | bne cr6,L(done_small) /* If found null bytes. */ |
364 | cmpld r8,r7 /* Check if reached maxlen. */ |
365 | bne L(loop_small) /* If it has more bytes to read. */ |
366 | mr r3,r4 /* Reached maxlen with null bytes not found. |
367 | Length is equal to maxlen. */ |
368 | blr /* Done. */ |
369 | |
370 | /* Still handling case of maxlen <= 32. Found null bytes. |
371 | Registers: r10 == match bits within doubleword, r8 == address of |
372 | last doubleword read, r3 == pointer to s, r4 == maxlen. */ |
373 | .p2align 4 |
374 | L(done_small): |
375 | #ifdef __LITTLE_ENDIAN__ |
376 | /* Count trailing zeros. */ |
377 | addi r0,r10,-1 |
378 | andc r0,r0,r10 |
379 | popcntd r0,r0 |
380 | #else |
381 | cntlzd r0,r10 /* Count leading zeros before the match. */ |
382 | #endif |
383 | sub r3,r8,r3 /* Calculate total of bytes before the match. */ |
384 | srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ |
385 | add r3,r3,r0 /* Length until the match. */ |
386 | cmpld r3,r4 /* Check length is greater than maxlen. */ |
387 | blelr |
388 | mr r3,r4 /* If length is greater than maxlen, return |
389 | maxlen. */ |
390 | blr |
391 | |
392 | /* Handle case of reached maxlen with null bytes not found. */ |
393 | .p2align 4 |
394 | L(end_max): |
395 | mr r3,r4 /* Length is equal to maxlen. */ |
396 | blr /* Done. */ |
397 | |
398 | |
399 | END (__strnlen) |
400 | libc_hidden_def (__strnlen) |
401 | weak_alias (__strnlen, strnlen) |
402 | libc_hidden_def (strnlen) |
403 | |