1 | /* Optimized strstr implementation for PowerPC64/POWER7. |
2 | Copyright (C) 2015-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* Char * [r3] strstr (char *s [r3], char * pat[r4]) */ |
22 | |
23 | /* The performance gain is obtained using aligned memory access, load |
24 | * doubleword and usage of cmpb instruction for quicker comparison. */ |
25 | |
26 | #define ITERATIONS 64 |
27 | |
28 | #ifndef STRSTR |
29 | # define STRSTR strstr |
30 | #endif |
31 | |
32 | #ifndef STRLEN |
33 | /* For builds with no IFUNC support, local calls should be made to internal |
34 | GLIBC symbol (created by libc_hidden_builtin_def). */ |
35 | # ifdef SHARED |
36 | # define STRLEN __GI_strlen |
37 | # define STRLEN_is_local |
38 | # else |
39 | # define STRLEN strlen |
40 | # endif |
41 | #endif |
42 | |
43 | #ifndef STRNLEN |
44 | /* For builds with no IFUNC support, local calls should be made to internal |
45 | GLIBC symbol (created by libc_hidden_builtin_def). */ |
46 | # ifdef SHARED |
47 | # define STRNLEN __GI_strnlen |
48 | # define STRNLEN_is_local |
49 | # else |
50 | # define STRNLEN __strnlen |
51 | # endif |
52 | #endif |
53 | |
54 | #ifndef STRCHR |
55 | # ifdef SHARED |
56 | # define STRCHR __GI_strchr |
57 | # define STRCHR_is_local |
58 | # else |
59 | # define STRCHR strchr |
60 | # endif |
61 | #endif |
62 | |
63 | #define FRAMESIZE (FRAME_MIN_SIZE+32) |
64 | .machine power7 |
65 | /* Can't be ENTRY_TOCLESS due to calling __strstr_ppc which uses r2. */ |
66 | ENTRY (STRSTR, 4) |
67 | CALL_MCOUNT 2 |
68 | mflr r0 /* Load link register LR to r0. */ |
69 | std r31, -8(r1) /* Save callers register r31. */ |
70 | std r30, -16(r1) /* Save callers register r30. */ |
71 | std r29, -24(r1) /* Save callers register r29. */ |
72 | std r28, -32(r1) /* Save callers register r28. */ |
73 | std r0, 16(r1) /* Store the link register. */ |
74 | cfi_offset(r31, -8) |
75 | cfi_offset(r30, -16) |
76 | cfi_offset(r28, -32) |
77 | cfi_offset(r29, -24) |
78 | cfi_offset(lr, 16) |
79 | stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */ |
80 | cfi_adjust_cfa_offset(FRAMESIZE) |
81 | |
82 | dcbt 0, r3 |
83 | dcbt 0, r4 |
84 | cmpdi cr7, r3, 0 |
85 | beq cr7, L(retnull) |
86 | cmpdi cr7, r4, 0 |
87 | beq cr7, L(retnull) |
88 | |
89 | mr r29, r3 |
90 | mr r30, r4 |
91 | mr r3, r4 |
92 | bl STRLEN |
93 | #ifndef STRLEN_is_local |
94 | nop |
95 | #endif |
96 | |
97 | cmpdi cr7, r3, 0 /* If search str is null. */ |
98 | beq cr7, L(ret_r3) |
99 | |
100 | mr r31, r3 |
101 | mr r4, r3 |
102 | mr r3, r29 |
103 | bl STRNLEN |
104 | #ifndef STRNLEN_is_local |
105 | nop |
106 | #endif |
107 | |
108 | cmpd cr7, r3, r31 /* If len(r3) < len(r4). */ |
109 | blt cr7, L(retnull) |
110 | mr r3, r29 |
111 | lbz r4, 0(r30) |
112 | bl STRCHR |
113 | #ifndef STRCHR_is_local |
114 | nop |
115 | #endif |
116 | |
117 | mr r11, r3 |
118 | /* If first char of search str is not present. */ |
119 | cmpdi cr7, r3, 0 |
120 | ble cr7, L(end) |
121 | /* Reg r28 is used to count the number of iterations. */ |
122 | li r28, 0 |
123 | rldicl r8, r3, 0, 52 /* Page cross check. */ |
124 | cmpldi cr7, r8, 4096-16 |
125 | bgt cr7, L(bytebybyte) |
126 | |
127 | rldicl r8, r30, 0, 52 |
128 | cmpldi cr7, r8, 4096-16 |
129 | bgt cr7, L(bytebybyte) |
130 | |
131 | /* If len(r4) < 8 handle in a different way. */ |
132 | /* Shift position based on null and use cmpb. */ |
133 | cmpdi cr7, r31, 8 |
134 | blt cr7, L(lessthan8) |
135 | |
136 | /* Len(r4) >= 8 reaches here. */ |
137 | mr r8, r3 /* Save r3 for future use. */ |
138 | mr r4, r30 /* Restore r4. */ |
139 | li r0, 0 |
140 | rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */ |
141 | clrrdi r4, r4, 3 /* Make r4 aligned to 8. */ |
142 | ld r6, 0(r4) |
143 | addi r4, r4, 8 |
144 | cmpdi cr7, r10, 0 /* Check if its already aligned? */ |
145 | beq cr7, L(begin1) |
146 | #ifdef __LITTLE_ENDIAN__ |
147 | srd r6, r6, r10 /* Discard unwanted bits. */ |
148 | #else |
149 | sld r6, r6, r10 |
150 | #endif |
151 | ld r9, 0(r4) |
152 | subfic r10, r10, 64 |
153 | #ifdef __LITTLE_ENDIAN__ |
154 | sld r9, r9, r10 /* Discard unwanted bits. */ |
155 | #else |
156 | srd r9, r9, r10 |
157 | #endif |
158 | or r6, r6, r9 /* Form complete search str. */ |
159 | L(begin1): |
160 | mr r29, r6 |
161 | rlwinm r10, r3, 3, 26, 28 |
162 | clrrdi r3, r3, 3 |
163 | ld r5, 0(r3) |
164 | cmpb r9, r0, r6 /* Check if input has null. */ |
165 | cmpdi cr7, r9, 0 |
166 | bne cr7, L(return3) |
167 | cmpb r9, r0, r5 /* Check if input has null. */ |
168 | #ifdef __LITTLE_ENDIAN__ |
169 | srd r9, r9, r10 |
170 | #else |
171 | sld r9, r9, r10 |
172 | #endif |
173 | cmpdi cr7, r9, 0 |
174 | bne cr7, L(retnull) |
175 | |
176 | li r12, -8 /* Shift values. */ |
177 | li r11, 72 /* Shift values. */ |
178 | cmpdi cr7, r10, 0 |
179 | beq cr7, L(nextbyte1) |
180 | mr r12, r10 |
181 | addi r12, r12, -8 |
182 | subfic r11, r12, 64 |
183 | |
184 | L(nextbyte1): |
185 | ldu r7, 8(r3) /* Load next dw. */ |
186 | addi r12, r12, 8 /* Shift one byte and compare. */ |
187 | addi r11, r11, -8 |
188 | #ifdef __LITTLE_ENDIAN__ |
189 | srd r9, r5, r12 /* Rotate based on mask. */ |
190 | sld r10, r7, r11 |
191 | #else |
192 | sld r9, r5, r12 |
193 | srd r10, r7, r11 |
194 | #endif |
195 | /* Form single dw from few bytes on first load and second load. */ |
196 | or r10, r9, r10 |
197 | /* Check for null in the formed dw. */ |
198 | cmpb r9, r0, r10 |
199 | cmpdi cr7, r9, 0 |
200 | bne cr7, L(retnull) |
201 | /* Cmpb search str and input str. */ |
202 | cmpb r9, r10, r6 |
203 | cmpdi cr7, r9, -1 |
204 | beq cr7, L(match) |
205 | addi r8, r8, 1 |
206 | b L(begin) |
207 | |
208 | .align 4 |
209 | L(match): |
210 | /* There is a match of 8 bytes, check next bytes. */ |
211 | cmpdi cr7, r31, 8 |
212 | beq cr7, L(return) |
213 | /* Update next starting point r8. */ |
214 | srdi r9, r11, 3 |
215 | subf r9, r9, r3 |
216 | mr r8, r9 |
217 | |
218 | L(secondmatch): |
219 | mr r5, r7 |
220 | rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */ |
221 | ld r6, 0(r4) |
222 | addi r4, r4, 8 |
223 | cmpdi cr7, r10, 0 /* Check if its already aligned? */ |
224 | beq cr7, L(proceed3) |
225 | #ifdef __LITTLE_ENDIAN__ |
226 | srd r6, r6, r10 /* Discard unwanted bits. */ |
227 | cmpb r9, r0, r6 |
228 | sld r9, r9, r10 |
229 | #else |
230 | sld r6, r6, r10 |
231 | cmpb r9, r0, r6 |
232 | srd r9, r9, r10 |
233 | #endif |
234 | cmpdi cr7, r9, 0 |
235 | bne cr7, L(proceed3) |
236 | ld r9, 0(r4) |
237 | subfic r10, r10, 64 |
238 | #ifdef __LITTLE_ENDIAN__ |
239 | sld r9, r9, r10 /* Discard unwanted bits. */ |
240 | #else |
241 | srd r9, r9, r10 |
242 | #endif |
243 | or r6, r6, r9 /* Form complete search str. */ |
244 | |
245 | L(proceed3): |
246 | li r7, 0 |
247 | addi r3, r3, 8 |
248 | cmpb r9, r0, r5 |
249 | cmpdi cr7, r9, 0 |
250 | bne cr7, L(proceed4) |
251 | ld r7, 0(r3) |
252 | L(proceed4): |
253 | #ifdef __LITTLE_ENDIAN__ |
254 | srd r9, r5, r12 |
255 | sld r10, r7, r11 |
256 | #else |
257 | sld r9, r5, r12 |
258 | srd r10, r7, r11 |
259 | #endif |
260 | /* Form single dw with few bytes from first and second load. */ |
261 | or r10, r9, r10 |
262 | cmpb r9, r0, r6 |
263 | cmpdi cr7, r9, 0 |
264 | bne cr7, L(return4) |
265 | /* Check for null in the formed dw. */ |
266 | cmpb r9, r0, r10 |
267 | cmpdi cr7, r9, 0 |
268 | bne cr7, L(retnull) |
269 | /* If the next 8 bytes dont match, start search again. */ |
270 | cmpb r9, r10, r6 |
271 | cmpdi cr7, r9, -1 |
272 | bne cr7, L(reset) |
273 | /* If the next 8 bytes match, load and compare next 8. */ |
274 | b L(secondmatch) |
275 | |
276 | .align 4 |
277 | L(reset): |
278 | /* Start the search again. */ |
279 | addi r8, r8, 1 |
280 | b L(begin) |
281 | |
282 | .align 4 |
283 | L(return3): |
284 | /* Count leading zeros and compare partial dw. */ |
285 | #ifdef __LITTLE_ENDIAN__ |
286 | addi r7, r9, -1 |
287 | andc r7, r7, r9 |
288 | popcntd r7, r7 |
289 | subfic r7, r7, 64 |
290 | sld r10, r5, r7 |
291 | sld r6, r6, r7 |
292 | #else |
293 | cntlzd r7, r9 |
294 | subfic r7, r7, 64 |
295 | srd r10, r5, r7 |
296 | srd r6, r6, r7 |
297 | #endif |
298 | cmpb r9, r10, r6 |
299 | cmpdi cr7, r9, -1 |
300 | addi r8, r8, 1 |
301 | /* Start search again if there is no match. */ |
302 | bne cr7, L(begin) |
303 | /* If the words match, update return values. */ |
304 | subfic r7, r7, 64 |
305 | srdi r7, r7, 3 |
306 | add r3, r3, r7 |
307 | subf r3, r31, r3 |
308 | b L(end) |
309 | |
310 | .align 4 |
311 | L(return4): |
312 | /* Count leading zeros and compare partial dw. */ |
313 | #ifdef __LITTLE_ENDIAN__ |
314 | addi r7, r9, -1 |
315 | andc r7, r7, r9 |
316 | popcntd r7, r7 |
317 | subfic r7, r7, 64 |
318 | sld r10, r10, r7 |
319 | sld r6, r6, r7 |
320 | #else |
321 | cntlzd r7, r9 |
322 | subfic r7, r7, 64 |
323 | srd r10, r10, r7 |
324 | srd r6, r6, r7 |
325 | #endif |
326 | cmpb r9, r10, r6 |
327 | cmpdi cr7, r9, -1 |
328 | addi r8, r8, 1 |
329 | bne cr7, L(begin) |
330 | subfic r7, r7, 64 |
331 | srdi r11, r11, 3 |
332 | subf r3, r11, r3 |
333 | srdi r7, r7, 3 |
334 | add r3, r3, r7 |
335 | subf r3, r31, r3 |
336 | b L(end) |
337 | |
338 | .align 4 |
339 | L(begin): |
340 | mr r3, r8 |
341 | /* When our iterations exceed ITERATIONS,fall back to default. */ |
342 | addi r28, r28, 1 |
343 | cmpdi cr7, r28, ITERATIONS |
344 | beq cr7, L(default) |
345 | lbz r4, 0(r30) |
346 | bl STRCHR |
347 | #ifndef STRCHR_is_local |
348 | nop |
349 | #endif |
350 | /* If first char of search str is not present. */ |
351 | cmpdi cr7, r3, 0 |
352 | ble cr7, L(end) |
353 | mr r8, r3 |
354 | mr r4, r30 /* Restore r4. */ |
355 | li r0, 0 |
356 | mr r6, r29 |
357 | clrrdi r4, r4, 3 |
358 | addi r4, r4, 8 |
359 | b L(begin1) |
360 | |
361 | /* Handle less than 8 search string. */ |
362 | .align 4 |
363 | L(lessthan8): |
364 | mr r4, r3 |
365 | mr r9, r30 |
366 | li r0, 0 |
367 | |
368 | rlwinm r10, r9, 3, 26, 28 /* Calculate padding in bits. */ |
369 | srdi r8, r10, 3 /* Padding in bytes. */ |
370 | clrrdi r9, r9, 3 /* Make r4 aligned to 8. */ |
371 | ld r6, 0(r9) |
372 | cmpdi cr7, r10, 0 /* Check if its already aligned? */ |
373 | beq cr7, L(proceed2) |
374 | #ifdef __LITTLE_ENDIAN__ |
375 | srd r6, r6, r10 /* Discard unwanted bits. */ |
376 | #else |
377 | sld r6, r6, r10 |
378 | #endif |
379 | subfic r8, r8, 8 |
380 | cmpd cr7, r8, r31 /* Next load needed? */ |
381 | bge cr7, L(proceed2) |
382 | ld r7, 8(r9) |
383 | subfic r10, r10, 64 |
384 | #ifdef __LITTLE_ENDIAN__ |
385 | sld r7, r7, r10 /* Discard unwanted bits. */ |
386 | #else |
387 | srd r7, r7, r10 |
388 | #endif |
389 | or r6, r6, r7 /* Form complete search str. */ |
390 | L(proceed2): |
391 | mr r29, r6 |
392 | rlwinm r10, r3, 3, 26, 28 |
393 | clrrdi r7, r3, 3 /* Make r3 aligned. */ |
394 | ld r5, 0(r7) |
395 | sldi r8, r31, 3 |
396 | subfic r8, r8, 64 |
397 | #ifdef __LITTLE_ENDIAN__ |
398 | sld r6, r6, r8 |
399 | cmpb r9, r0, r5 |
400 | srd r9, r9, r10 |
401 | #else |
402 | srd r6, r6, r8 |
403 | cmpb r9, r0, r5 |
404 | sld r9, r9, r10 |
405 | #endif |
406 | cmpdi cr7, r9, 0 |
407 | bne cr7, L(noload) |
408 | cmpdi cr7, r10, 0 |
409 | beq cr7, L(continue) |
410 | ld r7, 8(r7) |
411 | L(continue1): |
412 | mr r12, r10 |
413 | addi r12, r12, -8 |
414 | subfic r11, r12, 64 |
415 | b L(nextbyte) |
416 | |
417 | .align 4 |
418 | L(continue): |
419 | ld r7, 8(r7) |
420 | li r12, -8 /* Shift values. */ |
421 | li r11, 72 /* Shift values. */ |
422 | L(nextbyte): |
423 | addi r12, r12, 8 /* Mask for rotation. */ |
424 | addi r11, r11, -8 |
425 | #ifdef __LITTLE_ENDIAN__ |
426 | srd r9, r5, r12 |
427 | sld r10, r7, r11 |
428 | or r10, r9, r10 |
429 | sld r10, r10, r8 |
430 | cmpb r9, r0, r10 |
431 | srd r9, r9, r8 |
432 | #else |
433 | sld r9, r5, r12 |
434 | srd r10, r7, r11 |
435 | or r10, r9, r10 |
436 | srd r10, r10, r8 |
437 | cmpb r9, r0, r10 |
438 | sld r9, r9, r8 |
439 | #endif |
440 | cmpdi cr7, r9, 0 |
441 | bne cr7, L(retnull) |
442 | cmpb r9, r10, r6 |
443 | cmpdi cr7, r9, -1 |
444 | beq cr7, L(end) |
445 | addi r3, r4, 1 |
446 | /* When our iterations exceed ITERATIONS,fall back to default. */ |
447 | addi r28, r28, 1 |
448 | cmpdi cr7, r28, ITERATIONS |
449 | beq cr7, L(default) |
450 | lbz r4, 0(r30) |
451 | bl STRCHR |
452 | #ifndef STRCHR_is_local |
453 | nop |
454 | #endif |
455 | /* If first char of search str is not present. */ |
456 | cmpdi cr7, r3, 0 |
457 | ble cr7, L(end) |
458 | mr r4, r3 |
459 | mr r6, r29 |
460 | li r0, 0 |
461 | b L(proceed2) |
462 | |
463 | .align 4 |
464 | L(noload): |
465 | /* Reached null in r3, so skip next load. */ |
466 | li r7, 0 |
467 | b L(continue1) |
468 | |
469 | .align 4 |
470 | L(return): |
471 | /* Update return values. */ |
472 | srdi r9, r11, 3 |
473 | subf r3, r9, r3 |
474 | b L(end) |
475 | |
476 | /* Handling byte by byte. */ |
477 | .align 4 |
478 | L(bytebybyte): |
479 | mr r8, r3 |
480 | addi r8, r8, -1 |
481 | L(loop1): |
482 | addi r8, r8, 1 |
483 | mr r3, r8 |
484 | mr r4, r30 |
485 | lbz r6, 0(r4) |
486 | cmpdi cr7, r6, 0 |
487 | beq cr7, L(updater3) |
488 | L(loop): |
489 | lbz r5, 0(r3) |
490 | cmpdi cr7, r5, 0 |
491 | beq cr7, L(retnull) |
492 | cmpld cr7, r6, r5 |
493 | bne cr7, L(loop1) |
494 | addi r3, r3, 1 |
495 | addi r4, r4, 1 |
496 | lbz r6, 0(r4) |
497 | cmpdi cr7, r6, 0 |
498 | beq cr7, L(updater3) |
499 | b L(loop) |
500 | |
501 | /* Handling return values. */ |
502 | .align 4 |
503 | L(updater3): |
504 | subf r3, r31, r3 /* Reduce len of r4 from r3. */ |
505 | b L(end) |
506 | |
507 | .align 4 |
508 | L(ret_r3): |
509 | mr r3, r29 /* Return r3. */ |
510 | b L(end) |
511 | |
512 | .align 4 |
513 | L(retnull): |
514 | li r3, 0 /* Return NULL. */ |
515 | b L(end) |
516 | |
517 | .align 4 |
518 | L(default): |
519 | mr r4, r30 |
520 | bl __strstr_ppc |
521 | nop |
522 | |
523 | .align 4 |
524 | L(end): |
525 | addi r1, r1, FRAMESIZE /* Restore stack pointer. */ |
526 | cfi_adjust_cfa_offset(-FRAMESIZE) |
527 | ld r0, 16(r1) /* Restore the saved link register. */ |
528 | ld r28, -32(r1) /* Restore callers save register r28. */ |
529 | ld r29, -24(r1) /* Restore callers save register r29. */ |
530 | ld r30, -16(r1) /* Restore callers save register r30. */ |
531 | ld r31, -8(r1) /* Restore callers save register r31. */ |
532 | mtlr r0 /* Branch to link register. */ |
533 | blr |
534 | END (STRSTR) |
535 | libc_hidden_builtin_def (strstr) |
536 | |