1/* Optimized strstr implementation for PowerPC64/POWER7.
2 Copyright (C) 2015-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* Char * [r3] strstr (char *s [r3], char * pat[r4]) */
22
23/* The performance gain is obtained using aligned memory access, load
24 * doubleword and usage of cmpb instruction for quicker comparison. */
25
26#define ITERATIONS 64
27
28#ifndef STRSTR
29# define STRSTR strstr
30#endif
31
32#ifndef STRLEN
33/* For builds with no IFUNC support, local calls should be made to internal
34 GLIBC symbol (created by libc_hidden_builtin_def). */
35# ifdef SHARED
36# define STRLEN __GI_strlen
37# define STRLEN_is_local
38# else
39# define STRLEN strlen
40# endif
41#endif
42
43#ifndef STRNLEN
44/* For builds with no IFUNC support, local calls should be made to internal
45 GLIBC symbol (created by libc_hidden_builtin_def). */
46# ifdef SHARED
47# define STRNLEN __GI_strnlen
48# define STRNLEN_is_local
49# else
50# define STRNLEN __strnlen
51# endif
52#endif
53
54#ifndef STRCHR
55# ifdef SHARED
56# define STRCHR __GI_strchr
57# define STRCHR_is_local
58# else
59# define STRCHR strchr
60# endif
61#endif
62
63#define FRAMESIZE (FRAME_MIN_SIZE+32)
64 .machine power7
65/* Can't be ENTRY_TOCLESS due to calling __strstr_ppc which uses r2. */
66ENTRY (STRSTR, 4)
67 CALL_MCOUNT 2
68 mflr r0 /* Load link register LR to r0. */
69 std r31, -8(r1) /* Save callers register r31. */
70 std r30, -16(r1) /* Save callers register r30. */
71 std r29, -24(r1) /* Save callers register r29. */
72 std r28, -32(r1) /* Save callers register r28. */
73 std r0, 16(r1) /* Store the link register. */
74 cfi_offset(r31, -8)
75 cfi_offset(r30, -16)
76 cfi_offset(r28, -32)
77 cfi_offset(r29, -24)
78 cfi_offset(lr, 16)
79 stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
80 cfi_adjust_cfa_offset(FRAMESIZE)
81
82 dcbt 0, r3
83 dcbt 0, r4
84 cmpdi cr7, r3, 0
85 beq cr7, L(retnull)
86 cmpdi cr7, r4, 0
87 beq cr7, L(retnull)
88
89 mr r29, r3
90 mr r30, r4
91 mr r3, r4
92 bl STRLEN
93#ifndef STRLEN_is_local
94 nop
95#endif
96
97 cmpdi cr7, r3, 0 /* If search str is null. */
98 beq cr7, L(ret_r3)
99
100 mr r31, r3
101 mr r4, r3
102 mr r3, r29
103 bl STRNLEN
104#ifndef STRNLEN_is_local
105 nop
106#endif
107
108 cmpd cr7, r3, r31 /* If len(r3) < len(r4). */
109 blt cr7, L(retnull)
110 mr r3, r29
111 lbz r4, 0(r30)
112 bl STRCHR
113#ifndef STRCHR_is_local
114 nop
115#endif
116
117 mr r11, r3
118 /* If first char of search str is not present. */
119 cmpdi cr7, r3, 0
120 ble cr7, L(end)
121 /* Reg r28 is used to count the number of iterations. */
122 li r28, 0
123 rldicl r8, r3, 0, 52 /* Page cross check. */
124 cmpldi cr7, r8, 4096-16
125 bgt cr7, L(bytebybyte)
126
127 rldicl r8, r30, 0, 52
128 cmpldi cr7, r8, 4096-16
129 bgt cr7, L(bytebybyte)
130
131 /* If len(r4) < 8 handle in a different way. */
132 /* Shift position based on null and use cmpb. */
133 cmpdi cr7, r31, 8
134 blt cr7, L(lessthan8)
135
136 /* Len(r4) >= 8 reaches here. */
137 mr r8, r3 /* Save r3 for future use. */
138 mr r4, r30 /* Restore r4. */
139 li r0, 0
140 rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */
141 clrrdi r4, r4, 3 /* Make r4 aligned to 8. */
142 ld r6, 0(r4)
143 addi r4, r4, 8
144 cmpdi cr7, r10, 0 /* Check if its already aligned? */
145 beq cr7, L(begin1)
146#ifdef __LITTLE_ENDIAN__
147 srd r6, r6, r10 /* Discard unwanted bits. */
148#else
149 sld r6, r6, r10
150#endif
151 ld r9, 0(r4)
152 subfic r10, r10, 64
153#ifdef __LITTLE_ENDIAN__
154 sld r9, r9, r10 /* Discard unwanted bits. */
155#else
156 srd r9, r9, r10
157#endif
158 or r6, r6, r9 /* Form complete search str. */
159L(begin1):
160 mr r29, r6
161 rlwinm r10, r3, 3, 26, 28
162 clrrdi r3, r3, 3
163 ld r5, 0(r3)
164 cmpb r9, r0, r6 /* Check if input has null. */
165 cmpdi cr7, r9, 0
166 bne cr7, L(return3)
167 cmpb r9, r0, r5 /* Check if input has null. */
168#ifdef __LITTLE_ENDIAN__
169 srd r9, r9, r10
170#else
171 sld r9, r9, r10
172#endif
173 cmpdi cr7, r9, 0
174 bne cr7, L(retnull)
175
176 li r12, -8 /* Shift values. */
177 li r11, 72 /* Shift values. */
178 cmpdi cr7, r10, 0
179 beq cr7, L(nextbyte1)
180 mr r12, r10
181 addi r12, r12, -8
182 subfic r11, r12, 64
183
184L(nextbyte1):
185 ldu r7, 8(r3) /* Load next dw. */
186 addi r12, r12, 8 /* Shift one byte and compare. */
187 addi r11, r11, -8
188#ifdef __LITTLE_ENDIAN__
189 srd r9, r5, r12 /* Rotate based on mask. */
190 sld r10, r7, r11
191#else
192 sld r9, r5, r12
193 srd r10, r7, r11
194#endif
195 /* Form single dw from few bytes on first load and second load. */
196 or r10, r9, r10
197 /* Check for null in the formed dw. */
198 cmpb r9, r0, r10
199 cmpdi cr7, r9, 0
200 bne cr7, L(retnull)
201 /* Cmpb search str and input str. */
202 cmpb r9, r10, r6
203 cmpdi cr7, r9, -1
204 beq cr7, L(match)
205 addi r8, r8, 1
206 b L(begin)
207
208 .align 4
209L(match):
210 /* There is a match of 8 bytes, check next bytes. */
211 cmpdi cr7, r31, 8
212 beq cr7, L(return)
213 /* Update next starting point r8. */
214 srdi r9, r11, 3
215 subf r9, r9, r3
216 mr r8, r9
217
218L(secondmatch):
219 mr r5, r7
220 rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */
221 ld r6, 0(r4)
222 addi r4, r4, 8
223 cmpdi cr7, r10, 0 /* Check if its already aligned? */
224 beq cr7, L(proceed3)
225#ifdef __LITTLE_ENDIAN__
226 srd r6, r6, r10 /* Discard unwanted bits. */
227 cmpb r9, r0, r6
228 sld r9, r9, r10
229#else
230 sld r6, r6, r10
231 cmpb r9, r0, r6
232 srd r9, r9, r10
233#endif
234 cmpdi cr7, r9, 0
235 bne cr7, L(proceed3)
236 ld r9, 0(r4)
237 subfic r10, r10, 64
238#ifdef __LITTLE_ENDIAN__
239 sld r9, r9, r10 /* Discard unwanted bits. */
240#else
241 srd r9, r9, r10
242#endif
243 or r6, r6, r9 /* Form complete search str. */
244
245L(proceed3):
246 li r7, 0
247 addi r3, r3, 8
248 cmpb r9, r0, r5
249 cmpdi cr7, r9, 0
250 bne cr7, L(proceed4)
251 ld r7, 0(r3)
252L(proceed4):
253#ifdef __LITTLE_ENDIAN__
254 srd r9, r5, r12
255 sld r10, r7, r11
256#else
257 sld r9, r5, r12
258 srd r10, r7, r11
259#endif
260 /* Form single dw with few bytes from first and second load. */
261 or r10, r9, r10
262 cmpb r9, r0, r6
263 cmpdi cr7, r9, 0
264 bne cr7, L(return4)
265 /* Check for null in the formed dw. */
266 cmpb r9, r0, r10
267 cmpdi cr7, r9, 0
268 bne cr7, L(retnull)
269 /* If the next 8 bytes dont match, start search again. */
270 cmpb r9, r10, r6
271 cmpdi cr7, r9, -1
272 bne cr7, L(reset)
273 /* If the next 8 bytes match, load and compare next 8. */
274 b L(secondmatch)
275
276 .align 4
277L(reset):
278 /* Start the search again. */
279 addi r8, r8, 1
280 b L(begin)
281
282 .align 4
283L(return3):
284 /* Count leading zeros and compare partial dw. */
285#ifdef __LITTLE_ENDIAN__
286 addi r7, r9, -1
287 andc r7, r7, r9
288 popcntd r7, r7
289 subfic r7, r7, 64
290 sld r10, r5, r7
291 sld r6, r6, r7
292#else
293 cntlzd r7, r9
294 subfic r7, r7, 64
295 srd r10, r5, r7
296 srd r6, r6, r7
297#endif
298 cmpb r9, r10, r6
299 cmpdi cr7, r9, -1
300 addi r8, r8, 1
301 /* Start search again if there is no match. */
302 bne cr7, L(begin)
303 /* If the words match, update return values. */
304 subfic r7, r7, 64
305 srdi r7, r7, 3
306 add r3, r3, r7
307 subf r3, r31, r3
308 b L(end)
309
310 .align 4
311L(return4):
312 /* Count leading zeros and compare partial dw. */
313#ifdef __LITTLE_ENDIAN__
314 addi r7, r9, -1
315 andc r7, r7, r9
316 popcntd r7, r7
317 subfic r7, r7, 64
318 sld r10, r10, r7
319 sld r6, r6, r7
320#else
321 cntlzd r7, r9
322 subfic r7, r7, 64
323 srd r10, r10, r7
324 srd r6, r6, r7
325#endif
326 cmpb r9, r10, r6
327 cmpdi cr7, r9, -1
328 addi r8, r8, 1
329 bne cr7, L(begin)
330 subfic r7, r7, 64
331 srdi r11, r11, 3
332 subf r3, r11, r3
333 srdi r7, r7, 3
334 add r3, r3, r7
335 subf r3, r31, r3
336 b L(end)
337
338 .align 4
339L(begin):
340 mr r3, r8
341 /* When our iterations exceed ITERATIONS,fall back to default. */
342 addi r28, r28, 1
343 cmpdi cr7, r28, ITERATIONS
344 beq cr7, L(default)
345 lbz r4, 0(r30)
346 bl STRCHR
347#ifndef STRCHR_is_local
348 nop
349#endif
350 /* If first char of search str is not present. */
351 cmpdi cr7, r3, 0
352 ble cr7, L(end)
353 mr r8, r3
354 mr r4, r30 /* Restore r4. */
355 li r0, 0
356 mr r6, r29
357 clrrdi r4, r4, 3
358 addi r4, r4, 8
359 b L(begin1)
360
361 /* Handle less than 8 search string. */
362 .align 4
363L(lessthan8):
364 mr r4, r3
365 mr r9, r30
366 li r0, 0
367
368 rlwinm r10, r9, 3, 26, 28 /* Calculate padding in bits. */
369 srdi r8, r10, 3 /* Padding in bytes. */
370 clrrdi r9, r9, 3 /* Make r4 aligned to 8. */
371 ld r6, 0(r9)
372 cmpdi cr7, r10, 0 /* Check if its already aligned? */
373 beq cr7, L(proceed2)
374#ifdef __LITTLE_ENDIAN__
375 srd r6, r6, r10 /* Discard unwanted bits. */
376#else
377 sld r6, r6, r10
378#endif
379 subfic r8, r8, 8
380 cmpd cr7, r8, r31 /* Next load needed? */
381 bge cr7, L(proceed2)
382 ld r7, 8(r9)
383 subfic r10, r10, 64
384#ifdef __LITTLE_ENDIAN__
385 sld r7, r7, r10 /* Discard unwanted bits. */
386#else
387 srd r7, r7, r10
388#endif
389 or r6, r6, r7 /* Form complete search str. */
390L(proceed2):
391 mr r29, r6
392 rlwinm r10, r3, 3, 26, 28
393 clrrdi r7, r3, 3 /* Make r3 aligned. */
394 ld r5, 0(r7)
395 sldi r8, r31, 3
396 subfic r8, r8, 64
397#ifdef __LITTLE_ENDIAN__
398 sld r6, r6, r8
399 cmpb r9, r0, r5
400 srd r9, r9, r10
401#else
402 srd r6, r6, r8
403 cmpb r9, r0, r5
404 sld r9, r9, r10
405#endif
406 cmpdi cr7, r9, 0
407 bne cr7, L(noload)
408 cmpdi cr7, r10, 0
409 beq cr7, L(continue)
410 ld r7, 8(r7)
411L(continue1):
412 mr r12, r10
413 addi r12, r12, -8
414 subfic r11, r12, 64
415 b L(nextbyte)
416
417 .align 4
418L(continue):
419 ld r7, 8(r7)
420 li r12, -8 /* Shift values. */
421 li r11, 72 /* Shift values. */
422L(nextbyte):
423 addi r12, r12, 8 /* Mask for rotation. */
424 addi r11, r11, -8
425#ifdef __LITTLE_ENDIAN__
426 srd r9, r5, r12
427 sld r10, r7, r11
428 or r10, r9, r10
429 sld r10, r10, r8
430 cmpb r9, r0, r10
431 srd r9, r9, r8
432#else
433 sld r9, r5, r12
434 srd r10, r7, r11
435 or r10, r9, r10
436 srd r10, r10, r8
437 cmpb r9, r0, r10
438 sld r9, r9, r8
439#endif
440 cmpdi cr7, r9, 0
441 bne cr7, L(retnull)
442 cmpb r9, r10, r6
443 cmpdi cr7, r9, -1
444 beq cr7, L(end)
445 addi r3, r4, 1
446 /* When our iterations exceed ITERATIONS,fall back to default. */
447 addi r28, r28, 1
448 cmpdi cr7, r28, ITERATIONS
449 beq cr7, L(default)
450 lbz r4, 0(r30)
451 bl STRCHR
452#ifndef STRCHR_is_local
453 nop
454#endif
455 /* If first char of search str is not present. */
456 cmpdi cr7, r3, 0
457 ble cr7, L(end)
458 mr r4, r3
459 mr r6, r29
460 li r0, 0
461 b L(proceed2)
462
463 .align 4
464L(noload):
465 /* Reached null in r3, so skip next load. */
466 li r7, 0
467 b L(continue1)
468
469 .align 4
470L(return):
471 /* Update return values. */
472 srdi r9, r11, 3
473 subf r3, r9, r3
474 b L(end)
475
476 /* Handling byte by byte. */
477 .align 4
478L(bytebybyte):
479 mr r8, r3
480 addi r8, r8, -1
481L(loop1):
482 addi r8, r8, 1
483 mr r3, r8
484 mr r4, r30
485 lbz r6, 0(r4)
486 cmpdi cr7, r6, 0
487 beq cr7, L(updater3)
488L(loop):
489 lbz r5, 0(r3)
490 cmpdi cr7, r5, 0
491 beq cr7, L(retnull)
492 cmpld cr7, r6, r5
493 bne cr7, L(loop1)
494 addi r3, r3, 1
495 addi r4, r4, 1
496 lbz r6, 0(r4)
497 cmpdi cr7, r6, 0
498 beq cr7, L(updater3)
499 b L(loop)
500
501 /* Handling return values. */
502 .align 4
503L(updater3):
504 subf r3, r31, r3 /* Reduce len of r4 from r3. */
505 b L(end)
506
507 .align 4
508L(ret_r3):
509 mr r3, r29 /* Return r3. */
510 b L(end)
511
512 .align 4
513L(retnull):
514 li r3, 0 /* Return NULL. */
515 b L(end)
516
517 .align 4
518L(default):
519 mr r4, r30
520 bl __strstr_ppc
521 nop
522
523 .align 4
524L(end):
525 addi r1, r1, FRAMESIZE /* Restore stack pointer. */
526 cfi_adjust_cfa_offset(-FRAMESIZE)
527 ld r0, 16(r1) /* Restore the saved link register. */
528 ld r28, -32(r1) /* Restore callers save register r28. */
529 ld r29, -24(r1) /* Restore callers save register r29. */
530 ld r30, -16(r1) /* Restore callers save register r30. */
531 ld r31, -8(r1) /* Restore callers save register r31. */
532 mtlr r0 /* Branch to link register. */
533 blr
534END (STRSTR)
535libc_hidden_builtin_def (strstr)
536

source code of glibc/sysdeps/powerpc/powerpc64/power7/strstr.S