1/* Optimized strcasestr implementation for PowerPC64/POWER8.
2 Copyright (C) 2016-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include <locale-defines.h>
21
22/* Char * [r3] strcasestr (char *s [r3], char * pat[r4]) */
23
24/* The performance gain is obtained by comparing 16 bytes. */
25
26/* When the first char of r4 is hit ITERATIONS times in r3
27 fallback to default. */
28#define ITERATIONS 64
29
30#ifndef STRCASESTR
31# define STRCASESTR __strcasestr
32#endif
33
34#ifndef STRLEN
35/* For builds without IFUNC support, local calls should be made to internal
36 GLIBC symbol (created by libc_hidden_builtin_def). */
37# ifdef SHARED
38# define STRLEN __GI_strlen
39# else
40# define STRLEN strlen
41# endif
42#endif
43
44#ifndef STRNLEN
45/* For builds without IFUNC support, local calls should be made to internal
46 GLIBC symbol (created by libc_hidden_builtin_def). */
47# ifdef SHARED
48# define STRNLEN __GI_strnlen
49# else
50# define STRNLEN __strnlen
51# endif
52#endif
53
54#ifndef STRCHR
55# ifdef SHARED
56# define STRCHR __GI_strchr
57# else
58# define STRCHR strchr
59# endif
60#endif
61
62/* Convert 16 bytes of v4 and reg to lowercase and compare. */
63#define TOLOWER(reg) \
64 vcmpgtub v6, v4, v1; \
65 vcmpgtub v7, v2, v4; \
66 vand v8, v7, v6; \
67 vand v8, v8, v3; \
68 vor v4, v8, v4; \
69 vcmpgtub v6, reg, v1; \
70 vcmpgtub v7, v2, reg; \
71 vand v8, v7, v6; \
72 vand v8, v8, v3; \
73 vor reg, v8, reg; \
74 vcmpequb. v6, reg, v4;
75
76#define FRAMESIZE (FRAME_MIN_SIZE+48)
77 .machine power8
78ENTRY (STRCASESTR, 4)
79 CALL_MCOUNT 2
80 mflr r0 /* Load link register LR to r0. */
81 std r31, -8(r1) /* Save callers register r31. */
82 std r30, -16(r1) /* Save callers register r30. */
83 std r29, -24(r1) /* Save callers register r29. */
84 std r28, -32(r1) /* Save callers register r28. */
85 std r27, -40(r1) /* Save callers register r27. */
86 std r0, 16(r1) /* Store the link register. */
87 cfi_offset(r31, -8)
88 cfi_offset(r30, -16)
89 cfi_offset(r29, -24)
90 cfi_offset(r28, -32)
91 cfi_offset(r27, -40)
92 cfi_offset(lr, 16)
93 stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
94 cfi_adjust_cfa_offset(FRAMESIZE)
95
96 dcbt 0, r3
97 dcbt 0, r4
98 cmpdi cr7, r3, 0 /* Input validation. */
99 beq cr7, L(retnull)
100 cmpdi cr7, r4, 0
101 beq cr7, L(retnull)
102
103 mr r29, r3
104 mr r30, r4
105 /* Load first byte from r4 and check if its null. */
106 lbz r6, 0(r4)
107 cmpdi cr7, r6, 0
108 beq cr7, L(ret_r3)
109
110 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
111 add r9, r10, __libc_tsd_LOCALE@tls
112 ld r9, 0(r9)
113 ld r9, LOCALE_CTYPE_TOUPPER(r9)
114 sldi r10, r6, 2 /* Convert to upper case. */
115 lwzx r28, r9, r10
116
117 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
118 add r11, r10, __libc_tsd_LOCALE@tls
119 ld r11, 0(r11)
120 ld r11, LOCALE_CTYPE_TOLOWER(r11)
121 sldi r10, r6, 2 /* Convert to lower case. */
122 lwzx r27, r11, r10
123
124 /* Check if the first char is present. */
125 mr r4, r27
126 bl STRCHR
127 nop
128 mr r5, r3
129 mr r3, r29
130 mr r29, r5
131 mr r4, r28
132 bl STRCHR
133 nop
134 cmpdi cr7, r29, 0
135 beq cr7, L(firstpos)
136 cmpdi cr7, r3, 0
137 beq cr7, L(skipcheck)
138 cmpw cr7, r3, r29
139 ble cr7, L(firstpos)
140 /* Move r3 to the first occurrence. */
141L(skipcheck):
142 mr r3, r29
143L(firstpos):
144 mr r29, r3
145
146 sldi r9, r27, 8
147 or r28, r9, r28
148 /* Reg r27 is used to count the number of iterations. */
149 li r27, 0
150 /* If first char of search str is not present. */
151 cmpdi cr7, r3, 0
152 ble cr7, L(end)
153
154 /* Find the length of pattern. */
155 mr r3, r30
156 bl STRLEN
157 nop
158
159 cmpdi cr7, r3, 0 /* If search str is null. */
160 beq cr7, L(ret_r3)
161
162 mr r31, r3
163 mr r4, r3
164 mr r3, r29
165 bl STRNLEN
166 nop
167
168 cmpd cr7, r3, r31 /* If len(r3) < len(r4). */
169 blt cr7, L(retnull)
170
171 mr r3, r29
172
173 /* Locales not matching ASCII for single bytes. */
174 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
175 add r9, r10, __libc_tsd_LOCALE@tls
176 ld r9, 0(r9)
177 ld r7, 0(r9)
178 addi r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
179 lwz r8, 0(r7)
180 cmpdi cr7, r8, 1
181 beq cr7, L(bytebybyte)
182
183 /* If len(r4) < 16 handle byte by byte. */
184 /* For shorter strings we will not use vector registers. */
185 cmpdi cr7, r31, 16
186 blt cr7, L(bytebybyte)
187
188 /* Comparison values used for TOLOWER. */
189 /* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte. */
190 vspltish v0, 0
191 vspltisb v5, 2
192 vspltisb v4, 4
193 vsl v3, v5, v4
194 vaddubm v1, v3, v3
195 vspltisb v5, 15
196 vaddubm v2, v5, v5
197 vaddubm v2, v1, v2
198 vspltisb v4, -3
199 vaddubm v2, v2, v4
200
201 /*
202 1. Load 16 bytes from r3 and r4
203 2. Check if there is null, If yes, proceed byte by byte path.
204 3. Else,Convert both to lowercase and compare.
205 4. If they are same proceed to 1.
206 5. If they dont match, find if first char of r4 is present in the
207 loaded 16 byte of r3.
208 6. If yes, move position, load next 16 bytes of r3 and proceed to 2.
209 */
210
211 mr r8, r3 /* Save r3 for future use. */
212 mr r4, r30 /* Restore r4. */
213 clrldi r10, r4, 60
214 lvx v5, 0, r4 /* Load 16 bytes from r4. */
215 cmpdi cr7, r10, 0
216 beq cr7, L(begin2)
217 /* If r4 is unaligned, load another 16 bytes. */
218#ifdef __LITTLE_ENDIAN__
219 lvsr v7, 0, r4
220#else
221 lvsl v7, 0, r4
222#endif
223 addi r5, r4, 16
224 lvx v9, 0, r5
225#ifdef __LITTLE_ENDIAN__
226 vperm v5, v9, v5, v7
227#else
228 vperm v5, v5, v9, v7
229#endif
230L(begin2):
231 lvx v4, 0, r3
232 vcmpequb. v7, v0, v4 /* Check for null. */
233 beq cr6, L(nullchk6)
234 b L(trailcheck)
235
236 .align 4
237L(nullchk6):
238 clrldi r10, r3, 60
239 cmpdi cr7, r10, 0
240 beq cr7, L(next16)
241#ifdef __LITTLE_ENDIAN__
242 lvsr v7, 0, r3
243#else
244 lvsl v7, 0, r3
245#endif
246 addi r5, r3, 16
247 /* If r3 is unaligned, load another 16 bytes. */
248 lvx v10, 0, r5
249#ifdef __LITTLE_ENDIAN__
250 vperm v4, v10, v4, v7
251#else
252 vperm v4, v4, v10, v7
253#endif
254L(next16):
255 vcmpequb. v6, v0, v5 /* Check for null. */
256 beq cr6, L(nullchk)
257 b L(trailcheck)
258
259 .align 4
260L(nullchk):
261 vcmpequb. v6, v0, v4
262 beq cr6, L(nullchk1)
263 b L(retnull)
264
265 .align 4
266L(nullchk1):
267 /* Convert both v3 and v4 to lower. */
268 TOLOWER(v5)
269 /* If both are same, branch to match. */
270 blt cr6, L(match)
271 /* Find if the first char is present in next 15 bytes. */
272#ifdef __LITTLE_ENDIAN__
273 vspltb v6, v5, 15
274 vsldoi v7, v0, v4, 15
275#else
276 vspltb v6, v5, 0
277 vspltisb v7, 8
278 vslo v7, v4, v7
279#endif
280 vcmpequb v7, v6, v7
281 vcmpequb. v6, v0, v7
282 /* Shift r3 by 16 bytes and proceed. */
283 blt cr6, L(shift16)
284 vclzd v8, v7
285#ifdef __LITTLE_ENDIAN__
286 vspltb v6, v8, 15
287#else
288 vspltb v6, v8, 7
289#endif
290 vcmpequb. v6, v6, v1
291 /* Shift r3 by 8 bytes and proceed. */
292 blt cr6, L(shift8)
293 b L(begin)
294
295 .align 4
296L(match):
297 /* There is a match of 16 bytes, check next bytes. */
298 cmpdi cr7, r31, 16
299 mr r29, r3
300 beq cr7, L(ret_r3)
301
302L(secondmatch):
303 addi r3, r3, 16
304 addi r4, r4, 16
305 /* Load next 16 bytes of r3 and r4 and compare. */
306 clrldi r10, r4, 60
307 cmpdi cr7, r10, 0
308 beq cr7, L(nextload)
309 /* Handle unaligned case. */
310 vor v6, v9, v9
311 vcmpequb. v7, v0, v6
312 beq cr6, L(nullchk2)
313 b L(trailcheck)
314
315 .align 4
316L(nullchk2):
317#ifdef __LITTLE_ENDIAN__
318 lvsr v7, 0, r4
319#else
320 lvsl v7, 0, r4
321#endif
322 addi r5, r4, 16
323 /* If r4 is unaligned, load another 16 bytes. */
324 lvx v9, 0, r5
325#ifdef __LITTLE_ENDIAN__
326 vperm v11, v9, v6, v7
327#else
328 vperm v11, v6, v9, v7
329#endif
330 b L(compare)
331
332 .align 4
333L(nextload):
334 lvx v11, 0, r4
335L(compare):
336 vcmpequb. v7, v0, v11
337 beq cr6, L(nullchk3)
338 b L(trailcheck)
339
340 .align 4
341L(nullchk3):
342 clrldi r10, r3, 60
343 cmpdi cr7, r10, 0
344 beq cr7, L(nextload1)
345 /* Handle unaligned case. */
346 vor v4, v10, v10
347 vcmpequb. v7, v0, v4
348 beq cr6, L(nullchk4)
349 b L(retnull)
350
351 .align 4
352L(nullchk4):
353#ifdef __LITTLE_ENDIAN__
354 lvsr v7, 0, r3
355#else
356 lvsl v7, 0, r3
357#endif
358 addi r5, r3, 16
359 /* If r3 is unaligned, load another 16 bytes. */
360 lvx v10, 0, r5
361#ifdef __LITTLE_ENDIAN__
362 vperm v4, v10, v4, v7
363#else
364 vperm v4, v4, v10, v7
365#endif
366 b L(compare1)
367
368 .align 4
369L(nextload1):
370 lvx v4, 0, r3
371L(compare1):
372 vcmpequb. v7, v0, v4
373 beq cr6, L(nullchk5)
374 b L(retnull)
375
376 .align 4
377L(nullchk5):
378 /* Convert both v3 and v4 to lower. */
379 TOLOWER(v11)
380 /* If both are same, branch to secondmatch. */
381 blt cr6, L(secondmatch)
382 /* Continue the search. */
383 b L(begin)
384
385 .align 4
386L(trailcheck):
387 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
388 add r11, r10, __libc_tsd_LOCALE@tls
389 ld r11, 0(r11)
390 ld r11, LOCALE_CTYPE_TOLOWER(r11)
391L(loop2):
392 lbz r5, 0(r3) /* Load byte from r3. */
393 lbz r6, 0(r4) /* Load next byte from r4. */
394 cmpdi cr7, r6, 0 /* Is it null? */
395 beq cr7, L(updater3)
396 cmpdi cr7, r5, 0 /* Is it null? */
397 beq cr7, L(retnull) /* If yes, return. */
398 addi r3, r3, 1
399 addi r4, r4, 1 /* Increment r4. */
400 sldi r10, r5, 2 /* Convert to lower case. */
401 lwzx r10, r11, r10
402 sldi r7, r6, 2 /* Convert to lower case. */
403 lwzx r7, r11, r7
404 cmpw cr7, r7, r10 /* Compare with byte from r4. */
405 bne cr7, L(begin)
406 b L(loop2)
407
408 .align 4
409L(shift8):
410 addi r8, r8, 7
411 b L(begin)
412 .align 4
413L(shift16):
414 addi r8, r8, 15
415 .align 4
416L(begin):
417 addi r8, r8, 1
418 mr r3, r8
419 /* When our iterations exceed ITERATIONS,fall back to default. */
420 addi r27, r27, 1
421 cmpdi cr7, r27, ITERATIONS
422 beq cr7, L(default)
423 mr r4, r30 /* Restore r4. */
424 b L(begin2)
425
426 /* Handling byte by byte. */
427 .align 4
428L(loop1):
429 mr r3, r8
430 addi r27, r27, 1
431 cmpdi cr7, r27, ITERATIONS
432 beq cr7, L(default)
433 mr r29, r8
434 srdi r4, r28, 8
435 /* Check if the first char is present. */
436 bl STRCHR
437 nop
438 mr r5, r3
439 mr r3, r29
440 mr r29, r5
441 sldi r4, r28, 56
442 srdi r4, r4, 56
443 bl STRCHR
444 nop
445 cmpdi cr7, r29, 0
446 beq cr7, L(nextpos)
447 cmpdi cr7, r3, 0
448 beq cr7, L(skipcheck1)
449 cmpw cr7, r3, r29
450 ble cr7, L(nextpos)
451 /* Move r3 to first occurrence. */
452L(skipcheck1):
453 mr r3, r29
454L(nextpos):
455 mr r29, r3
456 cmpdi cr7, r3, 0
457 ble cr7, L(retnull)
458L(bytebybyte):
459 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
460 add r11, r10, __libc_tsd_LOCALE@tls
461 ld r11, 0(r11)
462 ld r11, LOCALE_CTYPE_TOLOWER(r11)
463 mr r4, r30 /* Restore r4. */
464 mr r8, r3 /* Save r3. */
465 addi r8, r8, 1
466
467L(loop):
468 addi r3, r3, 1
469 lbz r5, 0(r3) /* Load byte from r3. */
470 addi r4, r4, 1 /* Increment r4. */
471 lbz r6, 0(r4) /* Load next byte from r4. */
472 cmpdi cr7, r6, 0 /* Is it null? */
473 beq cr7, L(updater3)
474 cmpdi cr7, r5, 0 /* Is it null? */
475 beq cr7, L(retnull) /* If yes, return. */
476 sldi r10, r5, 2 /* Convert to lower case. */
477 lwzx r10, r11, r10
478 sldi r7, r6, 2 /* Convert to lower case. */
479 lwzx r7, r11, r7
480 cmpw cr7, r7, r10 /* Compare with byte from r4. */
481 bne cr7, L(loop1)
482 b L(loop)
483
484 /* Handling return values. */
485 .align 4
486L(updater3):
487 subf r3, r31, r3 /* Reduce r31 (len of r4) from r3. */
488 b L(end)
489
490 .align 4
491L(ret_r3):
492 mr r3, r29 /* Return point of match. */
493 b L(end)
494
495 .align 4
496L(retnull):
497 li r3, 0 /* Substring was not found. */
498 b L(end)
499
500 .align 4
501L(default):
502 mr r4, r30
503 bl __strcasestr_ppc
504 nop
505
506 .align 4
507L(end):
508 addi r1, r1, FRAMESIZE /* Restore stack pointer. */
509 cfi_adjust_cfa_offset(-FRAMESIZE)
510 ld r0, 16(r1) /* Restore the saved link register. */
511 ld r27, -40(r1)
512 ld r28, -32(r1)
513 ld r29, -24(r1) /* Restore callers save register r29. */
514 ld r30, -16(r1) /* Restore callers save register r30. */
515 ld r31, -8(r1) /* Restore callers save register r31. */
516 cfi_restore(lr)
517 cfi_restore(r27)
518 cfi_restore(r28)
519 cfi_restore(r29)
520 cfi_restore(r30)
521 cfi_restore(r31)
522 mtlr r0 /* Branch to link register. */
523 blr
524END (STRCASESTR)
525
526weak_alias (__strcasestr, strcasestr)
527libc_hidden_def (__strcasestr)
528libc_hidden_builtin_def (strcasestr)
529

source code of glibc/sysdeps/powerpc/powerpc64/power8/strcasestr.S