1 | /* Optimized strcasestr implementation for PowerPC64/POWER8. |
2 | Copyright (C) 2016-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | #include <locale-defines.h> |
21 | |
22 | /* Char * [r3] strcasestr (char *s [r3], char * pat[r4]) */ |
23 | |
24 | /* The performance gain is obtained by comparing 16 bytes. */ |
25 | |
26 | /* When the first char of r4 is hit ITERATIONS times in r3 |
27 | fallback to default. */ |
28 | #define ITERATIONS 64 |
29 | |
30 | #ifndef STRCASESTR |
31 | # define STRCASESTR __strcasestr |
32 | #endif |
33 | |
34 | #ifndef STRLEN |
35 | /* For builds without IFUNC support, local calls should be made to internal |
36 | GLIBC symbol (created by libc_hidden_builtin_def). */ |
37 | # ifdef SHARED |
38 | # define STRLEN __GI_strlen |
39 | # else |
40 | # define STRLEN strlen |
41 | # endif |
42 | #endif |
43 | |
44 | #ifndef STRNLEN |
45 | /* For builds without IFUNC support, local calls should be made to internal |
46 | GLIBC symbol (created by libc_hidden_builtin_def). */ |
47 | # ifdef SHARED |
48 | # define STRNLEN __GI_strnlen |
49 | # else |
50 | # define STRNLEN __strnlen |
51 | # endif |
52 | #endif |
53 | |
54 | #ifndef STRCHR |
55 | # ifdef SHARED |
56 | # define STRCHR __GI_strchr |
57 | # else |
58 | # define STRCHR strchr |
59 | # endif |
60 | #endif |
61 | |
62 | /* Convert 16 bytes of v4 and reg to lowercase and compare. */ |
63 | #define TOLOWER(reg) \ |
64 | vcmpgtub v6, v4, v1; \ |
65 | vcmpgtub v7, v2, v4; \ |
66 | vand v8, v7, v6; \ |
67 | vand v8, v8, v3; \ |
68 | vor v4, v8, v4; \ |
69 | vcmpgtub v6, reg, v1; \ |
70 | vcmpgtub v7, v2, reg; \ |
71 | vand v8, v7, v6; \ |
72 | vand v8, v8, v3; \ |
73 | vor reg, v8, reg; \ |
74 | vcmpequb. v6, reg, v4; |
75 | |
76 | #define FRAMESIZE (FRAME_MIN_SIZE+48) |
77 | .machine power8 |
78 | ENTRY (STRCASESTR, 4) |
79 | CALL_MCOUNT 2 |
80 | mflr r0 /* Load link register LR to r0. */ |
81 | std r31, -8(r1) /* Save callers register r31. */ |
82 | std r30, -16(r1) /* Save callers register r30. */ |
83 | std r29, -24(r1) /* Save callers register r29. */ |
84 | std r28, -32(r1) /* Save callers register r28. */ |
85 | std r27, -40(r1) /* Save callers register r27. */ |
86 | std r0, 16(r1) /* Store the link register. */ |
87 | cfi_offset(r31, -8) |
88 | cfi_offset(r30, -16) |
89 | cfi_offset(r29, -24) |
90 | cfi_offset(r28, -32) |
91 | cfi_offset(r27, -40) |
92 | cfi_offset(lr, 16) |
93 | stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */ |
94 | cfi_adjust_cfa_offset(FRAMESIZE) |
95 | |
96 | dcbt 0, r3 |
97 | dcbt 0, r4 |
98 | cmpdi cr7, r3, 0 /* Input validation. */ |
99 | beq cr7, L(retnull) |
100 | cmpdi cr7, r4, 0 |
101 | beq cr7, L(retnull) |
102 | |
103 | mr r29, r3 |
104 | mr r30, r4 |
105 | /* Load first byte from r4 and check if its null. */ |
106 | lbz r6, 0(r4) |
107 | cmpdi cr7, r6, 0 |
108 | beq cr7, L(ret_r3) |
109 | |
110 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) |
111 | add r9, r10, __libc_tsd_LOCALE@tls |
112 | ld r9, 0(r9) |
113 | ld r9, LOCALE_CTYPE_TOUPPER(r9) |
114 | sldi r10, r6, 2 /* Convert to upper case. */ |
115 | lwzx r28, r9, r10 |
116 | |
117 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) |
118 | add r11, r10, __libc_tsd_LOCALE@tls |
119 | ld r11, 0(r11) |
120 | ld r11, LOCALE_CTYPE_TOLOWER(r11) |
121 | sldi r10, r6, 2 /* Convert to lower case. */ |
122 | lwzx r27, r11, r10 |
123 | |
124 | /* Check if the first char is present. */ |
125 | mr r4, r27 |
126 | bl STRCHR |
127 | nop |
128 | mr r5, r3 |
129 | mr r3, r29 |
130 | mr r29, r5 |
131 | mr r4, r28 |
132 | bl STRCHR |
133 | nop |
134 | cmpdi cr7, r29, 0 |
135 | beq cr7, L(firstpos) |
136 | cmpdi cr7, r3, 0 |
137 | beq cr7, L(skipcheck) |
138 | cmpw cr7, r3, r29 |
139 | ble cr7, L(firstpos) |
140 | /* Move r3 to the first occurrence. */ |
141 | L(skipcheck): |
142 | mr r3, r29 |
143 | L(firstpos): |
144 | mr r29, r3 |
145 | |
146 | sldi r9, r27, 8 |
147 | or r28, r9, r28 |
148 | /* Reg r27 is used to count the number of iterations. */ |
149 | li r27, 0 |
150 | /* If first char of search str is not present. */ |
151 | cmpdi cr7, r3, 0 |
152 | ble cr7, L(end) |
153 | |
154 | /* Find the length of pattern. */ |
155 | mr r3, r30 |
156 | bl STRLEN |
157 | nop |
158 | |
159 | cmpdi cr7, r3, 0 /* If search str is null. */ |
160 | beq cr7, L(ret_r3) |
161 | |
162 | mr r31, r3 |
163 | mr r4, r3 |
164 | mr r3, r29 |
165 | bl STRNLEN |
166 | nop |
167 | |
168 | cmpd cr7, r3, r31 /* If len(r3) < len(r4). */ |
169 | blt cr7, L(retnull) |
170 | |
171 | mr r3, r29 |
172 | |
173 | /* Locales not matching ASCII for single bytes. */ |
174 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) |
175 | add r9, r10, __libc_tsd_LOCALE@tls |
176 | ld r9, 0(r9) |
177 | ld r7, 0(r9) |
178 | addi r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES |
179 | lwz r8, 0(r7) |
180 | cmpdi cr7, r8, 1 |
181 | beq cr7, L(bytebybyte) |
182 | |
183 | /* If len(r4) < 16 handle byte by byte. */ |
184 | /* For shorter strings we will not use vector registers. */ |
185 | cmpdi cr7, r31, 16 |
186 | blt cr7, L(bytebybyte) |
187 | |
188 | /* Comparison values used for TOLOWER. */ |
189 | /* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte. */ |
190 | vspltish v0, 0 |
191 | vspltisb v5, 2 |
192 | vspltisb v4, 4 |
193 | vsl v3, v5, v4 |
194 | vaddubm v1, v3, v3 |
195 | vspltisb v5, 15 |
196 | vaddubm v2, v5, v5 |
197 | vaddubm v2, v1, v2 |
198 | vspltisb v4, -3 |
199 | vaddubm v2, v2, v4 |
200 | |
201 | /* |
202 | 1. Load 16 bytes from r3 and r4 |
203 | 2. Check if there is null, If yes, proceed byte by byte path. |
204 | 3. Else,Convert both to lowercase and compare. |
205 | 4. If they are same proceed to 1. |
206 | 5. If they dont match, find if first char of r4 is present in the |
207 | loaded 16 byte of r3. |
208 | 6. If yes, move position, load next 16 bytes of r3 and proceed to 2. |
209 | */ |
210 | |
211 | mr r8, r3 /* Save r3 for future use. */ |
212 | mr r4, r30 /* Restore r4. */ |
213 | clrldi r10, r4, 60 |
214 | lvx v5, 0, r4 /* Load 16 bytes from r4. */ |
215 | cmpdi cr7, r10, 0 |
216 | beq cr7, L(begin2) |
217 | /* If r4 is unaligned, load another 16 bytes. */ |
218 | #ifdef __LITTLE_ENDIAN__ |
219 | lvsr v7, 0, r4 |
220 | #else |
221 | lvsl v7, 0, r4 |
222 | #endif |
223 | addi r5, r4, 16 |
224 | lvx v9, 0, r5 |
225 | #ifdef __LITTLE_ENDIAN__ |
226 | vperm v5, v9, v5, v7 |
227 | #else |
228 | vperm v5, v5, v9, v7 |
229 | #endif |
230 | L(begin2): |
231 | lvx v4, 0, r3 |
232 | vcmpequb. v7, v0, v4 /* Check for null. */ |
233 | beq cr6, L(nullchk6) |
234 | b L(trailcheck) |
235 | |
236 | .align 4 |
237 | L(nullchk6): |
238 | clrldi r10, r3, 60 |
239 | cmpdi cr7, r10, 0 |
240 | beq cr7, L(next16) |
241 | #ifdef __LITTLE_ENDIAN__ |
242 | lvsr v7, 0, r3 |
243 | #else |
244 | lvsl v7, 0, r3 |
245 | #endif |
246 | addi r5, r3, 16 |
247 | /* If r3 is unaligned, load another 16 bytes. */ |
248 | lvx v10, 0, r5 |
249 | #ifdef __LITTLE_ENDIAN__ |
250 | vperm v4, v10, v4, v7 |
251 | #else |
252 | vperm v4, v4, v10, v7 |
253 | #endif |
254 | L(next16): |
255 | vcmpequb. v6, v0, v5 /* Check for null. */ |
256 | beq cr6, L(nullchk) |
257 | b L(trailcheck) |
258 | |
259 | .align 4 |
260 | L(nullchk): |
261 | vcmpequb. v6, v0, v4 |
262 | beq cr6, L(nullchk1) |
263 | b L(retnull) |
264 | |
265 | .align 4 |
266 | L(nullchk1): |
267 | /* Convert both v3 and v4 to lower. */ |
268 | TOLOWER(v5) |
269 | /* If both are same, branch to match. */ |
270 | blt cr6, L(match) |
271 | /* Find if the first char is present in next 15 bytes. */ |
272 | #ifdef __LITTLE_ENDIAN__ |
273 | vspltb v6, v5, 15 |
274 | vsldoi v7, v0, v4, 15 |
275 | #else |
276 | vspltb v6, v5, 0 |
277 | vspltisb v7, 8 |
278 | vslo v7, v4, v7 |
279 | #endif |
280 | vcmpequb v7, v6, v7 |
281 | vcmpequb. v6, v0, v7 |
282 | /* Shift r3 by 16 bytes and proceed. */ |
283 | blt cr6, L(shift16) |
284 | vclzd v8, v7 |
285 | #ifdef __LITTLE_ENDIAN__ |
286 | vspltb v6, v8, 15 |
287 | #else |
288 | vspltb v6, v8, 7 |
289 | #endif |
290 | vcmpequb. v6, v6, v1 |
291 | /* Shift r3 by 8 bytes and proceed. */ |
292 | blt cr6, L(shift8) |
293 | b L(begin) |
294 | |
295 | .align 4 |
296 | L(match): |
297 | /* There is a match of 16 bytes, check next bytes. */ |
298 | cmpdi cr7, r31, 16 |
299 | mr r29, r3 |
300 | beq cr7, L(ret_r3) |
301 | |
302 | L(secondmatch): |
303 | addi r3, r3, 16 |
304 | addi r4, r4, 16 |
305 | /* Load next 16 bytes of r3 and r4 and compare. */ |
306 | clrldi r10, r4, 60 |
307 | cmpdi cr7, r10, 0 |
308 | beq cr7, L(nextload) |
309 | /* Handle unaligned case. */ |
310 | vor v6, v9, v9 |
311 | vcmpequb. v7, v0, v6 |
312 | beq cr6, L(nullchk2) |
313 | b L(trailcheck) |
314 | |
315 | .align 4 |
316 | L(nullchk2): |
317 | #ifdef __LITTLE_ENDIAN__ |
318 | lvsr v7, 0, r4 |
319 | #else |
320 | lvsl v7, 0, r4 |
321 | #endif |
322 | addi r5, r4, 16 |
323 | /* If r4 is unaligned, load another 16 bytes. */ |
324 | lvx v9, 0, r5 |
325 | #ifdef __LITTLE_ENDIAN__ |
326 | vperm v11, v9, v6, v7 |
327 | #else |
328 | vperm v11, v6, v9, v7 |
329 | #endif |
330 | b L(compare) |
331 | |
332 | .align 4 |
333 | L(nextload): |
334 | lvx v11, 0, r4 |
335 | L(compare): |
336 | vcmpequb. v7, v0, v11 |
337 | beq cr6, L(nullchk3) |
338 | b L(trailcheck) |
339 | |
340 | .align 4 |
341 | L(nullchk3): |
342 | clrldi r10, r3, 60 |
343 | cmpdi cr7, r10, 0 |
344 | beq cr7, L(nextload1) |
345 | /* Handle unaligned case. */ |
346 | vor v4, v10, v10 |
347 | vcmpequb. v7, v0, v4 |
348 | beq cr6, L(nullchk4) |
349 | b L(retnull) |
350 | |
351 | .align 4 |
352 | L(nullchk4): |
353 | #ifdef __LITTLE_ENDIAN__ |
354 | lvsr v7, 0, r3 |
355 | #else |
356 | lvsl v7, 0, r3 |
357 | #endif |
358 | addi r5, r3, 16 |
359 | /* If r3 is unaligned, load another 16 bytes. */ |
360 | lvx v10, 0, r5 |
361 | #ifdef __LITTLE_ENDIAN__ |
362 | vperm v4, v10, v4, v7 |
363 | #else |
364 | vperm v4, v4, v10, v7 |
365 | #endif |
366 | b L(compare1) |
367 | |
368 | .align 4 |
369 | L(nextload1): |
370 | lvx v4, 0, r3 |
371 | L(compare1): |
372 | vcmpequb. v7, v0, v4 |
373 | beq cr6, L(nullchk5) |
374 | b L(retnull) |
375 | |
376 | .align 4 |
377 | L(nullchk5): |
378 | /* Convert both v3 and v4 to lower. */ |
379 | TOLOWER(v11) |
380 | /* If both are same, branch to secondmatch. */ |
381 | blt cr6, L(secondmatch) |
382 | /* Continue the search. */ |
383 | b L(begin) |
384 | |
385 | .align 4 |
386 | L(trailcheck): |
387 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) |
388 | add r11, r10, __libc_tsd_LOCALE@tls |
389 | ld r11, 0(r11) |
390 | ld r11, LOCALE_CTYPE_TOLOWER(r11) |
391 | L(loop2): |
392 | lbz r5, 0(r3) /* Load byte from r3. */ |
393 | lbz r6, 0(r4) /* Load next byte from r4. */ |
394 | cmpdi cr7, r6, 0 /* Is it null? */ |
395 | beq cr7, L(updater3) |
396 | cmpdi cr7, r5, 0 /* Is it null? */ |
397 | beq cr7, L(retnull) /* If yes, return. */ |
398 | addi r3, r3, 1 |
399 | addi r4, r4, 1 /* Increment r4. */ |
400 | sldi r10, r5, 2 /* Convert to lower case. */ |
401 | lwzx r10, r11, r10 |
402 | sldi r7, r6, 2 /* Convert to lower case. */ |
403 | lwzx r7, r11, r7 |
404 | cmpw cr7, r7, r10 /* Compare with byte from r4. */ |
405 | bne cr7, L(begin) |
406 | b L(loop2) |
407 | |
408 | .align 4 |
409 | L(shift8): |
410 | addi r8, r8, 7 |
411 | b L(begin) |
412 | .align 4 |
413 | L(shift16): |
414 | addi r8, r8, 15 |
415 | .align 4 |
416 | L(begin): |
417 | addi r8, r8, 1 |
418 | mr r3, r8 |
419 | /* When our iterations exceed ITERATIONS,fall back to default. */ |
420 | addi r27, r27, 1 |
421 | cmpdi cr7, r27, ITERATIONS |
422 | beq cr7, L(default) |
423 | mr r4, r30 /* Restore r4. */ |
424 | b L(begin2) |
425 | |
426 | /* Handling byte by byte. */ |
427 | .align 4 |
428 | L(loop1): |
429 | mr r3, r8 |
430 | addi r27, r27, 1 |
431 | cmpdi cr7, r27, ITERATIONS |
432 | beq cr7, L(default) |
433 | mr r29, r8 |
434 | srdi r4, r28, 8 |
435 | /* Check if the first char is present. */ |
436 | bl STRCHR |
437 | nop |
438 | mr r5, r3 |
439 | mr r3, r29 |
440 | mr r29, r5 |
441 | sldi r4, r28, 56 |
442 | srdi r4, r4, 56 |
443 | bl STRCHR |
444 | nop |
445 | cmpdi cr7, r29, 0 |
446 | beq cr7, L(nextpos) |
447 | cmpdi cr7, r3, 0 |
448 | beq cr7, L(skipcheck1) |
449 | cmpw cr7, r3, r29 |
450 | ble cr7, L(nextpos) |
451 | /* Move r3 to first occurrence. */ |
452 | L(skipcheck1): |
453 | mr r3, r29 |
454 | L(nextpos): |
455 | mr r29, r3 |
456 | cmpdi cr7, r3, 0 |
457 | ble cr7, L(retnull) |
458 | L(bytebybyte): |
459 | ld r10, __libc_tsd_LOCALE@got@tprel(r2) |
460 | add r11, r10, __libc_tsd_LOCALE@tls |
461 | ld r11, 0(r11) |
462 | ld r11, LOCALE_CTYPE_TOLOWER(r11) |
463 | mr r4, r30 /* Restore r4. */ |
464 | mr r8, r3 /* Save r3. */ |
465 | addi r8, r8, 1 |
466 | |
467 | L(loop): |
468 | addi r3, r3, 1 |
469 | lbz r5, 0(r3) /* Load byte from r3. */ |
470 | addi r4, r4, 1 /* Increment r4. */ |
471 | lbz r6, 0(r4) /* Load next byte from r4. */ |
472 | cmpdi cr7, r6, 0 /* Is it null? */ |
473 | beq cr7, L(updater3) |
474 | cmpdi cr7, r5, 0 /* Is it null? */ |
475 | beq cr7, L(retnull) /* If yes, return. */ |
476 | sldi r10, r5, 2 /* Convert to lower case. */ |
477 | lwzx r10, r11, r10 |
478 | sldi r7, r6, 2 /* Convert to lower case. */ |
479 | lwzx r7, r11, r7 |
480 | cmpw cr7, r7, r10 /* Compare with byte from r4. */ |
481 | bne cr7, L(loop1) |
482 | b L(loop) |
483 | |
484 | /* Handling return values. */ |
485 | .align 4 |
486 | L(updater3): |
487 | subf r3, r31, r3 /* Reduce r31 (len of r4) from r3. */ |
488 | b L(end) |
489 | |
490 | .align 4 |
491 | L(ret_r3): |
492 | mr r3, r29 /* Return point of match. */ |
493 | b L(end) |
494 | |
495 | .align 4 |
496 | L(retnull): |
497 | li r3, 0 /* Substring was not found. */ |
498 | b L(end) |
499 | |
500 | .align 4 |
501 | L(default): |
502 | mr r4, r30 |
503 | bl __strcasestr_ppc |
504 | nop |
505 | |
506 | .align 4 |
507 | L(end): |
508 | addi r1, r1, FRAMESIZE /* Restore stack pointer. */ |
509 | cfi_adjust_cfa_offset(-FRAMESIZE) |
510 | ld r0, 16(r1) /* Restore the saved link register. */ |
511 | ld r27, -40(r1) |
512 | ld r28, -32(r1) |
513 | ld r29, -24(r1) /* Restore callers save register r29. */ |
514 | ld r30, -16(r1) /* Restore callers save register r30. */ |
515 | ld r31, -8(r1) /* Restore callers save register r31. */ |
516 | cfi_restore(lr) |
517 | cfi_restore(r27) |
518 | cfi_restore(r28) |
519 | cfi_restore(r29) |
520 | cfi_restore(r30) |
521 | cfi_restore(r31) |
522 | mtlr r0 /* Branch to link register. */ |
523 | blr |
524 | END (STRCASESTR) |
525 | |
526 | weak_alias (__strcasestr, strcasestr) |
527 | libc_hidden_def (__strcasestr) |
528 | libc_hidden_builtin_def (strcasestr) |
529 | |