1/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* char *[r3] strrchr (char *s [r3], int c [r4]) */
22
23#ifdef __LITTLE_ENDIAN__
24/* Find the match position from v6 and place result in r6. */
25# define CALCULATE_MATCH() \
26 vbpermq v6, v6, v10; \
27 vsldoi v6, v6, v6, 6; \
28 mfvrd r7, v6; \
29 cntlzd r6, r7; \
30 subfic r6, r6, 15;
31/*
32 * Find the first null position to mask bytes after null.
33 * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
34 * Result placed at v2.
35 */
36# define FIND_NULL_POS(reg) \
37 vspltisb v11, -1; \
38 vadduqm v11, reg, v11; \
39 vandc v11, v11, reg; \
40 vpopcntd v2, v11; \
41 vspltb v11, v2, 15; \
42 vcmpequb. v11, v11, v9; \
43 blt cr6, 1f; \
44 vsldoi v9, v0, v9, 1; \
45 vslo v2, v2, v9; \
461: \
47 vsumsws v2, v2, v0;
48#else
49# define CALCULATE_MATCH() \
50 vbpermq v6, v6, v10; \
51 mfvrd r7, v6; \
52 addi r6, r7, -1; \
53 andc r6, r6, r7; \
54 popcntd r6, r6; \
55 subfic r6, r6, 15;
56# define FIND_NULL_POS(reg) \
57 vclzd v2, reg; \
58 vspltb v11, v2, 7; \
59 vcmpequb. v11, v11, v9; \
60 blt cr6, 1f; \
61 vsldoi v9, v0, v9, 1; \
62 vsro v2, v2, v9; \
631: \
64 vsumsws v2, v2, v0;
65#endif /* !__LITTLE_ENDIAN__ */
66
67#ifndef STRRCHR
68# define STRRCHR strrchr
69#endif
70 .machine power8
71ENTRY_TOCLESS (STRRCHR)
72 CALL_MCOUNT 2
73 dcbt 0,r3
74 clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
75 cmpdi cr7,r4,0
76 ld r12,0(r8) /* Load doubleword from memory. */
77 li r9,0 /* Used to store last occurrence. */
78 li r0,0 /* Doubleword with null chars to use
79 with cmpb. */
80
81 rlwinm r6,r3,3,26,28 /* Calculate padding. */
82
83 beq cr7,L(null_match)
84
85 /* Replicate byte to doubleword. */
86 insrdi r4,r4,8,48
87 insrdi r4,r4,16,32
88 insrdi r4,r4,32,0
89
90 /* r4 is changed now. If it's passed more chars, then
91 check for null again. */
92 cmpdi cr7,r4,0
93 beq cr7,L(null_match)
94 /* Now r4 has a doubleword of c bytes and r0 has
95 a doubleword of null bytes. */
96
97 cmpb r10,r12,r4 /* Compare each byte against c byte. */
98 cmpb r11,r12,r0 /* Compare each byte against null byte. */
99
100 /* Move the doublewords left and right to discard the bits that are
101 not part of the string and bring them back as zeros. */
102#ifdef __LITTLE_ENDIAN__
103 srd r10,r10,r6
104 srd r11,r11,r6
105 sld r10,r10,r6
106 sld r11,r11,r6
107#else
108 sld r10,r10,r6
109 sld r11,r11,r6
110 srd r10,r10,r6
111 srd r11,r11,r6
112#endif
113 or r5,r10,r11 /* OR the results to speed things up. */
114 cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
115 have been found. */
116 bne cr7,L(done)
117
118L(align):
119 andi. r12, r8, 15
120
121 /* Are we now aligned to a doubleword boundary? If so, skip to
122 the main loop. Otherwise, go through the alignment code. */
123
124 bne cr0, L(loop)
125
126 /* Handle WORD2 of pair. */
127 ldu r12,8(r8)
128 cmpb r10,r12,r4
129 cmpb r11,r12,r0
130 or r5,r10,r11
131 cmpdi cr7,r5,0
132 bne cr7,L(done)
133 b L(loop) /* We branch here (rather than falling through)
134 to skip the nops due to heavy alignment
135 of the loop below. */
136 .p2align 5
137L(loop):
138 /* Load two doublewords, compare and merge in a
139 single register for speed. This is an attempt
140 to speed up the null-checking process for bigger strings. */
141 ld r12,8(r8)
142 ldu r7,16(r8)
143 cmpb r10,r12,r4
144 cmpb r11,r12,r0
145 cmpb r6,r7,r4
146 cmpb r7,r7,r0
147 or r12,r10,r11
148 or r5,r6,r7
149 or r5,r12,r5
150 cmpdi cr7,r5,0
151 beq cr7,L(vector)
152
153 /* OK, one (or both) of the doublewords contains a c/null byte. Check
154 the first doubleword and decrement the address in case the first
155 doubleword really contains a c/null byte. */
156 cmpdi cr6,r12,0
157 addi r8,r8,-8
158 bne cr6,L(done)
159
160 /* The c/null byte must be in the second doubleword. Adjust the
161 address again and move the result of cmpb to r10 so we can calculate
162 the pointer. */
163
164 mr r10,r6
165 mr r11,r7
166 addi r8,r8,8
167
168 /* r10/r11 have the output of the cmpb instructions, that is,
169 0xff in the same position as the c/null byte in the original
170 doubleword from the string. Use that to calculate the pointer. */
171
172L(done):
173 /* If there are more than one 0xff in r11, find the first position of
174 0xff in r11 and fill r10 with 0 from that position. */
175 cmpdi cr7,r11,0
176 beq cr7,L(no_null)
177#ifdef __LITTLE_ENDIAN__
178 addi r3,r11,-1
179 andc r3,r3,r11
180 popcntd r0,r3
181#else
182 cntlzd r0,r11
183#endif
184 subfic r0,r0,63
185 li r6,-1
186#ifdef __LITTLE_ENDIAN__
187 srd r0,r6,r0
188#else
189 sld r0,r6,r0
190#endif
191 and r10,r0,r10
192L(no_null):
193#ifdef __LITTLE_ENDIAN__
194 cntlzd r0,r10 /* Count leading zeros before c matches. */
195 addi r3,r10,-1
196 andc r3,r3,r10
197 addi r10,r11,-1
198 andc r10,r10,r11
199 cmpld cr7,r3,r10
200 bgt cr7,L(no_match)
201#else
202 addi r3,r10,-1 /* Count trailing zeros before c matches. */
203 andc r3,r3,r10
204 popcntd r0,r3
205 cmpld cr7,r11,r10
206 bgt cr7,L(no_match)
207#endif
208 srdi r0,r0,3 /* Convert trailing zeros to bytes. */
209 subfic r0,r0,7
210 add r9,r8,r0 /* Return address of the matching c byte
211 or null in case c was not found. */
212 li r0,0
213 cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */
214 beq cr7,L(align)
215
216 .align 4
217L(no_match):
218 mr r3,r9
219 blr
220
221/* Check the first 32B in GPR's and move to vectorized loop. */
222 .p2align 5
223L(vector):
224 addi r3, r8, 8
225 /* Make sure 32B aligned. */
226 andi. r10, r3, 31
227 bne cr0, L(loop)
228 vspltisb v0, 0
229 /* Precompute vbpermq constant. */
230 vspltisb v10, 3
231 lvsl v11, r0, r0
232 vslb v10, v11, v10
233 mtvrd v1, r4
234 li r5, 16
235 vspltb v1, v1, 7
236 /* Compare 32 bytes in each loop. */
237L(continue):
238 lvx v4, 0, r3
239 lvx v5, r3, r5
240 vcmpequb v2, v0, v4
241 vcmpequb v3, v0, v5
242 vcmpequb v6, v1, v4
243 vcmpequb v7, v1, v5
244 vor v8, v2, v3
245 vor v9, v6, v7
246 vor v11, v8, v9
247 vcmpequb. v11, v0, v11
248 addi r3, r3, 32
249 blt cr6, L(continue)
250 vcmpequb. v8, v0, v8
251 blt cr6, L(match)
252
253 /* One (or both) of the quadwords contains c/null. */
254 vspltisb v8, 2
255 vspltisb v9, 5
256 /* Precompute values used for comparison. */
257 vsl v9, v8, v9 /* v9 = 0x4040404040404040. */
258 vaddubm v8, v9, v9
259 vsldoi v8, v0, v8, 1 /* v8 = 0x80. */
260
261 /* Check if null is in second qw. */
262 vcmpequb. v11, v0, v2
263 blt cr6, L(secondqw)
264
265 /* Null found in first qw. */
266 addi r8, r3, -32
267 /* Calculate the null position. */
268 FIND_NULL_POS(v2)
269 /* Check if null is in the first byte. */
270 vcmpequb. v11, v0, v2
271 blt cr6, L(no_match)
272 vsububm v2, v8, v2
273 /* Mask unwanted bytes after null. */
274#ifdef __LITTLE_ENDIAN__
275 vslo v6, v6, v2
276 vsro v6, v6, v2
277#else
278 vsro v6, v6, v2
279 vslo v6, v6, v2
280#endif
281 vcmpequb. v11, v0, v6
282 blt cr6, L(no_match)
283 /* Found a match before null. */
284 CALCULATE_MATCH()
285 add r3, r8, r6
286 blr
287
288L(secondqw):
289 addi r8, r3, -16
290 FIND_NULL_POS(v3)
291 vcmpequb. v11, v0, v2
292 blt cr6, L(no_match1)
293 vsububm v2, v8, v2
294 /* Mask unwanted bytes after null. */
295#ifdef __LITTLE_ENDIAN__
296 vslo v7, v7, v2
297 vsro v7, v7, v2
298#else
299 vsro v7, v7, v2
300 vslo v7, v7, v2
301#endif
302 vcmpequb. v11, v0, v7
303 blt cr6, L(no_match1)
304 addi r8, r8, 16
305 vor v6, v0, v7
306L(no_match1):
307 addi r8, r8, -16
308 vcmpequb. v11, v0, v6
309 blt cr6, L(no_match)
310 /* Found a match before null. */
311 CALCULATE_MATCH()
312 add r3, r8, r6
313 blr
314
315L(match):
316 /* One (or both) of the quadwords contains a match. */
317 mr r8, r3
318 vcmpequb. v8, v0, v7
319 blt cr6, L(firstqw)
320 /* Match found in second qw. */
321 addi r8, r8, 16
322 vor v6, v0, v7
323L(firstqw):
324 addi r8, r8, -32
325 CALCULATE_MATCH()
326 add r9, r8, r6 /* Compute final length. */
327 b L(continue)
328/* We are here because strrchr was called with a null byte. */
329 .align 4
330L(null_match):
331 /* r0 has a doubleword of null bytes. */
332
333 cmpb r5,r12,r0 /* Compare each byte against null bytes. */
334
335 /* Move the doublewords left and right to discard the bits that are
336 not part of the string and bring them back as zeros. */
337#ifdef __LITTLE_ENDIAN__
338 srd r5,r5,r6
339 sld r5,r5,r6
340#else
341 sld r5,r5,r6
342 srd r5,r5,r6
343#endif
344 cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
345 have been found. */
346 bne cr7,L(done_null)
347
348 andi. r12, r8, 15
349
350 /* Are we now aligned to a quadword boundary? If so, skip to
351 the main loop. Otherwise, go through the alignment code. */
352
353 bne cr0, L(loop_null)
354
355 /* Handle WORD2 of pair. */
356 ldu r12,8(r8)
357 cmpb r5,r12,r0
358 cmpdi cr7,r5,0
359 bne cr7,L(done_null)
360 b L(loop_null) /* We branch here (rather than falling through)
361 to skip the nops due to heavy alignment
362 of the loop below. */
363
364 /* Main loop to look for the end of the string. Since it's a
365 small loop (< 8 instructions), align it to 32-bytes. */
366 .p2align 5
367L(loop_null):
368 /* Load two doublewords, compare and merge in a
369 single register for speed. This is an attempt
370 to speed up the null-checking process for bigger strings. */
371 ld r12,8(r8)
372 ldu r11,16(r8)
373 cmpb r5,r12,r0
374 cmpb r10,r11,r0
375 or r6,r5,r10
376 cmpdi cr7,r6,0
377 beq cr7,L(vector1)
378
379 /* OK, one (or both) of the doublewords contains a null byte. Check
380 the first doubleword and decrement the address in case the first
381 doubleword really contains a null byte. */
382
383 cmpdi cr6,r5,0
384 addi r8,r8,-8
385 bne cr6,L(done_null)
386
387 /* The null byte must be in the second doubleword. Adjust the address
388 again and move the result of cmpb to r10 so we can calculate the
389 pointer. */
390
391 mr r5,r10
392 addi r8,r8,8
393
394 /* r5 has the output of the cmpb instruction, that is, it contains
395 0xff in the same position as the null byte in the original
396 doubleword from the string. Use that to calculate the pointer. */
397L(done_null):
398#ifdef __LITTLE_ENDIAN__
399 addi r0,r5,-1
400 andc r0,r0,r5
401 popcntd r0,r0
402#else
403 cntlzd r0,r5 /* Count leading zeros before the match. */
404#endif
405 srdi r0,r0,3 /* Convert trailing zeros to bytes. */
406 add r3,r8,r0 /* Return address of the matching null byte. */
407 blr
408/* Check the first 32B in GPR's and move to vectorized loop. */
409 .p2align 5
410L(vector1):
411 addi r3, r8, 8
412 /* Make sure 32B aligned. */
413 andi. r10, r3, 31
414 bne cr0, L(loop_null)
415 vspltisb v0, 0
416 /* Precompute vbpermq constant. */
417 vspltisb v10, 3
418 lvsl v11, r0, r0
419 vslb v10, v11, v10
420 li r5, 16
421 /* Compare 32 bytes in each loop. */
422L(continue1):
423 lvx v4, 0, r3
424 lvx v5, r3, r5
425 vcmpequb v2, v0, v4
426 vcmpequb v3, v0, v5
427 vor v8, v2, v3
428 vcmpequb. v11, v0, v8
429 addi r3, r3, 32
430 blt cr6, L(continue1)
431 addi r3, r3, -32
432 vbpermq v2, v2, v10
433 vbpermq v3, v3, v10
434 /* Shift each component into its correct position for merging. */
435#ifdef __LITTLE_ENDIAN__
436 vsldoi v3, v3, v3, 2
437#else
438 vsldoi v2, v2, v2, 6
439 vsldoi v3, v3, v3, 4
440#endif
441 /* Merge the results and move to a GPR. */
442 vor v4, v3, v2
443 mfvrd r5, v4
444#ifdef __LITTLE_ENDIAN__
445 addi r6, r5, -1
446 andc r6, r6, r5
447 popcntd r6, r6
448#else
449 cntlzd r6, r5 /* Count leading zeros before the match. */
450#endif
451 add r3, r3, r6 /* Compute final length. */
452 blr
453END_GEN_TB (STRRCHR, TB_TOCLESS)
454weak_alias (strrchr, rindex)
455libc_hidden_builtin_def (strrchr)
456

source code of glibc/sysdeps/powerpc/powerpc64/power8/strrchr.S