1 | /* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn. |
2 | Copyright (C) 2017-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* char *[r3] strrchr (char *s [r3], int c [r4]) */ |
22 | |
23 | #ifdef __LITTLE_ENDIAN__ |
24 | /* Find the match position from v6 and place result in r6. */ |
25 | # define CALCULATE_MATCH() \ |
26 | vbpermq v6, v6, v10; \ |
27 | vsldoi v6, v6, v6, 6; \ |
28 | mfvrd r7, v6; \ |
29 | cntlzd r6, r7; \ |
30 | subfic r6, r6, 15; |
31 | /* |
32 | * Find the first null position to mask bytes after null. |
33 | * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw. |
34 | * Result placed at v2. |
35 | */ |
36 | # define FIND_NULL_POS(reg) \ |
37 | vspltisb v11, -1; \ |
38 | vadduqm v11, reg, v11; \ |
39 | vandc v11, v11, reg; \ |
40 | vpopcntd v2, v11; \ |
41 | vspltb v11, v2, 15; \ |
42 | vcmpequb. v11, v11, v9; \ |
43 | blt cr6, 1f; \ |
44 | vsldoi v9, v0, v9, 1; \ |
45 | vslo v2, v2, v9; \ |
46 | 1: \ |
47 | vsumsws v2, v2, v0; |
48 | #else |
49 | # define CALCULATE_MATCH() \ |
50 | vbpermq v6, v6, v10; \ |
51 | mfvrd r7, v6; \ |
52 | addi r6, r7, -1; \ |
53 | andc r6, r6, r7; \ |
54 | popcntd r6, r6; \ |
55 | subfic r6, r6, 15; |
56 | # define FIND_NULL_POS(reg) \ |
57 | vclzd v2, reg; \ |
58 | vspltb v11, v2, 7; \ |
59 | vcmpequb. v11, v11, v9; \ |
60 | blt cr6, 1f; \ |
61 | vsldoi v9, v0, v9, 1; \ |
62 | vsro v2, v2, v9; \ |
63 | 1: \ |
64 | vsumsws v2, v2, v0; |
65 | #endif /* !__LITTLE_ENDIAN__ */ |
66 | |
67 | #ifndef STRRCHR |
68 | # define STRRCHR strrchr |
69 | #endif |
70 | .machine power8 |
71 | ENTRY_TOCLESS (STRRCHR) |
72 | CALL_MCOUNT 2 |
73 | dcbt 0,r3 |
74 | clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ |
75 | cmpdi cr7,r4,0 |
76 | ld r12,0(r8) /* Load doubleword from memory. */ |
77 | li r9,0 /* Used to store last occurrence. */ |
78 | li r0,0 /* Doubleword with null chars to use |
79 | with cmpb. */ |
80 | |
81 | rlwinm r6,r3,3,26,28 /* Calculate padding. */ |
82 | |
83 | beq cr7,L(null_match) |
84 | |
85 | /* Replicate byte to doubleword. */ |
86 | insrdi r4,r4,8,48 |
87 | insrdi r4,r4,16,32 |
88 | insrdi r4,r4,32,0 |
89 | |
90 | /* r4 is changed now. If it's passed more chars, then |
91 | check for null again. */ |
92 | cmpdi cr7,r4,0 |
93 | beq cr7,L(null_match) |
94 | /* Now r4 has a doubleword of c bytes and r0 has |
95 | a doubleword of null bytes. */ |
96 | |
97 | cmpb r10,r12,r4 /* Compare each byte against c byte. */ |
98 | cmpb r11,r12,r0 /* Compare each byte against null byte. */ |
99 | |
100 | /* Move the doublewords left and right to discard the bits that are |
101 | not part of the string and bring them back as zeros. */ |
102 | #ifdef __LITTLE_ENDIAN__ |
103 | srd r10,r10,r6 |
104 | srd r11,r11,r6 |
105 | sld r10,r10,r6 |
106 | sld r11,r11,r6 |
107 | #else |
108 | sld r10,r10,r6 |
109 | sld r11,r11,r6 |
110 | srd r10,r10,r6 |
111 | srd r11,r11,r6 |
112 | #endif |
113 | or r5,r10,r11 /* OR the results to speed things up. */ |
114 | cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes |
115 | have been found. */ |
116 | bne cr7,L(done) |
117 | |
118 | L(align): |
119 | andi. r12, r8, 15 |
120 | |
121 | /* Are we now aligned to a doubleword boundary? If so, skip to |
122 | the main loop. Otherwise, go through the alignment code. */ |
123 | |
124 | bne cr0, L(loop) |
125 | |
126 | /* Handle WORD2 of pair. */ |
127 | ldu r12,8(r8) |
128 | cmpb r10,r12,r4 |
129 | cmpb r11,r12,r0 |
130 | or r5,r10,r11 |
131 | cmpdi cr7,r5,0 |
132 | bne cr7,L(done) |
133 | b L(loop) /* We branch here (rather than falling through) |
134 | to skip the nops due to heavy alignment |
135 | of the loop below. */ |
136 | .p2align 5 |
137 | L(loop): |
138 | /* Load two doublewords, compare and merge in a |
139 | single register for speed. This is an attempt |
140 | to speed up the null-checking process for bigger strings. */ |
141 | ld r12,8(r8) |
142 | ldu r7,16(r8) |
143 | cmpb r10,r12,r4 |
144 | cmpb r11,r12,r0 |
145 | cmpb r6,r7,r4 |
146 | cmpb r7,r7,r0 |
147 | or r12,r10,r11 |
148 | or r5,r6,r7 |
149 | or r5,r12,r5 |
150 | cmpdi cr7,r5,0 |
151 | beq cr7,L(vector) |
152 | |
153 | /* OK, one (or both) of the doublewords contains a c/null byte. Check |
154 | the first doubleword and decrement the address in case the first |
155 | doubleword really contains a c/null byte. */ |
156 | cmpdi cr6,r12,0 |
157 | addi r8,r8,-8 |
158 | bne cr6,L(done) |
159 | |
160 | /* The c/null byte must be in the second doubleword. Adjust the |
161 | address again and move the result of cmpb to r10 so we can calculate |
162 | the pointer. */ |
163 | |
164 | mr r10,r6 |
165 | mr r11,r7 |
166 | addi r8,r8,8 |
167 | |
168 | /* r10/r11 have the output of the cmpb instructions, that is, |
169 | 0xff in the same position as the c/null byte in the original |
170 | doubleword from the string. Use that to calculate the pointer. */ |
171 | |
172 | L(done): |
173 | /* If there are more than one 0xff in r11, find the first position of |
174 | 0xff in r11 and fill r10 with 0 from that position. */ |
175 | cmpdi cr7,r11,0 |
176 | beq cr7,L(no_null) |
177 | #ifdef __LITTLE_ENDIAN__ |
178 | addi r3,r11,-1 |
179 | andc r3,r3,r11 |
180 | popcntd r0,r3 |
181 | #else |
182 | cntlzd r0,r11 |
183 | #endif |
184 | subfic r0,r0,63 |
185 | li r6,-1 |
186 | #ifdef __LITTLE_ENDIAN__ |
187 | srd r0,r6,r0 |
188 | #else |
189 | sld r0,r6,r0 |
190 | #endif |
191 | and r10,r0,r10 |
192 | L(no_null): |
193 | #ifdef __LITTLE_ENDIAN__ |
194 | cntlzd r0,r10 /* Count leading zeros before c matches. */ |
195 | addi r3,r10,-1 |
196 | andc r3,r3,r10 |
197 | addi r10,r11,-1 |
198 | andc r10,r10,r11 |
199 | cmpld cr7,r3,r10 |
200 | bgt cr7,L(no_match) |
201 | #else |
202 | addi r3,r10,-1 /* Count trailing zeros before c matches. */ |
203 | andc r3,r3,r10 |
204 | popcntd r0,r3 |
205 | cmpld cr7,r11,r10 |
206 | bgt cr7,L(no_match) |
207 | #endif |
208 | srdi r0,r0,3 /* Convert trailing zeros to bytes. */ |
209 | subfic r0,r0,7 |
210 | add r9,r8,r0 /* Return address of the matching c byte |
211 | or null in case c was not found. */ |
212 | li r0,0 |
213 | cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */ |
214 | beq cr7,L(align) |
215 | |
216 | .align 4 |
217 | L(no_match): |
218 | mr r3,r9 |
219 | blr |
220 | |
221 | /* Check the first 32B in GPR's and move to vectorized loop. */ |
222 | .p2align 5 |
223 | L(vector): |
224 | addi r3, r8, 8 |
225 | /* Make sure 32B aligned. */ |
226 | andi. r10, r3, 31 |
227 | bne cr0, L(loop) |
228 | vspltisb v0, 0 |
229 | /* Precompute vbpermq constant. */ |
230 | vspltisb v10, 3 |
231 | lvsl v11, r0, r0 |
232 | vslb v10, v11, v10 |
233 | mtvrd v1, r4 |
234 | li r5, 16 |
235 | vspltb v1, v1, 7 |
236 | /* Compare 32 bytes in each loop. */ |
237 | L(continue): |
238 | lvx v4, 0, r3 |
239 | lvx v5, r3, r5 |
240 | vcmpequb v2, v0, v4 |
241 | vcmpequb v3, v0, v5 |
242 | vcmpequb v6, v1, v4 |
243 | vcmpequb v7, v1, v5 |
244 | vor v8, v2, v3 |
245 | vor v9, v6, v7 |
246 | vor v11, v8, v9 |
247 | vcmpequb. v11, v0, v11 |
248 | addi r3, r3, 32 |
249 | blt cr6, L(continue) |
250 | vcmpequb. v8, v0, v8 |
251 | blt cr6, L(match) |
252 | |
253 | /* One (or both) of the quadwords contains c/null. */ |
254 | vspltisb v8, 2 |
255 | vspltisb v9, 5 |
256 | /* Precompute values used for comparison. */ |
257 | vsl v9, v8, v9 /* v9 = 0x4040404040404040. */ |
258 | vaddubm v8, v9, v9 |
259 | vsldoi v8, v0, v8, 1 /* v8 = 0x80. */ |
260 | |
261 | /* Check if null is in second qw. */ |
262 | vcmpequb. v11, v0, v2 |
263 | blt cr6, L(secondqw) |
264 | |
265 | /* Null found in first qw. */ |
266 | addi r8, r3, -32 |
267 | /* Calculate the null position. */ |
268 | FIND_NULL_POS(v2) |
269 | /* Check if null is in the first byte. */ |
270 | vcmpequb. v11, v0, v2 |
271 | blt cr6, L(no_match) |
272 | vsububm v2, v8, v2 |
273 | /* Mask unwanted bytes after null. */ |
274 | #ifdef __LITTLE_ENDIAN__ |
275 | vslo v6, v6, v2 |
276 | vsro v6, v6, v2 |
277 | #else |
278 | vsro v6, v6, v2 |
279 | vslo v6, v6, v2 |
280 | #endif |
281 | vcmpequb. v11, v0, v6 |
282 | blt cr6, L(no_match) |
283 | /* Found a match before null. */ |
284 | CALCULATE_MATCH() |
285 | add r3, r8, r6 |
286 | blr |
287 | |
288 | L(secondqw): |
289 | addi r8, r3, -16 |
290 | FIND_NULL_POS(v3) |
291 | vcmpequb. v11, v0, v2 |
292 | blt cr6, L(no_match1) |
293 | vsububm v2, v8, v2 |
294 | /* Mask unwanted bytes after null. */ |
295 | #ifdef __LITTLE_ENDIAN__ |
296 | vslo v7, v7, v2 |
297 | vsro v7, v7, v2 |
298 | #else |
299 | vsro v7, v7, v2 |
300 | vslo v7, v7, v2 |
301 | #endif |
302 | vcmpequb. v11, v0, v7 |
303 | blt cr6, L(no_match1) |
304 | addi r8, r8, 16 |
305 | vor v6, v0, v7 |
306 | L(no_match1): |
307 | addi r8, r8, -16 |
308 | vcmpequb. v11, v0, v6 |
309 | blt cr6, L(no_match) |
310 | /* Found a match before null. */ |
311 | CALCULATE_MATCH() |
312 | add r3, r8, r6 |
313 | blr |
314 | |
315 | L(match): |
316 | /* One (or both) of the quadwords contains a match. */ |
317 | mr r8, r3 |
318 | vcmpequb. v8, v0, v7 |
319 | blt cr6, L(firstqw) |
320 | /* Match found in second qw. */ |
321 | addi r8, r8, 16 |
322 | vor v6, v0, v7 |
323 | L(firstqw): |
324 | addi r8, r8, -32 |
325 | CALCULATE_MATCH() |
326 | add r9, r8, r6 /* Compute final length. */ |
327 | b L(continue) |
328 | /* We are here because strrchr was called with a null byte. */ |
329 | .align 4 |
330 | L(null_match): |
331 | /* r0 has a doubleword of null bytes. */ |
332 | |
333 | cmpb r5,r12,r0 /* Compare each byte against null bytes. */ |
334 | |
335 | /* Move the doublewords left and right to discard the bits that are |
336 | not part of the string and bring them back as zeros. */ |
337 | #ifdef __LITTLE_ENDIAN__ |
338 | srd r5,r5,r6 |
339 | sld r5,r5,r6 |
340 | #else |
341 | sld r5,r5,r6 |
342 | srd r5,r5,r6 |
343 | #endif |
344 | cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes |
345 | have been found. */ |
346 | bne cr7,L(done_null) |
347 | |
348 | andi. r12, r8, 15 |
349 | |
350 | /* Are we now aligned to a quadword boundary? If so, skip to |
351 | the main loop. Otherwise, go through the alignment code. */ |
352 | |
353 | bne cr0, L(loop_null) |
354 | |
355 | /* Handle WORD2 of pair. */ |
356 | ldu r12,8(r8) |
357 | cmpb r5,r12,r0 |
358 | cmpdi cr7,r5,0 |
359 | bne cr7,L(done_null) |
360 | b L(loop_null) /* We branch here (rather than falling through) |
361 | to skip the nops due to heavy alignment |
362 | of the loop below. */ |
363 | |
364 | /* Main loop to look for the end of the string. Since it's a |
365 | small loop (< 8 instructions), align it to 32-bytes. */ |
366 | .p2align 5 |
367 | L(loop_null): |
368 | /* Load two doublewords, compare and merge in a |
369 | single register for speed. This is an attempt |
370 | to speed up the null-checking process for bigger strings. */ |
371 | ld r12,8(r8) |
372 | ldu r11,16(r8) |
373 | cmpb r5,r12,r0 |
374 | cmpb r10,r11,r0 |
375 | or r6,r5,r10 |
376 | cmpdi cr7,r6,0 |
377 | beq cr7,L(vector1) |
378 | |
379 | /* OK, one (or both) of the doublewords contains a null byte. Check |
380 | the first doubleword and decrement the address in case the first |
381 | doubleword really contains a null byte. */ |
382 | |
383 | cmpdi cr6,r5,0 |
384 | addi r8,r8,-8 |
385 | bne cr6,L(done_null) |
386 | |
387 | /* The null byte must be in the second doubleword. Adjust the address |
388 | again and move the result of cmpb to r10 so we can calculate the |
389 | pointer. */ |
390 | |
391 | mr r5,r10 |
392 | addi r8,r8,8 |
393 | |
394 | /* r5 has the output of the cmpb instruction, that is, it contains |
395 | 0xff in the same position as the null byte in the original |
396 | doubleword from the string. Use that to calculate the pointer. */ |
397 | L(done_null): |
398 | #ifdef __LITTLE_ENDIAN__ |
399 | addi r0,r5,-1 |
400 | andc r0,r0,r5 |
401 | popcntd r0,r0 |
402 | #else |
403 | cntlzd r0,r5 /* Count leading zeros before the match. */ |
404 | #endif |
405 | srdi r0,r0,3 /* Convert trailing zeros to bytes. */ |
406 | add r3,r8,r0 /* Return address of the matching null byte. */ |
407 | blr |
408 | /* Check the first 32B in GPR's and move to vectorized loop. */ |
409 | .p2align 5 |
410 | L(vector1): |
411 | addi r3, r8, 8 |
412 | /* Make sure 32B aligned. */ |
413 | andi. r10, r3, 31 |
414 | bne cr0, L(loop_null) |
415 | vspltisb v0, 0 |
416 | /* Precompute vbpermq constant. */ |
417 | vspltisb v10, 3 |
418 | lvsl v11, r0, r0 |
419 | vslb v10, v11, v10 |
420 | li r5, 16 |
421 | /* Compare 32 bytes in each loop. */ |
422 | L(continue1): |
423 | lvx v4, 0, r3 |
424 | lvx v5, r3, r5 |
425 | vcmpequb v2, v0, v4 |
426 | vcmpequb v3, v0, v5 |
427 | vor v8, v2, v3 |
428 | vcmpequb. v11, v0, v8 |
429 | addi r3, r3, 32 |
430 | blt cr6, L(continue1) |
431 | addi r3, r3, -32 |
432 | vbpermq v2, v2, v10 |
433 | vbpermq v3, v3, v10 |
434 | /* Shift each component into its correct position for merging. */ |
435 | #ifdef __LITTLE_ENDIAN__ |
436 | vsldoi v3, v3, v3, 2 |
437 | #else |
438 | vsldoi v2, v2, v2, 6 |
439 | vsldoi v3, v3, v3, 4 |
440 | #endif |
441 | /* Merge the results and move to a GPR. */ |
442 | vor v4, v3, v2 |
443 | mfvrd r5, v4 |
444 | #ifdef __LITTLE_ENDIAN__ |
445 | addi r6, r5, -1 |
446 | andc r6, r6, r5 |
447 | popcntd r6, r6 |
448 | #else |
449 | cntlzd r6, r5 /* Count leading zeros before the match. */ |
450 | #endif |
451 | add r3, r3, r6 /* Compute final length. */ |
452 | blr |
453 | END_GEN_TB (STRRCHR, TB_TOCLESS) |
454 | weak_alias (strrchr, rindex) |
455 | libc_hidden_builtin_def (strrchr) |
456 | |