1 | /* Optimized strchr implementation for PowerPC64/POWER8. |
2 | Copyright (C) 2016-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | #ifdef USE_AS_STRCHRNUL |
22 | # ifndef STRCHRNUL |
23 | # define FUNC_NAME __strchrnul |
24 | # else |
25 | # define FUNC_NAME STRCHRNUL |
26 | # endif |
27 | #else |
28 | # ifndef STRCHR |
29 | # define FUNC_NAME strchr |
30 | # else |
31 | # define FUNC_NAME STRCHR |
32 | # endif |
33 | #endif /* !USE_AS_STRCHRNUL */ |
34 | |
35 | /* int [r3] strchr (char *s [r3], int c [r4]) */ |
36 | .machine power8 |
37 | ENTRY_TOCLESS (FUNC_NAME) |
38 | CALL_MCOUNT 2 |
39 | dcbt 0,r3 |
40 | clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ |
41 | cmpdi cr7,r4,0 |
42 | ld r12,0(r8) /* Load doubleword from memory. */ |
43 | li r0,0 /* Doubleword with null chars to use |
44 | with cmpb. */ |
45 | |
46 | rlwinm r6,r3,3,26,28 /* Calculate padding. */ |
47 | |
48 | beq cr7,L(null_match) |
49 | |
50 | /* Replicate byte to doubleword. */ |
51 | insrdi r4,r4,8,48 |
52 | insrdi r4,r4,16,32 |
53 | insrdi r4,r4,32,0 |
54 | |
55 | /* Now r4 has a doubleword of c bytes and r0 has |
56 | a doubleword of null bytes. */ |
57 | |
58 | cmpb r10,r12,r4 /* Compare each byte against c byte. */ |
59 | cmpb r11,r12,r0 /* Compare each byte against null byte. */ |
60 | |
61 | /* Move the doublewords left and right to discard the bits that are |
62 | not part of the string and bring them back as zeros. */ |
63 | #ifdef __LITTLE_ENDIAN__ |
64 | srd r10,r10,r6 |
65 | srd r11,r11,r6 |
66 | sld r10,r10,r6 |
67 | sld r11,r11,r6 |
68 | #else |
69 | sld r10,r10,r6 |
70 | sld r11,r11,r6 |
71 | srd r10,r10,r6 |
72 | srd r11,r11,r6 |
73 | #endif |
74 | or r5,r10,r11 /* OR the results to speed things up. */ |
75 | cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes |
76 | have been found. */ |
77 | bne cr7,L(done) |
78 | |
79 | mtcrf 0x01,r8 |
80 | |
81 | /* Are we now aligned to a doubleword boundary? If so, skip to |
82 | the main loop. Otherwise, go through the alignment code. */ |
83 | |
84 | bt 28,L(loop) |
85 | |
86 | /* Handle WORD2 of pair. */ |
87 | ldu r12,8(r8) |
88 | cmpb r10,r12,r4 |
89 | cmpb r11,r12,r0 |
90 | or r5,r10,r11 |
91 | cmpdi cr7,r5,0 |
92 | bne cr7,L(done) |
93 | b L(loop) /* We branch here (rather than falling through) |
94 | to skip the nops due to heavy alignment |
95 | of the loop below. */ |
96 | |
97 | .p2align 5 |
98 | L(loop): |
99 | /* Load two doublewords, compare and merge in a |
100 | single register for speed. This is an attempt |
101 | to speed up the null-checking process for bigger strings. */ |
102 | ld r12,8(r8) |
103 | ldu r9,16(r8) |
104 | cmpb r10,r12,r4 |
105 | cmpb r11,r12,r0 |
106 | cmpb r6,r9,r4 |
107 | cmpb r7,r9,r0 |
108 | or r5,r10,r11 |
109 | or r9,r6,r7 |
110 | or r12,r5,r9 |
111 | cmpdi cr7,r12,0 |
112 | beq cr7,L(vector) |
113 | /* OK, one (or both) of the doublewords contains a c/null byte. Check |
114 | the first doubleword and decrement the address in case the first |
115 | doubleword really contains a c/null byte. */ |
116 | |
117 | cmpdi cr6,r5,0 |
118 | addi r8,r8,-8 |
119 | bne cr6,L(done) |
120 | |
121 | /* The c/null byte must be in the second doubleword. Adjust the |
122 | address again and move the result of cmpb to r10 so we can calculate |
123 | the pointer. */ |
124 | |
125 | mr r10,r6 |
126 | mr r11,r7 |
127 | addi r8,r8,8 |
128 | #ifdef USE_AS_STRCHRNUL |
129 | mr r5, r9 |
130 | #endif |
131 | /* r10/r11 have the output of the cmpb instructions, that is, |
132 | 0xff in the same position as the c/null byte in the original |
133 | doubleword from the string. Use that to calculate the pointer. */ |
134 | L(done): |
135 | #ifdef USE_AS_STRCHRNUL |
136 | mr r10, r5 |
137 | #endif |
138 | #ifdef __LITTLE_ENDIAN__ |
139 | addi r3,r10,-1 |
140 | andc r3,r3,r10 |
141 | popcntd r0,r3 |
142 | # ifndef USE_AS_STRCHRNUL |
143 | addi r4,r11,-1 |
144 | andc r4,r4,r11 |
145 | cmpld cr7,r3,r4 |
146 | bgt cr7,L(no_match) |
147 | # endif |
148 | #else |
149 | cntlzd r0,r10 /* Count leading zeros before c matches. */ |
150 | # ifndef USE_AS_STRCHRNUL |
151 | cmpld cr7,r11,r10 |
152 | bgt cr7,L(no_match) |
153 | # endif |
154 | #endif |
155 | srdi r0,r0,3 /* Convert leading zeros to bytes. */ |
156 | add r3,r8,r0 /* Return address of the matching c byte |
157 | or null in case c was not found. */ |
158 | blr |
159 | |
160 | /* Check the first 32B in GPR's and move to vectorized loop. */ |
161 | .p2align 5 |
162 | L(vector): |
163 | addi r3, r8, 8 |
164 | andi. r10, r3, 31 |
165 | bne cr0, L(loop) |
166 | vspltisb v0, 0 |
167 | /* Precompute vbpermq constant. */ |
168 | vspltisb v10, 3 |
169 | lvsl v11, r0, r0 |
170 | vslb v10, v11, v10 |
171 | mtvrd v1, r4 |
172 | li r5, 16 |
173 | vspltb v1, v1, 7 |
174 | /* Compare 32 bytes in each loop. */ |
175 | L(continue): |
176 | lvx v4, 0, r3 |
177 | lvx v5, r3, r5 |
178 | vcmpequb v2, v0, v4 |
179 | vcmpequb v3, v0, v5 |
180 | vcmpequb v6, v1, v4 |
181 | vcmpequb v7, v1, v5 |
182 | vor v8, v2, v3 |
183 | vor v9, v6, v7 |
184 | vor v11, v8, v9 |
185 | vcmpequb. v11, v0, v11 |
186 | addi r3, r3, 32 |
187 | blt cr6, L(continue) |
188 | /* One (or both) of the quadwords contains a c/null byte. */ |
189 | addi r3, r3, -32 |
190 | #ifndef USE_AS_STRCHRNUL |
191 | vcmpequb. v11, v0, v9 |
192 | blt cr6, L(no_match) |
193 | #endif |
194 | /* Permute the first bit of each byte into bits 48-63. */ |
195 | vbpermq v2, v2, v10 |
196 | vbpermq v3, v3, v10 |
197 | vbpermq v6, v6, v10 |
198 | vbpermq v7, v7, v10 |
199 | /* Shift each component into its correct position for merging. */ |
200 | #ifdef __LITTLE_ENDIAN__ |
201 | vsldoi v3, v3, v3, 2 |
202 | vsldoi v7, v7, v7, 2 |
203 | #else |
204 | vsldoi v2, v2, v2, 6 |
205 | vsldoi v3, v3, v3, 4 |
206 | vsldoi v6, v6, v6, 6 |
207 | vsldoi v7, v7, v7, 4 |
208 | #endif |
209 | |
210 | /* Merge the results and move to a GPR. */ |
211 | vor v1, v3, v2 |
212 | vor v2, v6, v7 |
213 | vor v4, v1, v2 |
214 | mfvrd r5, v4 |
215 | #ifdef __LITTLE_ENDIAN__ |
216 | addi r6, r5, -1 |
217 | andc r6, r6, r5 |
218 | popcntd r6, r6 |
219 | #else |
220 | cntlzd r6, r5 /* Count leading zeros before the match. */ |
221 | #endif |
222 | add r3, r3, r6 /* Compute final length. */ |
223 | /* Return NULL if null found before c. */ |
224 | #ifndef USE_AS_STRCHRNUL |
225 | lbz r4, 0(r3) |
226 | cmpdi cr7, r4, 0 |
227 | beq cr7, L(no_match) |
228 | #endif |
229 | blr |
230 | |
231 | #ifndef USE_AS_STRCHRNUL |
232 | .align 4 |
233 | L(no_match): |
234 | li r3,0 |
235 | blr |
236 | #endif |
237 | |
238 | /* We are here because strchr was called with a null byte. */ |
239 | .align 4 |
240 | L(null_match): |
241 | /* r0 has a doubleword of null bytes. */ |
242 | |
243 | cmpb r5,r12,r0 /* Compare each byte against null bytes. */ |
244 | |
245 | /* Move the doublewords left and right to discard the bits that are |
246 | not part of the string and bring them back as zeros. */ |
247 | #ifdef __LITTLE_ENDIAN__ |
248 | srd r5,r5,r6 |
249 | sld r5,r5,r6 |
250 | #else |
251 | sld r5,r5,r6 |
252 | srd r5,r5,r6 |
253 | #endif |
254 | cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes |
255 | have been found. */ |
256 | bne cr7,L(done_null) |
257 | |
258 | mtcrf 0x01,r8 |
259 | |
260 | /* Are we now aligned to a quadword boundary? If so, skip to |
261 | the main loop. Otherwise, go through the alignment code. */ |
262 | |
263 | bt 28,L(loop_null) |
264 | |
265 | /* Handle WORD2 of pair. */ |
266 | ldu r12,8(r8) |
267 | cmpb r5,r12,r0 |
268 | cmpdi cr7,r5,0 |
269 | bne cr7,L(done_null) |
270 | b L(loop_null) /* We branch here (rather than falling through) |
271 | to skip the nops due to heavy alignment |
272 | of the loop below. */ |
273 | |
274 | /* Main loop to look for the end of the string. Since it's a |
275 | small loop (< 8 instructions), align it to 32-bytes. */ |
276 | .p2align 5 |
277 | L(loop_null): |
278 | /* Load two doublewords, compare and merge in a |
279 | single register for speed. This is an attempt |
280 | to speed up the null-checking process for bigger strings. */ |
281 | ld r12,8(r8) |
282 | ldu r11,16(r8) |
283 | cmpb r5,r12,r0 |
284 | cmpb r10,r11,r0 |
285 | or r6,r5,r10 |
286 | cmpdi cr7,r6,0 |
287 | beq cr7,L(vector1) |
288 | |
289 | /* OK, one (or both) of the doublewords contains a null byte. Check |
290 | the first doubleword and decrement the address in case the first |
291 | doubleword really contains a null byte. */ |
292 | |
293 | cmpdi cr6,r5,0 |
294 | addi r8,r8,-8 |
295 | bne cr6,L(done_null) |
296 | |
297 | /* The null byte must be in the second doubleword. Adjust the address |
298 | again and move the result of cmpb to r10 so we can calculate the |
299 | pointer. */ |
300 | |
301 | mr r5,r10 |
302 | addi r8,r8,8 |
303 | |
304 | /* r5 has the output of the cmpb instruction, that is, it contains |
305 | 0xff in the same position as the null byte in the original |
306 | doubleword from the string. Use that to calculate the pointer. */ |
307 | L(done_null): |
308 | #ifdef __LITTLE_ENDIAN__ |
309 | addi r0,r5,-1 |
310 | andc r0,r0,r5 |
311 | popcntd r0,r0 |
312 | #else |
313 | cntlzd r0,r5 /* Count leading zeros before the match. */ |
314 | #endif |
315 | srdi r0,r0,3 /* Convert leading zeros to bytes. */ |
316 | add r3,r8,r0 /* Return address of the matching null byte. */ |
317 | blr |
318 | .p2align 5 |
319 | L(vector1): |
320 | addi r3, r8, 8 |
321 | andi. r10, r3, 31 |
322 | bne cr0, L(loop_null) |
323 | vspltisb v8, -1 |
324 | vspltisb v0, 0 |
325 | vspltisb v10, 3 |
326 | lvsl v11, r0, r0 |
327 | vslb v10, v11, v10 |
328 | li r5, 16 |
329 | L(continue1): |
330 | lvx v4, 0, r3 |
331 | lvx v5, r3, r5 |
332 | vcmpequb v2, v0, v4 |
333 | vcmpequb v3, v0, v5 |
334 | vor v8, v2, v3 |
335 | vcmpequb. v11, v0, v8 |
336 | addi r3, r3, 32 |
337 | blt cr6, L(continue1) |
338 | addi r3, r3, -32 |
339 | L(end1): |
340 | vbpermq v2, v2, v10 |
341 | vbpermq v3, v3, v10 |
342 | /* Shift each component into its correct position for merging. */ |
343 | #ifdef __LITTLE_ENDIAN__ |
344 | vsldoi v3, v3, v3, 2 |
345 | #else |
346 | vsldoi v2, v2, v2, 6 |
347 | vsldoi v3, v3, v3, 4 |
348 | #endif |
349 | |
350 | /* Merge the results and move to a GPR. */ |
351 | vor v4, v3, v2 |
352 | mfvrd r5, v4 |
353 | #ifdef __LITTLE_ENDIAN__ |
354 | addi r6, r5, -1 |
355 | andc r6, r6, r5 |
356 | popcntd r6, r6 |
357 | #else |
358 | cntlzd r6, r5 /* Count leading zeros before the match. */ |
359 | #endif |
360 | add r3, r3, r6 /* Compute final length. */ |
361 | blr |
362 | END (FUNC_NAME) |
363 | |
364 | #ifndef USE_AS_STRCHRNUL |
365 | weak_alias (strchr, index) |
366 | libc_hidden_builtin_def (strchr) |
367 | #endif |
368 | |