1/* Optimized strchr implementation for PowerPC64/POWER8.
2 Copyright (C) 2016-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21#ifdef USE_AS_STRCHRNUL
22# ifndef STRCHRNUL
23# define FUNC_NAME __strchrnul
24# else
25# define FUNC_NAME STRCHRNUL
26# endif
27#else
28# ifndef STRCHR
29# define FUNC_NAME strchr
30# else
31# define FUNC_NAME STRCHR
32# endif
33#endif /* !USE_AS_STRCHRNUL */
34
35/* int [r3] strchr (char *s [r3], int c [r4]) */
36 .machine power8
37ENTRY_TOCLESS (FUNC_NAME)
38 CALL_MCOUNT 2
39 dcbt 0,r3
40 clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
41 cmpdi cr7,r4,0
42 ld r12,0(r8) /* Load doubleword from memory. */
43 li r0,0 /* Doubleword with null chars to use
44 with cmpb. */
45
46 rlwinm r6,r3,3,26,28 /* Calculate padding. */
47
48 beq cr7,L(null_match)
49
50 /* Replicate byte to doubleword. */
51 insrdi r4,r4,8,48
52 insrdi r4,r4,16,32
53 insrdi r4,r4,32,0
54
55 /* Now r4 has a doubleword of c bytes and r0 has
56 a doubleword of null bytes. */
57
58 cmpb r10,r12,r4 /* Compare each byte against c byte. */
59 cmpb r11,r12,r0 /* Compare each byte against null byte. */
60
61 /* Move the doublewords left and right to discard the bits that are
62 not part of the string and bring them back as zeros. */
63#ifdef __LITTLE_ENDIAN__
64 srd r10,r10,r6
65 srd r11,r11,r6
66 sld r10,r10,r6
67 sld r11,r11,r6
68#else
69 sld r10,r10,r6
70 sld r11,r11,r6
71 srd r10,r10,r6
72 srd r11,r11,r6
73#endif
74 or r5,r10,r11 /* OR the results to speed things up. */
75 cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
76 have been found. */
77 bne cr7,L(done)
78
79 mtcrf 0x01,r8
80
81 /* Are we now aligned to a doubleword boundary? If so, skip to
82 the main loop. Otherwise, go through the alignment code. */
83
84 bt 28,L(loop)
85
86 /* Handle WORD2 of pair. */
87 ldu r12,8(r8)
88 cmpb r10,r12,r4
89 cmpb r11,r12,r0
90 or r5,r10,r11
91 cmpdi cr7,r5,0
92 bne cr7,L(done)
93 b L(loop) /* We branch here (rather than falling through)
94 to skip the nops due to heavy alignment
95 of the loop below. */
96
97 .p2align 5
98L(loop):
99 /* Load two doublewords, compare and merge in a
100 single register for speed. This is an attempt
101 to speed up the null-checking process for bigger strings. */
102 ld r12,8(r8)
103 ldu r9,16(r8)
104 cmpb r10,r12,r4
105 cmpb r11,r12,r0
106 cmpb r6,r9,r4
107 cmpb r7,r9,r0
108 or r5,r10,r11
109 or r9,r6,r7
110 or r12,r5,r9
111 cmpdi cr7,r12,0
112 beq cr7,L(vector)
113 /* OK, one (or both) of the doublewords contains a c/null byte. Check
114 the first doubleword and decrement the address in case the first
115 doubleword really contains a c/null byte. */
116
117 cmpdi cr6,r5,0
118 addi r8,r8,-8
119 bne cr6,L(done)
120
121 /* The c/null byte must be in the second doubleword. Adjust the
122 address again and move the result of cmpb to r10 so we can calculate
123 the pointer. */
124
125 mr r10,r6
126 mr r11,r7
127 addi r8,r8,8
128#ifdef USE_AS_STRCHRNUL
129 mr r5, r9
130#endif
131 /* r10/r11 have the output of the cmpb instructions, that is,
132 0xff in the same position as the c/null byte in the original
133 doubleword from the string. Use that to calculate the pointer. */
134L(done):
135#ifdef USE_AS_STRCHRNUL
136 mr r10, r5
137#endif
138#ifdef __LITTLE_ENDIAN__
139 addi r3,r10,-1
140 andc r3,r3,r10
141 popcntd r0,r3
142# ifndef USE_AS_STRCHRNUL
143 addi r4,r11,-1
144 andc r4,r4,r11
145 cmpld cr7,r3,r4
146 bgt cr7,L(no_match)
147# endif
148#else
149 cntlzd r0,r10 /* Count leading zeros before c matches. */
150# ifndef USE_AS_STRCHRNUL
151 cmpld cr7,r11,r10
152 bgt cr7,L(no_match)
153# endif
154#endif
155 srdi r0,r0,3 /* Convert leading zeros to bytes. */
156 add r3,r8,r0 /* Return address of the matching c byte
157 or null in case c was not found. */
158 blr
159
160 /* Check the first 32B in GPR's and move to vectorized loop. */
161 .p2align 5
162L(vector):
163 addi r3, r8, 8
164 andi. r10, r3, 31
165 bne cr0, L(loop)
166 vspltisb v0, 0
167 /* Precompute vbpermq constant. */
168 vspltisb v10, 3
169 lvsl v11, r0, r0
170 vslb v10, v11, v10
171 mtvrd v1, r4
172 li r5, 16
173 vspltb v1, v1, 7
174 /* Compare 32 bytes in each loop. */
175L(continue):
176 lvx v4, 0, r3
177 lvx v5, r3, r5
178 vcmpequb v2, v0, v4
179 vcmpequb v3, v0, v5
180 vcmpequb v6, v1, v4
181 vcmpequb v7, v1, v5
182 vor v8, v2, v3
183 vor v9, v6, v7
184 vor v11, v8, v9
185 vcmpequb. v11, v0, v11
186 addi r3, r3, 32
187 blt cr6, L(continue)
188 /* One (or both) of the quadwords contains a c/null byte. */
189 addi r3, r3, -32
190#ifndef USE_AS_STRCHRNUL
191 vcmpequb. v11, v0, v9
192 blt cr6, L(no_match)
193#endif
194 /* Permute the first bit of each byte into bits 48-63. */
195 vbpermq v2, v2, v10
196 vbpermq v3, v3, v10
197 vbpermq v6, v6, v10
198 vbpermq v7, v7, v10
199 /* Shift each component into its correct position for merging. */
200#ifdef __LITTLE_ENDIAN__
201 vsldoi v3, v3, v3, 2
202 vsldoi v7, v7, v7, 2
203#else
204 vsldoi v2, v2, v2, 6
205 vsldoi v3, v3, v3, 4
206 vsldoi v6, v6, v6, 6
207 vsldoi v7, v7, v7, 4
208#endif
209
210 /* Merge the results and move to a GPR. */
211 vor v1, v3, v2
212 vor v2, v6, v7
213 vor v4, v1, v2
214 mfvrd r5, v4
215#ifdef __LITTLE_ENDIAN__
216 addi r6, r5, -1
217 andc r6, r6, r5
218 popcntd r6, r6
219#else
220 cntlzd r6, r5 /* Count leading zeros before the match. */
221#endif
222 add r3, r3, r6 /* Compute final length. */
223 /* Return NULL if null found before c. */
224#ifndef USE_AS_STRCHRNUL
225 lbz r4, 0(r3)
226 cmpdi cr7, r4, 0
227 beq cr7, L(no_match)
228#endif
229 blr
230
231#ifndef USE_AS_STRCHRNUL
232 .align 4
233L(no_match):
234 li r3,0
235 blr
236#endif
237
238/* We are here because strchr was called with a null byte. */
239 .align 4
240L(null_match):
241 /* r0 has a doubleword of null bytes. */
242
243 cmpb r5,r12,r0 /* Compare each byte against null bytes. */
244
245 /* Move the doublewords left and right to discard the bits that are
246 not part of the string and bring them back as zeros. */
247#ifdef __LITTLE_ENDIAN__
248 srd r5,r5,r6
249 sld r5,r5,r6
250#else
251 sld r5,r5,r6
252 srd r5,r5,r6
253#endif
254 cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes
255 have been found. */
256 bne cr7,L(done_null)
257
258 mtcrf 0x01,r8
259
260 /* Are we now aligned to a quadword boundary? If so, skip to
261 the main loop. Otherwise, go through the alignment code. */
262
263 bt 28,L(loop_null)
264
265 /* Handle WORD2 of pair. */
266 ldu r12,8(r8)
267 cmpb r5,r12,r0
268 cmpdi cr7,r5,0
269 bne cr7,L(done_null)
270 b L(loop_null) /* We branch here (rather than falling through)
271 to skip the nops due to heavy alignment
272 of the loop below. */
273
274 /* Main loop to look for the end of the string. Since it's a
275 small loop (< 8 instructions), align it to 32-bytes. */
276 .p2align 5
277L(loop_null):
278 /* Load two doublewords, compare and merge in a
279 single register for speed. This is an attempt
280 to speed up the null-checking process for bigger strings. */
281 ld r12,8(r8)
282 ldu r11,16(r8)
283 cmpb r5,r12,r0
284 cmpb r10,r11,r0
285 or r6,r5,r10
286 cmpdi cr7,r6,0
287 beq cr7,L(vector1)
288
289 /* OK, one (or both) of the doublewords contains a null byte. Check
290 the first doubleword and decrement the address in case the first
291 doubleword really contains a null byte. */
292
293 cmpdi cr6,r5,0
294 addi r8,r8,-8
295 bne cr6,L(done_null)
296
297 /* The null byte must be in the second doubleword. Adjust the address
298 again and move the result of cmpb to r10 so we can calculate the
299 pointer. */
300
301 mr r5,r10
302 addi r8,r8,8
303
304 /* r5 has the output of the cmpb instruction, that is, it contains
305 0xff in the same position as the null byte in the original
306 doubleword from the string. Use that to calculate the pointer. */
307L(done_null):
308#ifdef __LITTLE_ENDIAN__
309 addi r0,r5,-1
310 andc r0,r0,r5
311 popcntd r0,r0
312#else
313 cntlzd r0,r5 /* Count leading zeros before the match. */
314#endif
315 srdi r0,r0,3 /* Convert leading zeros to bytes. */
316 add r3,r8,r0 /* Return address of the matching null byte. */
317 blr
318 .p2align 5
319L(vector1):
320 addi r3, r8, 8
321 andi. r10, r3, 31
322 bne cr0, L(loop_null)
323 vspltisb v8, -1
324 vspltisb v0, 0
325 vspltisb v10, 3
326 lvsl v11, r0, r0
327 vslb v10, v11, v10
328 li r5, 16
329L(continue1):
330 lvx v4, 0, r3
331 lvx v5, r3, r5
332 vcmpequb v2, v0, v4
333 vcmpequb v3, v0, v5
334 vor v8, v2, v3
335 vcmpequb. v11, v0, v8
336 addi r3, r3, 32
337 blt cr6, L(continue1)
338 addi r3, r3, -32
339L(end1):
340 vbpermq v2, v2, v10
341 vbpermq v3, v3, v10
342 /* Shift each component into its correct position for merging. */
343#ifdef __LITTLE_ENDIAN__
344 vsldoi v3, v3, v3, 2
345#else
346 vsldoi v2, v2, v2, 6
347 vsldoi v3, v3, v3, 4
348#endif
349
350 /* Merge the results and move to a GPR. */
351 vor v4, v3, v2
352 mfvrd r5, v4
353#ifdef __LITTLE_ENDIAN__
354 addi r6, r5, -1
355 andc r6, r6, r5
356 popcntd r6, r6
357#else
358 cntlzd r6, r5 /* Count leading zeros before the match. */
359#endif
360 add r3, r3, r6 /* Compute final length. */
361 blr
362END (FUNC_NAME)
363
364#ifndef USE_AS_STRCHRNUL
365weak_alias (strchr, index)
366libc_hidden_builtin_def (strchr)
367#endif
368

source code of glibc/sysdeps/powerpc/powerpc64/power8/strchr.S