1/* Optimized memchr implementation for POWER8.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]) */
22
23#ifndef MEMCHR
24# define MEMCHR __memchr
25#endif
26 .machine power8
27ENTRY_TOCLESS (MEMCHR)
28 CALL_MCOUNT 3
29 dcbt 0, r3
30 clrrdi r8, r3, 3
31 insrdi r4, r4, 8, 48
32
33 /* Calculate the last acceptable address and check for possible
34 addition overflow by using satured math:
35 r7 = r3 + r5
36 r7 |= -(r7 < x) */
37 add r7, r3, r5
38 subfc r6, r3, r7
39 subfe r9, r9, r9
40 extsw r6, r9
41 or r7, r7, r6
42
43 insrdi r4, r4, 16, 32
44 cmpldi r5, 32
45 li r9, -1
46 rlwinm r6, r3, 3, 26, 28 /* Calculate padding. */
47 insrdi r4, r4, 32, 0
48 mr r10, r7
49 addi r7, r7, -1
50#ifdef __LITTLE_ENDIAN__
51 sld r9, r9, r6
52#else
53 srd r9, r9, r6
54#endif
55 ble L(small_range)
56 andi. r11, r3, 63
57 beq cr0, L(align_qw)
58 clrldi r11, r3, 61
59 ld r12, 0(r8) /* Load doubleword from memory. */
60 cmpb r3, r12, r4 /* Check for BYTEs in DWORD1. */
61 and r3, r3, r9
62 clrldi r6, r7, 61 /* Byte count - 1 in last dword. */
63 clrrdi r7, r7, 3 /* Address of last doubleword. */
64 cmpldi cr7, r3, 0 /* Does r3 indicate we got a hit? */
65 bne cr7, L(done)
66 addi r8, r8, 8
67 addi r5, r5, -8
68 add r5, r5, r11
69
70 /* Are we now aligned to a quadword boundary? */
71 andi. r11, r8, 15
72 beq cr0, L(align_qw)
73
74 /* Handle DWORD to make it QW aligned. */
75 ld r12, 0(r8)
76 cmpb r3, r12, r4
77 cmpldi cr7, r3, 0
78 bne cr7, L(done)
79 addi r5, r5, -8
80 addi r8, r8, 8
81 /* At this point, r8 is 16B aligned. */
82L(align_qw):
83 vspltisb v0, 0
84 /* Precompute vbpermq constant. */
85 vspltisb v10, 3
86 li r0, 0
87 lvsl v11, r0, r0
88 vslb v10, v11, v10
89 mtvrd v1, r4
90 vspltb v1, v1, 7
91 cmpldi r5, 64
92 ble L(tail64)
93 /* Are we 64-byte aligned? If so, jump to the vectorized loop.
94 Note: aligning to 64-byte will necessarily slow down performance for
95 strings around 64 bytes in length due to the extra comparisons
96 required to check alignment for the vectorized loop. This is a
97 necessary tradeoff we are willing to take in order to speed up the
98 calculation for larger strings. */
99 andi. r11, r8, 63
100 beq cr0, L(preloop_64B)
101 /* In order to begin the 64B loop, it needs to be 64
102 bytes aligned. So read until it is 64B aligned. */
103 lvx v4, 0, r8
104 vcmpequb v6, v1, v4
105 vcmpequb. v11, v0, v6
106 bnl cr6, L(found_16B)
107 addi r8, r8, 16
108 addi r5, r5, -16
109
110 andi. r11, r8, 63
111 beq cr0, L(preloop_64B)
112 lvx v4, 0, r8
113 vcmpequb v6, v1, v4
114 vcmpequb. v11, v0, v6
115 bnl cr6, L(found_16B)
116 addi r8, r8, 16
117 addi r5, r5, -16
118
119 andi. r11, r8, 63
120 beq cr0, L(preloop_64B)
121 lvx v4, 0, r8
122 vcmpequb v6, v1, v4
123 vcmpequb. v11, v0, v6
124 bnl cr6, L(found_16B)
125 addi r8, r8, 16
126 addi r5, r5, -16
127 /* At this point it should be 64B aligned.
128 Prepare for the 64B loop. */
129L(preloop_64B):
130 cmpldi r5, 64 /* Check if r5 < 64. */
131 ble L(tail64)
132 sub r6, r10, r8
133 srdi r9, r6, 6 /* Number of loop iterations. */
134 mtctr r9 /* Setup the counter. */
135 li r11, 16 /* Load required offsets. */
136 li r9, 32
137 li r7, 48
138
139 /* Handle r5 > 64. Loop over the bytes in strides of 64B. */
140 .align 4
141L(loop):
142 lvx v2, 0, r8 /* Load 4 quadwords. */
143 lvx v3, r8, r11
144 lvx v4, v8, r9
145 lvx v5, v8, r7
146 vcmpequb v6, v1, v2
147 vcmpequb v7, v1, v3
148 vcmpequb v8, v1, v4
149 vcmpequb v9, v1, v5
150 vor v11, v6, v7
151 vor v12, v8, v9
152 vor v11, v11, v12 /* Compare and merge into one VR for speed. */
153 vcmpequb. v11, v0, v11
154 bnl cr6, L(found)
155 addi r8, r8, 64 /* Adjust address for the next iteration. */
156 bdnz L(loop)
157 clrldi r5, r6, 58
158
159 /* Handle remainder of 64B loop or r5 > 64. */
160 .align 4
161L(tail64):
162 cmpldi r5, 0
163 beq L(null)
164 lvx v4, 0, r8
165 vcmpequb v6, v1, v4
166 vcmpequb. v11, v0, v6
167 bnl cr6, L(found_16B)
168 addi r8, r8, 16
169 cmpldi cr6, r5, 16
170 ble cr6, L(null)
171 addi r5, r5, -16
172
173 lvx v4, 0, r8
174 vcmpequb v6, v1, v4
175 vcmpequb. v11, v0, v6
176 bnl cr6, L(found_16B)
177 addi r8, r8, 16
178 cmpldi cr6, r5, 16
179 ble cr6, L(null)
180 addi r5, r5, -16
181
182 lvx v4, 0, r8
183 vcmpequb v6, v1, v4
184 vcmpequb. v11, v0, v6
185 bnl cr6, L(found_16B)
186 addi r8, r8, 16
187 cmpldi cr6, r5, 16
188 ble cr6, L(null)
189 addi r5, r5, -16
190
191 lvx v4, 0, r8
192 vcmpequb v6, v1, v4
193 vcmpequb. v11, v0, v6
194 bnl cr6, L(found_16B)
195 li r3, 0
196 blr
197
198 /* Found a match in 64B loop. */
199 .align 4
200L(found):
201 /* Permute the first bit of each byte into bits 48-63. */
202 vbpermq v6, v6, v10
203 vbpermq v7, v7, v10
204 vbpermq v8, v8, v10
205 vbpermq v9, v9, v10
206 /* Shift each component into its correct position for merging. */
207#ifdef __LITTLE_ENDIAN__
208 vsldoi v7, v7, v7, 2
209 vsldoi v8, v8, v8, 4
210 vsldoi v9, v9, v9, 6
211#else
212 vsldoi v6, v6, v6, 6
213 vsldoi v7, v7, v7, 4
214 vsldoi v8, v8, v8, 2
215#endif
216 /* Merge the results and move to a GPR. */
217 vor v11, v6, v7
218 vor v4, v9, v8
219 vor v4, v11, v4
220 mfvrd r5, v4
221#ifdef __LITTLE_ENDIAN__
222 addi r6, r5, -1
223 andc r6, r6, r5
224 popcntd r6, r6
225#else
226 cntlzd r6, r5 /* Count leading zeros before the match. */
227#endif
228 add r3, r8, r6 /* Compute final length. */
229 blr
230
231 /* Found a match in last 16 bytes. */
232 .align 4
233L(found_16B):
234 /* Permute the first bit of each byte into bits 48-63. */
235 vbpermq v6, v6, v10
236 /* Shift each component into its correct position for merging. */
237#ifdef __LITTLE_ENDIAN__
238 mfvrd r7, v6
239 addi r6, r7, -1
240 andc r6, r6, r7
241 popcntd r6, r6
242#else
243 vsldoi v6, v6, v6, 6
244 mfvrd r7, v6
245 cntlzd r6, r7 /* Count leading zeros before the match. */
246#endif
247 add r3, r8, r6 /* Compute final length. */
248 cmpld r6, r5
249 bltlr
250 li r3, 0
251 blr
252
253 .align 4
254 /* r3 has the output of the cmpb instruction, that is, it contains
255 0xff in the same position as BYTE in the original
256 doubleword from the string. Use that to calculate the pointer.
257 We need to make sure BYTE is *before* the end of the range. */
258L(done):
259#ifdef __LITTLE_ENDIAN__
260 addi r0, r3, -1
261 andc r0, r0, r3
262 popcntd r0, r0 /* Count trailing zeros. */
263#else
264 cntlzd r0, r3 /* Count leading zeros before the match. */
265#endif
266 cmpld r8, r7 /* Are we on the last dword? */
267 srdi r0, r0, 3 /* Convert leading/trailing zeros to bytes. */
268 add r3, r8, r0
269 cmpld cr7, r0, r6 /* If on the last dword, check byte offset. */
270 bnelr
271 blelr cr7
272 li r3, 0
273 blr
274
275 .align 4
276L(null):
277 li r3, 0
278 blr
279
280/* Deals with size <= 32. */
281 .align 4
282L(small_range):
283 cmpldi r5, 0
284 beq L(null)
285 ld r12, 0(r8) /* Load word from memory. */
286 cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */
287 and r3, r3, r9
288 cmpldi cr7, r3, 0
289 clrldi r6, r7, 61 /* Byte count - 1 in last dword. */
290 clrrdi r7, r7, 3 /* Address of last doubleword. */
291 cmpld r8, r7 /* Are we done already? */
292 bne cr7, L(done)
293 beqlr
294
295 ldu r12, 8(r8)
296 cmpb r3, r12, r4
297 cmpldi cr6, r3, 0
298 cmpld r8, r7
299 bne cr6, L(done) /* Found something. */
300 beqlr /* Hit end of string (length). */
301
302 ldu r12, 8(r8)
303 cmpb r3, r12, r4
304 cmpldi cr6, r3, 0
305 cmpld r8, r7
306 bne cr6, L(done)
307 beqlr
308
309 ldu r12, 8(r8)
310 cmpb r3, r12, r4
311 cmpldi cr6, r3, 0
312 cmpld r8, r7
313 bne cr6, L(done)
314 beqlr
315
316 ldu r12, 8(r8)
317 cmpb r3, r12, r4
318 cmpldi cr6, r3, 0
319 bne cr6, L(done)
320 blr
321
322END (MEMCHR)
323weak_alias (__memchr, memchr)
324libc_hidden_builtin_def (memchr)
325

source code of glibc/sysdeps/powerpc/powerpc64/power8/memchr.S