1/* Optimized memrchr implementation for PowerPC64/POWER8.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5]) */
22
23#ifndef MEMRCHR
24# define MEMRCHR __memrchr
25#endif
26 .machine power8
27ENTRY_TOCLESS (MEMRCHR)
28 CALL_MCOUNT 3
29 add r7, r3, r5 /* Calculate the last acceptable address. */
30 neg r0, r7
31 addi r7, r7, -1
32 mr r10, r3
33 clrrdi r6, r7, 7
34 li r9, 3<<5
35 dcbt r9, r6, 8 /* Stream hint, decreasing addresses. */
36
37 /* Replicate BYTE to doubleword. */
38 insrdi r4, r4, 8, 48
39 insrdi r4, r4, 16, 32
40 insrdi r4, r4, 32, 0
41 li r6, -8
42 li r9, -1
43 rlwinm r0, r0, 3, 26, 28 /* Calculate padding. */
44 clrrdi r8, r7, 3
45 srd r9, r9, r0
46 cmpldi r5, 32
47 clrrdi r0, r10, 3
48 ble L(small_range)
49
50#ifdef __LITTLE_ENDIAN__
51 ldx r12, 0, r8
52#else
53 ldbrx r12, 0, r8 /* Load reversed doubleword from memory. */
54#endif
55 cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */
56 and r3, r3, r9
57 cmpldi cr7, r3, 0 /* If r3 == 0, no BYTEs have been found. */
58 bne cr7, L(done)
59
60 /* Are we now aligned to a quadword boundary? If so, skip to
61 the main loop. Otherwise, go through the alignment code. */
62 andi. r12, r8, 15
63 beq cr0, L(align_qw)
64
65 /* Handle DWORD2 of pair. */
66#ifdef __LITTLE_ENDIAN__
67 ldx r12, r8, r6
68#else
69 ldbrx r12, r8, r6
70#endif
71 addi r8, r8, -8
72 cmpb r3, r12, r4
73 cmpldi cr7, r3, 0
74 bne cr7, L(done)
75
76 .align 4
77 /* At this point, r8 is 16B aligned. */
78L(align_qw):
79 sub r5, r8, r0
80 vspltisb v0, 0
81 /* Precompute vbpermq constant. */
82 vspltisb v10, 3
83 li r0, 0
84 lvsl v11, r0, r0
85 vslb v10, v11, v10
86 mtvrd v1, r4
87 vspltb v1, v1, 7
88 cmpldi r5, 64
89 ble L(tail64)
90 /* Are we 64-byte aligned? If so, jump to the vectorized loop.
91 Note: aligning to 64-byte will necessarily slow down performance for
92 strings around 64 bytes in length due to the extra comparisons
93 required to check alignment for the vectorized loop. This is a
94 necessary tradeoff we are willing to take in order to speed up the
95 calculation for larger strings. */
96 andi. r11, r8, 63
97 beq cr0, L(preloop_64B)
98 /* In order to begin the 64B loop, it needs to be 64
99 bytes aligned. So read until it is 64B aligned. */
100 addi r8, r8, -16
101 lvx v4, 0, r8
102 vcmpequb v6, v1, v4
103 vcmpequb. v11, v0, v6
104 bnl cr6, L(found_16B)
105 addi r5, r5, -16
106
107 andi. r11, r8, 63
108 beq cr0, L(preloop_64B)
109 addi r8, r8, -16
110 lvx v4, 0, r8
111 vcmpequb v6, v1, v4
112 vcmpequb. v11, v0, v6
113 bnl cr6, L(found_16B)
114 addi r5, r5, -16
115
116 andi. r11, r8, 63
117 beq cr0, L(preloop_64B)
118 addi r8, r8, -16
119 lvx v4, 0, r8
120 vcmpequb v6, v1, v4
121 vcmpequb. v11, v0, v6
122 bnl cr6, L(found_16B)
123 addi r5, r5, -16
124 /* At this point it should be 64B aligned.
125 Prepare for the 64B loop. */
126L(preloop_64B):
127 cmpldi r5, 64 /* Check if r5 < 64. */
128 ble L(tail64)
129 srdi r9, r5, 6 /* Number of loop iterations. */
130 mtctr r9 /* Setup the counter. */
131 li r11, 16 /* Load required offsets. */
132 li r9, 32
133 li r7, 48
134
135 /* Handle r5 > 64. Loop over the bytes in strides of 64B. */
136 .align 4
137L(loop):
138 addi r8, r8, -64 /* Adjust address for the next iteration. */
139 lvx v2, 0, r8 /* Load 4 quadwords. */
140 lvx v3, r8, r11
141 lvx v4, v8, r9
142 lvx v5, v8, r7
143 vcmpequb v6, v1, v2
144 vcmpequb v7, v1, v3
145 vcmpequb v8, v1, v4
146 vcmpequb v9, v1, v5
147 vor v11, v6, v7
148 vor v12, v8, v9
149 vor v11, v11, v12 /* Compare and merge into one VR for speed. */
150 vcmpequb. v11, v0, v11
151 bnl cr6, L(found)
152 bdnz L(loop)
153 clrldi r5, r5, 58
154
155 /* Handle remainder of 64B loop or r5 > 64. */
156 .align 4
157L(tail64):
158 cmpldi r5, 0
159 beq L(null)
160 addi r8, r8, -16
161 lvx v4, 0, r8
162 vcmpequb v6, v1, v4
163 vcmpequb. v11, v0, v6
164 bnl cr6, L(found_16B)
165 cmpldi cr6, r5, 16
166 ble cr6, L(null)
167 addi r5, r5, -16
168
169 addi r8, r8, -16
170 lvx v4, 0, r8
171 vcmpequb v6, v1, v4
172 vcmpequb. v11, v0, v6
173 bnl cr6, L(found_16B)
174 cmpldi cr6, r5, 16
175 ble cr6, L(null)
176 addi r5, r5, -16
177
178 addi r8, r8, -16
179 lvx v4, 0, r8
180 vcmpequb v6, v1, v4
181 vcmpequb. v11, v0, v6
182 bnl cr6, L(found_16B)
183 cmpldi cr6, r5, 16
184 ble cr6, L(null)
185 addi r5, r5, -16
186
187 addi r8, r8, -16
188 lvx v4, 0, r8
189 vcmpequb v6, v1, v4
190 vcmpequb. v11, v0, v6
191 bnl cr6, L(found_16B)
192 li r3, 0
193 blr
194
195 /* Found a match in 64B loop. */
196 .align 4
197L(found):
198 /* Permute the first bit of each byte into bits 48-63. */
199 vbpermq v6, v6, v10
200 vbpermq v7, v7, v10
201 vbpermq v8, v8, v10
202 vbpermq v9, v9, v10
203 /* Shift each component into its correct position for merging. */
204#ifdef __LITTLE_ENDIAN__
205 vsldoi v7, v7, v7, 2
206 vsldoi v8, v8, v8, 4
207 vsldoi v9, v9, v9, 6
208#else
209 vsldoi v6, v6, v6, 6
210 vsldoi v7, v7, v7, 4
211 vsldoi v8, v8, v8, 2
212#endif
213 /* Merge the results and move to a GPR. */
214 vor v11, v6, v7
215 vor v4, v9, v8
216 vor v4, v11, v4
217 mfvrd r5, v4
218#ifdef __LITTLE_ENDIAN__
219 cntlzd r6, r5 /* Count leading zeros before the match. */
220#else
221 addi r6, r5, -1
222 andc r6, r6, r5
223 popcntd r6, r6
224#endif
225 addi r8, r8, 63
226 sub r3, r8, r6 /* Compute final address. */
227 cmpld cr7, r3, r10
228 bgelr cr7
229 li r3, 0
230 blr
231
232 /* Found a match in last 16 bytes. */
233 .align 4
234L(found_16B):
235 cmpld r8, r10 /* Are we on the last QW? */
236 bge L(last)
237 /* Now discard bytes before starting address. */
238 sub r9, r10, r8
239 mtvrd v9, r9
240 vspltisb v8, 3
241 /* Mask unwanted bytes. */
242#ifdef __LITTLE_ENDIAN__
243 lvsr v7, 0, r10
244 vperm v6, v0, v6, v7
245 vsldoi v9, v0, v9, 8
246 vsl v9, v9, v8
247 vslo v6, v6, v9
248#else
249 lvsl v7, 0, r10
250 vperm v6, v6, v0, v7
251 vsldoi v9, v0, v9, 8
252 vsl v9, v9, v8
253 vsro v6, v6, v9
254#endif
255L(last):
256 /* Permute the first bit of each byte into bits 48-63. */
257 vbpermq v6, v6, v10
258 /* Shift each component into its correct position for merging. */
259#ifdef __LITTLE_ENDIAN__
260 vsldoi v6, v6, v6, 6
261 mfvrd r7, v6
262 cntlzd r6, r7 /* Count leading zeros before the match. */
263#else
264 mfvrd r7, v6
265 addi r6, r7, -1
266 andc r6, r6, r7
267 popcntd r6, r6
268#endif
269 addi r8, r8, 15
270 sub r3, r8, r6 /* Compute final address. */
271 cmpld r6, r5
272 bltlr
273 li r3, 0
274 blr
275
276 /* r3 has the output of the cmpb instruction, that is, it contains
277 0xff in the same position as BYTE in the original
278 word from the string. Use that to calculate the pointer.
279 We need to make sure BYTE is *before* the end of the
280 range. */
281L(done):
282 cntlzd r9, r3 /* Count leading zeros before the match. */
283 cmpld r8, r0 /* Are we on the last word? */
284 srdi r6, r9, 3 /* Convert leading zeros to bytes. */
285 addi r0, r6, -7
286 sub r3, r8, r0
287 cmpld cr7, r3, r10
288 bnelr
289 bgelr cr7
290 li r3, 0
291 blr
292
293 .align 4
294L(null):
295 li r3, 0
296 blr
297
298/* Deals with size <= 32. */
299 .align 4
300L(small_range):
301 cmpldi r5, 0
302 beq L(null)
303
304#ifdef __LITTLE_ENDIAN__
305 ldx r12, 0, r8
306#else
307 ldbrx r12, 0, r8 /* Load reversed doubleword from memory. */
308#endif
309 cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */
310 and r3, r3, r9
311 cmpldi cr7, r3, 0
312 bne cr7, L(done)
313
314 /* Are we done already? */
315 cmpld r8, r0
316 addi r8, r8, -8
317 beqlr
318
319 .align 5
320L(loop_small):
321#ifdef __LITTLE_ENDIAN__
322 ldx r12, 0, r8
323#else
324 ldbrx r12, 0, r8
325#endif
326 cmpb r3, r12, r4
327 cmpld r8, r0
328 cmpldi cr7, r3, 0
329 bne cr7, L(done)
330 addi r8, r8, -8
331 bne L(loop_small)
332 blr
333
334END (MEMRCHR)
335libc_hidden_def (__memrchr)
336weak_alias (__memrchr, memrchr)
337libc_hidden_builtin_def (memrchr)
338

source code of glibc/sysdeps/powerpc/powerpc64/power8/memrchr.S