1 | /* Optimized memrchr implementation for PowerPC64/POWER8. |
2 | Copyright (C) 2017-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5]) */ |
22 | |
23 | #ifndef MEMRCHR |
24 | # define MEMRCHR __memrchr |
25 | #endif |
26 | .machine power8 |
27 | ENTRY_TOCLESS (MEMRCHR) |
28 | CALL_MCOUNT 3 |
29 | add r7, r3, r5 /* Calculate the last acceptable address. */ |
30 | neg r0, r7 |
31 | addi r7, r7, -1 |
32 | mr r10, r3 |
33 | clrrdi r6, r7, 7 |
34 | li r9, 3<<5 |
35 | dcbt r9, r6, 8 /* Stream hint, decreasing addresses. */ |
36 | |
37 | /* Replicate BYTE to doubleword. */ |
38 | insrdi r4, r4, 8, 48 |
39 | insrdi r4, r4, 16, 32 |
40 | insrdi r4, r4, 32, 0 |
41 | li r6, -8 |
42 | li r9, -1 |
43 | rlwinm r0, r0, 3, 26, 28 /* Calculate padding. */ |
44 | clrrdi r8, r7, 3 |
45 | srd r9, r9, r0 |
46 | cmpldi r5, 32 |
47 | clrrdi r0, r10, 3 |
48 | ble L(small_range) |
49 | |
50 | #ifdef __LITTLE_ENDIAN__ |
51 | ldx r12, 0, r8 |
52 | #else |
53 | ldbrx r12, 0, r8 /* Load reversed doubleword from memory. */ |
54 | #endif |
55 | cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */ |
56 | and r3, r3, r9 |
57 | cmpldi cr7, r3, 0 /* If r3 == 0, no BYTEs have been found. */ |
58 | bne cr7, L(done) |
59 | |
60 | /* Are we now aligned to a quadword boundary? If so, skip to |
61 | the main loop. Otherwise, go through the alignment code. */ |
62 | andi. r12, r8, 15 |
63 | beq cr0, L(align_qw) |
64 | |
65 | /* Handle DWORD2 of pair. */ |
66 | #ifdef __LITTLE_ENDIAN__ |
67 | ldx r12, r8, r6 |
68 | #else |
69 | ldbrx r12, r8, r6 |
70 | #endif |
71 | addi r8, r8, -8 |
72 | cmpb r3, r12, r4 |
73 | cmpldi cr7, r3, 0 |
74 | bne cr7, L(done) |
75 | |
76 | .align 4 |
77 | /* At this point, r8 is 16B aligned. */ |
78 | L(align_qw): |
79 | sub r5, r8, r0 |
80 | vspltisb v0, 0 |
81 | /* Precompute vbpermq constant. */ |
82 | vspltisb v10, 3 |
83 | li r0, 0 |
84 | lvsl v11, r0, r0 |
85 | vslb v10, v11, v10 |
86 | mtvrd v1, r4 |
87 | vspltb v1, v1, 7 |
88 | cmpldi r5, 64 |
89 | ble L(tail64) |
90 | /* Are we 64-byte aligned? If so, jump to the vectorized loop. |
91 | Note: aligning to 64-byte will necessarily slow down performance for |
92 | strings around 64 bytes in length due to the extra comparisons |
93 | required to check alignment for the vectorized loop. This is a |
94 | necessary tradeoff we are willing to take in order to speed up the |
95 | calculation for larger strings. */ |
96 | andi. r11, r8, 63 |
97 | beq cr0, L(preloop_64B) |
98 | /* In order to begin the 64B loop, it needs to be 64 |
99 | bytes aligned. So read until it is 64B aligned. */ |
100 | addi r8, r8, -16 |
101 | lvx v4, 0, r8 |
102 | vcmpequb v6, v1, v4 |
103 | vcmpequb. v11, v0, v6 |
104 | bnl cr6, L(found_16B) |
105 | addi r5, r5, -16 |
106 | |
107 | andi. r11, r8, 63 |
108 | beq cr0, L(preloop_64B) |
109 | addi r8, r8, -16 |
110 | lvx v4, 0, r8 |
111 | vcmpequb v6, v1, v4 |
112 | vcmpequb. v11, v0, v6 |
113 | bnl cr6, L(found_16B) |
114 | addi r5, r5, -16 |
115 | |
116 | andi. r11, r8, 63 |
117 | beq cr0, L(preloop_64B) |
118 | addi r8, r8, -16 |
119 | lvx v4, 0, r8 |
120 | vcmpequb v6, v1, v4 |
121 | vcmpequb. v11, v0, v6 |
122 | bnl cr6, L(found_16B) |
123 | addi r5, r5, -16 |
124 | /* At this point it should be 64B aligned. |
125 | Prepare for the 64B loop. */ |
126 | L(preloop_64B): |
127 | cmpldi r5, 64 /* Check if r5 < 64. */ |
128 | ble L(tail64) |
129 | srdi r9, r5, 6 /* Number of loop iterations. */ |
130 | mtctr r9 /* Setup the counter. */ |
131 | li r11, 16 /* Load required offsets. */ |
132 | li r9, 32 |
133 | li r7, 48 |
134 | |
135 | /* Handle r5 > 64. Loop over the bytes in strides of 64B. */ |
136 | .align 4 |
137 | L(loop): |
138 | addi r8, r8, -64 /* Adjust address for the next iteration. */ |
139 | lvx v2, 0, r8 /* Load 4 quadwords. */ |
140 | lvx v3, r8, r11 |
141 | lvx v4, v8, r9 |
142 | lvx v5, v8, r7 |
143 | vcmpequb v6, v1, v2 |
144 | vcmpequb v7, v1, v3 |
145 | vcmpequb v8, v1, v4 |
146 | vcmpequb v9, v1, v5 |
147 | vor v11, v6, v7 |
148 | vor v12, v8, v9 |
149 | vor v11, v11, v12 /* Compare and merge into one VR for speed. */ |
150 | vcmpequb. v11, v0, v11 |
151 | bnl cr6, L(found) |
152 | bdnz L(loop) |
153 | clrldi r5, r5, 58 |
154 | |
155 | /* Handle remainder of 64B loop or r5 > 64. */ |
156 | .align 4 |
157 | L(tail64): |
158 | cmpldi r5, 0 |
159 | beq L(null) |
160 | addi r8, r8, -16 |
161 | lvx v4, 0, r8 |
162 | vcmpequb v6, v1, v4 |
163 | vcmpequb. v11, v0, v6 |
164 | bnl cr6, L(found_16B) |
165 | cmpldi cr6, r5, 16 |
166 | ble cr6, L(null) |
167 | addi r5, r5, -16 |
168 | |
169 | addi r8, r8, -16 |
170 | lvx v4, 0, r8 |
171 | vcmpequb v6, v1, v4 |
172 | vcmpequb. v11, v0, v6 |
173 | bnl cr6, L(found_16B) |
174 | cmpldi cr6, r5, 16 |
175 | ble cr6, L(null) |
176 | addi r5, r5, -16 |
177 | |
178 | addi r8, r8, -16 |
179 | lvx v4, 0, r8 |
180 | vcmpequb v6, v1, v4 |
181 | vcmpequb. v11, v0, v6 |
182 | bnl cr6, L(found_16B) |
183 | cmpldi cr6, r5, 16 |
184 | ble cr6, L(null) |
185 | addi r5, r5, -16 |
186 | |
187 | addi r8, r8, -16 |
188 | lvx v4, 0, r8 |
189 | vcmpequb v6, v1, v4 |
190 | vcmpequb. v11, v0, v6 |
191 | bnl cr6, L(found_16B) |
192 | li r3, 0 |
193 | blr |
194 | |
195 | /* Found a match in 64B loop. */ |
196 | .align 4 |
197 | L(found): |
198 | /* Permute the first bit of each byte into bits 48-63. */ |
199 | vbpermq v6, v6, v10 |
200 | vbpermq v7, v7, v10 |
201 | vbpermq v8, v8, v10 |
202 | vbpermq v9, v9, v10 |
203 | /* Shift each component into its correct position for merging. */ |
204 | #ifdef __LITTLE_ENDIAN__ |
205 | vsldoi v7, v7, v7, 2 |
206 | vsldoi v8, v8, v8, 4 |
207 | vsldoi v9, v9, v9, 6 |
208 | #else |
209 | vsldoi v6, v6, v6, 6 |
210 | vsldoi v7, v7, v7, 4 |
211 | vsldoi v8, v8, v8, 2 |
212 | #endif |
213 | /* Merge the results and move to a GPR. */ |
214 | vor v11, v6, v7 |
215 | vor v4, v9, v8 |
216 | vor v4, v11, v4 |
217 | mfvrd r5, v4 |
218 | #ifdef __LITTLE_ENDIAN__ |
219 | cntlzd r6, r5 /* Count leading zeros before the match. */ |
220 | #else |
221 | addi r6, r5, -1 |
222 | andc r6, r6, r5 |
223 | popcntd r6, r6 |
224 | #endif |
225 | addi r8, r8, 63 |
226 | sub r3, r8, r6 /* Compute final address. */ |
227 | cmpld cr7, r3, r10 |
228 | bgelr cr7 |
229 | li r3, 0 |
230 | blr |
231 | |
232 | /* Found a match in last 16 bytes. */ |
233 | .align 4 |
234 | L(found_16B): |
235 | cmpld r8, r10 /* Are we on the last QW? */ |
236 | bge L(last) |
237 | /* Now discard bytes before starting address. */ |
238 | sub r9, r10, r8 |
239 | mtvrd v9, r9 |
240 | vspltisb v8, 3 |
241 | /* Mask unwanted bytes. */ |
242 | #ifdef __LITTLE_ENDIAN__ |
243 | lvsr v7, 0, r10 |
244 | vperm v6, v0, v6, v7 |
245 | vsldoi v9, v0, v9, 8 |
246 | vsl v9, v9, v8 |
247 | vslo v6, v6, v9 |
248 | #else |
249 | lvsl v7, 0, r10 |
250 | vperm v6, v6, v0, v7 |
251 | vsldoi v9, v0, v9, 8 |
252 | vsl v9, v9, v8 |
253 | vsro v6, v6, v9 |
254 | #endif |
255 | L(last): |
256 | /* Permute the first bit of each byte into bits 48-63. */ |
257 | vbpermq v6, v6, v10 |
258 | /* Shift each component into its correct position for merging. */ |
259 | #ifdef __LITTLE_ENDIAN__ |
260 | vsldoi v6, v6, v6, 6 |
261 | mfvrd r7, v6 |
262 | cntlzd r6, r7 /* Count leading zeros before the match. */ |
263 | #else |
264 | mfvrd r7, v6 |
265 | addi r6, r7, -1 |
266 | andc r6, r6, r7 |
267 | popcntd r6, r6 |
268 | #endif |
269 | addi r8, r8, 15 |
270 | sub r3, r8, r6 /* Compute final address. */ |
271 | cmpld r6, r5 |
272 | bltlr |
273 | li r3, 0 |
274 | blr |
275 | |
276 | /* r3 has the output of the cmpb instruction, that is, it contains |
277 | 0xff in the same position as BYTE in the original |
278 | word from the string. Use that to calculate the pointer. |
279 | We need to make sure BYTE is *before* the end of the |
280 | range. */ |
281 | L(done): |
282 | cntlzd r9, r3 /* Count leading zeros before the match. */ |
283 | cmpld r8, r0 /* Are we on the last word? */ |
284 | srdi r6, r9, 3 /* Convert leading zeros to bytes. */ |
285 | addi r0, r6, -7 |
286 | sub r3, r8, r0 |
287 | cmpld cr7, r3, r10 |
288 | bnelr |
289 | bgelr cr7 |
290 | li r3, 0 |
291 | blr |
292 | |
293 | .align 4 |
294 | L(null): |
295 | li r3, 0 |
296 | blr |
297 | |
298 | /* Deals with size <= 32. */ |
299 | .align 4 |
300 | L(small_range): |
301 | cmpldi r5, 0 |
302 | beq L(null) |
303 | |
304 | #ifdef __LITTLE_ENDIAN__ |
305 | ldx r12, 0, r8 |
306 | #else |
307 | ldbrx r12, 0, r8 /* Load reversed doubleword from memory. */ |
308 | #endif |
309 | cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */ |
310 | and r3, r3, r9 |
311 | cmpldi cr7, r3, 0 |
312 | bne cr7, L(done) |
313 | |
314 | /* Are we done already? */ |
315 | cmpld r8, r0 |
316 | addi r8, r8, -8 |
317 | beqlr |
318 | |
319 | .align 5 |
320 | L(loop_small): |
321 | #ifdef __LITTLE_ENDIAN__ |
322 | ldx r12, 0, r8 |
323 | #else |
324 | ldbrx r12, 0, r8 |
325 | #endif |
326 | cmpb r3, r12, r4 |
327 | cmpld r8, r0 |
328 | cmpldi cr7, r3, 0 |
329 | bne cr7, L(done) |
330 | addi r8, r8, -8 |
331 | bne L(loop_small) |
332 | blr |
333 | |
334 | END (MEMRCHR) |
335 | libc_hidden_def (__memrchr) |
336 | weak_alias (__memrchr, memrchr) |
337 | libc_hidden_builtin_def (memrchr) |
338 | |