1 | /* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8. |
2 | Copyright (C) 2015-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | #ifdef USE_AS_STPCPY |
22 | # ifndef STPCPY |
23 | # define FUNC_NAME __stpcpy |
24 | # else |
25 | # define FUNC_NAME STPCPY |
26 | # endif |
27 | #else |
28 | # ifndef STRCPY |
29 | # define FUNC_NAME strcpy |
30 | # else |
31 | # define FUNC_NAME STRCPY |
32 | # endif |
33 | #endif /* !USE_AS_STPCPY */ |
34 | |
35 | /* Implements the function |
36 | |
37 | char * [r3] strcpy (char *dest [r3], const char *src [r4]) |
38 | |
39 | or |
40 | |
41 | char * [r3] stpcpy (char *dest [r3], const char *src [r4]) |
42 | |
43 | if USE_AS_STPCPY is defined. |
44 | |
45 | The implementation uses unaligned doubleword access to avoid specialized |
46 | code paths depending of data alignment. Although recent powerpc64 uses |
47 | 64K as default, the page cross handling assumes minimum page size of |
48 | 4k. */ |
49 | |
50 | .machine power8 |
51 | ENTRY_TOCLESS (FUNC_NAME, 4) |
52 | li r0,0 /* Doubleword with null chars to use |
53 | with cmpb. */ |
54 | |
55 | /* Check if the [src]+15 will cross a 4K page by checking if the bit |
56 | indicating the page size changes. Basically: |
57 | |
58 | uint64_t srcin = (uint64_t)src; |
59 | uint64_t ob = srcin & 4096UL; |
60 | uint64_t nb = (srcin+15UL) & 4096UL; |
61 | if (ob ^ nb) |
62 | goto pagecross; */ |
63 | |
64 | addi r9,r4,15 |
65 | xor r9,r9,r4 |
66 | rlwinm. r9,r9,0,19,19 |
67 | bne L(pagecross) |
68 | |
69 | /* For short string (less than 16 bytes), just calculate its size as |
70 | strlen and issues a memcpy if null is found. */ |
71 | mr r7,r4 |
72 | ld r12,0(r7) /* Load doubleword from memory. */ |
73 | cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */ |
74 | cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ |
75 | bne cr7,L(done) |
76 | |
77 | ldu r8,8(r7) |
78 | cmpb r10,r8,r0 |
79 | cmpdi cr7,r10,0 |
80 | bne cr7,L(done) |
81 | |
82 | b L(loop_before) |
83 | |
84 | .align 4 |
85 | L(pagecross): |
86 | clrrdi r7,r4,3 /* Align the address to doubleword boundary. */ |
87 | rlwinm r6,r4,3,26,28 /* Calculate padding. */ |
88 | li r5,-1 /* MASK = 0xffffffffffffffff. */ |
89 | ld r12,0(r7) /* Load doubleword from memory. */ |
90 | #ifdef __LITTLE_ENDIAN__ |
91 | sld r5,r5,r6 |
92 | #else |
93 | srd r5,r5,r6 /* MASK = MASK >> padding. */ |
94 | #endif |
95 | orc r9,r12,r5 /* Mask bits that are not part of the string. */ |
96 | cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ |
97 | cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ |
98 | bne cr7,L(done) |
99 | |
100 | ldu r6,8(r7) |
101 | cmpb r10,r6,r0 |
102 | cmpdi cr7,r10,0 |
103 | bne cr7,L(done) |
104 | |
105 | ld r12,0(r7) |
106 | cmpb r10,r12,r0 |
107 | cmpdi cr7,r10,0 |
108 | bne cr7,L(done) |
109 | |
110 | ldu r6,8(r7) |
111 | cmpb r10,r6,r0 |
112 | cmpdi cr7,r10,0 |
113 | bne cr7,L(done) |
114 | |
115 | /* We checked for 24 - x bytes, with x being the source alignment |
116 | (0 <= x <= 16), and no zero has been found. Start the loop |
117 | copy with doubleword aligned address. */ |
118 | mr r7,r4 |
119 | ld r12, 0(r7) |
120 | ldu r8, 8(r7) |
121 | |
122 | L(loop_before): |
123 | /* Save the two doublewords read from source and align the source |
124 | to 16 bytes for the loop. */ |
125 | mr r11,r3 |
126 | std r12,0(r11) |
127 | std r8,8(r11) |
128 | addi r11,r11,16 |
129 | rldicl r9,r4,0,60 |
130 | subf r7,r9,r7 |
131 | subf r11,r9,r11 |
132 | /* Source is adjusted to 16B alignment and destination r11 is |
133 | also moved based on that adjustment. Now check if r11 is |
134 | also 16B aligned to move to vectorized loop. */ |
135 | andi. r6, r11, 0xF |
136 | bne L(loop_start) |
137 | |
138 | /* Prepare for the loop. */ |
139 | subf r4, r9, r4 /* Adjust r4 based on alignment. */ |
140 | li r7, 16 /* Load required offsets. */ |
141 | li r8, 32 |
142 | li r9, 48 |
143 | vspltisb v0, 0 |
144 | addi r4, r4, 16 |
145 | /* Are we 64-byte aligned? If so, jump to the vectorized loop. |
146 | Else copy 16B till r4 is 64B aligned. */ |
147 | andi. r6, r4, 63 |
148 | beq L(qw_loop) |
149 | |
150 | lvx v6, 0, r4 /* Load 16 bytes from memory. */ |
151 | vcmpequb. v5, v0, v6 /* Check for null. */ |
152 | bne cr6, L(qw_done) |
153 | stvx v6, 0, r11 /* Store 16 bytes. */ |
154 | addi r4, r4, 16 /* Increment the address. */ |
155 | addi r11, r11, 16 |
156 | andi. r6, r4, 63 |
157 | beq L(qw_loop) |
158 | |
159 | lvx v6, 0, r4 |
160 | vcmpequb. v5, v0, v6 |
161 | bne cr6, L(qw_done) |
162 | stvx v6, 0, r11 |
163 | addi r4, r4, 16 |
164 | addi r11, r11, 16 |
165 | andi. r6, r4, 63 |
166 | beq L(qw_loop) |
167 | |
168 | lvx v6, 0, r4 |
169 | vcmpequb. v5, v0, v6 |
170 | bne cr6, L(qw_done) |
171 | stvx v6, 0, r11 |
172 | addi r4, r4, 16 |
173 | addi r11, r11, 16 |
174 | |
175 | .align 4 |
176 | L(qw_loop): |
177 | lvx v1, r4, r0 /* Load 4 quadwords. */ |
178 | lvx v2, r4, r7 |
179 | lvx v3, r4, r8 |
180 | lvx v4, r4, r9 |
181 | vminub v5, v1, v2 /* Compare and merge into one VR for speed. */ |
182 | vminub v8, v3, v4 |
183 | vminub v7, v5, v8 |
184 | vcmpequb. v7, v7, v0 /* Check for NULLs. */ |
185 | bne cr6, L(qw_loop_done) |
186 | stvx v1, r11, r0 /* Store 4 quadwords. */ |
187 | stvx v2, r11, r7 |
188 | stvx v3, r11, r8 |
189 | stvx v4, r11, r9 |
190 | addi r4, r4, 64 /* Adjust address for the next iteration. */ |
191 | addi r11, r11, 64 /* Adjust address for the next iteration. */ |
192 | |
193 | lvx v1, r4, r0 /* Load 4 quadwords. */ |
194 | lvx v2, r4, r7 |
195 | lvx v3, r4, r8 |
196 | lvx v4, r4, r9 |
197 | vminub v5, v1, v2 /* Compare and merge into one VR for speed. */ |
198 | vminub v8, v3, v4 |
199 | vminub v7, v5, v8 |
200 | vcmpequb. v7, v7, v0 /* Check for NULLs. */ |
201 | bne cr6, L(qw_loop_done) |
202 | stvx v1, r11, r0 /* Store 4 quadwords. */ |
203 | stvx v2, r11, r7 |
204 | stvx v3, r11, r8 |
205 | stvx v4, r11, r9 |
206 | addi r4, r4, 64 /* Adjust address for the next iteration. */ |
207 | addi r11, r11, 64 /* Adjust address for the next iteration. */ |
208 | |
209 | lvx v1, r4, r0 /* Load 4 quadwords. */ |
210 | lvx v2, r4, r7 |
211 | lvx v3, r4, r8 |
212 | lvx v4, r4, r9 |
213 | vminub v5, v1, v2 /* Compare and merge into one VR for speed. */ |
214 | vminub v8, v3, v4 |
215 | vminub v7, v5, v8 |
216 | vcmpequb. v7, v7, v0 /* Check for NULLs. */ |
217 | bne cr6, L(qw_loop_done) |
218 | stvx v1, r11, r0 /* Store 4 quadwords. */ |
219 | stvx v2, r11, r7 |
220 | stvx v3, r11, r8 |
221 | stvx v4, r11, r9 |
222 | addi r4, r4, 64 /* Adjust address for the next iteration. */ |
223 | addi r11, r11, 64 /* Adjust address for the next iteration. */ |
224 | b L(qw_loop) |
225 | |
226 | .align 4 |
227 | L(qw_loop_done): |
228 | /* Null found in one of the 4 loads. */ |
229 | vcmpequb. v7, v1, v0 |
230 | vor v6, v1, v1 |
231 | bne cr6, L(qw_done) |
232 | /* Not on the first 16B, So store it. */ |
233 | stvx v1, r11, r0 |
234 | addi r4, r4, 16 |
235 | addi r11, r11, 16 |
236 | vcmpequb. v7, v2, v0 |
237 | vor v6, v2, v2 |
238 | bne cr6, L(qw_done) |
239 | /* Not on the second 16B, So store it. */ |
240 | stvx v2, r11, r0 |
241 | addi r4, r4, 16 |
242 | addi r11, r11, 16 |
243 | vcmpequb. v7, v3, v0 |
244 | vor v6, v3, v3 |
245 | bne cr6, L(qw_done) |
246 | /* Not on the third 16B, So store it. */ |
247 | stvx v6, r11, r0 |
248 | addi r4, r4, 16 |
249 | addi r11, r11, 16 |
250 | vor v6, v4, v4 |
251 | |
252 | .align 4 |
253 | L(qw_done): |
254 | mr r7, r4 |
255 | /* Move the result to GPR. */ |
256 | #ifdef __LITTLE_ENDIAN__ |
257 | vsldoi v4, v6, v0, 8 |
258 | mfvrd r12, v4 |
259 | #else |
260 | mfvrd r12, v6 |
261 | #endif |
262 | /* Check for null in the first 8 bytes. */ |
263 | cmpb r10, r12, r0 |
264 | cmpdi cr6, r10, 0 |
265 | bne cr6, L(done2) |
266 | /* Null found in second doubleword. */ |
267 | #ifdef __LITTLE_ENDIAN__ |
268 | mfvrd r6, v6 |
269 | #else |
270 | vsldoi v6, v6, v0, 8 |
271 | mfvrd r6, v6 |
272 | #endif |
273 | cmpb r10, r6, r0 |
274 | addi r7, r7, 8 |
275 | b L(done2) |
276 | |
277 | .align 5 |
278 | L(loop): |
279 | std r12, 0(r11) |
280 | std r6, 8(r11) |
281 | addi r11,r11,16 |
282 | L(loop_start): |
283 | /* Load two doublewords, compare and merge in a |
284 | single register for speed. This is an attempt |
285 | to speed up the null-checking process for bigger strings. */ |
286 | |
287 | ld r12, 8(r7) |
288 | ldu r6, 16(r7) |
289 | cmpb r10,r12,r0 |
290 | cmpb r9,r6,r0 |
291 | or r8,r9,r10 /* Merge everything in one doubleword. */ |
292 | cmpdi cr7,r8,0 |
293 | beq cr7,L(loop) |
294 | |
295 | |
296 | /* OK, one (or both) of the doublewords contains a null byte. Check |
297 | the first doubleword and decrement the address in case the first |
298 | doubleword really contains a null byte. */ |
299 | |
300 | addi r4,r7,-8 |
301 | cmpdi cr6,r10,0 |
302 | addi r7,r7,-8 |
303 | bne cr6,L(done2) |
304 | |
305 | /* The null byte must be in the second doubleword. Adjust the address |
306 | again and move the result of cmpb to r10 so we can calculate the |
307 | length. */ |
308 | |
309 | mr r10,r9 |
310 | addi r7,r7,8 |
311 | b L(done2) |
312 | |
313 | /* r10 has the output of the cmpb instruction, that is, it contains |
314 | 0xff in the same position as the null byte in the original |
315 | doubleword from the string. Use that to calculate the length. */ |
316 | L(done): |
317 | mr r11,r3 |
318 | L(done2): |
319 | #ifdef __LITTLE_ENDIAN__ |
320 | addi r9, r10, -1 /* Form a mask from trailing zeros. */ |
321 | andc r9, r9, r10 |
322 | popcntd r6, r9 /* Count the bits in the mask. */ |
323 | #else |
324 | cntlzd r6,r10 /* Count leading zeros before the match. */ |
325 | #endif |
326 | subf r5,r4,r7 |
327 | srdi r6,r6,3 /* Convert leading/trailing zeros to bytes. */ |
328 | add r8,r5,r6 /* Compute final length. */ |
329 | #ifdef USE_AS_STPCPY |
330 | /* stpcpy returns the dest address plus the size not counting the |
331 | final '\0'. */ |
332 | add r3,r11,r8 |
333 | #endif |
334 | addi r8,r8,1 /* Final '/0'. */ |
335 | |
336 | cmpldi cr6,r8,8 |
337 | mtocrf 0x01,r8 |
338 | ble cr6,L(copy_LE_8) |
339 | |
340 | cmpldi cr1,r8,16 |
341 | blt cr1,8f |
342 | |
343 | /* Handle copies of 0~31 bytes. */ |
344 | .align 4 |
345 | L(copy_LT_32): |
346 | /* At least 6 bytes to go. */ |
347 | blt cr1,8f |
348 | |
349 | /* Copy 16 bytes. */ |
350 | ld r6,0(r4) |
351 | ld r8,8(r4) |
352 | addi r4,r4,16 |
353 | std r6,0(r11) |
354 | std r8,8(r11) |
355 | addi r11,r11,16 |
356 | 8: /* Copy 8 bytes. */ |
357 | bf 28,L(tail4) |
358 | ld r6,0(r4) |
359 | addi r4,r4,8 |
360 | std r6,0(r11) |
361 | addi r11,r11,8 |
362 | |
363 | .align 4 |
364 | /* Copies 4~7 bytes. */ |
365 | L(tail4): |
366 | bf 29,L(tail2) |
367 | lwz r6,0(r4) |
368 | stw r6,0(r11) |
369 | bf 30,L(tail5) |
370 | lhz r7,4(r4) |
371 | sth r7,4(r11) |
372 | bflr 31 |
373 | lbz r8,6(r4) |
374 | stb r8,6(r11) |
375 | blr |
376 | |
377 | .align 4 |
378 | /* Copies 2~3 bytes. */ |
379 | L(tail2): |
380 | bf 30,1f |
381 | lhz r6,0(r4) |
382 | sth r6,0(r11) |
383 | bflr 31 |
384 | lbz r7,2(r4) |
385 | stb r7,2(r11) |
386 | blr |
387 | |
388 | .align 4 |
389 | L(tail5): |
390 | bf 31,1f |
391 | lbz r6,4(r4) |
392 | stb r6,4(r11) |
393 | blr |
394 | |
395 | .align 4 |
396 | 1: |
397 | bflr 31 |
398 | lbz r6,0(r4) |
399 | stb r6,0(r11) |
400 | blr |
401 | |
402 | /* Handles copies of 0~8 bytes. */ |
403 | .align 4 |
404 | L(copy_LE_8): |
405 | bne cr6,L(tail4) |
406 | ld r6,0(r4) |
407 | std r6,0(r11) |
408 | blr |
409 | END (FUNC_NAME) |
410 | |
411 | #ifndef USE_AS_STPCPY |
412 | libc_hidden_builtin_def (strcpy) |
413 | #endif |
414 | |