1 | /* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8. |
2 | Copyright (C) 2015-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | #ifdef USE_AS_STPNCPY |
22 | # ifndef STPNCPY |
23 | # define FUNC_NAME __stpncpy |
24 | # else |
25 | # define FUNC_NAME STPNCPY |
26 | # endif |
27 | #else |
28 | # ifndef STRNCPY |
29 | # define FUNC_NAME strncpy |
30 | # else |
31 | # define FUNC_NAME STRNCPY |
32 | # endif |
33 | #endif /* !USE_AS_STPNCPY */ |
34 | |
35 | #ifndef MEMSET |
36 | /* For builds without IFUNC support, local calls should be made to internal |
37 | GLIBC symbol (created by libc_hidden_builtin_def). */ |
38 | # ifdef SHARED |
39 | # define MEMSET_is_local |
40 | # define MEMSET __GI_memset |
41 | # else |
42 | # define MEMSET memset |
43 | # endif |
44 | #endif |
45 | |
46 | #define FRAMESIZE (FRAME_MIN_SIZE+48) |
47 | |
48 | /* Implements the function |
49 | |
50 | char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) |
51 | |
52 | or |
53 | |
54 | char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5]) |
55 | |
56 | if USE_AS_STPCPY is defined. |
57 | |
58 | The implementation uses unaligned doubleword access to avoid specialized |
59 | code paths depending of data alignment. Although recent powerpc64 uses |
60 | 64K as default, the page cross handling assumes minimum page size of |
61 | 4k. */ |
62 | |
63 | .machine power8 |
64 | #ifdef MEMSET_is_local |
65 | ENTRY_TOCLESS (FUNC_NAME, 4) |
66 | #else |
67 | ENTRY (FUNC_NAME, 4) |
68 | #endif |
69 | CALL_MCOUNT 3 |
70 | |
71 | /* Check if the [src]+15 will cross a 4K page by checking if the bit |
72 | indicating the page size changes. Basically: |
73 | |
74 | uint64_t srcin = (uint64_t)src; |
75 | uint64_t ob = srcin & 4096UL; |
76 | uint64_t nb = (srcin+15UL) & 4096UL; |
77 | if (ob ^ nb) |
78 | goto pagecross; */ |
79 | |
80 | addi r10,r4,16 |
81 | rlwinm r9,r4,0,19,19 |
82 | |
83 | /* Save some non-volatile registers on the stack. */ |
84 | std r26,-48(r1) |
85 | std r27,-40(r1) |
86 | |
87 | rlwinm r8,r10,0,19,19 |
88 | |
89 | std r28,-32(r1) |
90 | std r29,-24(r1) |
91 | |
92 | cmpld cr7,r9,r8 |
93 | |
94 | std r30,-16(r1) |
95 | std r31,-8(r1) |
96 | |
97 | /* Update CFI. */ |
98 | cfi_offset(r26, -48) |
99 | cfi_offset(r27, -40) |
100 | cfi_offset(r28, -32) |
101 | cfi_offset(r29, -24) |
102 | cfi_offset(r30, -16) |
103 | cfi_offset(r31, -8) |
104 | |
105 | beq cr7,L(unaligned_lt_16) |
106 | rldicl r9,r4,0,61 |
107 | subfic r8,r9,8 |
108 | cmpld cr7,r5,r8 |
109 | bgt cr7,L(pagecross) |
110 | |
111 | /* At this points there is 1 to 15 bytes to check and write. Since it could |
112 | be either from first unaligned 16 bytes access or from bulk copy, the code |
113 | uses an unrolled byte read/write instead of trying to analyze the cmpb |
114 | results. */ |
115 | L(short_path): |
116 | mr r9,r3 |
117 | L(short_path_1): |
118 | /* Return if there are no more bytes to be written. */ |
119 | cmpdi cr7,r5,0 |
120 | beq cr7,L(short_path_loop_end_1) |
121 | L(short_path_2): |
122 | /* Copy one char from src (r4) and write it to dest (r9). If it is the |
123 | end-of-string, start the null padding. Continue, otherwise. */ |
124 | lbz r10,0(r4) |
125 | cmpdi cr7,r10,0 |
126 | stb r10,0(r9) |
127 | beq cr7,L(zero_pad_start_1) |
128 | /* If there are no more bytes to be written, return. */ |
129 | cmpdi cr0,r5,1 |
130 | addi r8,r9,1 |
131 | addi r6,r5,-1 |
132 | beq cr0,L(short_path_loop_end_0) |
133 | /* Copy another char from src (r4) to dest (r9). Check again if it is |
134 | the end-of-string. If so, start the null padding. */ |
135 | lbz r10,1(r4) |
136 | cmpdi cr7,r10,0 |
137 | stb r10,1(r9) |
138 | beq cr7,L(zero_pad_start_prepare_1) |
139 | /* Eagerly decrement r5 by 3, which is the number of bytes already |
140 | written, plus one write that will be performed later on. */ |
141 | addi r10,r5,-3 |
142 | b L(short_path_loop_1) |
143 | |
144 | .align 4 |
145 | L(short_path_loop): |
146 | /* At this point, the induction variable, r5, as well as the pointers |
147 | to dest and src (r9 and r4, respectively) have been updated. |
148 | |
149 | Note: The registers r7 and r10 are induction variables derived from |
150 | r5. They are used to determine if the total number of writes has |
151 | been reached at every other write. |
152 | |
153 | Copy one char from src (r4) and write it to dest (r9). If it is the |
154 | end-of-string, start the null padding. Continue, otherwise. */ |
155 | lbz r8,0(r4) |
156 | addi r7,r10,-2 |
157 | cmpdi cr5,r8,0 |
158 | stb r8,0(r9) |
159 | beq cr5,L(zero_pad_start_1) |
160 | beq cr7,L(short_path_loop_end_0) |
161 | /* Copy another char from src (r4) to dest (r9). Check again if it is |
162 | the end-of-string. If so, start the null padding. */ |
163 | lbz r8,1(r4) |
164 | cmpdi cr7,r8,0 |
165 | stb r8,1(r9) |
166 | beq cr7,L(zero_pad_start) |
167 | mr r10,r7 |
168 | L(short_path_loop_1): |
169 | /* This block is reached after two chars have been already written to |
170 | dest. Nevertheless, r5 (the induction variable), r9 (the pointer to |
171 | dest), and r4 (the pointer to src) have not yet been updated. |
172 | |
173 | At this point: |
174 | r5 holds the count of bytes yet to be written plus 2. |
175 | r9 points to the last two chars that were already written to dest. |
176 | r4 points to the last two chars that were already copied from src. |
177 | |
178 | The algorithm continues by decrementing r5, the induction variable, |
179 | so that it reflects the last two writes. The pointers to dest (r9) |
180 | and to src (r4) are increment by two, for the same reason. |
181 | |
182 | Note: Register r10 is another induction variable, derived from r5, |
183 | which determines if the total number of writes has been reached. */ |
184 | addic. r5,r5,-2 |
185 | addi r9,r9,2 |
186 | cmpdi cr7,r10,0 /* Eagerly check if the next write is the last. */ |
187 | addi r4,r4,2 |
188 | addi r6,r9,1 |
189 | bne cr0,L(short_path_loop) /* Check if the total number of writes |
190 | has been reached at every other |
191 | write. */ |
192 | #ifdef USE_AS_STPNCPY |
193 | mr r3,r9 |
194 | b L(short_path_loop_end) |
195 | #endif |
196 | |
197 | L(short_path_loop_end_0): |
198 | #ifdef USE_AS_STPNCPY |
199 | addi r3,r9,1 |
200 | b L(short_path_loop_end) |
201 | #endif |
202 | L(short_path_loop_end_1): |
203 | #ifdef USE_AS_STPNCPY |
204 | mr r3,r9 |
205 | #endif |
206 | L(short_path_loop_end): |
207 | /* Restore non-volatile registers. */ |
208 | ld r26,-48(r1) |
209 | ld r27,-40(r1) |
210 | ld r28,-32(r1) |
211 | ld r29,-24(r1) |
212 | ld r30,-16(r1) |
213 | ld r31,-8(r1) |
214 | blr |
215 | |
216 | /* This code pads the remainder of dest with NULL bytes. The algorithm |
217 | calculates the remaining size and calls memset. */ |
218 | .align 4 |
219 | L(zero_pad_start): |
220 | mr r5,r10 |
221 | mr r9,r6 |
222 | L(zero_pad_start_1): |
223 | /* At this point: |
224 | - r5 holds the number of bytes that still have to be written to |
225 | dest. |
226 | - r9 points to the position, in dest, where the first null byte |
227 | will be written. |
228 | The above statements are true both when control reaches this label |
229 | from a branch or when falling through the previous lines. */ |
230 | #ifndef USE_AS_STPNCPY |
231 | mr r30,r3 /* Save the return value of strncpy. */ |
232 | #endif |
233 | /* Prepare the call to memset. */ |
234 | mr r3,r9 /* Pointer to the area to be zero-filled. */ |
235 | li r4,0 /* Byte to be written (zero). */ |
236 | |
237 | /* We delayed the creation of the stack frame, as well as the saving of |
238 | the link register, because only at this point, we are sure that |
239 | doing so is actually needed. */ |
240 | |
241 | /* Save the link register. */ |
242 | mflr r0 |
243 | std r0,16(r1) |
244 | |
245 | /* Create the stack frame. */ |
246 | stdu r1,-FRAMESIZE(r1) |
247 | cfi_adjust_cfa_offset(FRAMESIZE) |
248 | cfi_offset(lr, 16) |
249 | |
250 | bl MEMSET |
251 | #ifndef MEMSET_is_local |
252 | nop |
253 | #endif |
254 | |
255 | ld r0,FRAMESIZE+16(r1) |
256 | |
257 | #ifndef USE_AS_STPNCPY |
258 | mr r3,r30 /* Restore the return value of strncpy, i.e.: |
259 | dest. For stpncpy, the return value is the |
260 | same as return value of memset. */ |
261 | #endif |
262 | |
263 | /* Restore non-volatile registers and return. */ |
264 | ld r26,FRAMESIZE-48(r1) |
265 | ld r27,FRAMESIZE-40(r1) |
266 | ld r28,FRAMESIZE-32(r1) |
267 | ld r29,FRAMESIZE-24(r1) |
268 | ld r30,FRAMESIZE-16(r1) |
269 | ld r31,FRAMESIZE-8(r1) |
270 | /* Restore the stack frame. */ |
271 | addi r1,r1,FRAMESIZE |
272 | cfi_adjust_cfa_offset(-FRAMESIZE) |
273 | /* Restore the link register. */ |
274 | mtlr r0 |
275 | cfi_restore(lr) |
276 | blr |
277 | |
278 | /* The common case where [src]+16 will not cross a 4K page boundary. |
279 | In this case the code fast check the first 16 bytes by using doubleword |
280 | read/compares and update destiny if neither total size or null byte |
281 | is found in destiny. */ |
282 | .align 4 |
283 | L(unaligned_lt_16): |
284 | cmpldi cr7,r5,7 |
285 | ble cr7,L(short_path) |
286 | ld r7,0(r4) |
287 | li r8,0 |
288 | cmpb r8,r7,r8 |
289 | cmpdi cr7,r8,0 |
290 | bne cr7,L(short_path_prepare_2) |
291 | addi r6,r5,-8 |
292 | std r7,0(r3) |
293 | addi r9,r3,8 |
294 | cmpldi cr7,r6,7 |
295 | addi r7,r4,8 |
296 | ble cr7,L(short_path_prepare_1_1) |
297 | ld r4,8(r4) |
298 | cmpb r8,r4,r8 |
299 | cmpdi cr7,r8,0 |
300 | bne cr7,L(short_path_prepare_2_1) |
301 | std r4,8(r3) |
302 | addi r29,r3,16 |
303 | addi r5,r5,-16 |
304 | /* Neither the null byte was found or total length was reached, |
305 | align to 16 bytes and issue a bulk copy/compare. */ |
306 | b L(align_to_16b) |
307 | |
308 | /* In the case of 4k page boundary cross, the algorithm first align |
309 | the address to a doubleword, calculate a mask based on alignment |
310 | to ignore the bytes and continue using doubleword. */ |
311 | .align 4 |
312 | L(pagecross): |
313 | rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */ |
314 | li r6,-1 /* MASK = 0xffffffffffffffffUL. */ |
315 | sldi r9,r9,3 /* Calculate padding. */ |
316 | ld r7,0(r11) /* Load doubleword from memory. */ |
317 | #ifdef __LITTLE_ENDIAN__ |
318 | sld r9,r6,r9 /* MASK = MASK << padding. */ |
319 | #else |
320 | srd r9,r6,r9 /* MASK = MASK >> padding. */ |
321 | #endif |
322 | orc r9,r7,r9 /* Mask bits that are not part of the |
323 | string. */ |
324 | li r7,0 |
325 | cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */ |
326 | cmpdi cr7,r9,0 |
327 | bne cr7,L(short_path_prepare_2) |
328 | subf r8,r8,r5 /* Adjust total length. */ |
329 | cmpldi cr7,r8,8 /* Check if length was reached. */ |
330 | ble cr7,L(short_path_prepare_2) |
331 | |
332 | /* For next checks we have aligned address, so we check for more |
333 | three doublewords to make sure we can read 16 unaligned bytes |
334 | to start the bulk copy with 16 aligned addresses. */ |
335 | ld r7,8(r11) |
336 | cmpb r9,r7,r9 |
337 | cmpdi cr7,r9,0 |
338 | bne cr7,L(short_path_prepare_2) |
339 | addi r7,r8,-8 |
340 | cmpldi cr7,r7,8 |
341 | ble cr7,L(short_path_prepare_2) |
342 | ld r7,16(r11) |
343 | cmpb r9,r7,r9 |
344 | cmpdi cr7,r9,0 |
345 | bne cr7,L(short_path_prepare_2) |
346 | addi r8,r8,-16 |
347 | cmpldi cr7,r8,8 |
348 | ble cr7,L(short_path_prepare_2) |
349 | ld r8,24(r11) |
350 | cmpb r9,r8,r9 |
351 | cmpdi cr7,r9,0 |
352 | bne cr7,L(short_path_prepare_2) |
353 | |
354 | /* No null byte found in the 32 bytes read and length not reached, |
355 | read source again using unaligned loads and store them. */ |
356 | ld r9,0(r4) |
357 | addi r29,r3,16 |
358 | addi r5,r5,-16 |
359 | std r9,0(r3) |
360 | ld r9,8(r4) |
361 | std r9,8(r3) |
362 | |
363 | /* Align source to 16 bytes and adjust destiny and size. */ |
364 | L(align_to_16b): |
365 | rldicl r9,r10,0,60 |
366 | rldicr r28,r10,0,59 |
367 | add r12,r5,r9 |
368 | subf r29,r9,r29 |
369 | |
370 | /* The bulk read/compare/copy loads two doublewords, compare and merge |
371 | in a single register for speed. This is an attempt to speed up the |
372 | null-checking process for bigger strings. */ |
373 | |
374 | cmpldi cr7,r12,15 |
375 | ble cr7,L(short_path_prepare_1_2) |
376 | |
377 | /* Main loop for large sizes, unrolled 2 times to get better use of |
378 | pipeline. */ |
379 | ld r8,0(28) |
380 | ld r10,8(28) |
381 | li r9,0 |
382 | cmpb r7,r8,r9 |
383 | cmpb r9,r10,r9 |
384 | or. r6,r9,r7 |
385 | bne cr0,L(short_path_prepare_2_3) |
386 | addi r5,r12,-16 |
387 | addi r4,r28,16 |
388 | std r8,0(r29) |
389 | std r10,8(r29) |
390 | cmpldi cr7,r5,15 |
391 | addi r9,r29,16 |
392 | ble cr7,L(short_path_1) |
393 | mr r11,r28 |
394 | mr r6,r29 |
395 | li r30,0 |
396 | subfic r26,r4,48 |
397 | subfic r27,r9,48 |
398 | |
399 | b L(loop_16b) |
400 | |
401 | .align 4 |
402 | L(loop_start): |
403 | ld r31,0(r11) |
404 | ld r10,8(r11) |
405 | cmpb r0,r31,r7 |
406 | cmpb r8,r10,r7 |
407 | or. r7,r0,r8 |
408 | addi r5,r5,-32 |
409 | cmpldi cr7,r5,15 |
410 | add r4,r4,r26 |
411 | add r9,r9,r27 |
412 | bne cr0,L(short_path_prepare_2_2) |
413 | add r4,r28,r4 |
414 | std r31,0(r6) |
415 | add r9,r29,r9 |
416 | std r10,8(r6) |
417 | ble cr7,L(short_path_1) |
418 | |
419 | L(loop_16b): |
420 | ld r10,16(r11) |
421 | ld r0,24(r11) |
422 | cmpb r8,r10,r30 |
423 | cmpb r7,r0,r30 |
424 | or. r7,r8,r7 |
425 | addi r12,r12,-32 |
426 | cmpldi cr7,r12,15 |
427 | addi r11,r11,32 |
428 | bne cr0,L(short_path_2) |
429 | std r10,16(r6) |
430 | addi r6,r6,32 |
431 | std r0,-8(r6) |
432 | bgt cr7,L(loop_start) |
433 | |
434 | mr r5,r12 |
435 | mr r4,r11 |
436 | mr r9,r6 |
437 | b L(short_path_1) |
438 | |
439 | .align 4 |
440 | L(short_path_prepare_1_1): |
441 | mr r5,r6 |
442 | mr r4,r7 |
443 | b L(short_path_1) |
444 | L(short_path_prepare_1_2): |
445 | mr r5,r12 |
446 | mr r4,r28 |
447 | mr r9,r29 |
448 | b L(short_path_1) |
449 | L(short_path_prepare_2): |
450 | mr r9,r3 |
451 | b L(short_path_2) |
452 | L(short_path_prepare_2_1): |
453 | mr r5,r6 |
454 | mr r4,r7 |
455 | b L(short_path_2) |
456 | L(short_path_prepare_2_2): |
457 | mr r5,r12 |
458 | mr r4,r11 |
459 | mr r9,r6 |
460 | b L(short_path_2) |
461 | L(short_path_prepare_2_3): |
462 | mr r5,r12 |
463 | mr r4,r28 |
464 | mr r9,r29 |
465 | b L(short_path_2) |
466 | L(zero_pad_start_prepare_1): |
467 | mr r5,r6 |
468 | mr r9,r8 |
469 | b L(zero_pad_start_1) |
470 | END (FUNC_NAME) |
471 | |
472 | #ifndef USE_AS_STPNCPY |
473 | libc_hidden_builtin_def (strncpy) |
474 | #endif |
475 | |