1/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
2 Copyright (C) 2015-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21#ifdef USE_AS_STPNCPY
22# ifndef STPNCPY
23# define FUNC_NAME __stpncpy
24# else
25# define FUNC_NAME STPNCPY
26# endif
27#else
28# ifndef STRNCPY
29# define FUNC_NAME strncpy
30# else
31# define FUNC_NAME STRNCPY
32# endif
33#endif /* !USE_AS_STPNCPY */
34
35#ifndef MEMSET
36/* For builds without IFUNC support, local calls should be made to internal
37 GLIBC symbol (created by libc_hidden_builtin_def). */
38# ifdef SHARED
39# define MEMSET_is_local
40# define MEMSET __GI_memset
41# else
42# define MEMSET memset
43# endif
44#endif
45
46#define FRAMESIZE (FRAME_MIN_SIZE+48)
47
48/* Implements the function
49
50 char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
51
52 or
53
54 char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
55
56 if USE_AS_STPCPY is defined.
57
58 The implementation uses unaligned doubleword access to avoid specialized
59 code paths depending of data alignment. Although recent powerpc64 uses
60 64K as default, the page cross handling assumes minimum page size of
61 4k. */
62
63 .machine power8
64#ifdef MEMSET_is_local
65ENTRY_TOCLESS (FUNC_NAME, 4)
66#else
67ENTRY (FUNC_NAME, 4)
68#endif
69 CALL_MCOUNT 3
70
71 /* Check if the [src]+15 will cross a 4K page by checking if the bit
72 indicating the page size changes. Basically:
73
74 uint64_t srcin = (uint64_t)src;
75 uint64_t ob = srcin & 4096UL;
76 uint64_t nb = (srcin+15UL) & 4096UL;
77 if (ob ^ nb)
78 goto pagecross; */
79
80 addi r10,r4,16
81 rlwinm r9,r4,0,19,19
82
83 /* Save some non-volatile registers on the stack. */
84 std r26,-48(r1)
85 std r27,-40(r1)
86
87 rlwinm r8,r10,0,19,19
88
89 std r28,-32(r1)
90 std r29,-24(r1)
91
92 cmpld cr7,r9,r8
93
94 std r30,-16(r1)
95 std r31,-8(r1)
96
97 /* Update CFI. */
98 cfi_offset(r26, -48)
99 cfi_offset(r27, -40)
100 cfi_offset(r28, -32)
101 cfi_offset(r29, -24)
102 cfi_offset(r30, -16)
103 cfi_offset(r31, -8)
104
105 beq cr7,L(unaligned_lt_16)
106 rldicl r9,r4,0,61
107 subfic r8,r9,8
108 cmpld cr7,r5,r8
109 bgt cr7,L(pagecross)
110
111 /* At this points there is 1 to 15 bytes to check and write. Since it could
112 be either from first unaligned 16 bytes access or from bulk copy, the code
113 uses an unrolled byte read/write instead of trying to analyze the cmpb
114 results. */
115L(short_path):
116 mr r9,r3
117L(short_path_1):
118 /* Return if there are no more bytes to be written. */
119 cmpdi cr7,r5,0
120 beq cr7,L(short_path_loop_end_1)
121L(short_path_2):
122 /* Copy one char from src (r4) and write it to dest (r9). If it is the
123 end-of-string, start the null padding. Continue, otherwise. */
124 lbz r10,0(r4)
125 cmpdi cr7,r10,0
126 stb r10,0(r9)
127 beq cr7,L(zero_pad_start_1)
128 /* If there are no more bytes to be written, return. */
129 cmpdi cr0,r5,1
130 addi r8,r9,1
131 addi r6,r5,-1
132 beq cr0,L(short_path_loop_end_0)
133 /* Copy another char from src (r4) to dest (r9). Check again if it is
134 the end-of-string. If so, start the null padding. */
135 lbz r10,1(r4)
136 cmpdi cr7,r10,0
137 stb r10,1(r9)
138 beq cr7,L(zero_pad_start_prepare_1)
139 /* Eagerly decrement r5 by 3, which is the number of bytes already
140 written, plus one write that will be performed later on. */
141 addi r10,r5,-3
142 b L(short_path_loop_1)
143
144 .align 4
145L(short_path_loop):
146 /* At this point, the induction variable, r5, as well as the pointers
147 to dest and src (r9 and r4, respectively) have been updated.
148
149 Note: The registers r7 and r10 are induction variables derived from
150 r5. They are used to determine if the total number of writes has
151 been reached at every other write.
152
153 Copy one char from src (r4) and write it to dest (r9). If it is the
154 end-of-string, start the null padding. Continue, otherwise. */
155 lbz r8,0(r4)
156 addi r7,r10,-2
157 cmpdi cr5,r8,0
158 stb r8,0(r9)
159 beq cr5,L(zero_pad_start_1)
160 beq cr7,L(short_path_loop_end_0)
161 /* Copy another char from src (r4) to dest (r9). Check again if it is
162 the end-of-string. If so, start the null padding. */
163 lbz r8,1(r4)
164 cmpdi cr7,r8,0
165 stb r8,1(r9)
166 beq cr7,L(zero_pad_start)
167 mr r10,r7
168L(short_path_loop_1):
169 /* This block is reached after two chars have been already written to
170 dest. Nevertheless, r5 (the induction variable), r9 (the pointer to
171 dest), and r4 (the pointer to src) have not yet been updated.
172
173 At this point:
174 r5 holds the count of bytes yet to be written plus 2.
175 r9 points to the last two chars that were already written to dest.
176 r4 points to the last two chars that were already copied from src.
177
178 The algorithm continues by decrementing r5, the induction variable,
179 so that it reflects the last two writes. The pointers to dest (r9)
180 and to src (r4) are increment by two, for the same reason.
181
182 Note: Register r10 is another induction variable, derived from r5,
183 which determines if the total number of writes has been reached. */
184 addic. r5,r5,-2
185 addi r9,r9,2
186 cmpdi cr7,r10,0 /* Eagerly check if the next write is the last. */
187 addi r4,r4,2
188 addi r6,r9,1
189 bne cr0,L(short_path_loop) /* Check if the total number of writes
190 has been reached at every other
191 write. */
192#ifdef USE_AS_STPNCPY
193 mr r3,r9
194 b L(short_path_loop_end)
195#endif
196
197L(short_path_loop_end_0):
198#ifdef USE_AS_STPNCPY
199 addi r3,r9,1
200 b L(short_path_loop_end)
201#endif
202L(short_path_loop_end_1):
203#ifdef USE_AS_STPNCPY
204 mr r3,r9
205#endif
206L(short_path_loop_end):
207 /* Restore non-volatile registers. */
208 ld r26,-48(r1)
209 ld r27,-40(r1)
210 ld r28,-32(r1)
211 ld r29,-24(r1)
212 ld r30,-16(r1)
213 ld r31,-8(r1)
214 blr
215
216 /* This code pads the remainder of dest with NULL bytes. The algorithm
217 calculates the remaining size and calls memset. */
218 .align 4
219L(zero_pad_start):
220 mr r5,r10
221 mr r9,r6
222L(zero_pad_start_1):
223 /* At this point:
224 - r5 holds the number of bytes that still have to be written to
225 dest.
226 - r9 points to the position, in dest, where the first null byte
227 will be written.
228 The above statements are true both when control reaches this label
229 from a branch or when falling through the previous lines. */
230#ifndef USE_AS_STPNCPY
231 mr r30,r3 /* Save the return value of strncpy. */
232#endif
233 /* Prepare the call to memset. */
234 mr r3,r9 /* Pointer to the area to be zero-filled. */
235 li r4,0 /* Byte to be written (zero). */
236
237 /* We delayed the creation of the stack frame, as well as the saving of
238 the link register, because only at this point, we are sure that
239 doing so is actually needed. */
240
241 /* Save the link register. */
242 mflr r0
243 std r0,16(r1)
244
245 /* Create the stack frame. */
246 stdu r1,-FRAMESIZE(r1)
247 cfi_adjust_cfa_offset(FRAMESIZE)
248 cfi_offset(lr, 16)
249
250 bl MEMSET
251#ifndef MEMSET_is_local
252 nop
253#endif
254
255 ld r0,FRAMESIZE+16(r1)
256
257#ifndef USE_AS_STPNCPY
258 mr r3,r30 /* Restore the return value of strncpy, i.e.:
259 dest. For stpncpy, the return value is the
260 same as return value of memset. */
261#endif
262
263 /* Restore non-volatile registers and return. */
264 ld r26,FRAMESIZE-48(r1)
265 ld r27,FRAMESIZE-40(r1)
266 ld r28,FRAMESIZE-32(r1)
267 ld r29,FRAMESIZE-24(r1)
268 ld r30,FRAMESIZE-16(r1)
269 ld r31,FRAMESIZE-8(r1)
270 /* Restore the stack frame. */
271 addi r1,r1,FRAMESIZE
272 cfi_adjust_cfa_offset(-FRAMESIZE)
273 /* Restore the link register. */
274 mtlr r0
275 cfi_restore(lr)
276 blr
277
278 /* The common case where [src]+16 will not cross a 4K page boundary.
279 In this case the code fast check the first 16 bytes by using doubleword
280 read/compares and update destiny if neither total size or null byte
281 is found in destiny. */
282 .align 4
283L(unaligned_lt_16):
284 cmpldi cr7,r5,7
285 ble cr7,L(short_path)
286 ld r7,0(r4)
287 li r8,0
288 cmpb r8,r7,r8
289 cmpdi cr7,r8,0
290 bne cr7,L(short_path_prepare_2)
291 addi r6,r5,-8
292 std r7,0(r3)
293 addi r9,r3,8
294 cmpldi cr7,r6,7
295 addi r7,r4,8
296 ble cr7,L(short_path_prepare_1_1)
297 ld r4,8(r4)
298 cmpb r8,r4,r8
299 cmpdi cr7,r8,0
300 bne cr7,L(short_path_prepare_2_1)
301 std r4,8(r3)
302 addi r29,r3,16
303 addi r5,r5,-16
304 /* Neither the null byte was found or total length was reached,
305 align to 16 bytes and issue a bulk copy/compare. */
306 b L(align_to_16b)
307
308 /* In the case of 4k page boundary cross, the algorithm first align
309 the address to a doubleword, calculate a mask based on alignment
310 to ignore the bytes and continue using doubleword. */
311 .align 4
312L(pagecross):
313 rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */
314 li r6,-1 /* MASK = 0xffffffffffffffffUL. */
315 sldi r9,r9,3 /* Calculate padding. */
316 ld r7,0(r11) /* Load doubleword from memory. */
317#ifdef __LITTLE_ENDIAN__
318 sld r9,r6,r9 /* MASK = MASK << padding. */
319#else
320 srd r9,r6,r9 /* MASK = MASK >> padding. */
321#endif
322 orc r9,r7,r9 /* Mask bits that are not part of the
323 string. */
324 li r7,0
325 cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */
326 cmpdi cr7,r9,0
327 bne cr7,L(short_path_prepare_2)
328 subf r8,r8,r5 /* Adjust total length. */
329 cmpldi cr7,r8,8 /* Check if length was reached. */
330 ble cr7,L(short_path_prepare_2)
331
332 /* For next checks we have aligned address, so we check for more
333 three doublewords to make sure we can read 16 unaligned bytes
334 to start the bulk copy with 16 aligned addresses. */
335 ld r7,8(r11)
336 cmpb r9,r7,r9
337 cmpdi cr7,r9,0
338 bne cr7,L(short_path_prepare_2)
339 addi r7,r8,-8
340 cmpldi cr7,r7,8
341 ble cr7,L(short_path_prepare_2)
342 ld r7,16(r11)
343 cmpb r9,r7,r9
344 cmpdi cr7,r9,0
345 bne cr7,L(short_path_prepare_2)
346 addi r8,r8,-16
347 cmpldi cr7,r8,8
348 ble cr7,L(short_path_prepare_2)
349 ld r8,24(r11)
350 cmpb r9,r8,r9
351 cmpdi cr7,r9,0
352 bne cr7,L(short_path_prepare_2)
353
354 /* No null byte found in the 32 bytes read and length not reached,
355 read source again using unaligned loads and store them. */
356 ld r9,0(r4)
357 addi r29,r3,16
358 addi r5,r5,-16
359 std r9,0(r3)
360 ld r9,8(r4)
361 std r9,8(r3)
362
363 /* Align source to 16 bytes and adjust destiny and size. */
364L(align_to_16b):
365 rldicl r9,r10,0,60
366 rldicr r28,r10,0,59
367 add r12,r5,r9
368 subf r29,r9,r29
369
370 /* The bulk read/compare/copy loads two doublewords, compare and merge
371 in a single register for speed. This is an attempt to speed up the
372 null-checking process for bigger strings. */
373
374 cmpldi cr7,r12,15
375 ble cr7,L(short_path_prepare_1_2)
376
377 /* Main loop for large sizes, unrolled 2 times to get better use of
378 pipeline. */
379 ld r8,0(28)
380 ld r10,8(28)
381 li r9,0
382 cmpb r7,r8,r9
383 cmpb r9,r10,r9
384 or. r6,r9,r7
385 bne cr0,L(short_path_prepare_2_3)
386 addi r5,r12,-16
387 addi r4,r28,16
388 std r8,0(r29)
389 std r10,8(r29)
390 cmpldi cr7,r5,15
391 addi r9,r29,16
392 ble cr7,L(short_path_1)
393 mr r11,r28
394 mr r6,r29
395 li r30,0
396 subfic r26,r4,48
397 subfic r27,r9,48
398
399 b L(loop_16b)
400
401 .align 4
402L(loop_start):
403 ld r31,0(r11)
404 ld r10,8(r11)
405 cmpb r0,r31,r7
406 cmpb r8,r10,r7
407 or. r7,r0,r8
408 addi r5,r5,-32
409 cmpldi cr7,r5,15
410 add r4,r4,r26
411 add r9,r9,r27
412 bne cr0,L(short_path_prepare_2_2)
413 add r4,r28,r4
414 std r31,0(r6)
415 add r9,r29,r9
416 std r10,8(r6)
417 ble cr7,L(short_path_1)
418
419L(loop_16b):
420 ld r10,16(r11)
421 ld r0,24(r11)
422 cmpb r8,r10,r30
423 cmpb r7,r0,r30
424 or. r7,r8,r7
425 addi r12,r12,-32
426 cmpldi cr7,r12,15
427 addi r11,r11,32
428 bne cr0,L(short_path_2)
429 std r10,16(r6)
430 addi r6,r6,32
431 std r0,-8(r6)
432 bgt cr7,L(loop_start)
433
434 mr r5,r12
435 mr r4,r11
436 mr r9,r6
437 b L(short_path_1)
438
439 .align 4
440L(short_path_prepare_1_1):
441 mr r5,r6
442 mr r4,r7
443 b L(short_path_1)
444L(short_path_prepare_1_2):
445 mr r5,r12
446 mr r4,r28
447 mr r9,r29
448 b L(short_path_1)
449L(short_path_prepare_2):
450 mr r9,r3
451 b L(short_path_2)
452L(short_path_prepare_2_1):
453 mr r5,r6
454 mr r4,r7
455 b L(short_path_2)
456L(short_path_prepare_2_2):
457 mr r5,r12
458 mr r4,r11
459 mr r9,r6
460 b L(short_path_2)
461L(short_path_prepare_2_3):
462 mr r5,r12
463 mr r4,r28
464 mr r9,r29
465 b L(short_path_2)
466L(zero_pad_start_prepare_1):
467 mr r5,r6
468 mr r9,r8
469 b L(zero_pad_start_1)
470END (FUNC_NAME)
471
472#ifndef USE_AS_STPNCPY
473libc_hidden_builtin_def (strncpy)
474#endif
475

source code of glibc/sysdeps/powerpc/powerpc64/power8/strncpy.S