1/* Optimized memset implementation for PowerPC64/POWER8.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
22 Returns 's'. */
23
24#ifndef MEMSET
25# define MEMSET memset
26#endif
27 .machine power8
28ENTRY_TOCLESS (MEMSET, 5)
29 CALL_MCOUNT 3
30
31L(_memset):
32 cmpldi cr7,r5,31
33 neg r0,r3
34 mr r10,r3
35
36 insrdi r4,r4,8,48
37 insrdi r4,r4,16,32 /* Replicate byte to word. */
38 ble cr7,L(write_LT_32)
39
40 andi. r11,r10,15 /* Check alignment of DST. */
41 insrdi r4,r4,32,0 /* Replicate word to double word. */
42
43 beq L(big_aligned)
44
45 mtocrf 0x01,r0
46 clrldi r0,r0,60
47
48 /* Get DST aligned to 16 bytes. */
491: bf 31,2f
50 stb r4,0(r10)
51 addi r10,r10,1
52
532: bf 30,4f
54 sth r4,0(r10)
55 addi r10,r10,2
56
574: bf 29,8f
58 stw r4,0(r10)
59 addi r10,r10,4
60
618: bf 28,16f
62 std r4,0(r10)
63 addi r10,r10,8
64
6516: subf r5,r0,r5
66
67 .align 4
68L(big_aligned):
69 /* For sizes larger than 255 two possible paths:
70 - if constant is '0', zero full cache lines with dcbz
71 - otherwise uses vector instructions. */
72 cmpldi cr5,r5,255
73 dcbtst 0,r10
74 cmpldi cr6,r4,0
75 crand 27,26,21
76 bt 27,L(huge_dcbz)
77 bge cr5,L(huge_vector)
78
79
80 /* Size between 32 and 255 bytes with constant different than 0, use
81 doubleword store instruction to achieve best throughput. */
82 srdi r8,r5,5
83 clrldi r11,r5,59
84 cmpldi cr6,r11,0
85 cmpdi r8,0
86 beq L(tail_bytes)
87 mtctr r8
88
89 /* Main aligned write loop, writes 32-bytes at a time. */
90 .align 4
91L(big_loop):
92 std r4,0(r10)
93 std r4,8(r10)
94 std r4,16(r10)
95 std r4,24(r10)
96 addi r10,r10,32
97 bdz L(tail_bytes)
98
99 std r4,0(r10)
100 std r4,8(r10)
101 std r4,16(r10)
102 std r4,24(r10)
103 addi r10,10,32
104 bdnz L(big_loop)
105
106 b L(tail_bytes)
107
108 /* Write remaining 1~31 bytes. */
109 .align 4
110L(tail_bytes):
111 beqlr cr6
112
113 srdi r7,r11,4
114 clrldi r8,r11,60
115 mtocrf 0x01,r7
116
117 .align 4
118 bf 31,8f
119 std r4,0(r10)
120 std r4,8(r10)
121 addi r10,r10,16
122
123 .align 4
1248: mtocrf 0x1,r8
125 bf 28,4f
126 std r4,0(r10)
127 addi r10,r10,8
128
129 .align 4
1304: bf 29,2f
131 stw 4,0(10)
132 addi 10,10,4
133
134 .align 4
1352: bf 30,1f
136 sth 4,0(10)
137 addi 10,10,2
138
139 .align 4
1401: bflr 31
141 stb 4,0(10)
142 blr
143
144 /* Size larger than 255 bytes with constant different than 0, use
145 vector instruction to achieve best throughput. */
146L(huge_vector):
147 /* Replicate set byte to quadword in VMX register. */
148 mtvsrd v1,r4
149 xxpermdi 32,v0,v1,0
150 vspltb v2,v0,15
151
152 /* Main aligned write loop: 128 bytes at a time. */
153 li r6,16
154 li r7,32
155 li r8,48
156 mtocrf 0x02,r5
157 srdi r12,r5,7
158 cmpdi r12,0
159 beq L(aligned_tail)
160 mtctr r12
161 b L(aligned_128loop)
162
163 .align 4
164L(aligned_128loop):
165 stvx v2,0,r10
166 stvx v2,r10,r6
167 stvx v2,r10,r7
168 stvx v2,r10,r8
169 addi r10,r10,64
170 stvx v2,0,r10
171 stvx v2,r10,r6
172 stvx v2,r10,r7
173 stvx v2,r10,r8
174 addi r10,r10,64
175 bdnz L(aligned_128loop)
176
177 /* Write remaining 1~127 bytes. */
178L(aligned_tail):
179 mtocrf 0x01,r5
180 bf 25,32f
181 stvx v2,0,r10
182 stvx v2,r10,r6
183 stvx v2,r10,r7
184 stvx v2,r10,r8
185 addi r10,r10,64
186
18732: bf 26,16f
188 stvx v2,0,r10
189 stvx v2,r10,r6
190 addi r10,r10,32
191
19216: bf 27,8f
193 stvx v2,0,r10
194 addi r10,r10,16
195
1968: bf 28,4f
197 std r4,0(r10)
198 addi r10,r10,8
199
200 /* Copies 4~7 bytes. */
2014: bf 29,L(tail2)
202 stw r4,0(r10)
203 bf 30,L(tail5)
204 sth r4,4(r10)
205 bflr 31
206 stb r4,6(r10)
207 /* Return original DST pointer. */
208 blr
209
210 /* Special case when value is 0 and we have a long length to deal
211 with. Use dcbz to zero out a full cacheline of 128 bytes at a time.
212 Before using dcbz though, we need to get the destination 128-byte
213 aligned. */
214 .align 4
215L(huge_dcbz):
216 andi. r11,r10,127
217 neg r0,r10
218 beq L(huge_dcbz_aligned)
219
220 clrldi r0,r0,57
221 subf r5,r0,r5
222 srdi r0,r0,3
223 mtocrf 0x01,r0
224
225 /* Write 1~128 bytes until DST is aligned to 128 bytes. */
2268: bf 28,4f
227
228 std r4,0(r10)
229 std r4,8(r10)
230 std r4,16(r10)
231 std r4,24(r10)
232 std r4,32(r10)
233 std r4,40(r10)
234 std r4,48(r10)
235 std r4,56(r10)
236 addi r10,r10,64
237
238 .align 4
2394: bf 29,2f
240 std r4,0(r10)
241 std r4,8(r10)
242 std r4,16(r10)
243 std r4,24(r10)
244 addi r10,r10,32
245
246 .align 4
2472: bf 30,1f
248 std r4,0(r10)
249 std r4,8(r10)
250 addi r10,r10,16
251
252 .align 4
2531: bf 31,L(huge_dcbz_aligned)
254 std r4,0(r10)
255 addi r10,r10,8
256
257L(huge_dcbz_aligned):
258 /* Setup dcbz unroll offsets and count numbers. */
259 srdi r8,r5,9
260 clrldi r11,r5,55
261 cmpldi cr6,r11,0
262 li r9,128
263 cmpdi r8,0
264 beq L(huge_tail)
265 li r7,256
266 li r6,384
267 mtctr r8
268
269 .align 4
270L(huge_loop):
271 /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
272 a throughput boost for large sizes (2048 bytes or higher). */
273 dcbz 0,r10
274 dcbz r9,r10
275 dcbz r7,r10
276 dcbz r6,r10
277 addi r10,r10,512
278 bdnz L(huge_loop)
279
280 beqlr cr6
281
282L(huge_tail):
283 srdi r6,r11,8
284 srdi r7,r11,4
285 clrldi r8,r11,4
286 cmpldi cr6,r8,0
287 mtocrf 0x01,r6
288
289 beq cr6,L(tail)
290
291 /* We have 1~511 bytes remaining. */
292 .align 4
29332: bf 31,16f
294 dcbz 0,r10
295 dcbz r9,r10
296 addi r10,r10,256
297
298 .align 4
29916: mtocrf 0x01,r7
300 bf 28,8f
301 dcbz 0,r10
302 addi r10,r10,128
303
304 .align 4
3058: bf 29,4f
306 std r4,0(r10)
307 std r4,8(r10)
308 std r4,16(r10)
309 std r4,24(r10)
310 std r4,32(r10)
311 std r4,40(r10)
312 std r4,48(r10)
313 std r4,56(r10)
314 addi r10,r10,64
315
316 .align 4
3174: bf 30,2f
318 std r4,0(r10)
319 std r4,8(r10)
320 std r4,16(r10)
321 std r4,24(r10)
322 addi r10,r10,32
323
324 .align 4
3252: bf 31,L(tail)
326 std r4,0(r10)
327 std r4,8(r10)
328 addi r10,r10,16
329 .align 4
330
331 /* Remaining 1~15 bytes. */
332L(tail):
333 mtocrf 0x01,r8
334
335 .align
3368: bf 28,4f
337 std r4,0(r10)
338 addi r10,r10,8
339
340 .align 4
3414: bf 29,2f
342 stw r4,0(r10)
343 addi r10,r10,4
344
345 .align 4
3462: bf 30,1f
347 sth r4,0(r10)
348 addi r10,r10,2
349
350 .align 4
3511: bflr 31
352 stb r4,0(r10)
353 blr
354
355 /* Handle short copies of 0~31 bytes. Best throughput is achieved
356 by just unrolling all operations. */
357 .align 4
358L(write_LT_32):
359 cmpldi cr6,5,8
360 mtocrf 0x01,r5
361 ble cr6,L(write_LE_8)
362
363 /* At least 9 bytes to go. */
364 neg r8,r4
365 andi. r0,r8,3
366 cmpldi cr1,r5,16
367 beq L(write_LT_32_aligned)
368
369 /* Force 4-byte alignment for SRC. */
370 mtocrf 0x01,r0
371 subf r5,r0,r5
372
3732: bf 30,1f
374 /* Use stb instead of sth because it doesn't generate
375 alignment interrupts on cache-inhibited storage. */
376 stb r4,0(r10)
377 stb r4,1(r10)
378 addi r10,r10,2
379
3801: bf 31,L(end_4bytes_alignment)
381 stb r4,0(r10)
382 addi r10,r10,1
383
384 .align 4
385L(end_4bytes_alignment):
386 cmpldi cr1,r5,16
387 mtocrf 0x01,r5
388
389L(write_LT_32_aligned):
390 blt cr1,8f
391
392 stw r4,0(r10)
393 stw r4,4(r10)
394 stw r4,8(r10)
395 stw r4,12(r10)
396 addi r10,r10,16
397
3988: bf 28,L(tail4)
399 stw r4,0(r10)
400 stw r4,4(r10)
401 addi r10,r10,8
402
403 .align 4
404 /* Copies 4~7 bytes. */
405L(tail4):
406 bf 29,L(tail2)
407 stw r4,0(r10)
408 bf 30,L(tail5)
409 sth r4,4(r10)
410 bflr 31
411 stb r4,6(r10)
412 blr
413
414 .align 4
415 /* Copies 2~3 bytes. */
416L(tail2):
417 bf 30,1f
418 sth r4,0(r10)
419 bflr 31
420 stb r4,2(r10)
421 blr
422
423 .align 4
424L(tail5):
425 bflr 31
426 stb r4,4(r10)
427 blr
428
429 .align 4
4301: bflr 31
431 stb r4,0(r10)
432 blr
433
434 /* Handles copies of 0~8 bytes. */
435 .align 4
436L(write_LE_8):
437 bne cr6,L(LE7_tail4)
438 /* If input is word aligned, use stw, else use stb. */
439 andi. r0,r10,3
440 bne L(8_unalign)
441
442 stw r4,0(r10)
443 stw r4,4(r10)
444 blr
445
446 /* Unaligned input and size is 8. */
447 .align 4
448L(8_unalign):
449 andi. r0,r10,1
450 beq L(8_hwalign)
451 stb r4,0(r10)
452 sth r4,1(r10)
453 sth r4,3(r10)
454 sth r4,5(r10)
455 stb r4,7(r10)
456 blr
457
458 /* Halfword aligned input and size is 8. */
459 .align 4
460L(8_hwalign):
461 sth r4,0(r10)
462 sth r4,2(r10)
463 sth r4,4(r10)
464 sth r4,6(r10)
465 blr
466
467 .align 4
468 /* Copies 4~7 bytes. */
469L(LE7_tail4):
470 /* Use stb instead of sth because it doesn't generate
471 alignment interrupts on cache-inhibited storage. */
472 bf 29,L(LE7_tail2)
473 stb r4,0(r10)
474 stb r4,1(r10)
475 stb r4,2(r10)
476 stb r4,3(r10)
477 bf 30,L(LE7_tail5)
478 stb r4,4(r10)
479 stb r4,5(r10)
480 bflr 31
481 stb r4,6(r10)
482 blr
483
484 .align 4
485 /* Copies 2~3 bytes. */
486L(LE7_tail2):
487 bf 30,1f
488 stb r4,0(r10)
489 stb r4,1(r10)
490 bflr 31
491 stb r4,2(r10)
492 blr
493
494 .align 4
495L(LE7_tail5):
496 bflr 31
497 stb r4,4(r10)
498 blr
499
500 .align 4
5011: bflr 31
502 stb r4,0(r10)
503 blr
504
505END_GEN_TB (MEMSET,TB_TOCLESS)
506libc_hidden_builtin_def (memset)
507

source code of glibc/sysdeps/powerpc/powerpc64/power8/memset.S