1 | /* Optimized memset implementation for PowerPC64/POWER8. |
2 | Copyright (C) 2014-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); |
22 | Returns 's'. */ |
23 | |
24 | #ifndef MEMSET |
25 | # define MEMSET memset |
26 | #endif |
27 | .machine power8 |
28 | ENTRY_TOCLESS (MEMSET, 5) |
29 | CALL_MCOUNT 3 |
30 | |
31 | L(_memset): |
32 | cmpldi cr7,r5,31 |
33 | neg r0,r3 |
34 | mr r10,r3 |
35 | |
36 | insrdi r4,r4,8,48 |
37 | insrdi r4,r4,16,32 /* Replicate byte to word. */ |
38 | ble cr7,L(write_LT_32) |
39 | |
40 | andi. r11,r10,15 /* Check alignment of DST. */ |
41 | insrdi r4,r4,32,0 /* Replicate word to double word. */ |
42 | |
43 | beq L(big_aligned) |
44 | |
45 | mtocrf 0x01,r0 |
46 | clrldi r0,r0,60 |
47 | |
48 | /* Get DST aligned to 16 bytes. */ |
49 | 1: bf 31,2f |
50 | stb r4,0(r10) |
51 | addi r10,r10,1 |
52 | |
53 | 2: bf 30,4f |
54 | sth r4,0(r10) |
55 | addi r10,r10,2 |
56 | |
57 | 4: bf 29,8f |
58 | stw r4,0(r10) |
59 | addi r10,r10,4 |
60 | |
61 | 8: bf 28,16f |
62 | std r4,0(r10) |
63 | addi r10,r10,8 |
64 | |
65 | 16: subf r5,r0,r5 |
66 | |
67 | .align 4 |
68 | L(big_aligned): |
69 | /* For sizes larger than 255 two possible paths: |
70 | - if constant is '0', zero full cache lines with dcbz |
71 | - otherwise uses vector instructions. */ |
72 | cmpldi cr5,r5,255 |
73 | dcbtst 0,r10 |
74 | cmpldi cr6,r4,0 |
75 | crand 27,26,21 |
76 | bt 27,L(huge_dcbz) |
77 | bge cr5,L(huge_vector) |
78 | |
79 | |
80 | /* Size between 32 and 255 bytes with constant different than 0, use |
81 | doubleword store instruction to achieve best throughput. */ |
82 | srdi r8,r5,5 |
83 | clrldi r11,r5,59 |
84 | cmpldi cr6,r11,0 |
85 | cmpdi r8,0 |
86 | beq L(tail_bytes) |
87 | mtctr r8 |
88 | |
89 | /* Main aligned write loop, writes 32-bytes at a time. */ |
90 | .align 4 |
91 | L(big_loop): |
92 | std r4,0(r10) |
93 | std r4,8(r10) |
94 | std r4,16(r10) |
95 | std r4,24(r10) |
96 | addi r10,r10,32 |
97 | bdz L(tail_bytes) |
98 | |
99 | std r4,0(r10) |
100 | std r4,8(r10) |
101 | std r4,16(r10) |
102 | std r4,24(r10) |
103 | addi r10,10,32 |
104 | bdnz L(big_loop) |
105 | |
106 | b L(tail_bytes) |
107 | |
108 | /* Write remaining 1~31 bytes. */ |
109 | .align 4 |
110 | L(tail_bytes): |
111 | beqlr cr6 |
112 | |
113 | srdi r7,r11,4 |
114 | clrldi r8,r11,60 |
115 | mtocrf 0x01,r7 |
116 | |
117 | .align 4 |
118 | bf 31,8f |
119 | std r4,0(r10) |
120 | std r4,8(r10) |
121 | addi r10,r10,16 |
122 | |
123 | .align 4 |
124 | 8: mtocrf 0x1,r8 |
125 | bf 28,4f |
126 | std r4,0(r10) |
127 | addi r10,r10,8 |
128 | |
129 | .align 4 |
130 | 4: bf 29,2f |
131 | stw 4,0(10) |
132 | addi 10,10,4 |
133 | |
134 | .align 4 |
135 | 2: bf 30,1f |
136 | sth 4,0(10) |
137 | addi 10,10,2 |
138 | |
139 | .align 4 |
140 | 1: bflr 31 |
141 | stb 4,0(10) |
142 | blr |
143 | |
144 | /* Size larger than 255 bytes with constant different than 0, use |
145 | vector instruction to achieve best throughput. */ |
146 | L(huge_vector): |
147 | /* Replicate set byte to quadword in VMX register. */ |
148 | mtvsrd v1,r4 |
149 | xxpermdi 32,v0,v1,0 |
150 | vspltb v2,v0,15 |
151 | |
152 | /* Main aligned write loop: 128 bytes at a time. */ |
153 | li r6,16 |
154 | li r7,32 |
155 | li r8,48 |
156 | mtocrf 0x02,r5 |
157 | srdi r12,r5,7 |
158 | cmpdi r12,0 |
159 | beq L(aligned_tail) |
160 | mtctr r12 |
161 | b L(aligned_128loop) |
162 | |
163 | .align 4 |
164 | L(aligned_128loop): |
165 | stvx v2,0,r10 |
166 | stvx v2,r10,r6 |
167 | stvx v2,r10,r7 |
168 | stvx v2,r10,r8 |
169 | addi r10,r10,64 |
170 | stvx v2,0,r10 |
171 | stvx v2,r10,r6 |
172 | stvx v2,r10,r7 |
173 | stvx v2,r10,r8 |
174 | addi r10,r10,64 |
175 | bdnz L(aligned_128loop) |
176 | |
177 | /* Write remaining 1~127 bytes. */ |
178 | L(aligned_tail): |
179 | mtocrf 0x01,r5 |
180 | bf 25,32f |
181 | stvx v2,0,r10 |
182 | stvx v2,r10,r6 |
183 | stvx v2,r10,r7 |
184 | stvx v2,r10,r8 |
185 | addi r10,r10,64 |
186 | |
187 | 32: bf 26,16f |
188 | stvx v2,0,r10 |
189 | stvx v2,r10,r6 |
190 | addi r10,r10,32 |
191 | |
192 | 16: bf 27,8f |
193 | stvx v2,0,r10 |
194 | addi r10,r10,16 |
195 | |
196 | 8: bf 28,4f |
197 | std r4,0(r10) |
198 | addi r10,r10,8 |
199 | |
200 | /* Copies 4~7 bytes. */ |
201 | 4: bf 29,L(tail2) |
202 | stw r4,0(r10) |
203 | bf 30,L(tail5) |
204 | sth r4,4(r10) |
205 | bflr 31 |
206 | stb r4,6(r10) |
207 | /* Return original DST pointer. */ |
208 | blr |
209 | |
210 | /* Special case when value is 0 and we have a long length to deal |
211 | with. Use dcbz to zero out a full cacheline of 128 bytes at a time. |
212 | Before using dcbz though, we need to get the destination 128-byte |
213 | aligned. */ |
214 | .align 4 |
215 | L(huge_dcbz): |
216 | andi. r11,r10,127 |
217 | neg r0,r10 |
218 | beq L(huge_dcbz_aligned) |
219 | |
220 | clrldi r0,r0,57 |
221 | subf r5,r0,r5 |
222 | srdi r0,r0,3 |
223 | mtocrf 0x01,r0 |
224 | |
225 | /* Write 1~128 bytes until DST is aligned to 128 bytes. */ |
226 | 8: bf 28,4f |
227 | |
228 | std r4,0(r10) |
229 | std r4,8(r10) |
230 | std r4,16(r10) |
231 | std r4,24(r10) |
232 | std r4,32(r10) |
233 | std r4,40(r10) |
234 | std r4,48(r10) |
235 | std r4,56(r10) |
236 | addi r10,r10,64 |
237 | |
238 | .align 4 |
239 | 4: bf 29,2f |
240 | std r4,0(r10) |
241 | std r4,8(r10) |
242 | std r4,16(r10) |
243 | std r4,24(r10) |
244 | addi r10,r10,32 |
245 | |
246 | .align 4 |
247 | 2: bf 30,1f |
248 | std r4,0(r10) |
249 | std r4,8(r10) |
250 | addi r10,r10,16 |
251 | |
252 | .align 4 |
253 | 1: bf 31,L(huge_dcbz_aligned) |
254 | std r4,0(r10) |
255 | addi r10,r10,8 |
256 | |
257 | L(huge_dcbz_aligned): |
258 | /* Setup dcbz unroll offsets and count numbers. */ |
259 | srdi r8,r5,9 |
260 | clrldi r11,r5,55 |
261 | cmpldi cr6,r11,0 |
262 | li r9,128 |
263 | cmpdi r8,0 |
264 | beq L(huge_tail) |
265 | li r7,256 |
266 | li r6,384 |
267 | mtctr r8 |
268 | |
269 | .align 4 |
270 | L(huge_loop): |
271 | /* Sets 512 bytes to zero in each iteration, the loop unrolling shows |
272 | a throughput boost for large sizes (2048 bytes or higher). */ |
273 | dcbz 0,r10 |
274 | dcbz r9,r10 |
275 | dcbz r7,r10 |
276 | dcbz r6,r10 |
277 | addi r10,r10,512 |
278 | bdnz L(huge_loop) |
279 | |
280 | beqlr cr6 |
281 | |
282 | L(huge_tail): |
283 | srdi r6,r11,8 |
284 | srdi r7,r11,4 |
285 | clrldi r8,r11,4 |
286 | cmpldi cr6,r8,0 |
287 | mtocrf 0x01,r6 |
288 | |
289 | beq cr6,L(tail) |
290 | |
291 | /* We have 1~511 bytes remaining. */ |
292 | .align 4 |
293 | 32: bf 31,16f |
294 | dcbz 0,r10 |
295 | dcbz r9,r10 |
296 | addi r10,r10,256 |
297 | |
298 | .align 4 |
299 | 16: mtocrf 0x01,r7 |
300 | bf 28,8f |
301 | dcbz 0,r10 |
302 | addi r10,r10,128 |
303 | |
304 | .align 4 |
305 | 8: bf 29,4f |
306 | std r4,0(r10) |
307 | std r4,8(r10) |
308 | std r4,16(r10) |
309 | std r4,24(r10) |
310 | std r4,32(r10) |
311 | std r4,40(r10) |
312 | std r4,48(r10) |
313 | std r4,56(r10) |
314 | addi r10,r10,64 |
315 | |
316 | .align 4 |
317 | 4: bf 30,2f |
318 | std r4,0(r10) |
319 | std r4,8(r10) |
320 | std r4,16(r10) |
321 | std r4,24(r10) |
322 | addi r10,r10,32 |
323 | |
324 | .align 4 |
325 | 2: bf 31,L(tail) |
326 | std r4,0(r10) |
327 | std r4,8(r10) |
328 | addi r10,r10,16 |
329 | .align 4 |
330 | |
331 | /* Remaining 1~15 bytes. */ |
332 | L(tail): |
333 | mtocrf 0x01,r8 |
334 | |
335 | .align |
336 | 8: bf 28,4f |
337 | std r4,0(r10) |
338 | addi r10,r10,8 |
339 | |
340 | .align 4 |
341 | 4: bf 29,2f |
342 | stw r4,0(r10) |
343 | addi r10,r10,4 |
344 | |
345 | .align 4 |
346 | 2: bf 30,1f |
347 | sth r4,0(r10) |
348 | addi r10,r10,2 |
349 | |
350 | .align 4 |
351 | 1: bflr 31 |
352 | stb r4,0(r10) |
353 | blr |
354 | |
355 | /* Handle short copies of 0~31 bytes. Best throughput is achieved |
356 | by just unrolling all operations. */ |
357 | .align 4 |
358 | L(write_LT_32): |
359 | cmpldi cr6,5,8 |
360 | mtocrf 0x01,r5 |
361 | ble cr6,L(write_LE_8) |
362 | |
363 | /* At least 9 bytes to go. */ |
364 | neg r8,r4 |
365 | andi. r0,r8,3 |
366 | cmpldi cr1,r5,16 |
367 | beq L(write_LT_32_aligned) |
368 | |
369 | /* Force 4-byte alignment for SRC. */ |
370 | mtocrf 0x01,r0 |
371 | subf r5,r0,r5 |
372 | |
373 | 2: bf 30,1f |
374 | /* Use stb instead of sth because it doesn't generate |
375 | alignment interrupts on cache-inhibited storage. */ |
376 | stb r4,0(r10) |
377 | stb r4,1(r10) |
378 | addi r10,r10,2 |
379 | |
380 | 1: bf 31,L(end_4bytes_alignment) |
381 | stb r4,0(r10) |
382 | addi r10,r10,1 |
383 | |
384 | .align 4 |
385 | L(end_4bytes_alignment): |
386 | cmpldi cr1,r5,16 |
387 | mtocrf 0x01,r5 |
388 | |
389 | L(write_LT_32_aligned): |
390 | blt cr1,8f |
391 | |
392 | stw r4,0(r10) |
393 | stw r4,4(r10) |
394 | stw r4,8(r10) |
395 | stw r4,12(r10) |
396 | addi r10,r10,16 |
397 | |
398 | 8: bf 28,L(tail4) |
399 | stw r4,0(r10) |
400 | stw r4,4(r10) |
401 | addi r10,r10,8 |
402 | |
403 | .align 4 |
404 | /* Copies 4~7 bytes. */ |
405 | L(tail4): |
406 | bf 29,L(tail2) |
407 | stw r4,0(r10) |
408 | bf 30,L(tail5) |
409 | sth r4,4(r10) |
410 | bflr 31 |
411 | stb r4,6(r10) |
412 | blr |
413 | |
414 | .align 4 |
415 | /* Copies 2~3 bytes. */ |
416 | L(tail2): |
417 | bf 30,1f |
418 | sth r4,0(r10) |
419 | bflr 31 |
420 | stb r4,2(r10) |
421 | blr |
422 | |
423 | .align 4 |
424 | L(tail5): |
425 | bflr 31 |
426 | stb r4,4(r10) |
427 | blr |
428 | |
429 | .align 4 |
430 | 1: bflr 31 |
431 | stb r4,0(r10) |
432 | blr |
433 | |
434 | /* Handles copies of 0~8 bytes. */ |
435 | .align 4 |
436 | L(write_LE_8): |
437 | bne cr6,L(LE7_tail4) |
438 | /* If input is word aligned, use stw, else use stb. */ |
439 | andi. r0,r10,3 |
440 | bne L(8_unalign) |
441 | |
442 | stw r4,0(r10) |
443 | stw r4,4(r10) |
444 | blr |
445 | |
446 | /* Unaligned input and size is 8. */ |
447 | .align 4 |
448 | L(8_unalign): |
449 | andi. r0,r10,1 |
450 | beq L(8_hwalign) |
451 | stb r4,0(r10) |
452 | sth r4,1(r10) |
453 | sth r4,3(r10) |
454 | sth r4,5(r10) |
455 | stb r4,7(r10) |
456 | blr |
457 | |
458 | /* Halfword aligned input and size is 8. */ |
459 | .align 4 |
460 | L(8_hwalign): |
461 | sth r4,0(r10) |
462 | sth r4,2(r10) |
463 | sth r4,4(r10) |
464 | sth r4,6(r10) |
465 | blr |
466 | |
467 | .align 4 |
468 | /* Copies 4~7 bytes. */ |
469 | L(LE7_tail4): |
470 | /* Use stb instead of sth because it doesn't generate |
471 | alignment interrupts on cache-inhibited storage. */ |
472 | bf 29,L(LE7_tail2) |
473 | stb r4,0(r10) |
474 | stb r4,1(r10) |
475 | stb r4,2(r10) |
476 | stb r4,3(r10) |
477 | bf 30,L(LE7_tail5) |
478 | stb r4,4(r10) |
479 | stb r4,5(r10) |
480 | bflr 31 |
481 | stb r4,6(r10) |
482 | blr |
483 | |
484 | .align 4 |
485 | /* Copies 2~3 bytes. */ |
486 | L(LE7_tail2): |
487 | bf 30,1f |
488 | stb r4,0(r10) |
489 | stb r4,1(r10) |
490 | bflr 31 |
491 | stb r4,2(r10) |
492 | blr |
493 | |
494 | .align 4 |
495 | L(LE7_tail5): |
496 | bflr 31 |
497 | stb r4,4(r10) |
498 | blr |
499 | |
500 | .align 4 |
501 | 1: bflr 31 |
502 | stb r4,0(r10) |
503 | blr |
504 | |
505 | END_GEN_TB (MEMSET,TB_TOCLESS) |
506 | libc_hidden_builtin_def (memset) |
507 | |