1 | /* Optimized memset implementation for PowerPC64/POWER7. |
2 | Copyright (C) 2010-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); |
22 | Returns 's'. */ |
23 | |
24 | #ifndef MEMSET |
25 | # define MEMSET memset |
26 | #endif |
27 | .machine power7 |
28 | ENTRY_TOCLESS (MEMSET, 5) |
29 | CALL_MCOUNT 3 |
30 | |
31 | L(_memset): |
32 | cmpldi cr7,5,31 |
33 | cmpldi cr6,5,8 |
34 | mr 10,3 |
35 | |
36 | /* Replicate byte to word. */ |
37 | insrdi 4,4,8,48 |
38 | insrdi 4,4,16,32 |
39 | ble cr6,L(small) /* If length <= 8, use short copy code. */ |
40 | |
41 | neg 0,3 |
42 | ble cr7,L(medium) /* If length < 32, use medium copy code. */ |
43 | |
44 | andi. 11,10,7 /* Check alignment of SRC. */ |
45 | insrdi 4,4,32,0 /* Replicate word to double word. */ |
46 | |
47 | mr 12,5 |
48 | beq L(big_aligned) |
49 | |
50 | clrldi 0,0,61 |
51 | mtocrf 0x01,0 |
52 | subf 5,0,5 |
53 | |
54 | /* Get DST aligned to 8 bytes. */ |
55 | 1: bf 31,2f |
56 | |
57 | stb 4,0(10) |
58 | addi 10,10,1 |
59 | 2: bf 30,4f |
60 | |
61 | sth 4,0(10) |
62 | addi 10,10,2 |
63 | 4: bf 29,L(big_aligned) |
64 | |
65 | stw 4,0(10) |
66 | addi 10,10,4 |
67 | |
68 | .align 4 |
69 | L(big_aligned): |
70 | |
71 | cmpldi cr5,5,255 |
72 | li 0,32 |
73 | dcbtst 0,10 |
74 | cmpldi cr6,4,0 |
75 | srdi 9,5,3 /* Number of full doublewords remaining. */ |
76 | crand 27,26,21 |
77 | mtocrf 0x01,9 |
78 | bt 27,L(huge) |
79 | |
80 | /* From this point on, we'll copy 32+ bytes and the value |
81 | isn't 0 (so we can't use dcbz). */ |
82 | |
83 | srdi 8,5,5 |
84 | clrldi 11,5,61 |
85 | cmpldi cr6,11,0 |
86 | cmpldi cr1,9,4 |
87 | mtctr 8 |
88 | |
89 | /* Copy 1~3 doublewords so the main loop starts |
90 | at a multiple of 32 bytes. */ |
91 | |
92 | bf 30,1f |
93 | |
94 | std 4,0(10) |
95 | std 4,8(10) |
96 | addi 10,10,16 |
97 | bf 31,L(big_loop) |
98 | |
99 | std 4,0(10) |
100 | addi 10,10,8 |
101 | mr 12,10 |
102 | blt cr1,L(tail_bytes) |
103 | b L(big_loop) |
104 | |
105 | .align 4 |
106 | 1: /* Copy 1 doubleword. */ |
107 | bf 31,L(big_loop) |
108 | |
109 | std 4,0(10) |
110 | addi 10,10,8 |
111 | |
112 | /* Main aligned copy loop. Copies 32-bytes at a time and |
113 | ping-pong through r10 and r12 to avoid AGEN delays. */ |
114 | .align 4 |
115 | L(big_loop): |
116 | addi 12,10,32 |
117 | std 4,0(10) |
118 | std 4,8(10) |
119 | std 4,16(10) |
120 | std 4,24(10) |
121 | bdz L(tail_bytes) |
122 | |
123 | addi 10,10,64 |
124 | std 4,0(12) |
125 | std 4,8(12) |
126 | std 4,16(12) |
127 | std 4,24(12) |
128 | bdnz L(big_loop) |
129 | |
130 | mr 12,10 |
131 | b L(tail_bytes) |
132 | |
133 | .align 4 |
134 | L(tail_bytes): |
135 | |
136 | /* Check for tail bytes. */ |
137 | beqlr cr6 |
138 | |
139 | clrldi 0,5,61 |
140 | mtocrf 0x01,0 |
141 | |
142 | /* At this point we have a tail of 0-7 bytes and we know that the |
143 | destination is doubleword-aligned. */ |
144 | 4: /* Copy 4 bytes. */ |
145 | bf 29,2f |
146 | |
147 | stw 4,0(12) |
148 | addi 12,12,4 |
149 | 2: /* Copy 2 bytes. */ |
150 | bf 30,1f |
151 | |
152 | sth 4,0(12) |
153 | addi 12,12,2 |
154 | 1: /* Copy 1 byte. */ |
155 | bflr 31 |
156 | |
157 | stb 4,0(12) |
158 | blr |
159 | |
160 | /* Special case when value is 0 and we have a long length to deal |
161 | with. Use dcbz to zero out 128-bytes at a time. Before using |
162 | dcbz though, we need to get the destination 128-bytes aligned. */ |
163 | .align 4 |
164 | L(huge): |
165 | andi. 11,10,127 |
166 | neg 0,10 |
167 | beq L(huge_aligned) |
168 | |
169 | clrldi 0,0,57 |
170 | subf 5,0,5 |
171 | srdi 0,0,3 |
172 | mtocrf 0x01,0 |
173 | |
174 | /* Get DST aligned to 128 bytes. */ |
175 | 8: bf 28,4f |
176 | |
177 | std 4,0(10) |
178 | std 4,8(10) |
179 | std 4,16(10) |
180 | std 4,24(10) |
181 | std 4,32(10) |
182 | std 4,40(10) |
183 | std 4,48(10) |
184 | std 4,56(10) |
185 | addi 10,10,64 |
186 | .align 4 |
187 | 4: bf 29,2f |
188 | |
189 | std 4,0(10) |
190 | std 4,8(10) |
191 | std 4,16(10) |
192 | std 4,24(10) |
193 | addi 10,10,32 |
194 | .align 4 |
195 | 2: bf 30,1f |
196 | |
197 | std 4,0(10) |
198 | std 4,8(10) |
199 | addi 10,10,16 |
200 | .align 4 |
201 | 1: bf 31,L(huge_aligned) |
202 | |
203 | std 4,0(10) |
204 | addi 10,10,8 |
205 | |
206 | |
207 | L(huge_aligned): |
208 | srdi 8,5,7 |
209 | clrldi 11,5,57 |
210 | cmpldi cr6,11,0 |
211 | mtctr 8 |
212 | |
213 | .align 4 |
214 | L(huge_loop): |
215 | dcbz 0,10 |
216 | addi 10,10,128 |
217 | bdnz L(huge_loop) |
218 | |
219 | /* Check how many bytes are still left. */ |
220 | beqlr cr6 |
221 | |
222 | subf 9,3,10 |
223 | subf 5,9,12 |
224 | srdi 8,5,3 |
225 | cmpldi cr6,8,0 |
226 | mtocrf 0x01,8 |
227 | |
228 | /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for |
229 | speed. We'll handle the resulting tail bytes later. */ |
230 | beq cr6,L(tail) |
231 | |
232 | 8: bf 28,4f |
233 | |
234 | std 4,0(10) |
235 | std 4,8(10) |
236 | std 4,16(10) |
237 | std 4,24(10) |
238 | std 4,32(10) |
239 | std 4,40(10) |
240 | std 4,48(10) |
241 | std 4,56(10) |
242 | addi 10,10,64 |
243 | .align 4 |
244 | 4: bf 29,2f |
245 | |
246 | std 4,0(10) |
247 | std 4,8(10) |
248 | std 4,16(10) |
249 | std 4,24(10) |
250 | addi 10,10,32 |
251 | .align 4 |
252 | 2: bf 30,1f |
253 | |
254 | std 4,0(10) |
255 | std 4,8(10) |
256 | addi 10,10,16 |
257 | .align 4 |
258 | 1: bf 31,L(tail) |
259 | |
260 | std 4,0(10) |
261 | addi 10,10,8 |
262 | |
263 | /* Handle the rest of the tail bytes here. */ |
264 | L(tail): |
265 | mtocrf 0x01,5 |
266 | |
267 | .align 4 |
268 | 4: bf 29,2f |
269 | |
270 | stw 4,0(10) |
271 | addi 10,10,4 |
272 | .align 4 |
273 | 2: bf 30,1f |
274 | |
275 | sth 4,0(10) |
276 | addi 10,10,2 |
277 | .align 4 |
278 | 1: bflr 31 |
279 | |
280 | stb 4,0(10) |
281 | blr |
282 | |
283 | /* Expanded tree to copy tail bytes without increments. */ |
284 | .align 4 |
285 | L(copy_tail): |
286 | bf 29,L(FXX) |
287 | |
288 | stw 4,0(10) |
289 | bf 30,L(TFX) |
290 | |
291 | sth 4,4(10) |
292 | bflr 31 |
293 | |
294 | stb 4,6(10) |
295 | blr |
296 | |
297 | .align 4 |
298 | L(FXX): bf 30,L(FFX) |
299 | |
300 | sth 4,0(10) |
301 | bflr 31 |
302 | |
303 | stb 4,2(10) |
304 | blr |
305 | |
306 | .align 4 |
307 | L(TFX): bflr 31 |
308 | |
309 | stb 4,4(10) |
310 | blr |
311 | |
312 | .align 4 |
313 | L(FFX): bflr 31 |
314 | |
315 | stb 4,0(10) |
316 | blr |
317 | |
318 | /* Handle copies of 9~31 bytes. */ |
319 | .align 4 |
320 | L(medium): |
321 | /* At least 9 bytes to go. */ |
322 | andi. 11,10,3 |
323 | clrldi 0,0,62 |
324 | beq L(medium_aligned) |
325 | |
326 | /* Force 4-bytes alignment for DST. */ |
327 | mtocrf 0x01,0 |
328 | subf 5,0,5 |
329 | 1: /* Copy 1 byte. */ |
330 | bf 31,2f |
331 | |
332 | stb 4,0(10) |
333 | addi 10,10,1 |
334 | 2: /* Copy 2 bytes. */ |
335 | bf 30,L(medium_aligned) |
336 | |
337 | sth 4,0(10) |
338 | addi 10,10,2 |
339 | |
340 | .align 4 |
341 | L(medium_aligned): |
342 | /* At least 6 bytes to go, and DST is word-aligned. */ |
343 | cmpldi cr1,5,16 |
344 | mtocrf 0x01,5 |
345 | blt cr1,8f |
346 | |
347 | /* Copy 16 bytes. */ |
348 | stw 4,0(10) |
349 | stw 4,4(10) |
350 | stw 4,8(10) |
351 | stw 4,12(10) |
352 | addi 10,10,16 |
353 | 8: /* Copy 8 bytes. */ |
354 | bf 28,4f |
355 | |
356 | stw 4,0(10) |
357 | stw 4,4(10) |
358 | addi 10,10,8 |
359 | 4: /* Copy 4 bytes. */ |
360 | bf 29,2f |
361 | |
362 | stw 4,0(10) |
363 | addi 10,10,4 |
364 | 2: /* Copy 2-3 bytes. */ |
365 | bf 30,1f |
366 | |
367 | sth 4,0(10) |
368 | addi 10,10,2 |
369 | 1: /* Copy 1 byte. */ |
370 | bflr 31 |
371 | |
372 | stb 4,0(10) |
373 | blr |
374 | |
375 | /* Handles copies of 0~8 bytes. */ |
376 | .align 4 |
377 | L(small): |
378 | mtocrf 0x01,5 |
379 | bne cr6,L(copy_tail) |
380 | |
381 | stw 4,0(10) |
382 | stw 4,4(10) |
383 | blr |
384 | |
385 | END_GEN_TB (MEMSET,TB_TOCLESS) |
386 | libc_hidden_builtin_def (memset) |
387 | |