1 | /* Optimized 64-bit memset implementation for POWER6. |
2 | Copyright (C) 1997-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); |
22 | Returns 's'. |
23 | |
24 | The memset is done in three sizes: byte (8 bits), word (32 bits), |
25 | cache line (256 bits). There is a special case for setting cache lines |
26 | to 0, to take advantage of the dcbz instruction. */ |
27 | |
28 | #ifndef MEMSET |
29 | # define MEMSET memset |
30 | #endif |
31 | .machine power6 |
32 | ENTRY_TOCLESS (MEMSET, 7) |
33 | CALL_MCOUNT 3 |
34 | |
35 | #define rTMP r0 |
36 | #define rRTN r3 /* Initial value of 1st argument. */ |
37 | #define rMEMP0 r3 /* Original value of 1st arg. */ |
38 | #define rCHR r4 /* Char to set in each byte. */ |
39 | #define rLEN r5 /* Length of region to set. */ |
40 | #define rMEMP r6 /* Address at which we are storing. */ |
41 | #define rALIGN r7 /* Number of bytes we are setting now (when aligning). */ |
42 | #define rMEMP2 r8 |
43 | #define rMEMP3 r9 /* Alt mem pointer. */ |
44 | L(_memset): |
45 | /* Take care of case for size <= 4. */ |
46 | cmpldi cr1, rLEN, 8 |
47 | andi. rALIGN, rMEMP0, 7 |
48 | mr rMEMP, rMEMP0 |
49 | ble cr1, L(small) |
50 | |
51 | /* Align to doubleword boundary. */ |
52 | cmpldi cr5, rLEN, 31 |
53 | insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */ |
54 | beq+ L(aligned2) |
55 | mtcrf 0x01, rMEMP0 |
56 | subfic rALIGN, rALIGN, 8 |
57 | cror 28,30,31 /* Detect odd word aligned. */ |
58 | add rMEMP, rMEMP, rALIGN |
59 | sub rLEN, rLEN, rALIGN |
60 | insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ |
61 | bt 29, L(g4) |
62 | /* Process the even word of doubleword. */ |
63 | bf+ 31, L(g2) |
64 | stb rCHR, 0(rMEMP0) |
65 | bt 30, L(g4x) |
66 | L(g2): |
67 | sth rCHR, -6(rMEMP) |
68 | L(g4x): |
69 | stw rCHR, -4(rMEMP) |
70 | b L(aligned) |
71 | /* Process the odd word of doubleword. */ |
72 | L(g4): |
73 | bf 28, L(g4x) /* If false, word aligned on odd word. */ |
74 | bf+ 31, L(g0) |
75 | stb rCHR, 0(rMEMP0) |
76 | bt 30, L(aligned) |
77 | L(g0): |
78 | sth rCHR, -2(rMEMP) |
79 | |
80 | /* Handle the case of size < 31. */ |
81 | L(aligned2): |
82 | insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ |
83 | L(aligned): |
84 | mtcrf 0x01, rLEN |
85 | ble cr5, L(medium) |
86 | /* Align to 32-byte boundary. */ |
87 | andi. rALIGN, rMEMP, 0x18 |
88 | subfic rALIGN, rALIGN, 0x20 |
89 | insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ |
90 | beq L(caligned) |
91 | mtcrf 0x01, rALIGN |
92 | add rMEMP, rMEMP, rALIGN |
93 | sub rLEN, rLEN, rALIGN |
94 | cmplwi cr1, rALIGN, 0x10 |
95 | mr rMEMP2, rMEMP |
96 | bf 28, L(a1) |
97 | stdu rCHR, -8(rMEMP2) |
98 | L(a1): blt cr1, L(a2) |
99 | std rCHR, -8(rMEMP2) |
100 | stdu rCHR, -16(rMEMP2) |
101 | L(a2): |
102 | |
103 | /* Now aligned to a 32 byte boundary. */ |
104 | .align 4 |
105 | L(caligned): |
106 | cmpldi cr1, rCHR, 0 |
107 | clrrdi. rALIGN, rLEN, 5 |
108 | mtcrf 0x01, rLEN |
109 | beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */ |
110 | beq L(medium) /* We may not actually get to do a full line. */ |
111 | .align 4 |
112 | /* Storing a non-zero "c" value. We are aligned at a sector (32-byte) |
113 | boundary may not be at cache line (128-byte) boundary. */ |
114 | L(nzloopstart): |
115 | /* memset in 32-byte chunks until we get to a cache line boundary. |
116 | If rLEN is less than the distance to the next cache-line boundary use |
117 | cacheAligned1 code to finish the tail. */ |
118 | cmpldi cr1,rLEN,128 |
119 | |
120 | andi. rTMP,rMEMP,127 |
121 | blt cr1,L(cacheAligned1) |
122 | addi rMEMP3,rMEMP,32 |
123 | beq L(nzCacheAligned) |
124 | addi rLEN,rLEN,-32 |
125 | std rCHR,0(rMEMP) |
126 | std rCHR,8(rMEMP) |
127 | std rCHR,16(rMEMP) |
128 | addi rMEMP,rMEMP,32 |
129 | andi. rTMP,rMEMP3,127 |
130 | std rCHR,-8(rMEMP3) |
131 | |
132 | beq L(nzCacheAligned) |
133 | addi rLEN,rLEN,-32 |
134 | std rCHR,0(rMEMP3) |
135 | addi rMEMP,rMEMP,32 |
136 | std rCHR,8(rMEMP3) |
137 | andi. rTMP,rMEMP,127 |
138 | std rCHR,16(rMEMP3) |
139 | std rCHR,24(rMEMP3) |
140 | |
141 | beq L(nzCacheAligned) |
142 | addi rLEN,rLEN,-32 |
143 | std rCHR,32(rMEMP3) |
144 | addi rMEMP,rMEMP,32 |
145 | cmpldi cr1,rLEN,128 |
146 | std rCHR,40(rMEMP3) |
147 | cmpldi cr6,rLEN,256 |
148 | li rMEMP2,128 |
149 | std rCHR,48(rMEMP3) |
150 | std rCHR,56(rMEMP3) |
151 | blt cr1,L(cacheAligned1) |
152 | b L(nzCacheAligned128) |
153 | |
154 | /* Now we are aligned to the cache line and can use dcbtst. */ |
155 | .align 4 |
156 | L(nzCacheAligned): |
157 | cmpldi cr1,rLEN,128 |
158 | blt cr1,L(cacheAligned1) |
159 | b L(nzCacheAligned128) |
160 | .align 5 |
161 | L(nzCacheAligned128): |
162 | cmpldi cr1,rLEN,256 |
163 | addi rMEMP3,rMEMP,64 |
164 | std rCHR,0(rMEMP) |
165 | std rCHR,8(rMEMP) |
166 | std rCHR,16(rMEMP) |
167 | std rCHR,24(rMEMP) |
168 | std rCHR,32(rMEMP) |
169 | std rCHR,40(rMEMP) |
170 | std rCHR,48(rMEMP) |
171 | std rCHR,56(rMEMP) |
172 | addi rMEMP,rMEMP3,64 |
173 | addi rLEN,rLEN,-128 |
174 | std rCHR,0(rMEMP3) |
175 | std rCHR,8(rMEMP3) |
176 | std rCHR,16(rMEMP3) |
177 | std rCHR,24(rMEMP3) |
178 | std rCHR,32(rMEMP3) |
179 | std rCHR,40(rMEMP3) |
180 | std rCHR,48(rMEMP3) |
181 | std rCHR,56(rMEMP3) |
182 | bge cr1,L(nzCacheAligned128) |
183 | dcbtst 0,rMEMP |
184 | b L(cacheAligned1) |
185 | .align 5 |
186 | /* Storing a zero "c" value. We are aligned at a sector (32-byte) |
187 | boundary but may not be at cache line (128-byte) boundary. If the |
188 | remaining length spans a full cache line we can use the Data cache |
189 | block zero instruction. */ |
190 | L(zloopstart): |
191 | /* memset in 32-byte chunks until we get to a cache line boundary. |
192 | If rLEN is less than the distance to the next cache-line boundary use |
193 | cacheAligned1 code to finish the tail. */ |
194 | cmpldi cr1,rLEN,128 |
195 | beq L(medium) |
196 | L(getCacheAligned): |
197 | andi. rTMP,rMEMP,127 |
198 | nop |
199 | blt cr1,L(cacheAligned1) |
200 | addi rMEMP3,rMEMP,32 |
201 | beq L(cacheAligned) |
202 | addi rLEN,rLEN,-32 |
203 | std rCHR,0(rMEMP) |
204 | std rCHR,8(rMEMP) |
205 | std rCHR,16(rMEMP) |
206 | addi rMEMP,rMEMP,32 |
207 | andi. rTMP,rMEMP3,127 |
208 | std rCHR,-8(rMEMP3) |
209 | L(getCacheAligned2): |
210 | beq L(cacheAligned) |
211 | addi rLEN,rLEN,-32 |
212 | std rCHR,0(rMEMP3) |
213 | std rCHR,8(rMEMP3) |
214 | addi rMEMP,rMEMP,32 |
215 | andi. rTMP,rMEMP,127 |
216 | std rCHR,16(rMEMP3) |
217 | std rCHR,24(rMEMP3) |
218 | L(getCacheAligned3): |
219 | beq L(cacheAligned) |
220 | addi rLEN,rLEN,-32 |
221 | std rCHR,32(rMEMP3) |
222 | addi rMEMP,rMEMP,32 |
223 | cmpldi cr1,rLEN,128 |
224 | std rCHR,40(rMEMP3) |
225 | cmpldi cr6,rLEN,256 |
226 | li rMEMP2,128 |
227 | std rCHR,48(rMEMP3) |
228 | std rCHR,56(rMEMP3) |
229 | blt cr1,L(cacheAligned1) |
230 | blt cr6,L(cacheAligned128) |
231 | b L(cacheAlignedx) |
232 | |
233 | /* Now we are aligned to the cache line and can use dcbz. */ |
234 | .align 5 |
235 | L(cacheAligned): |
236 | cmpldi cr1,rLEN,128 |
237 | cmpldi cr6,rLEN,256 |
238 | blt cr1,L(cacheAligned1) |
239 | li rMEMP2,128 |
240 | L(cacheAlignedx): |
241 | cmpldi cr5,rLEN,640 |
242 | blt cr6,L(cacheAligned128) |
243 | bgt cr5,L(cacheAligned512) |
244 | cmpldi cr6,rLEN,512 |
245 | dcbz 0,rMEMP |
246 | cmpldi cr1,rLEN,384 |
247 | dcbz rMEMP2,rMEMP |
248 | addi rMEMP,rMEMP,256 |
249 | addi rLEN,rLEN,-256 |
250 | blt cr1,L(cacheAligned1) |
251 | blt cr6,L(cacheAligned128) |
252 | b L(cacheAligned256) |
253 | .align 5 |
254 | /* A simple loop for the longer (>640 bytes) lengths. This form limits |
255 | the branch miss-predicted to exactly 1 at loop exit.*/ |
256 | L(cacheAligned512): |
257 | cmpldi cr1,rLEN,128 |
258 | blt cr1,L(cacheAligned1) |
259 | dcbz 0,rMEMP |
260 | addi rLEN,rLEN,-128 |
261 | addi rMEMP,rMEMP,128 |
262 | b L(cacheAligned512) |
263 | .align 5 |
264 | L(cacheAligned256): |
265 | |
266 | cmpldi cr6,rLEN,512 |
267 | |
268 | dcbz 0,rMEMP |
269 | cmpldi cr1,rLEN,384 |
270 | dcbz rMEMP2,rMEMP |
271 | addi rMEMP,rMEMP,256 |
272 | addi rLEN,rLEN,-256 |
273 | |
274 | bge cr6,L(cacheAligned256) |
275 | |
276 | blt cr1,L(cacheAligned1) |
277 | .align 4 |
278 | L(cacheAligned128): |
279 | dcbz 0,rMEMP |
280 | addi rMEMP,rMEMP,128 |
281 | addi rLEN,rLEN,-128 |
282 | nop |
283 | L(cacheAligned1): |
284 | cmpldi cr1,rLEN,32 |
285 | blt cr1,L(handletail32) |
286 | addi rMEMP3,rMEMP,32 |
287 | addi rLEN,rLEN,-32 |
288 | std rCHR,0(rMEMP) |
289 | std rCHR,8(rMEMP) |
290 | std rCHR,16(rMEMP) |
291 | addi rMEMP,rMEMP,32 |
292 | cmpldi cr1,rLEN,32 |
293 | std rCHR,-8(rMEMP3) |
294 | L(cacheAligned2): |
295 | blt cr1,L(handletail32) |
296 | addi rLEN,rLEN,-32 |
297 | std rCHR,0(rMEMP3) |
298 | std rCHR,8(rMEMP3) |
299 | addi rMEMP,rMEMP,32 |
300 | cmpldi cr1,rLEN,32 |
301 | std rCHR,16(rMEMP3) |
302 | std rCHR,24(rMEMP3) |
303 | nop |
304 | L(cacheAligned3): |
305 | blt cr1,L(handletail32) |
306 | addi rMEMP,rMEMP,32 |
307 | addi rLEN,rLEN,-32 |
308 | std rCHR,32(rMEMP3) |
309 | std rCHR,40(rMEMP3) |
310 | std rCHR,48(rMEMP3) |
311 | std rCHR,56(rMEMP3) |
312 | |
313 | /* We are here because the length or remainder (rLEN) is less than the |
314 | cache line/sector size and does not justify aggressive loop unrolling. |
315 | So set up the preconditions for L(medium) and go there. */ |
316 | .align 3 |
317 | L(handletail32): |
318 | cmpldi cr1,rLEN,0 |
319 | beqlr cr1 |
320 | b L(medium) |
321 | |
322 | .align 5 |
323 | L(small): |
324 | /* Memset of 8 bytes or less. */ |
325 | cmpldi cr6, rLEN, 4 |
326 | cmpldi cr5, rLEN, 1 |
327 | ble cr6,L(le4) |
328 | subi rLEN, rLEN, 4 |
329 | stb rCHR,0(rMEMP) |
330 | stb rCHR,1(rMEMP) |
331 | stb rCHR,2(rMEMP) |
332 | stb rCHR,3(rMEMP) |
333 | addi rMEMP,rMEMP, 4 |
334 | cmpldi cr5, rLEN, 1 |
335 | L(le4): |
336 | cmpldi cr1, rLEN, 3 |
337 | bltlr cr5 |
338 | stb rCHR, 0(rMEMP) |
339 | beqlr cr5 |
340 | stb rCHR, 1(rMEMP) |
341 | bltlr cr1 |
342 | stb rCHR, 2(rMEMP) |
343 | beqlr cr1 |
344 | stb rCHR, 3(rMEMP) |
345 | blr |
346 | |
347 | /* Memset of 0-31 bytes. */ |
348 | .align 5 |
349 | L(medium): |
350 | insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ |
351 | cmpldi cr1, rLEN, 16 |
352 | L(medium_tail2): |
353 | add rMEMP, rMEMP, rLEN |
354 | L(medium_tail): |
355 | bt- 31, L(medium_31t) |
356 | bt- 30, L(medium_30t) |
357 | L(medium_30f): |
358 | bt 29, L(medium_29t) |
359 | L(medium_29f): |
360 | bge cr1, L(medium_27t) |
361 | bflr 28 |
362 | std rCHR, -8(rMEMP) |
363 | blr |
364 | |
365 | L(medium_31t): |
366 | stbu rCHR, -1(rMEMP) |
367 | bf- 30, L(medium_30f) |
368 | L(medium_30t): |
369 | sthu rCHR, -2(rMEMP) |
370 | bf- 29, L(medium_29f) |
371 | L(medium_29t): |
372 | stwu rCHR, -4(rMEMP) |
373 | blt cr1, L(medium_27f) |
374 | L(medium_27t): |
375 | std rCHR, -8(rMEMP) |
376 | stdu rCHR, -16(rMEMP) |
377 | L(medium_27f): |
378 | bflr 28 |
379 | L(medium_28t): |
380 | std rCHR, -8(rMEMP) |
381 | blr |
382 | END_GEN_TB (MEMSET,TB_TOCLESS) |
383 | libc_hidden_builtin_def (memset) |
384 | |