1 | /* Optimized memset implementation for PowerPC. |
2 | Copyright (C) 1997-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | #include <rtld-global-offsets.h> |
21 | |
22 | /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); |
23 | Returns 's'. |
24 | |
25 | The memset is done in four sizes: byte (8 bits), word (32 bits), |
26 | 32-byte blocks (256 bits) and cache line size (128, 256, 1024 bits). |
27 | There is a special case for setting whole cache lines to 0, which |
28 | takes advantage of the dcbz instruction. */ |
29 | |
30 | .section ".text" |
31 | EALIGN (memset, 5, 1) |
32 | |
33 | #define rTMP r0 |
34 | #define rRTN r3 /* initial value of 1st argument */ |
35 | #define rMEMP0 r3 /* original value of 1st arg */ |
36 | #define rCHR r4 /* char to set in each byte */ |
37 | #define rLEN r5 /* length of region to set */ |
38 | #define rMEMP r6 /* address at which we are storing */ |
39 | #define rALIGN r7 /* number of bytes we are setting now (when aligning) */ |
40 | #define rMEMP2 r8 |
41 | |
42 | #define rPOS32 r7 /* constant +32 for clearing with dcbz */ |
43 | #define rNEG64 r8 /* constant -64 for clearing with dcbz */ |
44 | #define rNEG32 r9 /* constant -32 for clearing with dcbz */ |
45 | |
46 | #define rGOT r9 /* Address of the Global Offset Table. */ |
47 | #define rCLS r8 /* Cache line size obtained from static. */ |
48 | #define rCLM r9 /* Cache line size mask to check for cache alignment. */ |
49 | |
50 | /* take care of case for size <= 4 */ |
51 | cmplwi cr1, rLEN, 4 |
52 | andi. rALIGN, rMEMP0, 3 |
53 | mr rMEMP, rMEMP0 |
54 | ble- cr1, L(small) |
55 | /* align to word boundary */ |
56 | cmplwi cr5, rLEN, 31 |
57 | rlwimi rCHR, rCHR, 8, 16, 23 |
58 | beq+ L(aligned) /* 8th instruction from .align */ |
59 | mtcrf 0x01, rMEMP0 |
60 | subfic rALIGN, rALIGN, 4 |
61 | add rMEMP, rMEMP, rALIGN |
62 | sub rLEN, rLEN, rALIGN |
63 | bf+ 31, L(g0) |
64 | stb rCHR, 0(rMEMP0) |
65 | bt 30, L(aligned) |
66 | L(g0): sth rCHR, -2(rMEMP) /* 16th instruction from .align */ |
67 | /* take care of case for size < 31 */ |
68 | L(aligned): |
69 | mtcrf 0x01, rLEN |
70 | rlwimi rCHR, rCHR, 16, 0, 15 |
71 | ble cr5, L(medium) |
72 | /* align to cache line boundary... */ |
73 | andi. rALIGN, rMEMP, 0x1C |
74 | subfic rALIGN, rALIGN, 0x20 |
75 | beq L(caligned) |
76 | mtcrf 0x01, rALIGN |
77 | add rMEMP, rMEMP, rALIGN |
78 | sub rLEN, rLEN, rALIGN |
79 | cmplwi cr1, rALIGN, 0x10 |
80 | mr rMEMP2, rMEMP |
81 | bf 28, L(a1) |
82 | stw rCHR, -4(rMEMP2) |
83 | stwu rCHR, -8(rMEMP2) |
84 | L(a1): blt cr1, L(a2) |
85 | stw rCHR, -4(rMEMP2) /* 32nd instruction from .align */ |
86 | stw rCHR, -8(rMEMP2) |
87 | stw rCHR, -12(rMEMP2) |
88 | stwu rCHR, -16(rMEMP2) |
89 | L(a2): bf 29, L(caligned) |
90 | stw rCHR, -4(rMEMP2) |
91 | /* now aligned to a cache line. */ |
92 | L(caligned): |
93 | cmplwi cr1, rCHR, 0 |
94 | clrrwi. rALIGN, rLEN, 5 |
95 | mtcrf 0x01, rLEN /* 40th instruction from .align */ |
96 | |
97 | /* Check if we can use the special case for clearing memory using dcbz. |
98 | This requires that we know the correct cache line size for this |
99 | processor. Getting the cache line size may require establishing GOT |
100 | addressability, so branch out of line to set this up. */ |
101 | beq cr1, L(checklinesize) |
102 | |
103 | /* Store blocks of 32-bytes (256-bits) starting on a 32-byte boundary. |
104 | Can't assume that rCHR is zero or that the cache line size is either |
105 | 32-bytes or even known. */ |
106 | L(nondcbz): |
107 | srwi rTMP, rALIGN, 5 |
108 | mtctr rTMP |
109 | beq L(medium) /* we may not actually get to do a full line */ |
110 | clrlwi. rLEN, rLEN, 27 |
111 | add rMEMP, rMEMP, rALIGN |
112 | li rNEG64, -0x40 |
113 | bdz L(cloopdone) /* 48th instruction from .align */ |
114 | |
115 | /* We can't use dcbz here as we don't know the cache line size. We can |
116 | use "data cache block touch for store", which is safe. */ |
117 | L(c3): dcbtst rNEG64, rMEMP |
118 | stw rCHR, -4(rMEMP) |
119 | stw rCHR, -8(rMEMP) |
120 | stw rCHR, -12(rMEMP) |
121 | stw rCHR, -16(rMEMP) |
122 | nop /* let 601 fetch last 4 instructions of loop */ |
123 | stw rCHR, -20(rMEMP) |
124 | stw rCHR, -24(rMEMP) /* 56th instruction from .align */ |
125 | nop /* let 601 fetch first 8 instructions of loop */ |
126 | stw rCHR, -28(rMEMP) |
127 | stwu rCHR, -32(rMEMP) |
128 | bdnz L(c3) |
129 | L(cloopdone): |
130 | stw rCHR, -4(rMEMP) |
131 | stw rCHR, -8(rMEMP) |
132 | stw rCHR, -12(rMEMP) |
133 | stw rCHR, -16(rMEMP) /* 64th instruction from .align */ |
134 | stw rCHR, -20(rMEMP) |
135 | cmplwi cr1, rLEN, 16 |
136 | stw rCHR, -24(rMEMP) |
137 | stw rCHR, -28(rMEMP) |
138 | stwu rCHR, -32(rMEMP) |
139 | beqlr |
140 | add rMEMP, rMEMP, rALIGN |
141 | b L(medium_tail2) /* 72nd instruction from .align */ |
142 | |
143 | .align 5 |
144 | nop |
145 | /* Clear cache lines of memory in 128-byte chunks. |
146 | This code is optimized for processors with 32-byte cache lines. |
147 | It is further optimized for the 601 processor, which requires |
148 | some care in how the code is aligned in the i-cache. */ |
149 | L(zloopstart): |
150 | clrlwi rLEN, rLEN, 27 |
151 | mtcrf 0x02, rALIGN |
152 | srwi. rTMP, rALIGN, 7 |
153 | mtctr rTMP |
154 | li rPOS32, 0x20 |
155 | li rNEG64, -0x40 |
156 | cmplwi cr1, rLEN, 16 /* 8 */ |
157 | bf 26, L(z0) |
158 | dcbz 0, rMEMP |
159 | addi rMEMP, rMEMP, 0x20 |
160 | L(z0): li rNEG32, -0x20 |
161 | bf 25, L(z1) |
162 | dcbz 0, rMEMP |
163 | dcbz rPOS32, rMEMP |
164 | addi rMEMP, rMEMP, 0x40 /* 16 */ |
165 | L(z1): cmplwi cr5, rLEN, 0 |
166 | beq L(medium) |
167 | L(zloop): |
168 | dcbz 0, rMEMP |
169 | dcbz rPOS32, rMEMP |
170 | addi rMEMP, rMEMP, 0x80 |
171 | dcbz rNEG64, rMEMP |
172 | dcbz rNEG32, rMEMP |
173 | bdnz L(zloop) |
174 | beqlr cr5 |
175 | b L(medium_tail2) |
176 | |
177 | .align 5 |
178 | L(small): |
179 | /* Memset of 4 bytes or less. */ |
180 | cmplwi cr5, rLEN, 1 |
181 | cmplwi cr1, rLEN, 3 |
182 | bltlr cr5 |
183 | stb rCHR, 0(rMEMP) |
184 | beqlr cr5 |
185 | nop |
186 | stb rCHR, 1(rMEMP) |
187 | bltlr cr1 |
188 | stb rCHR, 2(rMEMP) |
189 | beqlr cr1 |
190 | nop |
191 | stb rCHR, 3(rMEMP) |
192 | blr |
193 | |
194 | /* Memset of 0-31 bytes. */ |
195 | .align 5 |
196 | L(medium): |
197 | cmplwi cr1, rLEN, 16 |
198 | L(medium_tail2): |
199 | add rMEMP, rMEMP, rLEN |
200 | L(medium_tail): |
201 | bt- 31, L(medium_31t) |
202 | bt- 30, L(medium_30t) |
203 | L(medium_30f): |
204 | bt- 29, L(medium_29t) |
205 | L(medium_29f): |
206 | bge- cr1, L(medium_27t) |
207 | bflr- 28 |
208 | stw rCHR, -4(rMEMP) /* 8th instruction from .align */ |
209 | stw rCHR, -8(rMEMP) |
210 | blr |
211 | |
212 | L(medium_31t): |
213 | stbu rCHR, -1(rMEMP) |
214 | bf- 30, L(medium_30f) |
215 | L(medium_30t): |
216 | sthu rCHR, -2(rMEMP) |
217 | bf- 29, L(medium_29f) |
218 | L(medium_29t): |
219 | stwu rCHR, -4(rMEMP) |
220 | blt- cr1, L(medium_27f) /* 16th instruction from .align */ |
221 | L(medium_27t): |
222 | stw rCHR, -4(rMEMP) |
223 | stw rCHR, -8(rMEMP) |
224 | stw rCHR, -12(rMEMP) |
225 | stwu rCHR, -16(rMEMP) |
226 | L(medium_27f): |
227 | bflr- 28 |
228 | L(medium_28t): |
229 | stw rCHR, -4(rMEMP) |
230 | stw rCHR, -8(rMEMP) |
231 | blr |
232 | |
233 | L(checklinesize): |
234 | /* If the remaining length is less the 32 bytes then don't bother getting |
235 | the cache line size. */ |
236 | beq L(medium) |
237 | #ifdef PIC |
238 | mflr rTMP |
239 | /* Establishes GOT addressability so we can load the cache line size |
240 | from rtld_global_ro. This value was set from the aux vector during |
241 | startup. */ |
242 | SETUP_GOT_ACCESS(rGOT,got_label) |
243 | addis rGOT,rGOT,_GLOBAL_OFFSET_TABLE_-got_label@ha |
244 | addi rGOT,rGOT,_GLOBAL_OFFSET_TABLE_-got_label@l |
245 | mtlr rTMP |
246 | #endif |
247 | /* Load rtld_global_ro._dl_cache_line_size. */ |
248 | __GLRO(rCLS, rGOT, _dl_cache_line_size, |
249 | RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET) |
250 | |
251 | /* If the cache line size was not set then goto to L(nondcbz), which is |
252 | safe for any cache line size. */ |
253 | cmplwi cr1,rCLS,0 |
254 | beq cr1,L(nondcbz) |
255 | |
256 | /* If the cache line size is 32 bytes then goto to L(zloopstart), |
257 | which is coded specifically for 32-byte lines (and 601). */ |
258 | cmplwi cr1,rCLS,32 |
259 | beq cr1,L(zloopstart) |
260 | |
261 | /* Now we know the cache line size and it is not 32-bytes. However |
262 | we may not yet be aligned to the cache line and may have a partial |
263 | line to fill. Touch it 1st to fetch the cache line. */ |
264 | dcbtst 0,rMEMP |
265 | |
266 | addi rCLM,rCLS,-1 |
267 | L(getCacheAligned): |
268 | cmplwi cr1,rLEN,32 |
269 | and. rTMP,rCLM,rMEMP |
270 | blt cr1,L(handletail32) |
271 | beq L(cacheAligned) |
272 | /* We are not aligned to start of a cache line yet. Store 32-byte |
273 | of data and test again. */ |
274 | addi rMEMP,rMEMP,32 |
275 | addi rLEN,rLEN,-32 |
276 | stw rCHR,-32(rMEMP) |
277 | stw rCHR,-28(rMEMP) |
278 | stw rCHR,-24(rMEMP) |
279 | stw rCHR,-20(rMEMP) |
280 | stw rCHR,-16(rMEMP) |
281 | stw rCHR,-12(rMEMP) |
282 | stw rCHR,-8(rMEMP) |
283 | stw rCHR,-4(rMEMP) |
284 | b L(getCacheAligned) |
285 | |
286 | /* Now we are aligned to the cache line and can use dcbz. */ |
287 | L(cacheAligned): |
288 | cmplw cr1,rLEN,rCLS |
289 | blt cr1,L(handletail32) |
290 | dcbz 0,rMEMP |
291 | subf rLEN,rCLS,rLEN |
292 | add rMEMP,rMEMP,rCLS |
293 | b L(cacheAligned) |
294 | |
295 | /* We are here because; the cache line size was set, it was not |
296 | 32-bytes, and the remainder (rLEN) is now less than the actual cache |
297 | line size. Set up the preconditions for L(nondcbz) and go there to |
298 | store the remaining bytes. */ |
299 | L(handletail32): |
300 | clrrwi. rALIGN, rLEN, 5 |
301 | b L(nondcbz) |
302 | |
303 | END (memset) |
304 | libc_hidden_builtin_def (memset) |
305 | |