1 | /* Optimized memset implementation for PowerPC64. |
2 | Copyright (C) 1997-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | #include <rtld-global-offsets.h> |
21 | |
22 | .section ".toc" ,"aw" |
23 | __GLRO_DEF(dl_cache_line_size) |
24 | |
25 | .section ".text" |
26 | .align 2 |
27 | |
28 | /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); |
29 | Returns 's'. |
30 | |
31 | The memset is done in three sizes: byte (8 bits), word (32 bits), |
32 | cache line (256 bits). There is a special case for setting cache lines |
33 | to 0, to take advantage of the dcbz instruction. */ |
34 | |
35 | #ifndef MEMSET |
36 | # define MEMSET memset |
37 | #endif |
38 | |
39 | ENTRY (MEMSET, 5) |
40 | CALL_MCOUNT 3 |
41 | |
42 | #define rTMP r0 |
43 | #define rRTN r3 /* Initial value of 1st argument. */ |
44 | #define rMEMP0 r3 /* Original value of 1st arg. */ |
45 | #define rCHR r4 /* Char to set in each byte. */ |
46 | #define rLEN r5 /* Length of region to set. */ |
47 | #define rMEMP r6 /* Address at which we are storing. */ |
48 | #define rALIGN r7 /* Number of bytes we are setting now (when aligning). */ |
49 | #define rMEMP2 r8 |
50 | |
51 | #define rNEG64 r8 /* Constant -64 for clearing with dcbz. */ |
52 | #define rCLS r8 /* Cache line size obtained from static. */ |
53 | #define rCLM r9 /* Cache line size mask to check for cache alignment. */ |
54 | L(_memset): |
55 | /* Take care of case for size <= 4. */ |
56 | cmpldi cr1, rLEN, 8 |
57 | andi. rALIGN, rMEMP0, 7 |
58 | mr rMEMP, rMEMP0 |
59 | ble- cr1, L(small) |
60 | |
61 | /* Align to doubleword boundary. */ |
62 | cmpldi cr5, rLEN, 31 |
63 | insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */ |
64 | beq+ L(aligned2) |
65 | mtcrf 0x01, rMEMP0 |
66 | subfic rALIGN, rALIGN, 8 |
67 | cror 28,30,31 /* Detect odd word aligned. */ |
68 | add rMEMP, rMEMP, rALIGN |
69 | sub rLEN, rLEN, rALIGN |
70 | insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ |
71 | bt 29, L(g4) |
72 | /* Process the even word of doubleword. */ |
73 | bf+ 31, L(g2) |
74 | stb rCHR, 0(rMEMP0) |
75 | bt 30, L(g4x) |
76 | L(g2): |
77 | sth rCHR, -6(rMEMP) |
78 | L(g4x): |
79 | stw rCHR, -4(rMEMP) |
80 | b L(aligned) |
81 | /* Process the odd word of doubleword. */ |
82 | L(g4): |
83 | bf 28, L(g4x) /* If false, word aligned on odd word. */ |
84 | bf+ 31, L(g0) |
85 | stb rCHR, 0(rMEMP0) |
86 | bt 30, L(aligned) |
87 | L(g0): |
88 | sth rCHR, -2(rMEMP) |
89 | |
90 | /* Handle the case of size < 31. */ |
91 | L(aligned2): |
92 | insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ |
93 | L(aligned): |
94 | mtcrf 0x01, rLEN |
95 | ble cr5, L(medium) |
96 | /* Align to 32-byte boundary. */ |
97 | andi. rALIGN, rMEMP, 0x18 |
98 | subfic rALIGN, rALIGN, 0x20 |
99 | insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ |
100 | beq L(caligned) |
101 | mtcrf 0x01, rALIGN |
102 | add rMEMP, rMEMP, rALIGN |
103 | sub rLEN, rLEN, rALIGN |
104 | cmplwi cr1, rALIGN, 0x10 |
105 | mr rMEMP2, rMEMP |
106 | bf 28, L(a1) |
107 | stdu rCHR, -8(rMEMP2) |
108 | L(a1): blt cr1, L(a2) |
109 | std rCHR, -8(rMEMP2) |
110 | stdu rCHR, -16(rMEMP2) |
111 | L(a2): |
112 | |
113 | /* Now aligned to a 32 byte boundary. */ |
114 | L(caligned): |
115 | cmpldi cr1, rCHR, 0 |
116 | clrrdi. rALIGN, rLEN, 5 |
117 | mtcrf 0x01, rLEN |
118 | beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */ |
119 | L(nondcbz): |
120 | srdi rTMP, rALIGN, 5 |
121 | mtctr rTMP |
122 | beq L(medium) /* We may not actually get to do a full line. */ |
123 | clrldi. rLEN, rLEN, 59 |
124 | add rMEMP, rMEMP, rALIGN |
125 | li rNEG64, -0x40 |
126 | bdz L(cloopdone) |
127 | |
128 | L(c3): dcbtst rNEG64, rMEMP |
129 | std rCHR, -8(rMEMP) |
130 | std rCHR, -16(rMEMP) |
131 | std rCHR, -24(rMEMP) |
132 | stdu rCHR, -32(rMEMP) |
133 | bdnz L(c3) |
134 | L(cloopdone): |
135 | std rCHR, -8(rMEMP) |
136 | std rCHR, -16(rMEMP) |
137 | cmpldi cr1, rLEN, 16 |
138 | std rCHR, -24(rMEMP) |
139 | stdu rCHR, -32(rMEMP) |
140 | beqlr |
141 | add rMEMP, rMEMP, rALIGN |
142 | b L(medium_tail2) |
143 | |
144 | .align 5 |
145 | /* Clear lines of memory in 128-byte chunks. */ |
146 | L(zloopstart): |
147 | /* If the remaining length is less the 32 bytes, don't bother getting |
148 | the cache line size. */ |
149 | beq L(medium) |
150 | /* Read the cache line size. */ |
151 | __GLRO (rCLS, dl_cache_line_size, |
152 | RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET) |
153 | |
154 | /* If the cache line size was not set just goto to L(nondcbz) which is |
155 | safe for any cache line size. */ |
156 | cmpldi cr1,rCLS,0 |
157 | beq cr1,L(nondcbz) |
158 | |
159 | |
160 | /* Now we know the cache line size, and it is not 32-bytes, but |
161 | we may not yet be aligned to the cache line. May have a partial |
162 | line to fill, so touch it 1st. */ |
163 | dcbt 0,rMEMP |
164 | addi rCLM,rCLS,-1 |
165 | L(getCacheAligned): |
166 | cmpldi cr1,rLEN,32 |
167 | and. rTMP,rCLM,rMEMP |
168 | blt cr1,L(handletail32) |
169 | beq L(cacheAligned) |
170 | addi rMEMP,rMEMP,32 |
171 | addi rLEN,rLEN,-32 |
172 | std rCHR,-32(rMEMP) |
173 | std rCHR,-24(rMEMP) |
174 | std rCHR,-16(rMEMP) |
175 | std rCHR,-8(rMEMP) |
176 | b L(getCacheAligned) |
177 | |
178 | /* Now we are aligned to the cache line and can use dcbz. */ |
179 | L(cacheAligned): |
180 | cmpld cr1,rLEN,rCLS |
181 | blt cr1,L(handletail32) |
182 | dcbz 0,rMEMP |
183 | subf rLEN,rCLS,rLEN |
184 | add rMEMP,rMEMP,rCLS |
185 | b L(cacheAligned) |
186 | |
187 | /* We are here because the cache line size was set and was not 32-bytes |
188 | and the remainder (rLEN) is less than the actual cache line size. |
189 | So set up the preconditions for L(nondcbz) and go there. */ |
190 | L(handletail32): |
191 | clrrwi. rALIGN, rLEN, 5 |
192 | b L(nondcbz) |
193 | |
194 | .align 5 |
195 | L(small): |
196 | /* Memset of 8 bytes or less. */ |
197 | cmpldi cr6, rLEN, 4 |
198 | cmpldi cr5, rLEN, 1 |
199 | ble cr6,L(le4) |
200 | subi rLEN, rLEN, 4 |
201 | stb rCHR,0(rMEMP) |
202 | stb rCHR,1(rMEMP) |
203 | stb rCHR,2(rMEMP) |
204 | stb rCHR,3(rMEMP) |
205 | addi rMEMP,rMEMP, 4 |
206 | cmpldi cr5, rLEN, 1 |
207 | L(le4): |
208 | cmpldi cr1, rLEN, 3 |
209 | bltlr cr5 |
210 | stb rCHR, 0(rMEMP) |
211 | beqlr cr5 |
212 | stb rCHR, 1(rMEMP) |
213 | bltlr cr1 |
214 | stb rCHR, 2(rMEMP) |
215 | beqlr cr1 |
216 | stb rCHR, 3(rMEMP) |
217 | blr |
218 | |
219 | /* Memset of 0-31 bytes. */ |
220 | .align 5 |
221 | L(medium): |
222 | insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ |
223 | cmpldi cr1, rLEN, 16 |
224 | L(medium_tail2): |
225 | add rMEMP, rMEMP, rLEN |
226 | L(medium_tail): |
227 | bt- 31, L(medium_31t) |
228 | bt- 30, L(medium_30t) |
229 | L(medium_30f): |
230 | bt- 29, L(medium_29t) |
231 | L(medium_29f): |
232 | bge- cr1, L(medium_27t) |
233 | bflr- 28 |
234 | std rCHR, -8(rMEMP) |
235 | blr |
236 | |
237 | L(medium_31t): |
238 | stbu rCHR, -1(rMEMP) |
239 | bf- 30, L(medium_30f) |
240 | L(medium_30t): |
241 | sthu rCHR, -2(rMEMP) |
242 | bf- 29, L(medium_29f) |
243 | L(medium_29t): |
244 | stwu rCHR, -4(rMEMP) |
245 | blt- cr1, L(medium_27f) |
246 | L(medium_27t): |
247 | std rCHR, -8(rMEMP) |
248 | stdu rCHR, -16(rMEMP) |
249 | L(medium_27f): |
250 | bflr- 28 |
251 | L(medium_28t): |
252 | std rCHR, -8(rMEMP) |
253 | blr |
254 | END_GEN_TB (MEMSET,TB_TOCLESS) |
255 | libc_hidden_builtin_def (memset) |
256 | |