1/* Optimized memset implementation for PowerPC64.
2 Copyright (C) 1997-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include <rtld-global-offsets.h>
21
22 .section ".toc","aw"
23__GLRO_DEF(dl_cache_line_size)
24
25 .section ".text"
26 .align 2
27
28/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
29 Returns 's'.
30
31 The memset is done in three sizes: byte (8 bits), word (32 bits),
32 cache line (256 bits). There is a special case for setting cache lines
33 to 0, to take advantage of the dcbz instruction. */
34
35#ifndef MEMSET
36# define MEMSET memset
37#endif
38
39ENTRY (MEMSET, 5)
40 CALL_MCOUNT 3
41
42#define rTMP r0
43#define rRTN r3 /* Initial value of 1st argument. */
44#define rMEMP0 r3 /* Original value of 1st arg. */
45#define rCHR r4 /* Char to set in each byte. */
46#define rLEN r5 /* Length of region to set. */
47#define rMEMP r6 /* Address at which we are storing. */
48#define rALIGN r7 /* Number of bytes we are setting now (when aligning). */
49#define rMEMP2 r8
50
51#define rNEG64 r8 /* Constant -64 for clearing with dcbz. */
52#define rCLS r8 /* Cache line size obtained from static. */
53#define rCLM r9 /* Cache line size mask to check for cache alignment. */
54L(_memset):
55/* Take care of case for size <= 4. */
56 cmpldi cr1, rLEN, 8
57 andi. rALIGN, rMEMP0, 7
58 mr rMEMP, rMEMP0
59 ble- cr1, L(small)
60
61/* Align to doubleword boundary. */
62 cmpldi cr5, rLEN, 31
63 insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */
64 beq+ L(aligned2)
65 mtcrf 0x01, rMEMP0
66 subfic rALIGN, rALIGN, 8
67 cror 28,30,31 /* Detect odd word aligned. */
68 add rMEMP, rMEMP, rALIGN
69 sub rLEN, rLEN, rALIGN
70 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */
71 bt 29, L(g4)
72/* Process the even word of doubleword. */
73 bf+ 31, L(g2)
74 stb rCHR, 0(rMEMP0)
75 bt 30, L(g4x)
76L(g2):
77 sth rCHR, -6(rMEMP)
78L(g4x):
79 stw rCHR, -4(rMEMP)
80 b L(aligned)
81/* Process the odd word of doubleword. */
82L(g4):
83 bf 28, L(g4x) /* If false, word aligned on odd word. */
84 bf+ 31, L(g0)
85 stb rCHR, 0(rMEMP0)
86 bt 30, L(aligned)
87L(g0):
88 sth rCHR, -2(rMEMP)
89
90/* Handle the case of size < 31. */
91L(aligned2):
92 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */
93L(aligned):
94 mtcrf 0x01, rLEN
95 ble cr5, L(medium)
96/* Align to 32-byte boundary. */
97 andi. rALIGN, rMEMP, 0x18
98 subfic rALIGN, rALIGN, 0x20
99 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */
100 beq L(caligned)
101 mtcrf 0x01, rALIGN
102 add rMEMP, rMEMP, rALIGN
103 sub rLEN, rLEN, rALIGN
104 cmplwi cr1, rALIGN, 0x10
105 mr rMEMP2, rMEMP
106 bf 28, L(a1)
107 stdu rCHR, -8(rMEMP2)
108L(a1): blt cr1, L(a2)
109 std rCHR, -8(rMEMP2)
110 stdu rCHR, -16(rMEMP2)
111L(a2):
112
113/* Now aligned to a 32 byte boundary. */
114L(caligned):
115 cmpldi cr1, rCHR, 0
116 clrrdi. rALIGN, rLEN, 5
117 mtcrf 0x01, rLEN
118 beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */
119L(nondcbz):
120 srdi rTMP, rALIGN, 5
121 mtctr rTMP
122 beq L(medium) /* We may not actually get to do a full line. */
123 clrldi. rLEN, rLEN, 59
124 add rMEMP, rMEMP, rALIGN
125 li rNEG64, -0x40
126 bdz L(cloopdone)
127
128L(c3): dcbtst rNEG64, rMEMP
129 std rCHR, -8(rMEMP)
130 std rCHR, -16(rMEMP)
131 std rCHR, -24(rMEMP)
132 stdu rCHR, -32(rMEMP)
133 bdnz L(c3)
134L(cloopdone):
135 std rCHR, -8(rMEMP)
136 std rCHR, -16(rMEMP)
137 cmpldi cr1, rLEN, 16
138 std rCHR, -24(rMEMP)
139 stdu rCHR, -32(rMEMP)
140 beqlr
141 add rMEMP, rMEMP, rALIGN
142 b L(medium_tail2)
143
144 .align 5
145/* Clear lines of memory in 128-byte chunks. */
146L(zloopstart):
147/* If the remaining length is less the 32 bytes, don't bother getting
148 the cache line size. */
149 beq L(medium)
150 /* Read the cache line size. */
151 __GLRO (rCLS, dl_cache_line_size,
152 RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET)
153
154/* If the cache line size was not set just goto to L(nondcbz) which is
155 safe for any cache line size. */
156 cmpldi cr1,rCLS,0
157 beq cr1,L(nondcbz)
158
159
160/* Now we know the cache line size, and it is not 32-bytes, but
161 we may not yet be aligned to the cache line. May have a partial
162 line to fill, so touch it 1st. */
163 dcbt 0,rMEMP
164 addi rCLM,rCLS,-1
165L(getCacheAligned):
166 cmpldi cr1,rLEN,32
167 and. rTMP,rCLM,rMEMP
168 blt cr1,L(handletail32)
169 beq L(cacheAligned)
170 addi rMEMP,rMEMP,32
171 addi rLEN,rLEN,-32
172 std rCHR,-32(rMEMP)
173 std rCHR,-24(rMEMP)
174 std rCHR,-16(rMEMP)
175 std rCHR,-8(rMEMP)
176 b L(getCacheAligned)
177
178/* Now we are aligned to the cache line and can use dcbz. */
179L(cacheAligned):
180 cmpld cr1,rLEN,rCLS
181 blt cr1,L(handletail32)
182 dcbz 0,rMEMP
183 subf rLEN,rCLS,rLEN
184 add rMEMP,rMEMP,rCLS
185 b L(cacheAligned)
186
187/* We are here because the cache line size was set and was not 32-bytes
188 and the remainder (rLEN) is less than the actual cache line size.
189 So set up the preconditions for L(nondcbz) and go there. */
190L(handletail32):
191 clrrwi. rALIGN, rLEN, 5
192 b L(nondcbz)
193
194 .align 5
195L(small):
196/* Memset of 8 bytes or less. */
197 cmpldi cr6, rLEN, 4
198 cmpldi cr5, rLEN, 1
199 ble cr6,L(le4)
200 subi rLEN, rLEN, 4
201 stb rCHR,0(rMEMP)
202 stb rCHR,1(rMEMP)
203 stb rCHR,2(rMEMP)
204 stb rCHR,3(rMEMP)
205 addi rMEMP,rMEMP, 4
206 cmpldi cr5, rLEN, 1
207L(le4):
208 cmpldi cr1, rLEN, 3
209 bltlr cr5
210 stb rCHR, 0(rMEMP)
211 beqlr cr5
212 stb rCHR, 1(rMEMP)
213 bltlr cr1
214 stb rCHR, 2(rMEMP)
215 beqlr cr1
216 stb rCHR, 3(rMEMP)
217 blr
218
219/* Memset of 0-31 bytes. */
220 .align 5
221L(medium):
222 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */
223 cmpldi cr1, rLEN, 16
224L(medium_tail2):
225 add rMEMP, rMEMP, rLEN
226L(medium_tail):
227 bt- 31, L(medium_31t)
228 bt- 30, L(medium_30t)
229L(medium_30f):
230 bt- 29, L(medium_29t)
231L(medium_29f):
232 bge- cr1, L(medium_27t)
233 bflr- 28
234 std rCHR, -8(rMEMP)
235 blr
236
237L(medium_31t):
238 stbu rCHR, -1(rMEMP)
239 bf- 30, L(medium_30f)
240L(medium_30t):
241 sthu rCHR, -2(rMEMP)
242 bf- 29, L(medium_29f)
243L(medium_29t):
244 stwu rCHR, -4(rMEMP)
245 blt- cr1, L(medium_27f)
246L(medium_27t):
247 std rCHR, -8(rMEMP)
248 stdu rCHR, -16(rMEMP)
249L(medium_27f):
250 bflr- 28
251L(medium_28t):
252 std rCHR, -8(rMEMP)
253 blr
254END_GEN_TB (MEMSET,TB_TOCLESS)
255libc_hidden_builtin_def (memset)
256

source code of glibc/sysdeps/powerpc/powerpc64/memset.S