1/* Optimized memset implementation for PowerPC64.
2 Copyright (C) 1997-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
22 Returns 's'.
23
24 The memset is done in three sizes: byte (8 bits), word (32 bits),
25 cache line (1024 bits). There is a special case for setting cache lines
26 to 0, to take advantage of the dcbz instruction. */
27
28 .machine power4
29EALIGN (memset, 5, 0)
30 CALL_MCOUNT
31
32#define rTMP r0
33#define rRTN r3 /* Initial value of 1st argument. */
34#define rMEMP0 r3 /* Original value of 1st arg. */
35#define rCHR r4 /* Char to set in each byte. */
36#define rLEN r5 /* Length of region to set. */
37#define rMEMP r6 /* Address at which we are storing. */
38#define rALIGN r7 /* Number of bytes we are setting now (when aligning). */
39#define rMEMP2 r8
40
41#define rNEG64 r8 /* Constant -64 for clearing with dcbz. */
42#define rCLS r8 /* Cache line size (known to be 128). */
43#define rCLM r9 /* Cache line size mask to check for cache alignment. */
44L(_memset):
45/* Take care of case for size <= 4. */
46 cmplwi cr1, rLEN, 4
47 andi. rALIGN, rMEMP0, 3
48 mr rMEMP, rMEMP0
49 ble- cr1, L(small)
50
51/* Align to word boundary. */
52 cmplwi cr5, rLEN, 31
53 insrwi rCHR, rCHR, 8, 16 /* Replicate byte to halfword. */
54 beq+ L(aligned)
55 mtcrf 0x01, rMEMP0
56 subfic rALIGN, rALIGN, 4
57 add rMEMP, rMEMP, rALIGN
58 sub rLEN, rLEN, rALIGN
59 bf+ 31, L(g0)
60 stb rCHR, 0(rMEMP0)
61 bt 30, L(aligned)
62L(g0):
63 sth rCHR, -2(rMEMP)
64
65/* Handle the case of size < 31. */
66L(aligned):
67 mtcrf 0x01, rLEN
68 insrwi rCHR, rCHR, 16, 0 /* Replicate halfword to word. */
69 ble cr5, L(medium)
70/* Align to 32-byte boundary. */
71 andi. rALIGN, rMEMP, 0x1C
72 subfic rALIGN, rALIGN, 0x20
73 beq L(caligned)
74 mtcrf 0x01, rALIGN
75 add rMEMP, rMEMP, rALIGN
76 sub rLEN, rLEN, rALIGN
77 cmplwi cr1, rALIGN, 0x10
78 mr rMEMP2, rMEMP
79 bf 28, L(a1)
80 stw rCHR, -4(rMEMP2)
81 stwu rCHR, -8(rMEMP2)
82L(a1): blt cr1, L(a2)
83 stw rCHR, -4(rMEMP2)
84 stw rCHR, -8(rMEMP2)
85 stw rCHR, -12(rMEMP2)
86 stwu rCHR, -16(rMEMP2)
87L(a2): bf 29, L(caligned)
88 stw rCHR, -4(rMEMP2)
89
90/* Now aligned to a 32 byte boundary. */
91L(caligned):
92 cmplwi cr1, rCHR, 0
93 clrrwi. rALIGN, rLEN, 5
94 mtcrf 0x01, rLEN
95 beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */
96L(nondcbz):
97 srwi rTMP, rALIGN, 5
98 mtctr rTMP
99 beq L(medium) /* We may not actually get to do a full line. */
100 clrlwi. rLEN, rLEN, 27
101 add rMEMP, rMEMP, rALIGN
102 li rNEG64, -0x40
103 bdz L(cloopdone)
104
105 .align 4
106L(c3): dcbtst rNEG64, rMEMP
107 stw rCHR, -4(rMEMP)
108 stw rCHR, -8(rMEMP)
109 stw rCHR, -12(rMEMP)
110 stw rCHR, -16(rMEMP)
111 stw rCHR, -20(rMEMP)
112 stw rCHR, -24(rMEMP)
113 stw rCHR, -28(rMEMP)
114 stwu rCHR, -32(rMEMP)
115 bdnz L(c3)
116L(cloopdone):
117 stw rCHR, -4(rMEMP)
118 stw rCHR, -8(rMEMP)
119 stw rCHR, -12(rMEMP)
120 stw rCHR, -16(rMEMP)
121 cmplwi cr1, rLEN, 16
122 stw rCHR, -20(rMEMP)
123 stw rCHR, -24(rMEMP)
124 stw rCHR, -28(rMEMP)
125 stwu rCHR, -32(rMEMP)
126 beqlr
127 add rMEMP, rMEMP, rALIGN
128 b L(medium_tail2)
129
130 .align 5
131/* Clear lines of memory in 128-byte chunks. */
132L(zloopstart):
133/* If the remaining length is less the 32 bytes, don't bother getting
134 the cache line size. */
135 beq L(medium)
136 li rCLS,128 /* cache line size is 128 */
137 dcbt 0,rMEMP
138L(getCacheAligned):
139 cmplwi cr1,rLEN,32
140 andi. rTMP,rMEMP,127
141 blt cr1,L(handletail32)
142 beq L(cacheAligned)
143 addi rMEMP,rMEMP,32
144 addi rLEN,rLEN,-32
145 stw rCHR,-32(rMEMP)
146 stw rCHR,-28(rMEMP)
147 stw rCHR,-24(rMEMP)
148 stw rCHR,-20(rMEMP)
149 stw rCHR,-16(rMEMP)
150 stw rCHR,-12(rMEMP)
151 stw rCHR,-8(rMEMP)
152 stw rCHR,-4(rMEMP)
153 b L(getCacheAligned)
154
155/* Now we are aligned to the cache line and can use dcbz. */
156 .align 4
157L(cacheAligned):
158 cmplw cr1,rLEN,rCLS
159 blt cr1,L(handletail32)
160 dcbz 0,rMEMP
161 subf rLEN,rCLS,rLEN
162 add rMEMP,rMEMP,rCLS
163 b L(cacheAligned)
164
165/* We are here because the cache line size was set and the remainder
166 (rLEN) is less than the actual cache line size.
167 So set up the preconditions for L(nondcbz) and go there. */
168L(handletail32):
169 clrrwi. rALIGN, rLEN, 5
170 b L(nondcbz)
171
172 .align 5
173L(small):
174/* Memset of 4 bytes or less. */
175 cmplwi cr5, rLEN, 1
176 cmplwi cr1, rLEN, 3
177 bltlr cr5
178 stb rCHR, 0(rMEMP)
179 beqlr cr5
180 stb rCHR, 1(rMEMP)
181 bltlr cr1
182 stb rCHR, 2(rMEMP)
183 beqlr cr1
184 stb rCHR, 3(rMEMP)
185 blr
186
187/* Memset of 0-31 bytes. */
188 .align 5
189L(medium):
190 cmplwi cr1, rLEN, 16
191L(medium_tail2):
192 add rMEMP, rMEMP, rLEN
193L(medium_tail):
194 bt- 31, L(medium_31t)
195 bt- 30, L(medium_30t)
196L(medium_30f):
197 bt- 29, L(medium_29t)
198L(medium_29f):
199 bge- cr1, L(medium_27t)
200 bflr- 28
201 stw rCHR, -4(rMEMP)
202 stw rCHR, -8(rMEMP)
203 blr
204
205L(medium_31t):
206 stbu rCHR, -1(rMEMP)
207 bf- 30, L(medium_30f)
208L(medium_30t):
209 sthu rCHR, -2(rMEMP)
210 bf- 29, L(medium_29f)
211L(medium_29t):
212 stwu rCHR, -4(rMEMP)
213 blt- cr1, L(medium_27f)
214L(medium_27t):
215 stw rCHR, -4(rMEMP)
216 stw rCHR, -8(rMEMP)
217 stw rCHR, -12(rMEMP)
218 stwu rCHR, -16(rMEMP)
219L(medium_27f):
220 bflr- 28
221L(medium_28t):
222 stw rCHR, -4(rMEMP)
223 stw rCHR, -8(rMEMP)
224 blr
225END (memset)
226libc_hidden_builtin_def (memset)
227

source code of glibc/sysdeps/powerpc/powerpc32/power4/memset.S