1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * Copyright (c) 2011, The Linux Foundation. All rights reserved. |
4 | */ |
5 | |
6 | |
7 | /* HEXAGON assembly optimized memset */ |
8 | /* Replaces the standard library function memset */ |
9 | |
10 | |
11 | .macro HEXAGON_OPT_FUNC_BEGIN name |
12 | .text |
13 | .p2align 4 |
14 | .globl \name |
15 | .type \name, @function |
16 | \name: |
17 | .endm |
18 | |
19 | .macro HEXAGON_OPT_FUNC_FINISH name |
20 | .size \name, . - \name |
21 | .endm |
22 | |
23 | /* FUNCTION: memset (v2 version) */ |
24 | #if __HEXAGON_ARCH__ < 3 |
25 | HEXAGON_OPT_FUNC_BEGIN memset |
26 | { |
27 | r6 = #8 |
28 | r7 = extractu(r0, #3 , #0) |
29 | p0 = cmp.eq(r2, #0) |
30 | p1 = cmp.gtu(r2, #7) |
31 | } |
32 | { |
33 | r4 = vsplatb(r1) |
34 | r8 = r0 /* leave r0 intact for return val */ |
35 | r9 = sub(r6, r7) /* bytes until double alignment */ |
36 | if p0 jumpr r31 /* count == 0, so return */ |
37 | } |
38 | { |
39 | r3 = #0 |
40 | r7 = #0 |
41 | p0 = tstbit(r9, #0) |
42 | if p1 jump 2f /* skip byte loop */ |
43 | } |
44 | |
45 | /* less than 8 bytes to set, so just set a byte at a time and return */ |
46 | |
47 | loop0(1f, r2) /* byte loop */ |
48 | .falign |
49 | 1: /* byte loop */ |
50 | { |
51 | memb(r8++#1) = r4 |
52 | }:endloop0 |
53 | jumpr r31 |
54 | .falign |
55 | 2: /* skip byte loop */ |
56 | { |
57 | r6 = #1 |
58 | p0 = tstbit(r9, #1) |
59 | p1 = cmp.eq(r2, #1) |
60 | if !p0 jump 3f /* skip initial byte store */ |
61 | } |
62 | { |
63 | memb(r8++#1) = r4 |
64 | r3:2 = sub(r3:2, r7:6) |
65 | if p1 jumpr r31 |
66 | } |
67 | .falign |
68 | 3: /* skip initial byte store */ |
69 | { |
70 | r6 = #2 |
71 | p0 = tstbit(r9, #2) |
72 | p1 = cmp.eq(r2, #2) |
73 | if !p0 jump 4f /* skip initial half store */ |
74 | } |
75 | { |
76 | memh(r8++#2) = r4 |
77 | r3:2 = sub(r3:2, r7:6) |
78 | if p1 jumpr r31 |
79 | } |
80 | .falign |
81 | 4: /* skip initial half store */ |
82 | { |
83 | r6 = #4 |
84 | p0 = cmp.gtu(r2, #7) |
85 | p1 = cmp.eq(r2, #4) |
86 | if !p0 jump 5f /* skip initial word store */ |
87 | } |
88 | { |
89 | memw(r8++#4) = r4 |
90 | r3:2 = sub(r3:2, r7:6) |
91 | p0 = cmp.gtu(r2, #11) |
92 | if p1 jumpr r31 |
93 | } |
94 | .falign |
95 | 5: /* skip initial word store */ |
96 | { |
97 | r10 = lsr(r2, #3) |
98 | p1 = cmp.eq(r3, #1) |
99 | if !p0 jump 7f /* skip double loop */ |
100 | } |
101 | { |
102 | r5 = r4 |
103 | r6 = #8 |
104 | loop0(6f, r10) /* double loop */ |
105 | } |
106 | |
107 | /* set bytes a double word at a time */ |
108 | |
109 | .falign |
110 | 6: /* double loop */ |
111 | { |
112 | memd(r8++#8) = r5:4 |
113 | r3:2 = sub(r3:2, r7:6) |
114 | p1 = cmp.eq(r2, #8) |
115 | }:endloop0 |
116 | .falign |
117 | 7: /* skip double loop */ |
118 | { |
119 | p0 = tstbit(r2, #2) |
120 | if p1 jumpr r31 |
121 | } |
122 | { |
123 | r6 = #4 |
124 | p0 = tstbit(r2, #1) |
125 | p1 = cmp.eq(r2, #4) |
126 | if !p0 jump 8f /* skip final word store */ |
127 | } |
128 | { |
129 | memw(r8++#4) = r4 |
130 | r3:2 = sub(r3:2, r7:6) |
131 | if p1 jumpr r31 |
132 | } |
133 | .falign |
134 | 8: /* skip final word store */ |
135 | { |
136 | p1 = cmp.eq(r2, #2) |
137 | if !p0 jump 9f /* skip final half store */ |
138 | } |
139 | { |
140 | memh(r8++#2) = r4 |
141 | if p1 jumpr r31 |
142 | } |
143 | .falign |
144 | 9: /* skip final half store */ |
145 | { |
146 | memb(r8++#1) = r4 |
147 | jumpr r31 |
148 | } |
149 | HEXAGON_OPT_FUNC_FINISH memset |
150 | #endif |
151 | |
152 | |
153 | /* FUNCTION: memset (v3 and higher version) */ |
154 | #if __HEXAGON_ARCH__ >= 3 |
155 | HEXAGON_OPT_FUNC_BEGIN memset |
156 | { |
157 | r7=vsplatb(r1) |
158 | r6 = r0 |
159 | if (r2==#0) jump:nt .L1 |
160 | } |
161 | { |
162 | r5:4=combine(r7,r7) |
163 | p0 = cmp.gtu(r2,#8) |
164 | if (p0.new) jump:nt .L3 |
165 | } |
166 | { |
167 | r3 = r0 |
168 | loop0(.L47,r2) |
169 | } |
170 | .falign |
171 | .L47: |
172 | { |
173 | memb(r3++#1) = r1 |
174 | }:endloop0 /* start=.L47 */ |
175 | jumpr r31 |
176 | .L3: |
177 | { |
178 | p0 = tstbit(r0,#0) |
179 | if (!p0.new) jump:nt .L8 |
180 | p1 = cmp.eq(r2, #1) |
181 | } |
182 | { |
183 | r6 = add(r0, #1) |
184 | r2 = add(r2,#-1) |
185 | memb(r0) = r1 |
186 | if (p1) jump .L1 |
187 | } |
188 | .L8: |
189 | { |
190 | p0 = tstbit(r6,#1) |
191 | if (!p0.new) jump:nt .L10 |
192 | } |
193 | { |
194 | r2 = add(r2,#-2) |
195 | memh(r6++#2) = r7 |
196 | p0 = cmp.eq(r2, #2) |
197 | if (p0.new) jump:nt .L1 |
198 | } |
199 | .L10: |
200 | { |
201 | p0 = tstbit(r6,#2) |
202 | if (!p0.new) jump:nt .L12 |
203 | } |
204 | { |
205 | r2 = add(r2,#-4) |
206 | memw(r6++#4) = r7 |
207 | p0 = cmp.eq(r2, #4) |
208 | if (p0.new) jump:nt .L1 |
209 | } |
210 | .L12: |
211 | { |
212 | p0 = cmp.gtu(r2,#127) |
213 | if (!p0.new) jump:nt .L14 |
214 | } |
215 | r3 = and(r6,#31) |
216 | if (r3==#0) jump:nt .L17 |
217 | { |
218 | memd(r6++#8) = r5:4 |
219 | r2 = add(r2,#-8) |
220 | } |
221 | r3 = and(r6,#31) |
222 | if (r3==#0) jump:nt .L17 |
223 | { |
224 | memd(r6++#8) = r5:4 |
225 | r2 = add(r2,#-8) |
226 | } |
227 | r3 = and(r6,#31) |
228 | if (r3==#0) jump:nt .L17 |
229 | { |
230 | memd(r6++#8) = r5:4 |
231 | r2 = add(r2,#-8) |
232 | } |
233 | .L17: |
234 | { |
235 | r3 = lsr(r2,#5) |
236 | if (r1!=#0) jump:nt .L18 |
237 | } |
238 | { |
239 | r8 = r3 |
240 | r3 = r6 |
241 | loop0(.L46,r3) |
242 | } |
243 | .falign |
244 | .L46: |
245 | { |
246 | dczeroa(r6) |
247 | r6 = add(r6,#32) |
248 | r2 = add(r2,#-32) |
249 | }:endloop0 /* start=.L46 */ |
250 | .L14: |
251 | { |
252 | p0 = cmp.gtu(r2,#7) |
253 | if (!p0.new) jump:nt .L28 |
254 | r8 = lsr(r2,#3) |
255 | } |
256 | loop0(.L44,r8) |
257 | .falign |
258 | .L44: |
259 | { |
260 | memd(r6++#8) = r5:4 |
261 | r2 = add(r2,#-8) |
262 | }:endloop0 /* start=.L44 */ |
263 | .L28: |
264 | { |
265 | p0 = tstbit(r2,#2) |
266 | if (!p0.new) jump:nt .L33 |
267 | } |
268 | { |
269 | r2 = add(r2,#-4) |
270 | memw(r6++#4) = r7 |
271 | } |
272 | .L33: |
273 | { |
274 | p0 = tstbit(r2,#1) |
275 | if (!p0.new) jump:nt .L35 |
276 | } |
277 | { |
278 | r2 = add(r2,#-2) |
279 | memh(r6++#2) = r7 |
280 | } |
281 | .L35: |
282 | p0 = cmp.eq(r2,#1) |
283 | if (p0) memb(r6) = r1 |
284 | .L1: |
285 | jumpr r31 |
286 | .L18: |
287 | loop0(.L45,r3) |
288 | .falign |
289 | .L45: |
290 | dczeroa(r6) |
291 | { |
292 | memd(r6++#8) = r5:4 |
293 | r2 = add(r2,#-32) |
294 | } |
295 | memd(r6++#8) = r5:4 |
296 | memd(r6++#8) = r5:4 |
297 | { |
298 | memd(r6++#8) = r5:4 |
299 | }:endloop0 /* start=.L45 */ |
300 | jump .L14 |
301 | HEXAGON_OPT_FUNC_FINISH memset |
302 | #endif |
303 | |