1/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
5 *
6 * This file is subject to the terms and conditions of the GNU General
7 * Public License. See the file COPYING in the main directory of this
8 * archive for more details.
9 *
10 * Written by Jim Law <jlaw@irispower.com>
11 *
12 * intended to replace:
13 * memcpy in memcpy.c and
14 * memmove in memmove.c
15 * ... in arch/microblaze/lib
16 *
17 *
18 * assly_fastcopy.S
19 *
20 * Attempt at quicker memcpy and memmove for MicroBlaze
21 * Input : Operand1 in Reg r5 - destination address
22 * Operand2 in Reg r6 - source address
23 * Operand3 in Reg r7 - number of bytes to transfer
24 * Output: Result in Reg r3 - starting destinaition address
25 *
26 *
27 * Explanation:
28 * Perform (possibly unaligned) copy of a block of memory
29 * between mem locations with size of xfer spec'd in bytes
30 */
31
32#include <linux/linkage.h>
33 .text
34 .globl memcpy
35 .type memcpy, @function
36 .ent memcpy
37
38memcpy:
39fast_memcpy_ascending:
40 /* move d to return register as value of function */
41 addi r3, r5, 0
42
43 addi r4, r0, 4 /* n = 4 */
44 cmpu r4, r4, r7 /* n = c - n (unsigned) */
45 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
46
47 /* transfer first 0~3 bytes to get aligned dest address */
48 andi r4, r5, 3 /* n = d & 3 */
49 /* if zero, destination already aligned */
50 beqi r4, a_dalign_done
51 /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
52 rsubi r4, r4, 4
53 rsub r7, r4, r7 /* c = c - n adjust c */
54
55a_xfer_first_loop:
56 /* if no bytes left to transfer, transfer the bulk */
57 beqi r4, a_dalign_done
58 lbui r11, r6, 0 /* h = *s */
59 sbi r11, r5, 0 /* *d = h */
60 addi r6, r6, 1 /* s++ */
61 addi r5, r5, 1 /* d++ */
62 brid a_xfer_first_loop /* loop */
63 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
64
65a_dalign_done:
66 addi r4, r0, 32 /* n = 32 */
67 cmpu r4, r4, r7 /* n = c - n (unsigned) */
68 /* if n < 0, less than one block to transfer */
69 blti r4, a_block_done
70
71a_block_xfer:
72 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
73 rsub r7, r4, r7 /* c = c - n */
74
75 andi r9, r6, 3 /* t1 = s & 3 */
76 /* if temp != 0, unaligned transfers needed */
77 bnei r9, a_block_unaligned
78
79a_block_aligned:
80 lwi r9, r6, 0 /* t1 = *(s + 0) */
81 lwi r10, r6, 4 /* t2 = *(s + 4) */
82 lwi r11, r6, 8 /* t3 = *(s + 8) */
83 lwi r12, r6, 12 /* t4 = *(s + 12) */
84 swi r9, r5, 0 /* *(d + 0) = t1 */
85 swi r10, r5, 4 /* *(d + 4) = t2 */
86 swi r11, r5, 8 /* *(d + 8) = t3 */
87 swi r12, r5, 12 /* *(d + 12) = t4 */
88 lwi r9, r6, 16 /* t1 = *(s + 16) */
89 lwi r10, r6, 20 /* t2 = *(s + 20) */
90 lwi r11, r6, 24 /* t3 = *(s + 24) */
91 lwi r12, r6, 28 /* t4 = *(s + 28) */
92 swi r9, r5, 16 /* *(d + 16) = t1 */
93 swi r10, r5, 20 /* *(d + 20) = t2 */
94 swi r11, r5, 24 /* *(d + 24) = t3 */
95 swi r12, r5, 28 /* *(d + 28) = t4 */
96 addi r6, r6, 32 /* s = s + 32 */
97 addi r4, r4, -32 /* n = n - 32 */
98 bneid r4, a_block_aligned /* while (n) loop */
99 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
100 bri a_block_done
101
102a_block_unaligned:
103 andi r8, r6, 0xfffffffc /* as = s & ~3 */
104 add r6, r6, r4 /* s = s + n */
105 lwi r11, r8, 0 /* h = *(as + 0) */
106
107 addi r9, r9, -1
108 beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
109 addi r9, r9, -1
110 beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
111
112a_block_u3:
113 bslli r11, r11, 24 /* h = h << 24 */
114a_bu3_loop:
115 lwi r12, r8, 4 /* v = *(as + 4) */
116 bsrli r9, r12, 8 /* t1 = v >> 8 */
117 or r9, r11, r9 /* t1 = h | t1 */
118 swi r9, r5, 0 /* *(d + 0) = t1 */
119 bslli r11, r12, 24 /* h = v << 24 */
120 lwi r12, r8, 8 /* v = *(as + 8) */
121 bsrli r9, r12, 8 /* t1 = v >> 8 */
122 or r9, r11, r9 /* t1 = h | t1 */
123 swi r9, r5, 4 /* *(d + 4) = t1 */
124 bslli r11, r12, 24 /* h = v << 24 */
125 lwi r12, r8, 12 /* v = *(as + 12) */
126 bsrli r9, r12, 8 /* t1 = v >> 8 */
127 or r9, r11, r9 /* t1 = h | t1 */
128 swi r9, r5, 8 /* *(d + 8) = t1 */
129 bslli r11, r12, 24 /* h = v << 24 */
130 lwi r12, r8, 16 /* v = *(as + 16) */
131 bsrli r9, r12, 8 /* t1 = v >> 8 */
132 or r9, r11, r9 /* t1 = h | t1 */
133 swi r9, r5, 12 /* *(d + 12) = t1 */
134 bslli r11, r12, 24 /* h = v << 24 */
135 lwi r12, r8, 20 /* v = *(as + 20) */
136 bsrli r9, r12, 8 /* t1 = v >> 8 */
137 or r9, r11, r9 /* t1 = h | t1 */
138 swi r9, r5, 16 /* *(d + 16) = t1 */
139 bslli r11, r12, 24 /* h = v << 24 */
140 lwi r12, r8, 24 /* v = *(as + 24) */
141 bsrli r9, r12, 8 /* t1 = v >> 8 */
142 or r9, r11, r9 /* t1 = h | t1 */
143 swi r9, r5, 20 /* *(d + 20) = t1 */
144 bslli r11, r12, 24 /* h = v << 24 */
145 lwi r12, r8, 28 /* v = *(as + 28) */
146 bsrli r9, r12, 8 /* t1 = v >> 8 */
147 or r9, r11, r9 /* t1 = h | t1 */
148 swi r9, r5, 24 /* *(d + 24) = t1 */
149 bslli r11, r12, 24 /* h = v << 24 */
150 lwi r12, r8, 32 /* v = *(as + 32) */
151 bsrli r9, r12, 8 /* t1 = v >> 8 */
152 or r9, r11, r9 /* t1 = h | t1 */
153 swi r9, r5, 28 /* *(d + 28) = t1 */
154 bslli r11, r12, 24 /* h = v << 24 */
155 addi r8, r8, 32 /* as = as + 32 */
156 addi r4, r4, -32 /* n = n - 32 */
157 bneid r4, a_bu3_loop /* while (n) loop */
158 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
159 bri a_block_done
160
161a_block_u1:
162 bslli r11, r11, 8 /* h = h << 8 */
163a_bu1_loop:
164 lwi r12, r8, 4 /* v = *(as + 4) */
165 bsrli r9, r12, 24 /* t1 = v >> 24 */
166 or r9, r11, r9 /* t1 = h | t1 */
167 swi r9, r5, 0 /* *(d + 0) = t1 */
168 bslli r11, r12, 8 /* h = v << 8 */
169 lwi r12, r8, 8 /* v = *(as + 8) */
170 bsrli r9, r12, 24 /* t1 = v >> 24 */
171 or r9, r11, r9 /* t1 = h | t1 */
172 swi r9, r5, 4 /* *(d + 4) = t1 */
173 bslli r11, r12, 8 /* h = v << 8 */
174 lwi r12, r8, 12 /* v = *(as + 12) */
175 bsrli r9, r12, 24 /* t1 = v >> 24 */
176 or r9, r11, r9 /* t1 = h | t1 */
177 swi r9, r5, 8 /* *(d + 8) = t1 */
178 bslli r11, r12, 8 /* h = v << 8 */
179 lwi r12, r8, 16 /* v = *(as + 16) */
180 bsrli r9, r12, 24 /* t1 = v >> 24 */
181 or r9, r11, r9 /* t1 = h | t1 */
182 swi r9, r5, 12 /* *(d + 12) = t1 */
183 bslli r11, r12, 8 /* h = v << 8 */
184 lwi r12, r8, 20 /* v = *(as + 20) */
185 bsrli r9, r12, 24 /* t1 = v >> 24 */
186 or r9, r11, r9 /* t1 = h | t1 */
187 swi r9, r5, 16 /* *(d + 16) = t1 */
188 bslli r11, r12, 8 /* h = v << 8 */
189 lwi r12, r8, 24 /* v = *(as + 24) */
190 bsrli r9, r12, 24 /* t1 = v >> 24 */
191 or r9, r11, r9 /* t1 = h | t1 */
192 swi r9, r5, 20 /* *(d + 20) = t1 */
193 bslli r11, r12, 8 /* h = v << 8 */
194 lwi r12, r8, 28 /* v = *(as + 28) */
195 bsrli r9, r12, 24 /* t1 = v >> 24 */
196 or r9, r11, r9 /* t1 = h | t1 */
197 swi r9, r5, 24 /* *(d + 24) = t1 */
198 bslli r11, r12, 8 /* h = v << 8 */
199 lwi r12, r8, 32 /* v = *(as + 32) */
200 bsrli r9, r12, 24 /* t1 = v >> 24 */
201 or r9, r11, r9 /* t1 = h | t1 */
202 swi r9, r5, 28 /* *(d + 28) = t1 */
203 bslli r11, r12, 8 /* h = v << 8 */
204 addi r8, r8, 32 /* as = as + 32 */
205 addi r4, r4, -32 /* n = n - 32 */
206 bneid r4, a_bu1_loop /* while (n) loop */
207 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
208 bri a_block_done
209
210a_block_u2:
211 bslli r11, r11, 16 /* h = h << 16 */
212a_bu2_loop:
213 lwi r12, r8, 4 /* v = *(as + 4) */
214 bsrli r9, r12, 16 /* t1 = v >> 16 */
215 or r9, r11, r9 /* t1 = h | t1 */
216 swi r9, r5, 0 /* *(d + 0) = t1 */
217 bslli r11, r12, 16 /* h = v << 16 */
218 lwi r12, r8, 8 /* v = *(as + 8) */
219 bsrli r9, r12, 16 /* t1 = v >> 16 */
220 or r9, r11, r9 /* t1 = h | t1 */
221 swi r9, r5, 4 /* *(d + 4) = t1 */
222 bslli r11, r12, 16 /* h = v << 16 */
223 lwi r12, r8, 12 /* v = *(as + 12) */
224 bsrli r9, r12, 16 /* t1 = v >> 16 */
225 or r9, r11, r9 /* t1 = h | t1 */
226 swi r9, r5, 8 /* *(d + 8) = t1 */
227 bslli r11, r12, 16 /* h = v << 16 */
228 lwi r12, r8, 16 /* v = *(as + 16) */
229 bsrli r9, r12, 16 /* t1 = v >> 16 */
230 or r9, r11, r9 /* t1 = h | t1 */
231 swi r9, r5, 12 /* *(d + 12) = t1 */
232 bslli r11, r12, 16 /* h = v << 16 */
233 lwi r12, r8, 20 /* v = *(as + 20) */
234 bsrli r9, r12, 16 /* t1 = v >> 16 */
235 or r9, r11, r9 /* t1 = h | t1 */
236 swi r9, r5, 16 /* *(d + 16) = t1 */
237 bslli r11, r12, 16 /* h = v << 16 */
238 lwi r12, r8, 24 /* v = *(as + 24) */
239 bsrli r9, r12, 16 /* t1 = v >> 16 */
240 or r9, r11, r9 /* t1 = h | t1 */
241 swi r9, r5, 20 /* *(d + 20) = t1 */
242 bslli r11, r12, 16 /* h = v << 16 */
243 lwi r12, r8, 28 /* v = *(as + 28) */
244 bsrli r9, r12, 16 /* t1 = v >> 16 */
245 or r9, r11, r9 /* t1 = h | t1 */
246 swi r9, r5, 24 /* *(d + 24) = t1 */
247 bslli r11, r12, 16 /* h = v << 16 */
248 lwi r12, r8, 32 /* v = *(as + 32) */
249 bsrli r9, r12, 16 /* t1 = v >> 16 */
250 or r9, r11, r9 /* t1 = h | t1 */
251 swi r9, r5, 28 /* *(d + 28) = t1 */
252 bslli r11, r12, 16 /* h = v << 16 */
253 addi r8, r8, 32 /* as = as + 32 */
254 addi r4, r4, -32 /* n = n - 32 */
255 bneid r4, a_bu2_loop /* while (n) loop */
256 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
257
258a_block_done:
259 addi r4, r0, 4 /* n = 4 */
260 cmpu r4, r4, r7 /* n = c - n (unsigned) */
261 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
262
263a_word_xfer:
264 andi r4, r7, 0xfffffffc /* n = c & ~3 */
265 addi r10, r0, 0 /* offset = 0 */
266
267 andi r9, r6, 3 /* t1 = s & 3 */
268 /* if temp != 0, unaligned transfers needed */
269 bnei r9, a_word_unaligned
270
271a_word_aligned:
272 lw r9, r6, r10 /* t1 = *(s+offset) */
273 sw r9, r5, r10 /* *(d+offset) = t1 */
274 addi r4, r4,-4 /* n-- */
275 bneid r4, a_word_aligned /* loop */
276 addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
277
278 bri a_word_done
279
280a_word_unaligned:
281 andi r8, r6, 0xfffffffc /* as = s & ~3 */
282 lwi r11, r8, 0 /* h = *(as + 0) */
283 addi r8, r8, 4 /* as = as + 4 */
284
285 addi r9, r9, -1
286 beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
287 addi r9, r9, -1
288 beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
289
290a_word_u3:
291 bslli r11, r11, 24 /* h = h << 24 */
292a_wu3_loop:
293 lw r12, r8, r10 /* v = *(as + offset) */
294 bsrli r9, r12, 8 /* t1 = v >> 8 */
295 or r9, r11, r9 /* t1 = h | t1 */
296 sw r9, r5, r10 /* *(d + offset) = t1 */
297 bslli r11, r12, 24 /* h = v << 24 */
298 addi r4, r4,-4 /* n = n - 4 */
299 bneid r4, a_wu3_loop /* while (n) loop */
300 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
301
302 bri a_word_done
303
304a_word_u1:
305 bslli r11, r11, 8 /* h = h << 8 */
306a_wu1_loop:
307 lw r12, r8, r10 /* v = *(as + offset) */
308 bsrli r9, r12, 24 /* t1 = v >> 24 */
309 or r9, r11, r9 /* t1 = h | t1 */
310 sw r9, r5, r10 /* *(d + offset) = t1 */
311 bslli r11, r12, 8 /* h = v << 8 */
312 addi r4, r4,-4 /* n = n - 4 */
313 bneid r4, a_wu1_loop /* while (n) loop */
314 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
315
316 bri a_word_done
317
318a_word_u2:
319 bslli r11, r11, 16 /* h = h << 16 */
320a_wu2_loop:
321 lw r12, r8, r10 /* v = *(as + offset) */
322 bsrli r9, r12, 16 /* t1 = v >> 16 */
323 or r9, r11, r9 /* t1 = h | t1 */
324 sw r9, r5, r10 /* *(d + offset) = t1 */
325 bslli r11, r12, 16 /* h = v << 16 */
326 addi r4, r4,-4 /* n = n - 4 */
327 bneid r4, a_wu2_loop /* while (n) loop */
328 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
329
330a_word_done:
331 add r5, r5, r10 /* d = d + offset */
332 add r6, r6, r10 /* s = s + offset */
333 rsub r7, r10, r7 /* c = c - offset */
334
335a_xfer_end:
336a_xfer_end_loop:
337 beqi r7, a_done /* while (c) */
338 lbui r9, r6, 0 /* t1 = *s */
339 addi r6, r6, 1 /* s++ */
340 sbi r9, r5, 0 /* *d = t1 */
341 addi r7, r7, -1 /* c-- */
342 brid a_xfer_end_loop /* loop */
343 addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
344
345a_done:
346 rtsd r15, 8
347 nop
348
349.size memcpy, . - memcpy
350.end memcpy
351/*----------------------------------------------------------------------------*/
352 .globl memmove
353 .type memmove, @function
354 .ent memmove
355
356memmove:
357 cmpu r4, r5, r6 /* n = s - d */
358 bgei r4,fast_memcpy_ascending
359
360fast_memcpy_descending:
361 /* move d to return register as value of function */
362 addi r3, r5, 0
363
364 add r5, r5, r7 /* d = d + c */
365 add r6, r6, r7 /* s = s + c */
366
367 addi r4, r0, 4 /* n = 4 */
368 cmpu r4, r4, r7 /* n = c - n (unsigned) */
369 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
370
371 /* transfer first 0~3 bytes to get aligned dest address */
372 andi r4, r5, 3 /* n = d & 3 */
373 /* if zero, destination already aligned */
374 beqi r4,d_dalign_done
375 rsub r7, r4, r7 /* c = c - n adjust c */
376
377d_xfer_first_loop:
378 /* if no bytes left to transfer, transfer the bulk */
379 beqi r4,d_dalign_done
380 addi r6, r6, -1 /* s-- */
381 addi r5, r5, -1 /* d-- */
382 lbui r11, r6, 0 /* h = *s */
383 sbi r11, r5, 0 /* *d = h */
384 brid d_xfer_first_loop /* loop */
385 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
386
387d_dalign_done:
388 addi r4, r0, 32 /* n = 32 */
389 cmpu r4, r4, r7 /* n = c - n (unsigned) */
390 /* if n < 0, less than one block to transfer */
391 blti r4, d_block_done
392
393d_block_xfer:
394 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
395 rsub r7, r4, r7 /* c = c - n */
396
397 andi r9, r6, 3 /* t1 = s & 3 */
398 /* if temp != 0, unaligned transfers needed */
399 bnei r9, d_block_unaligned
400
401d_block_aligned:
402 addi r6, r6, -32 /* s = s - 32 */
403 addi r5, r5, -32 /* d = d - 32 */
404 lwi r9, r6, 28 /* t1 = *(s + 28) */
405 lwi r10, r6, 24 /* t2 = *(s + 24) */
406 lwi r11, r6, 20 /* t3 = *(s + 20) */
407 lwi r12, r6, 16 /* t4 = *(s + 16) */
408 swi r9, r5, 28 /* *(d + 28) = t1 */
409 swi r10, r5, 24 /* *(d + 24) = t2 */
410 swi r11, r5, 20 /* *(d + 20) = t3 */
411 swi r12, r5, 16 /* *(d + 16) = t4 */
412 lwi r9, r6, 12 /* t1 = *(s + 12) */
413 lwi r10, r6, 8 /* t2 = *(s + 8) */
414 lwi r11, r6, 4 /* t3 = *(s + 4) */
415 lwi r12, r6, 0 /* t4 = *(s + 0) */
416 swi r9, r5, 12 /* *(d + 12) = t1 */
417 swi r10, r5, 8 /* *(d + 8) = t2 */
418 swi r11, r5, 4 /* *(d + 4) = t3 */
419 addi r4, r4, -32 /* n = n - 32 */
420 bneid r4, d_block_aligned /* while (n) loop */
421 swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
422 bri d_block_done
423
424d_block_unaligned:
425 andi r8, r6, 0xfffffffc /* as = s & ~3 */
426 rsub r6, r4, r6 /* s = s - n */
427 lwi r11, r8, 0 /* h = *(as + 0) */
428
429 addi r9, r9, -1
430 beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
431 addi r9, r9, -1
432 beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
433
434d_block_u3:
435 bsrli r11, r11, 8 /* h = h >> 8 */
436d_bu3_loop:
437 addi r8, r8, -32 /* as = as - 32 */
438 addi r5, r5, -32 /* d = d - 32 */
439 lwi r12, r8, 28 /* v = *(as + 28) */
440 bslli r9, r12, 24 /* t1 = v << 24 */
441 or r9, r11, r9 /* t1 = h | t1 */
442 swi r9, r5, 28 /* *(d + 28) = t1 */
443 bsrli r11, r12, 8 /* h = v >> 8 */
444 lwi r12, r8, 24 /* v = *(as + 24) */
445 bslli r9, r12, 24 /* t1 = v << 24 */
446 or r9, r11, r9 /* t1 = h | t1 */
447 swi r9, r5, 24 /* *(d + 24) = t1 */
448 bsrli r11, r12, 8 /* h = v >> 8 */
449 lwi r12, r8, 20 /* v = *(as + 20) */
450 bslli r9, r12, 24 /* t1 = v << 24 */
451 or r9, r11, r9 /* t1 = h | t1 */
452 swi r9, r5, 20 /* *(d + 20) = t1 */
453 bsrli r11, r12, 8 /* h = v >> 8 */
454 lwi r12, r8, 16 /* v = *(as + 16) */
455 bslli r9, r12, 24 /* t1 = v << 24 */
456 or r9, r11, r9 /* t1 = h | t1 */
457 swi r9, r5, 16 /* *(d + 16) = t1 */
458 bsrli r11, r12, 8 /* h = v >> 8 */
459 lwi r12, r8, 12 /* v = *(as + 12) */
460 bslli r9, r12, 24 /* t1 = v << 24 */
461 or r9, r11, r9 /* t1 = h | t1 */
462 swi r9, r5, 12 /* *(d + 112) = t1 */
463 bsrli r11, r12, 8 /* h = v >> 8 */
464 lwi r12, r8, 8 /* v = *(as + 8) */
465 bslli r9, r12, 24 /* t1 = v << 24 */
466 or r9, r11, r9 /* t1 = h | t1 */
467 swi r9, r5, 8 /* *(d + 8) = t1 */
468 bsrli r11, r12, 8 /* h = v >> 8 */
469 lwi r12, r8, 4 /* v = *(as + 4) */
470 bslli r9, r12, 24 /* t1 = v << 24 */
471 or r9, r11, r9 /* t1 = h | t1 */
472 swi r9, r5, 4 /* *(d + 4) = t1 */
473 bsrli r11, r12, 8 /* h = v >> 8 */
474 lwi r12, r8, 0 /* v = *(as + 0) */
475 bslli r9, r12, 24 /* t1 = v << 24 */
476 or r9, r11, r9 /* t1 = h | t1 */
477 swi r9, r5, 0 /* *(d + 0) = t1 */
478 addi r4, r4, -32 /* n = n - 32 */
479 bneid r4, d_bu3_loop /* while (n) loop */
480 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
481 bri d_block_done
482
483d_block_u1:
484 bsrli r11, r11, 24 /* h = h >> 24 */
485d_bu1_loop:
486 addi r8, r8, -32 /* as = as - 32 */
487 addi r5, r5, -32 /* d = d - 32 */
488 lwi r12, r8, 28 /* v = *(as + 28) */
489 bslli r9, r12, 8 /* t1 = v << 8 */
490 or r9, r11, r9 /* t1 = h | t1 */
491 swi r9, r5, 28 /* *(d + 28) = t1 */
492 bsrli r11, r12, 24 /* h = v >> 24 */
493 lwi r12, r8, 24 /* v = *(as + 24) */
494 bslli r9, r12, 8 /* t1 = v << 8 */
495 or r9, r11, r9 /* t1 = h | t1 */
496 swi r9, r5, 24 /* *(d + 24) = t1 */
497 bsrli r11, r12, 24 /* h = v >> 24 */
498 lwi r12, r8, 20 /* v = *(as + 20) */
499 bslli r9, r12, 8 /* t1 = v << 8 */
500 or r9, r11, r9 /* t1 = h | t1 */
501 swi r9, r5, 20 /* *(d + 20) = t1 */
502 bsrli r11, r12, 24 /* h = v >> 24 */
503 lwi r12, r8, 16 /* v = *(as + 16) */
504 bslli r9, r12, 8 /* t1 = v << 8 */
505 or r9, r11, r9 /* t1 = h | t1 */
506 swi r9, r5, 16 /* *(d + 16) = t1 */
507 bsrli r11, r12, 24 /* h = v >> 24 */
508 lwi r12, r8, 12 /* v = *(as + 12) */
509 bslli r9, r12, 8 /* t1 = v << 8 */
510 or r9, r11, r9 /* t1 = h | t1 */
511 swi r9, r5, 12 /* *(d + 112) = t1 */
512 bsrli r11, r12, 24 /* h = v >> 24 */
513 lwi r12, r8, 8 /* v = *(as + 8) */
514 bslli r9, r12, 8 /* t1 = v << 8 */
515 or r9, r11, r9 /* t1 = h | t1 */
516 swi r9, r5, 8 /* *(d + 8) = t1 */
517 bsrli r11, r12, 24 /* h = v >> 24 */
518 lwi r12, r8, 4 /* v = *(as + 4) */
519 bslli r9, r12, 8 /* t1 = v << 8 */
520 or r9, r11, r9 /* t1 = h | t1 */
521 swi r9, r5, 4 /* *(d + 4) = t1 */
522 bsrli r11, r12, 24 /* h = v >> 24 */
523 lwi r12, r8, 0 /* v = *(as + 0) */
524 bslli r9, r12, 8 /* t1 = v << 8 */
525 or r9, r11, r9 /* t1 = h | t1 */
526 swi r9, r5, 0 /* *(d + 0) = t1 */
527 addi r4, r4, -32 /* n = n - 32 */
528 bneid r4, d_bu1_loop /* while (n) loop */
529 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
530 bri d_block_done
531
532d_block_u2:
533 bsrli r11, r11, 16 /* h = h >> 16 */
534d_bu2_loop:
535 addi r8, r8, -32 /* as = as - 32 */
536 addi r5, r5, -32 /* d = d - 32 */
537 lwi r12, r8, 28 /* v = *(as + 28) */
538 bslli r9, r12, 16 /* t1 = v << 16 */
539 or r9, r11, r9 /* t1 = h | t1 */
540 swi r9, r5, 28 /* *(d + 28) = t1 */
541 bsrli r11, r12, 16 /* h = v >> 16 */
542 lwi r12, r8, 24 /* v = *(as + 24) */
543 bslli r9, r12, 16 /* t1 = v << 16 */
544 or r9, r11, r9 /* t1 = h | t1 */
545 swi r9, r5, 24 /* *(d + 24) = t1 */
546 bsrli r11, r12, 16 /* h = v >> 16 */
547 lwi r12, r8, 20 /* v = *(as + 20) */
548 bslli r9, r12, 16 /* t1 = v << 16 */
549 or r9, r11, r9 /* t1 = h | t1 */
550 swi r9, r5, 20 /* *(d + 20) = t1 */
551 bsrli r11, r12, 16 /* h = v >> 16 */
552 lwi r12, r8, 16 /* v = *(as + 16) */
553 bslli r9, r12, 16 /* t1 = v << 16 */
554 or r9, r11, r9 /* t1 = h | t1 */
555 swi r9, r5, 16 /* *(d + 16) = t1 */
556 bsrli r11, r12, 16 /* h = v >> 16 */
557 lwi r12, r8, 12 /* v = *(as + 12) */
558 bslli r9, r12, 16 /* t1 = v << 16 */
559 or r9, r11, r9 /* t1 = h | t1 */
560 swi r9, r5, 12 /* *(d + 112) = t1 */
561 bsrli r11, r12, 16 /* h = v >> 16 */
562 lwi r12, r8, 8 /* v = *(as + 8) */
563 bslli r9, r12, 16 /* t1 = v << 16 */
564 or r9, r11, r9 /* t1 = h | t1 */
565 swi r9, r5, 8 /* *(d + 8) = t1 */
566 bsrli r11, r12, 16 /* h = v >> 16 */
567 lwi r12, r8, 4 /* v = *(as + 4) */
568 bslli r9, r12, 16 /* t1 = v << 16 */
569 or r9, r11, r9 /* t1 = h | t1 */
570 swi r9, r5, 4 /* *(d + 4) = t1 */
571 bsrli r11, r12, 16 /* h = v >> 16 */
572 lwi r12, r8, 0 /* v = *(as + 0) */
573 bslli r9, r12, 16 /* t1 = v << 16 */
574 or r9, r11, r9 /* t1 = h | t1 */
575 swi r9, r5, 0 /* *(d + 0) = t1 */
576 addi r4, r4, -32 /* n = n - 32 */
577 bneid r4, d_bu2_loop /* while (n) loop */
578 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
579
580d_block_done:
581 addi r4, r0, 4 /* n = 4 */
582 cmpu r4, r4, r7 /* n = c - n (unsigned) */
583 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
584
585d_word_xfer:
586 andi r4, r7, 0xfffffffc /* n = c & ~3 */
587 rsub r5, r4, r5 /* d = d - n */
588 rsub r6, r4, r6 /* s = s - n */
589 rsub r7, r4, r7 /* c = c - n */
590
591 andi r9, r6, 3 /* t1 = s & 3 */
592 /* if temp != 0, unaligned transfers needed */
593 bnei r9, d_word_unaligned
594
595d_word_aligned:
596 addi r4, r4,-4 /* n-- */
597 lw r9, r6, r4 /* t1 = *(s+n) */
598 bneid r4, d_word_aligned /* loop */
599 sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
600
601 bri d_word_done
602
603d_word_unaligned:
604 andi r8, r6, 0xfffffffc /* as = s & ~3 */
605 lw r11, r8, r4 /* h = *(as + n) */
606
607 addi r9, r9, -1
608 beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
609 addi r9, r9, -1
610 beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
611
612d_word_u3:
613 bsrli r11, r11, 8 /* h = h >> 8 */
614d_wu3_loop:
615 addi r4, r4,-4 /* n = n - 4 */
616 lw r12, r8, r4 /* v = *(as + n) */
617 bslli r9, r12, 24 /* t1 = v << 24 */
618 or r9, r11, r9 /* t1 = h | t1 */
619 sw r9, r5, r4 /* *(d + n) = t1 */
620 bneid r4, d_wu3_loop /* while (n) loop */
621 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
622
623 bri d_word_done
624
625d_word_u1:
626 bsrli r11, r11, 24 /* h = h >> 24 */
627d_wu1_loop:
628 addi r4, r4,-4 /* n = n - 4 */
629 lw r12, r8, r4 /* v = *(as + n) */
630 bslli r9, r12, 8 /* t1 = v << 8 */
631 or r9, r11, r9 /* t1 = h | t1 */
632 sw r9, r5, r4 /* *(d + n) = t1 */
633 bneid r4, d_wu1_loop /* while (n) loop */
634 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
635
636 bri d_word_done
637
638d_word_u2:
639 bsrli r11, r11, 16 /* h = h >> 16 */
640d_wu2_loop:
641 addi r4, r4,-4 /* n = n - 4 */
642 lw r12, r8, r4 /* v = *(as + n) */
643 bslli r9, r12, 16 /* t1 = v << 16 */
644 or r9, r11, r9 /* t1 = h | t1 */
645 sw r9, r5, r4 /* *(d + n) = t1 */
646 bneid r4, d_wu2_loop /* while (n) loop */
647 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
648
649d_word_done:
650
651d_xfer_end:
652d_xfer_end_loop:
653 beqi r7, a_done /* while (c) */
654 addi r6, r6, -1 /* s-- */
655 lbui r9, r6, 0 /* t1 = *s */
656 addi r5, r5, -1 /* d-- */
657 sbi r9, r5, 0 /* *d = t1 */
658 brid d_xfer_end_loop /* loop */
659 addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
660
661d_done:
662 rtsd r15, 8
663 nop
664
665.size memmove, . - memmove
666.end memmove
667

source code of linux/arch/microblaze/lib/fastcopy.S