memcpy.S source code [glibc/sysdeps/powerpc/powerpc64/power6/memcpy.S]

1	/ Optimized memcpy implementation for PowerPC64.*
2	Copyright (C) 2003-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	/ void * [r3] memcpy (void dst [r3], void src [r4], size_t len [r5]);*
22	Returns 'dst'.
23
24	Memcpy handles short copies (< 32-bytes) using a binary move blocks
25	(no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26	with the appropriate combination of byte and halfword load/stores.
27	There is minimal effort to optimize the alignment of short moves.
28	The 64-bit implementations of POWER3 and POWER4 do a reasonable job
29	of handling unaligned load/stores that do not cross 32-byte boundaries.
30
31	Longer moves (>= 32-bytes) justify the effort to get at least the
32	destination doubleword (8-byte) aligned. Further optimization is
33	possible when both source and destination are doubleword aligned.
34	Each case has a optimized unrolled loop.
35
36	For POWER6 unaligned loads will take a 20+ cycle hiccup for any
37	L1 cache miss that crosses a 32- or 128-byte boundary. Store
38	is more forgiving and does not take a hiccup until page or
39	segment boundaries. So we require doubleword alignment for
40	the source but may take a risk and only require word alignment
41	for the destination. /*
42
43	#ifndef MEMCPY
44	# define MEMCPY memcpy
45	#endif
46	.machine "power6"
47	ENTRY_TOCLESS (MEMCPY, `7`)
48	CALL_MCOUNT `3`
49
50	cmpldi cr1,`5`,`31`
51	neg `0`,`3`
52	std `3`,-`16`(`1`)
53	std `31`,-`8`(`1`)
54	andi. `11`,`3`,`7` / check alignment of dst. /
55	clrldi `0`,`0`,`61` / Number of bytes until the 1st doubleword of dst. /
56	clrldi `10`,`4`,`61` / check alignment of src. /
57	cmpldi cr6,`5`,`8`
58	ble- cr1,.L2 / If move < 32 bytes use short move code. /
59	mtcrf `0x01`,`0`
60	cmpld cr6,`10`,`11`
61	srdi `9`,`5`,`3` / Number of full double words remaining. /
62	beq .L0
63
64	subf `5`,`0`,`5`
65	/ Move 0-7 bytes as needed to get the destination doubleword aligned.*
66	Duplicate some code to maximize fall-through and minimize agen delays. /*
67	`1`: bf `31`,`2f`
68	lbz `6`,`0`(`4`)
69	stb `6`,`0`(`3`)
70	bf `30`,`5f`
71	lhz `6`,`1`(`4`)
72	sth `6`,`1`(`3`)
73	bf `29`,`0f`
74	lwz `6`,`3`(`4`)
75	stw `6`,`3`(`3`)
76	b `0f`
77	`5`:
78	bf `29`,`0f`
79	lwz `6`,`1`(`4`)
80	stw `6`,`1`(`3`)
81	b `0f`
82
83	`2`: bf `30`,`4f`
84	lhz `6`,`0`(`4`)
85	sth `6`,`0`(`3`)
86	bf `29`,`0f`
87	lwz `6`,`2`(`4`)
88	stw `6`,`2`(`3`)
89	b `0f`
90
91	`4`: bf `29`,`0f`
92	lwz `6`,`0`(`4`)
93	stw `6`,`0`(`3`)
94	`0`:
95	/ Add the number of bytes until the 1st doubleword of dst to src and dst. /
96	add `4`,`4`,`0`
97	add `3`,`3`,`0`
98
99	clrldi `10`,`4`,`61` / check alignment of src again. /
100	srdi `9`,`5`,`3` / Number of full double words remaining. /
101
102	/ Copy doublewords from source to destination, assuming the*
103	destination is aligned on a doubleword boundary.
104
105	At this point we know there are at least 25 bytes left (32-7) to copy.
106	The next step is to determine if the source is also doubleword aligned.
107	If not branch to the unaligned move code at .L6. which uses
108	a load, shift, store strategy.
109
110	Otherwise source and destination are doubleword aligned, and we can
111	the optimized doubleword copy loop. /*
112	.align `4`
113	.L0:
114	clrldi `11`,`5`,`61`
115	andi. `0`,`5`,`0x78`
116	srdi `12`,`5`,`7` / Number of 128-byte blocks to move. /
117	cmpldi cr1,`11`,`0` / If the tail is 0 bytes /
118	bne- cr6,.L6 / If source is not DW aligned. /
119
120	/ Move doublewords where destination and source are DW aligned.*
121	Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration.
122	If the copy is not an exact multiple of 128 bytes, 1-15
123	doublewords are copied as needed to set up the main loop. After
124	the main loop exits there may be a tail of 1-7 bytes. These byte
125	are copied a word/halfword/byte at a time as needed to preserve
126	alignment.
127
128	For POWER6 the L1 is store-through and the L2 is store-in. The
129	L2 is clocked at half CPU clock so we can store 16 bytes every
130	other cycle. POWER6 also has a load/store bypass so we can do
131	load, load, store, store every 2 cycles.
132
133	The following code is sensitive to cache line alignment. Do not
134	make any change with out first making sure they don't result in
135	splitting ld/std pairs across a cache line. /*
136
137	mtcrf `0x02`,`5`
138	mtcrf `0x01`,`5`
139	cmpldi cr5,`12`,`1`
140	beq L(das_loop)
141
142	bf `25`,`4f`
143	.align `3`
144	ld `6`,`0`(`4`)
145	ld `7`,`8`(`4`)
146	mr `11`,`4`
147	mr `10`,`3`
148	std `6`,`0`(`3`)
149	std `7`,`8`(`3`)
150	ld `6`,`16`(`4`)
151	ld `7`,`24`(`4`)
152	std `6`,`16`(`3`)
153	std `7`,`24`(`3`)
154	ld `6`,`0`+`32`(`4`)
155	ld `7`,`8`+`32`(`4`)
156	addi `4`,`4`,`64`
157	addi `3`,`3`,`64`
158	std `6`,`0`+`32`(`10`)
159	std `7`,`8`+`32`(`10`)
160	ld `6`,`16`+`32`(`11`)
161	ld `7`,`24`+`32`(`11`)
162	std `6`,`16`+`32`(`10`)
163	std `7`,`24`+`32`(`10`)
164	`4`:
165	mr `10`,`3`
166	bf `26`,`2f`
167	ld `6`,`0`(`4`)
168	ld `7`,`8`(`4`)
169	mr `11`,`4`
170	nop
171	std `6`,`0`(`3`)
172	std `7`,`8`(`3`)
173	ld `6`,`16`(`4`)
174	ld `7`,`24`(`4`)
175	addi `4`,`4`,`32`
176	std `6`,`16`(`3`)
177	std `7`,`24`(`3`)
178	addi `3`,`3`,`32`
179	`6`:
180	nop
181	bf `27`,`5f`
182	ld `6`,`0`+`32`(`11`)
183	ld `7`,`8`+`32`(`11`)
184	addi `4`,`4`,`16`
185	addi `3`,`3`,`16`
186	std `6`,`0`+`32`(`10`)
187	std `7`,`8`+`32`(`10`)
188	bf `28`,L(das_loop_s)
189	ld `0`,`16`+`32`(`11`)
190	addi `4`,`4`,`8`
191	addi `3`,`3`,`8`
192	std `0`,`16`+`32`(`10`)
193	blt cr5,L(das_tail)
194	b L(das_loop)
195	.align `3`
196	`5`:
197	nop
198	bf `28`,L(das_loop_s)
199	ld `6`,`32`(`11`)
200	addi `4`,`4`,`8`
201	addi `3`,`3`,`8`
202	std `6`,`32`(`10`)
203	blt cr5,L(das_tail)
204	b L(das_loop)
205	.align `3`
206	`2`:
207	mr `11`,`4`
208	bf `27`,`1f`
209	ld `6`,`0`(`4`)
210	ld `7`,`8`(`4`)
211	addi `4`,`4`,`16`
212	addi `3`,`3`,`16`
213	std `6`,`0`(`10`)
214	std `7`,`8`(`10`)
215	bf `28`,L(das_loop_s)
216	ld `0`,`16`(`11`)
217	addi `4`,`11`,`24`
218	addi `3`,`10`,`24`
219	std `0`,`16`(`10`)
220	blt cr5,L(das_tail)
221	b L(das_loop)
222	.align `3`
223	`1`:
224	nop
225	bf `28`,L(das_loop_s)
226	ld `6`,`0`(`4`)
227	addi `4`,`4`,`8`
228	addi `3`,`3`,`8`
229	std `6`,`0`(`10`)
230	L(das_loop_s):
231	nop
232	blt cr5,L(das_tail)
233	.align `4`
234	L(das_loop):
235	ld `6`,`0`(`4`)
236	ld `7`,`8`(`4`)
237	mr `10`,`3`
238	mr `11`,`4`
239	std `6`,`0`(`3`)
240	std `7`,`8`(`3`)
241	addi `12`,`12`,-`1`
242	nop
243	ld `8`,`16`(`4`)
244	ld `0`,`24`(`4`)
245	std `8`,`16`(`3`)
246	std `0`,`24`(`3`)
247
248	ld `6`,`0`+`32`(`4`)
249	ld `7`,`8`+`32`(`4`)
250	std `6`,`0`+`32`(`3`)
251	std `7`,`8`+`32`(`3`)
252	ld `8`,`16`+`32`(`4`)
253	ld `0`,`24`+`32`(`4`)
254	std `8`,`16`+`32`(`3`)
255	std `0`,`24`+`32`(`3`)
256
257	ld `6`,`0`+`64`(`11`)
258	ld `7`,`8`+`64`(`11`)
259	std `6`,`0`+`64`(`10`)
260	std `7`,`8`+`64`(`10`)
261	ld `8`,`16`+`64`(`11`)
262	ld `0`,`24`+`64`(`11`)
263	std `8`,`16`+`64`(`10`)
264	std `0`,`24`+`64`(`10`)
265
266	ld `6`,`0`+`96`(`11`)
267	ld `7`,`8`+`96`(`11`)
268	addi `4`,`4`,`128`
269	addi `3`,`3`,`128`
270	std `6`,`0`+`96`(`10`)
271	std `7`,`8`+`96`(`10`)
272	ld `8`,`16`+`96`(`11`)
273	ld `0`,`24`+`96`(`11`)
274	std `8`,`16`+`96`(`10`)
275	std `0`,`24`+`96`(`10`)
276	ble cr5,L(das_loop_e)
277
278	mtctr `12`
279	.align `4`
280	L(das_loop2):
281	ld `6`,`0`(`4`)
282	ld `7`,`8`(`4`)
283	mr `10`,`3`
284	mr `11`,`4`
285	std `6`,`0`(`3`)
286	std `7`,`8`(`3`)
287	ld `8`,`16`(`4`)
288	ld `0`,`24`(`4`)
289	std `8`,`16`(`3`)
290	std `0`,`24`(`3`)
291
292	ld `6`,`0`+`32`(`4`)
293	ld `7`,`8`+`32`(`4`)
294	std `6`,`0`+`32`(`3`)
295	std `7`,`8`+`32`(`3`)
296	ld `8`,`16`+`32`(`4`)
297	ld `0`,`24`+`32`(`4`)
298	std `8`,`16`+`32`(`3`)
299	std `0`,`24`+`32`(`3`)
300
301	ld `6`,`0`+`64`(`11`)
302	ld `7`,`8`+`64`(`11`)
303	std `6`,`0`+`64`(`10`)
304	std `7`,`8`+`64`(`10`)
305	ld `8`,`16`+`64`(`11`)
306	ld `0`,`24`+`64`(`11`)
307	std `8`,`16`+`64`(`10`)
308	std `0`,`24`+`64`(`10`)
309
310	ld `6`,`0`+`96`(`11`)
311	ld `7`,`8`+`96`(`11`)
312	addi `4`,`4`,`128`
313	addi `3`,`3`,`128`
314	std `6`,`0`+`96`(`10`)
315	std `7`,`8`+`96`(`10`)
316	ld `8`,`16`+`96`(`11`)
317	ld `0`,`24`+`96`(`11`)
318	std `8`,`16`+`96`(`10`)
319	std `0`,`24`+`96`(`10`)
320	bdnz L(das_loop2)
321	L(das_loop_e):
322	/ Check of a 1-7 byte tail, return if none. /
323	bne cr1,L(das_tail2)
324	/ Return original dst pointer. /
325	ld `3`,-`16`(`1`)
326	blr
327	.align `4`
328	L(das_tail):
329	beq cr1,`0f`
330
331	L(das_tail2):
332	/ At this point we have a tail of 0-7 bytes and we know that the*
333	destination is double word aligned. /*
334	`4`: bf `29`,`2f`
335	lwz `6`,`0`(`4`)
336	stw `6`,`0`(`3`)
337	bf `30`,`5f`
338	lhz `6`,`4`(`4`)
339	sth `6`,`4`(`3`)
340	bf `31`,`0f`
341	lbz `6`,`6`(`4`)
342	stb `6`,`6`(`3`)
343	b `0f`
344	`5`: bf `31`,`0f`
345	lbz `6`,`4`(`4`)
346	stb `6`,`4`(`3`)
347	b `0f`
348
349	`2`: bf `30`,`1f`
350	lhz `6`,`0`(`4`)
351	sth `6`,`0`(`3`)
352	bf `31`,`0f`
353	lbz `6`,`2`(`4`)
354	stb `6`,`2`(`3`)
355	b `0f`
356
357	`1`: bf `31`,`0f`
358	lbz `6`,`0`(`4`)
359	stb `6`,`0`(`3`)
360	`0`:
361	/ Return original dst pointer. /
362	ld `3`,-`16`(`1`)
363	blr
364
365	/ Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31*
366	bytes. Each case is handled without loops, using binary (1,2,4,8)
367	tests.
368
369	In the short (0-8 byte) case no attempt is made to force alignment
370	of either source or destination. The hardware will handle the
371	unaligned load/stores with small delays for crossing 32- 128-byte,
372	and 4096-byte boundaries. Since these short moves are unlikely to be
373	unaligned or cross these boundaries, the overhead to force
374	alignment is not justified.
375
376	The longer (9-31 byte) move is more likely to cross 32- or 128-byte
377	boundaries. Since only loads are sensitive to the 32-/128-byte
378	boundaries it is more important to align the source then the
379	destination. If the source is not already word aligned, we first
380	move 1-3 bytes as needed. Since we are only word aligned we don't
381	use double word load/stores to insure that all loads are aligned.
382	While the destination and stores may still be unaligned, this
383	is only an issue for page (4096 byte boundary) crossing, which
384	should be rare for these short moves. The hardware handles this
385	case automatically with a small (~20 cycle) delay. /*
386	.align `4`
387	.L2:
388	mtcrf `0x01`,`5`
389	neg `8`,`4`
390	clrrdi `11`,`4`,`2`
391	andi. `0`,`8`,`3`
392	ble cr6,.LE8 / Handle moves of 0-8 bytes. /
393	/ At least 9 bytes left. Get the source word aligned. /
394	cmpldi cr1,`5`,`16`
395	mr `10`,`5`
396	mr `12`,`4`
397	cmpldi cr6,`0`,`2`
398	beq L(dus_tail) / If the source is already word aligned skip this. /
399	/ Copy 1-3 bytes to get source address word aligned. /
400	lwz `6`,`0`(`11`)
401	subf `10`,`0`,`5`
402	add `12`,`4`,`0`
403	blt cr6,`5f`
404	srdi `7`,`6`,`16`
405	bgt cr6,`3f`
406	#ifdef __LITTLE_ENDIAN__
407	sth `7`,`0`(`3`)
408	#else
409	sth `6`,`0`(`3`)
410	#endif
411	b `7f`
412	.align `4`
413	`3`:
414	#ifdef __LITTLE_ENDIAN__
415	rotlwi `6`,`6`,`24`
416	stb `6`,`0`(`3`)
417	sth `7`,`1`(`3`)
418	#else
419	stb `7`,`0`(`3`)
420	sth `6`,`1`(`3`)
421	#endif
422	b `7f`
423	.align `4`
424	`5`:
425	#ifdef __LITTLE_ENDIAN__
426	rotlwi `6`,`6`,`8`
427	#endif
428	stb `6`,`0`(`3`)
429	`7`:
430	cmpldi cr1,`10`,`16`
431	add `3`,`3`,`0`
432	mtcrf `0x01`,`10`
433	.align `4`
434	L(dus_tail):
435	/ At least 6 bytes left and the source is word aligned. This allows*
436	some speculative loads up front. /*
437	/ We need to special case the fall-through because the biggest delays*
438	are due to address computation not being ready in time for the
439	AGEN. /*
440	lwz `6`,`0`(`12`)
441	lwz `7`,`4`(`12`)
442	blt cr1,L(dus_tail8)
443	cmpldi cr0,`10`,`24`
444	L(dus_tail16): / Move 16 bytes. /
445	stw `6`,`0`(`3`)
446	stw `7`,`4`(`3`)
447	lwz `6`,`8`(`12`)
448	lwz `7`,`12`(`12`)
449	stw `6`,`8`(`3`)
450	stw `7`,`12`(`3`)
451	/ Move 8 bytes more. /
452	bf `28`,L(dus_tail16p8)
453	cmpldi cr1,`10`,`28`
454	lwz `6`,`16`(`12`)
455	lwz `7`,`20`(`12`)
456	stw `6`,`16`(`3`)
457	stw `7`,`20`(`3`)
458	/ Move 4 bytes more. /
459	bf `29`,L(dus_tail16p4)
460	lwz `6`,`24`(`12`)
461	stw `6`,`24`(`3`)
462	addi `12`,`12`,`28`
463	addi `3`,`3`,`28`
464	bgt cr1,L(dus_tail2)
465	/ exactly 28 bytes. Return original dst pointer and exit. /
466	ld `3`,-`16`(`1`)
467	blr
468	.align `4`
469	L(dus_tail16p8): / less than 8 bytes left. /
470	beq cr1,L(dus_tailX) / exactly 16 bytes, early exit. /
471	cmpldi cr1,`10`,`20`
472	bf `29`,L(dus_tail16p2)
473	/ Move 4 bytes more. /
474	lwz `6`,`16`(`12`)
475	stw `6`,`16`(`3`)
476	addi `12`,`12`,`20`
477	addi `3`,`3`,`20`
478	bgt cr1,L(dus_tail2)
479	/ exactly 20 bytes. Return original dst pointer and exit. /
480	ld `3`,-`16`(`1`)
481	blr
482	.align `4`
483	L(dus_tail16p4): / less than 4 bytes left. /
484	addi `12`,`12`,`24`
485	addi `3`,`3`,`24`
486	bgt cr0,L(dus_tail2)
487	/ exactly 24 bytes. Return original dst pointer and exit. /
488	ld `3`,-`16`(`1`)
489	blr
490	.align `4`
491	L(dus_tail16p2): / 16 bytes moved, less than 4 bytes left. /
492	addi `12`,`12`,`16`
493	addi `3`,`3`,`16`
494	b L(dus_tail2)
495
496	.align `4`
497	L(dus_tail8): / Move 8 bytes. /
498	/ r6, r7 already loaded speculatively. /
499	cmpldi cr1,`10`,`8`
500	cmpldi cr0,`10`,`12`
501	bf `28`,L(dus_tail4)
502	.align `2`
503	stw `6`,`0`(`3`)
504	stw `7`,`4`(`3`)
505	/ Move 4 bytes more. /
506	bf `29`,L(dus_tail8p4)
507	lwz `6`,`8`(`12`)
508	stw `6`,`8`(`3`)
509	addi `12`,`12`,`12`
510	addi `3`,`3`,`12`
511	bgt cr0,L(dus_tail2)
512	/ exactly 12 bytes. Return original dst pointer and exit. /
513	ld `3`,-`16`(`1`)
514	blr
515	.align `4`
516	L(dus_tail8p4): / less than 4 bytes left. /
517	addi `12`,`12`,`8`
518	addi `3`,`3`,`8`
519	bgt cr1,L(dus_tail2)
520	/ exactly 8 bytes. Return original dst pointer and exit. /
521	ld `3`,-`16`(`1`)
522	blr
523
524	.align `4`
525	L(dus_tail4): / Move 4 bytes. /
526	/ r6 already loaded speculatively. If we are here we know there is*
527	more than 4 bytes left. So there is no need to test. /*
528	addi `12`,`12`,`4`
529	stw `6`,`0`(`3`)
530	addi `3`,`3`,`4`
531	L(dus_tail2): / Move 2-3 bytes. /
532	bf `30`,L(dus_tail1)
533	lhz `6`,`0`(`12`)
534	sth `6`,`0`(`3`)
535	bf `31`,L(dus_tailX)
536	lbz `7`,`2`(`12`)
537	stb `7`,`2`(`3`)
538	ld `3`,-`16`(`1`)
539	blr
540	L(dus_tail1): / Move 1 byte. /
541	bf `31`,L(dus_tailX)
542	lbz `6`,`0`(`12`)
543	stb `6`,`0`(`3`)
544	L(dus_tailX):
545	/ Return original dst pointer. /
546	ld `3`,-`16`(`1`)
547	blr
548
549	/ Special case to copy 0-8 bytes. /
550	.align `4`
551	.LE8:
552	mr `12`,`4`
553	bne cr6,L(dus_4)
554	/ Exactly 8 bytes. We may cross a 32-/128-byte boundary and take a ~20*
555	cycle delay. This case should be rare and any attempt to avoid this
556	would take most of 20 cycles any way. /*
557	ld `6`,`0`(`4`)
558	std `6`,`0`(`3`)
559	/ Return original dst pointer. /
560	ld `3`,-`16`(`1`)
561	blr
562	.align `4`
563	L(dus_4):
564	bf `29`,L(dus_tail2)
565	lwz `6`,`0`(`4`)
566	stw `6`,`0`(`3`)
567	bf `30`,L(dus_5)
568	lhz `7`,`4`(`4`)
569	sth `7`,`4`(`3`)
570	bf `31`,L(dus_0)
571	lbz `8`,`6`(`4`)
572	stb `8`,`6`(`3`)
573	ld `3`,-`16`(`1`)
574	blr
575	.align `4`
576	L(dus_5):
577	bf `31`,L(dus_0)
578	lbz `6`,`4`(`4`)
579	stb `6`,`4`(`3`)
580	L(dus_0):
581	/ Return original dst pointer. /
582	ld `3`,-`16`(`1`)
583	blr
584
585	.align `4`
586	.L6:
587	cfi_offset(`31`,-`8`)
588	mr `12`,`4`
589	mr `31`,`5`
590	/ Copy doublewords where the destination is aligned but the source is*
591	not. Use aligned doubleword loads from the source, shifted to realign
592	the data, to allow aligned destination stores. /*
593	addi `11`,`9`,-`1` / loop DW count is one less than total /
594	subf `5`,`10`,`12` / Move source addr to previous full double word. /
595	cmpldi cr5, `10`, `2`
596	cmpldi cr0, `10`, `4`
597	mr `4`,`3`
598	srdi `8`,`11`,`2` / calculate the 32 byte loop count /
599	ld `6`,`0`(`5`) / pre load 1st full doubleword. /
600	mtcrf `0x01`,`11`
601	cmpldi cr6,`9`,`4`
602	mtctr `8`
603	ld `7`,`8`(`5`) / pre load 2nd full doubleword. /
604	bge cr0, L(du4_do)
605	blt cr5, L(du1_do)
606	beq cr5, L(du2_do)
607	b L(du3_do)
608
609	.align `4`
610	L(du1_do):
611	bf `30`,L(du1_1dw)
612
613	/ there are at least two DWs to copy /
614	/ FIXME: can combine last shift and "or" into "rldimi" /
615	#ifdef __LITTLE_ENDIAN__
616	srdi `0`,`6`, `8`
617	sldi `8`,`7`, `64`-`8`
618	#else
619	sldi `0`,`6`, `8`
620	srdi `8`,`7`, `64`-`8`
621	#endif
622	or `0`,`0`,`8`
623	ld `6`,`16`(`5`)
624	std `0`,`0`(`4`)
625	#ifdef __LITTLE_ENDIAN__
626	srdi `0`,`7`, `8`
627	sldi `8`,`6`, `64`-`8`
628	#else
629	sldi `0`,`7`, `8`
630	srdi `8`,`6`, `64`-`8`
631	#endif
632	or `0`,`0`,`8`
633	ld `7`,`24`(`5`)
634	std `0`,`8`(`4`)
635	addi `4`,`4`,`16`
636	addi `5`,`5`,`32`
637	blt cr6,L(du1_fini) / if total DWs = 3, then bypass loop /
638	bf `31`,L(du1_loop)
639	/ there is a third DW to copy /
640	#ifdef __LITTLE_ENDIAN__
641	srdi `0`,`6`, `8`
642	sldi `8`,`7`, `64`-`8`
643	#else
644	sldi `0`,`6`, `8`
645	srdi `8`,`7`, `64`-`8`
646	#endif
647	or `0`,`0`,`8`
648	std `0`,`0`(`4`)
649	mr `6`,`7`
650	ld `7`,`0`(`5`)
651	addi `5`,`5`,`8`
652	addi `4`,`4`,`8`
653	beq cr6,L(du1_fini) / if total DWs = 4, then bypass loop /
654	b L(du1_loop)
655	.align `4`
656	L(du1_1dw):
657	#ifdef __LITTLE_ENDIAN__
658	srdi `0`,`6`, `8`
659	sldi `8`,`7`, `64`-`8`
660	#else
661	sldi `0`,`6`, `8`
662	srdi `8`,`7`, `64`-`8`
663	#endif
664	addi `5`,`5`,`16`
665	or `0`,`0`,`8`
666	bf `31`,L(du1_loop)
667	mr `6`,`7`
668	ld `7`,`0`(`5`)
669	addi `5`,`5`,`8`
670	std `0`,`0`(`4`)
671	addi `4`,`4`,`8`
672	.align `4`
673	/ copy 32 bytes at a time /
674	L(du1_loop):
675	#ifdef __LITTLE_ENDIAN__
676	srdi `0`,`6`, `8`
677	sldi `8`,`7`, `64`-`8`
678	#else
679	sldi `0`,`6`, `8`
680	srdi `8`,`7`, `64`-`8`
681	#endif
682	or `0`,`0`,`8`
683	ld `6`,`0`(`5`)
684	std `0`,`0`(`4`)
685	#ifdef __LITTLE_ENDIAN__
686	srdi `0`,`7`, `8`
687	sldi `8`,`6`, `64`-`8`
688	#else
689	sldi `0`,`7`, `8`
690	srdi `8`,`6`, `64`-`8`
691	#endif
692	or `0`,`0`,`8`
693	ld `7`,`8`(`5`)
694	std `0`,`8`(`4`)
695	#ifdef __LITTLE_ENDIAN__
696	srdi `0`,`6`, `8`
697	sldi `8`,`7`, `64`-`8`
698	#else
699	sldi `0`,`6`, `8`
700	srdi `8`,`7`, `64`-`8`
701	#endif
702	or `0`,`0`,`8`
703	ld `6`,`16`(`5`)
704	std `0`,`16`(`4`)
705	#ifdef __LITTLE_ENDIAN__
706	srdi `0`,`7`, `8`
707	sldi `8`,`6`, `64`-`8`
708	#else
709	sldi `0`,`7`, `8`
710	srdi `8`,`6`, `64`-`8`
711	#endif
712	or `0`,`0`,`8`
713	ld `7`,`24`(`5`)
714	std `0`,`24`(`4`)
715	addi `5`,`5`,`32`
716	addi `4`,`4`,`32`
717	bdnz+ L(du1_loop)
718	.align `4`
719	L(du1_fini):
720	/ calculate and store the final DW /
721	#ifdef __LITTLE_ENDIAN__
722	srdi `0`,`6`, `8`
723	sldi `8`,`7`, `64`-`8`
724	#else
725	sldi `0`,`6`, `8`
726	srdi `8`,`7`, `64`-`8`
727	#endif
728	or `0`,`0`,`8`
729	std `0`,`0`(`4`)
730	b L(du_done)
731
732	.align `4`
733	L(du2_do):
734	bf `30`,L(du2_1dw)
735
736	/ there are at least two DWs to copy /
737	#ifdef __LITTLE_ENDIAN__
738	srdi `0`,`6`, `16`
739	sldi `8`,`7`, `64`-`16`
740	#else
741	sldi `0`,`6`, `16`
742	srdi `8`,`7`, `64`-`16`
743	#endif
744	or `0`,`0`,`8`
745	ld `6`,`16`(`5`)
746	std `0`,`0`(`4`)
747	#ifdef __LITTLE_ENDIAN__
748	srdi `0`,`7`, `16`
749	sldi `8`,`6`, `64`-`16`
750	#else
751	sldi `0`,`7`, `16`
752	srdi `8`,`6`, `64`-`16`
753	#endif
754	or `0`,`0`,`8`
755	ld `7`,`24`(`5`)
756	std `0`,`8`(`4`)
757	addi `4`,`4`,`16`
758	addi `5`,`5`,`32`
759	blt cr6,L(du2_fini) / if total DWs = 3, then bypass loop /
760	bf `31`,L(du2_loop)
761	/ there is a third DW to copy /
762	#ifdef __LITTLE_ENDIAN__
763	srdi `0`,`6`, `16`
764	sldi `8`,`7`, `64`-`16`
765	#else
766	sldi `0`,`6`, `16`
767	srdi `8`,`7`, `64`-`16`
768	#endif
769	or `0`,`0`,`8`
770	std `0`,`0`(`4`)
771	mr `6`,`7`
772	ld `7`,`0`(`5`)
773	addi `5`,`5`,`8`
774	addi `4`,`4`,`8`
775	beq cr6,L(du2_fini) / if total DWs = 4, then bypass loop /
776	b L(du2_loop)
777	.align `4`
778	L(du2_1dw):
779	#ifdef __LITTLE_ENDIAN__
780	srdi `0`,`6`, `16`
781	sldi `8`,`7`, `64`-`16`
782	#else
783	sldi `0`,`6`, `16`
784	srdi `8`,`7`, `64`-`16`
785	#endif
786	addi `5`,`5`,`16`
787	or `0`,`0`,`8`
788	bf `31`,L(du2_loop)
789	mr `6`,`7`
790	ld `7`,`0`(`5`)
791	addi `5`,`5`,`8`
792	std `0`,`0`(`4`)
793	addi `4`,`4`,`8`
794	.align `4`
795	/ copy 32 bytes at a time /
796	L(du2_loop):
797	#ifdef __LITTLE_ENDIAN__
798	srdi `0`,`6`, `16`
799	sldi `8`,`7`, `64`-`16`
800	#else
801	sldi `0`,`6`, `16`
802	srdi `8`,`7`, `64`-`16`
803	#endif
804	or `0`,`0`,`8`
805	ld `6`,`0`(`5`)
806	std `0`,`0`(`4`)
807	#ifdef __LITTLE_ENDIAN__
808	srdi `0`,`7`, `16`
809	sldi `8`,`6`, `64`-`16`
810	#else
811	sldi `0`,`7`, `16`
812	srdi `8`,`6`, `64`-`16`
813	#endif
814	or `0`,`0`,`8`
815	ld `7`,`8`(`5`)
816	std `0`,`8`(`4`)
817	#ifdef __LITTLE_ENDIAN__
818	srdi `0`,`6`, `16`
819	sldi `8`,`7`, `64`-`16`
820	#else
821	sldi `0`,`6`, `16`
822	srdi `8`,`7`, `64`-`16`
823	#endif
824	or `0`,`0`,`8`
825	ld `6`,`16`(`5`)
826	std `0`,`16`(`4`)
827	#ifdef __LITTLE_ENDIAN__
828	srdi `0`,`7`, `16`
829	sldi `8`,`6`, `64`-`16`
830	#else
831	sldi `0`,`7`, `16`
832	srdi `8`,`6`, `64`-`16`
833	#endif
834	or `0`,`0`,`8`
835	ld `7`,`24`(`5`)
836	std `0`,`24`(`4`)
837	addi `5`,`5`,`32`
838	addi `4`,`4`,`32`
839	bdnz+ L(du2_loop)
840	.align `4`
841	L(du2_fini):
842	/ calculate and store the final DW /
843	#ifdef __LITTLE_ENDIAN__
844	srdi `0`,`6`, `16`
845	sldi `8`,`7`, `64`-`16`
846	#else
847	sldi `0`,`6`, `16`
848	srdi `8`,`7`, `64`-`16`
849	#endif
850	or `0`,`0`,`8`
851	std `0`,`0`(`4`)
852	b L(du_done)
853
854	.align `4`
855	L(du3_do):
856	bf `30`,L(du3_1dw)
857
858	/ there are at least two DWs to copy /
859	#ifdef __LITTLE_ENDIAN__
860	srdi `0`,`6`, `24`
861	sldi `8`,`7`, `64`-`24`
862	#else
863	sldi `0`,`6`, `24`
864	srdi `8`,`7`, `64`-`24`
865	#endif
866	or `0`,`0`,`8`
867	ld `6`,`16`(`5`)
868	std `0`,`0`(`4`)
869	#ifdef __LITTLE_ENDIAN__
870	srdi `0`,`7`, `24`
871	sldi `8`,`6`, `64`-`24`
872	#else
873	sldi `0`,`7`, `24`
874	srdi `8`,`6`, `64`-`24`
875	#endif
876	or `0`,`0`,`8`
877	ld `7`,`24`(`5`)
878	std `0`,`8`(`4`)
879	addi `4`,`4`,`16`
880	addi `5`,`5`,`32`
881	blt cr6,L(du3_fini) / if total DWs = 3, then bypass loop /
882	bf `31`,L(du3_loop)
883	/ there is a third DW to copy /
884	#ifdef __LITTLE_ENDIAN__
885	srdi `0`,`6`, `24`
886	sldi `8`,`7`, `64`-`24`
887	#else
888	sldi `0`,`6`, `24`
889	srdi `8`,`7`, `64`-`24`
890	#endif
891	or `0`,`0`,`8`
892	std `0`,`0`(`4`)
893	mr `6`,`7`
894	ld `7`,`0`(`5`)
895	addi `5`,`5`,`8`
896	addi `4`,`4`,`8`
897	beq cr6,L(du3_fini) / if total DWs = 4, then bypass loop /
898	b L(du3_loop)
899	.align `4`
900	L(du3_1dw):
901	#ifdef __LITTLE_ENDIAN__
902	srdi `0`,`6`, `24`
903	sldi `8`,`7`, `64`-`24`
904	#else
905	sldi `0`,`6`, `24`
906	srdi `8`,`7`, `64`-`24`
907	#endif
908	addi `5`,`5`,`16`
909	or `0`,`0`,`8`
910	bf `31`,L(du3_loop)
911	mr `6`,`7`
912	ld `7`,`0`(`5`)
913	addi `5`,`5`,`8`
914	std `0`,`0`(`4`)
915	addi `4`,`4`,`8`
916	.align `4`
917	/ copy 32 bytes at a time /
918	L(du3_loop):
919	#ifdef __LITTLE_ENDIAN__
920	srdi `0`,`6`, `24`
921	sldi `8`,`7`, `64`-`24`
922	#else
923	sldi `0`,`6`, `24`
924	srdi `8`,`7`, `64`-`24`
925	#endif
926	or `0`,`0`,`8`
927	ld `6`,`0`(`5`)
928	std `0`,`0`(`4`)
929	#ifdef __LITTLE_ENDIAN__
930	srdi `0`,`7`, `24`
931	sldi `8`,`6`, `64`-`24`
932	#else
933	sldi `0`,`7`, `24`
934	srdi `8`,`6`, `64`-`24`
935	#endif
936	or `0`,`0`,`8`
937	ld `7`,`8`(`5`)
938	std `0`,`8`(`4`)
939	#ifdef __LITTLE_ENDIAN__
940	srdi `0`,`6`, `24`
941	sldi `8`,`7`, `64`-`24`
942	#else
943	sldi `0`,`6`, `24`
944	srdi `8`,`7`, `64`-`24`
945	#endif
946	or `0`,`0`,`8`
947	ld `6`,`16`(`5`)
948	std `0`,`16`(`4`)
949	#ifdef __LITTLE_ENDIAN__
950	srdi `0`,`7`, `24`
951	sldi `8`,`6`, `64`-`24`
952	#else
953	sldi `0`,`7`, `24`
954	srdi `8`,`6`, `64`-`24`
955	#endif
956	or `0`,`0`,`8`
957	ld `7`,`24`(`5`)
958	std `0`,`24`(`4`)
959	addi `5`,`5`,`32`
960	addi `4`,`4`,`32`
961	bdnz+ L(du3_loop)
962	.align `4`
963	L(du3_fini):
964	/ calculate and store the final DW /
965	#ifdef __LITTLE_ENDIAN__
966	srdi `0`,`6`, `24`
967	sldi `8`,`7`, `64`-`24`
968	#else
969	sldi `0`,`6`, `24`
970	srdi `8`,`7`, `64`-`24`
971	#endif
972	or `0`,`0`,`8`
973	std `0`,`0`(`4`)
974	b L(du_done)
975
976	.align `4`
977	L(du4_do):
978	cmpldi cr5, `10`, `6`
979	beq cr0, L(du4_dox)
980	blt cr5, L(du5_do)
981	beq cr5, L(du6_do)
982	b L(du7_do)
983	L(du4_dox):
984	bf `30`,L(du4_1dw)
985
986	/ there are at least two DWs to copy /
987	#ifdef __LITTLE_ENDIAN__
988	srdi `0`,`6`, `32`
989	sldi `8`,`7`, `64`-`32`
990	#else
991	sldi `0`,`6`, `32`
992	srdi `8`,`7`, `64`-`32`
993	#endif
994	or `0`,`0`,`8`
995	ld `6`,`16`(`5`)
996	std `0`,`0`(`4`)
997	#ifdef __LITTLE_ENDIAN__
998	srdi `0`,`7`, `32`
999	sldi `8`,`6`, `64`-`32`
1000	#else
1001	sldi `0`,`7`, `32`
1002	srdi `8`,`6`, `64`-`32`
1003	#endif
1004	or `0`,`0`,`8`
1005	ld `7`,`24`(`5`)
1006	std `0`,`8`(`4`)
1007	addi `4`,`4`,`16`
1008	addi `5`,`5`,`32`
1009	blt cr6,L(du4_fini) / if total DWs = 3, then bypass loop /
1010	bf `31`,L(du4_loop)
1011	/ there is a third DW to copy /
1012	#ifdef __LITTLE_ENDIAN__
1013	srdi `0`,`6`, `32`
1014	sldi `8`,`7`, `64`-`32`
1015	#else
1016	sldi `0`,`6`, `32`
1017	srdi `8`,`7`, `64`-`32`
1018	#endif
1019	or `0`,`0`,`8`
1020	std `0`,`0`(`4`)
1021	mr `6`,`7`
1022	ld `7`,`0`(`5`)
1023	addi `5`,`5`,`8`
1024	addi `4`,`4`,`8`
1025	beq cr6,L(du4_fini) / if total DWs = 4, then bypass loop /
1026	b L(du4_loop)
1027	.align `4`
1028	L(du4_1dw):
1029	#ifdef __LITTLE_ENDIAN__
1030	srdi `0`,`6`, `32`
1031	sldi `8`,`7`, `64`-`32`
1032	#else
1033	sldi `0`,`6`, `32`
1034	srdi `8`,`7`, `64`-`32`
1035	#endif
1036	addi `5`,`5`,`16`
1037	or `0`,`0`,`8`
1038	bf `31`,L(du4_loop)
1039	mr `6`,`7`
1040	ld `7`,`0`(`5`)
1041	addi `5`,`5`,`8`
1042	std `0`,`0`(`4`)
1043	addi `4`,`4`,`8`
1044	.align `4`
1045	/ copy 32 bytes at a time /
1046	L(du4_loop):
1047	#ifdef __LITTLE_ENDIAN__
1048	srdi `0`,`6`, `32`
1049	sldi `8`,`7`, `64`-`32`
1050	#else
1051	sldi `0`,`6`, `32`
1052	srdi `8`,`7`, `64`-`32`
1053	#endif
1054	or `0`,`0`,`8`
1055	ld `6`,`0`(`5`)
1056	std `0`,`0`(`4`)
1057	#ifdef __LITTLE_ENDIAN__
1058	srdi `0`,`7`, `32`
1059	sldi `8`,`6`, `64`-`32`
1060	#else
1061	sldi `0`,`7`, `32`
1062	srdi `8`,`6`, `64`-`32`
1063	#endif
1064	or `0`,`0`,`8`
1065	ld `7`,`8`(`5`)
1066	std `0`,`8`(`4`)
1067	#ifdef __LITTLE_ENDIAN__
1068	srdi `0`,`6`, `32`
1069	sldi `8`,`7`, `64`-`32`
1070	#else
1071	sldi `0`,`6`, `32`
1072	srdi `8`,`7`, `64`-`32`
1073	#endif
1074	or `0`,`0`,`8`
1075	ld `6`,`16`(`5`)
1076	std `0`,`16`(`4`)
1077	#ifdef __LITTLE_ENDIAN__
1078	srdi `0`,`7`, `32`
1079	sldi `8`,`6`, `64`-`32`
1080	#else
1081	sldi `0`,`7`, `32`
1082	srdi `8`,`6`, `64`-`32`
1083	#endif
1084	or `0`,`0`,`8`
1085	ld `7`,`24`(`5`)
1086	std `0`,`24`(`4`)
1087	addi `5`,`5`,`32`
1088	addi `4`,`4`,`32`
1089	bdnz+ L(du4_loop)
1090	.align `4`
1091	L(du4_fini):
1092	/ calculate and store the final DW /
1093	#ifdef __LITTLE_ENDIAN__
1094	srdi `0`,`6`, `32`
1095	sldi `8`,`7`, `64`-`32`
1096	#else
1097	sldi `0`,`6`, `32`
1098	srdi `8`,`7`, `64`-`32`
1099	#endif
1100	or `0`,`0`,`8`
1101	std `0`,`0`(`4`)
1102	b L(du_done)
1103
1104	.align `4`
1105	L(du5_do):
1106	bf `30`,L(du5_1dw)
1107
1108	/ there are at least two DWs to copy /
1109	#ifdef __LITTLE_ENDIAN__
1110	srdi `0`,`6`, `40`
1111	sldi `8`,`7`, `64`-`40`
1112	#else
1113	sldi `0`,`6`, `40`
1114	srdi `8`,`7`, `64`-`40`
1115	#endif
1116	or `0`,`0`,`8`
1117	ld `6`,`16`(`5`)
1118	std `0`,`0`(`4`)
1119	#ifdef __LITTLE_ENDIAN__
1120	srdi `0`,`7`, `40`
1121	sldi `8`,`6`, `64`-`40`
1122	#else
1123	sldi `0`,`7`, `40`
1124	srdi `8`,`6`, `64`-`40`
1125	#endif
1126	or `0`,`0`,`8`
1127	ld `7`,`24`(`5`)
1128	std `0`,`8`(`4`)
1129	addi `4`,`4`,`16`
1130	addi `5`,`5`,`32`
1131	blt cr6,L(du5_fini) / if total DWs = 3, then bypass loop /
1132	bf `31`,L(du5_loop)
1133	/ there is a third DW to copy /
1134	#ifdef __LITTLE_ENDIAN__
1135	srdi `0`,`6`, `40`
1136	sldi `8`,`7`, `64`-`40`
1137	#else
1138	sldi `0`,`6`, `40`
1139	srdi `8`,`7`, `64`-`40`
1140	#endif
1141	or `0`,`0`,`8`
1142	std `0`,`0`(`4`)
1143	mr `6`,`7`
1144	ld `7`,`0`(`5`)
1145	addi `5`,`5`,`8`
1146	addi `4`,`4`,`8`
1147	beq cr6,L(du5_fini) / if total DWs = 4, then bypass loop /
1148	b L(du5_loop)
1149	.align `4`
1150	L(du5_1dw):
1151	#ifdef __LITTLE_ENDIAN__
1152	srdi `0`,`6`, `40`
1153	sldi `8`,`7`, `64`-`40`
1154	#else
1155	sldi `0`,`6`, `40`
1156	srdi `8`,`7`, `64`-`40`
1157	#endif
1158	addi `5`,`5`,`16`
1159	or `0`,`0`,`8`
1160	bf `31`,L(du5_loop)
1161	mr `6`,`7`
1162	ld `7`,`0`(`5`)
1163	addi `5`,`5`,`8`
1164	std `0`,`0`(`4`)
1165	addi `4`,`4`,`8`
1166	.align `4`
1167	/ copy 32 bytes at a time /
1168	L(du5_loop):
1169	#ifdef __LITTLE_ENDIAN__
1170	srdi `0`,`6`, `40`
1171	sldi `8`,`7`, `64`-`40`
1172	#else
1173	sldi `0`,`6`, `40`
1174	srdi `8`,`7`, `64`-`40`
1175	#endif
1176	or `0`,`0`,`8`
1177	ld `6`,`0`(`5`)
1178	std `0`,`0`(`4`)
1179	#ifdef __LITTLE_ENDIAN__
1180	srdi `0`,`7`, `40`
1181	sldi `8`,`6`, `64`-`40`
1182	#else
1183	sldi `0`,`7`, `40`
1184	srdi `8`,`6`, `64`-`40`
1185	#endif
1186	or `0`,`0`,`8`
1187	ld `7`,`8`(`5`)
1188	std `0`,`8`(`4`)
1189	#ifdef __LITTLE_ENDIAN__
1190	srdi `0`,`6`, `40`
1191	sldi `8`,`7`, `64`-`40`
1192	#else
1193	sldi `0`,`6`, `40`
1194	srdi `8`,`7`, `64`-`40`
1195	#endif
1196	or `0`,`0`,`8`
1197	ld `6`,`16`(`5`)
1198	std `0`,`16`(`4`)
1199	#ifdef __LITTLE_ENDIAN__
1200	srdi `0`,`7`, `40`
1201	sldi `8`,`6`, `64`-`40`
1202	#else
1203	sldi `0`,`7`, `40`
1204	srdi `8`,`6`, `64`-`40`
1205	#endif
1206	or `0`,`0`,`8`
1207	ld `7`,`24`(`5`)
1208	std `0`,`24`(`4`)
1209	addi `5`,`5`,`32`
1210	addi `4`,`4`,`32`
1211	bdnz+ L(du5_loop)
1212	.align `4`
1213	L(du5_fini):
1214	/ calculate and store the final DW /
1215	#ifdef __LITTLE_ENDIAN__
1216	srdi `0`,`6`, `40`
1217	sldi `8`,`7`, `64`-`40`
1218	#else
1219	sldi `0`,`6`, `40`
1220	srdi `8`,`7`, `64`-`40`
1221	#endif
1222	or `0`,`0`,`8`
1223	std `0`,`0`(`4`)
1224	b L(du_done)
1225
1226	.align `4`
1227	L(du6_do):
1228	bf `30`,L(du6_1dw)
1229
1230	/ there are at least two DWs to copy /
1231	#ifdef __LITTLE_ENDIAN__
1232	srdi `0`,`6`, `48`
1233	sldi `8`,`7`, `64`-`48`
1234	#else
1235	sldi `0`,`6`, `48`
1236	srdi `8`,`7`, `64`-`48`
1237	#endif
1238	or `0`,`0`,`8`
1239	ld `6`,`16`(`5`)
1240	std `0`,`0`(`4`)
1241	#ifdef __LITTLE_ENDIAN__
1242	srdi `0`,`7`, `48`
1243	sldi `8`,`6`, `64`-`48`
1244	#else
1245	sldi `0`,`7`, `48`
1246	srdi `8`,`6`, `64`-`48`
1247	#endif
1248	or `0`,`0`,`8`
1249	ld `7`,`24`(`5`)
1250	std `0`,`8`(`4`)
1251	addi `4`,`4`,`16`
1252	addi `5`,`5`,`32`
1253	blt cr6,L(du6_fini) / if total DWs = 3, then bypass loop /
1254	bf `31`,L(du6_loop)
1255	/ there is a third DW to copy /
1256	#ifdef __LITTLE_ENDIAN__
1257	srdi `0`,`6`, `48`
1258	sldi `8`,`7`, `64`-`48`
1259	#else
1260	sldi `0`,`6`, `48`
1261	srdi `8`,`7`, `64`-`48`
1262	#endif
1263	or `0`,`0`,`8`
1264	std `0`,`0`(`4`)
1265	mr `6`,`7`
1266	ld `7`,`0`(`5`)
1267	addi `5`,`5`,`8`
1268	addi `4`,`4`,`8`
1269	beq cr6,L(du6_fini) / if total DWs = 4, then bypass loop /
1270	b L(du6_loop)
1271	.align `4`
1272	L(du6_1dw):
1273	#ifdef __LITTLE_ENDIAN__
1274	srdi `0`,`6`, `48`
1275	sldi `8`,`7`, `64`-`48`
1276	#else
1277	sldi `0`,`6`, `48`
1278	srdi `8`,`7`, `64`-`48`
1279	#endif
1280	addi `5`,`5`,`16`
1281	or `0`,`0`,`8`
1282	bf `31`,L(du6_loop)
1283	mr `6`,`7`
1284	ld `7`,`0`(`5`)
1285	addi `5`,`5`,`8`
1286	std `0`,`0`(`4`)
1287	addi `4`,`4`,`8`
1288	.align `4`
1289	/ copy 32 bytes at a time /
1290	L(du6_loop):
1291	#ifdef __LITTLE_ENDIAN__
1292	srdi `0`,`6`, `48`
1293	sldi `8`,`7`, `64`-`48`
1294	#else
1295	sldi `0`,`6`, `48`
1296	srdi `8`,`7`, `64`-`48`
1297	#endif
1298	or `0`,`0`,`8`
1299	ld `6`,`0`(`5`)
1300	std `0`,`0`(`4`)
1301	#ifdef __LITTLE_ENDIAN__
1302	srdi `0`,`7`, `48`
1303	sldi `8`,`6`, `64`-`48`
1304	#else
1305	sldi `0`,`7`, `48`
1306	srdi `8`,`6`, `64`-`48`
1307	#endif
1308	or `0`,`0`,`8`
1309	ld `7`,`8`(`5`)
1310	std `0`,`8`(`4`)
1311	#ifdef __LITTLE_ENDIAN__
1312	srdi `0`,`6`, `48`
1313	sldi `8`,`7`, `64`-`48`
1314	#else
1315	sldi `0`,`6`, `48`
1316	srdi `8`,`7`, `64`-`48`
1317	#endif
1318	or `0`,`0`,`8`
1319	ld `6`,`16`(`5`)
1320	std `0`,`16`(`4`)
1321	#ifdef __LITTLE_ENDIAN__
1322	srdi `0`,`7`, `48`
1323	sldi `8`,`6`, `64`-`48`
1324	#else
1325	sldi `0`,`7`, `48`
1326	srdi `8`,`6`, `64`-`48`
1327	#endif
1328	or `0`,`0`,`8`
1329	ld `7`,`24`(`5`)
1330	std `0`,`24`(`4`)
1331	addi `5`,`5`,`32`
1332	addi `4`,`4`,`32`
1333	bdnz+ L(du6_loop)
1334	.align `4`
1335	L(du6_fini):
1336	/ calculate and store the final DW /
1337	#ifdef __LITTLE_ENDIAN__
1338	srdi `0`,`6`, `48`
1339	sldi `8`,`7`, `64`-`48`
1340	#else
1341	sldi `0`,`6`, `48`
1342	srdi `8`,`7`, `64`-`48`
1343	#endif
1344	or `0`,`0`,`8`
1345	std `0`,`0`(`4`)
1346	b L(du_done)
1347
1348	.align `4`
1349	L(du7_do):
1350	bf `30`,L(du7_1dw)
1351
1352	/ there are at least two DWs to copy /
1353	#ifdef __LITTLE_ENDIAN__
1354	srdi `0`,`6`, `56`
1355	sldi `8`,`7`, `64`-`56`
1356	#else
1357	sldi `0`,`6`, `56`
1358	srdi `8`,`7`, `64`-`56`
1359	#endif
1360	or `0`,`0`,`8`
1361	ld `6`,`16`(`5`)
1362	std `0`,`0`(`4`)
1363	#ifdef __LITTLE_ENDIAN__
1364	srdi `0`,`7`, `56`
1365	sldi `8`,`6`, `64`-`56`
1366	#else
1367	sldi `0`,`7`, `56`
1368	srdi `8`,`6`, `64`-`56`
1369	#endif
1370	or `0`,`0`,`8`
1371	ld `7`,`24`(`5`)
1372	std `0`,`8`(`4`)
1373	addi `4`,`4`,`16`
1374	addi `5`,`5`,`32`
1375	blt cr6,L(du7_fini) / if total DWs = 3, then bypass loop /
1376	bf `31`,L(du7_loop)
1377	/ there is a third DW to copy /
1378	#ifdef __LITTLE_ENDIAN__
1379	srdi `0`,`6`, `56`
1380	sldi `8`,`7`, `64`-`56`
1381	#else
1382	sldi `0`,`6`, `56`
1383	srdi `8`,`7`, `64`-`56`
1384	#endif
1385	or `0`,`0`,`8`
1386	std `0`,`0`(`4`)
1387	mr `6`,`7`
1388	ld `7`,`0`(`5`)
1389	addi `5`,`5`,`8`
1390	addi `4`,`4`,`8`
1391	beq cr6,L(du7_fini) / if total DWs = 4, then bypass loop /
1392	b L(du7_loop)
1393	.align `4`
1394	L(du7_1dw):
1395	#ifdef __LITTLE_ENDIAN__
1396	srdi `0`,`6`, `56`
1397	sldi `8`,`7`, `64`-`56`
1398	#else
1399	sldi `0`,`6`, `56`
1400	srdi `8`,`7`, `64`-`56`
1401	#endif
1402	addi `5`,`5`,`16`
1403	or `0`,`0`,`8`
1404	bf `31`,L(du7_loop)
1405	mr `6`,`7`
1406	ld `7`,`0`(`5`)
1407	addi `5`,`5`,`8`
1408	std `0`,`0`(`4`)
1409	addi `4`,`4`,`8`
1410	.align `4`
1411	/ copy 32 bytes at a time /
1412	L(du7_loop):
1413	#ifdef __LITTLE_ENDIAN__
1414	srdi `0`,`6`, `56`
1415	sldi `8`,`7`, `64`-`56`
1416	#else
1417	sldi `0`,`6`, `56`
1418	srdi `8`,`7`, `64`-`56`
1419	#endif
1420	or `0`,`0`,`8`
1421	ld `6`,`0`(`5`)
1422	std `0`,`0`(`4`)
1423	#ifdef __LITTLE_ENDIAN__
1424	srdi `0`,`7`, `56`
1425	sldi `8`,`6`, `64`-`56`
1426	#else
1427	sldi `0`,`7`, `56`
1428	srdi `8`,`6`, `64`-`56`
1429	#endif
1430	or `0`,`0`,`8`
1431	ld `7`,`8`(`5`)
1432	std `0`,`8`(`4`)
1433	#ifdef __LITTLE_ENDIAN__
1434	srdi `0`,`6`, `56`
1435	sldi `8`,`7`, `64`-`56`
1436	#else
1437	sldi `0`,`6`, `56`
1438	srdi `8`,`7`, `64`-`56`
1439	#endif
1440	or `0`,`0`,`8`
1441	ld `6`,`16`(`5`)
1442	std `0`,`16`(`4`)
1443	#ifdef __LITTLE_ENDIAN__
1444	srdi `0`,`7`, `56`
1445	sldi `8`,`6`, `64`-`56`
1446	#else
1447	sldi `0`,`7`, `56`
1448	srdi `8`,`6`, `64`-`56`
1449	#endif
1450	or `0`,`0`,`8`
1451	ld `7`,`24`(`5`)
1452	std `0`,`24`(`4`)
1453	addi `5`,`5`,`32`
1454	addi `4`,`4`,`32`
1455	bdnz+ L(du7_loop)
1456	.align `4`
1457	L(du7_fini):
1458	/ calculate and store the final DW /
1459	#ifdef __LITTLE_ENDIAN__
1460	srdi `0`,`6`, `56`
1461	sldi `8`,`7`, `64`-`56`
1462	#else
1463	sldi `0`,`6`, `56`
1464	srdi `8`,`7`, `64`-`56`
1465	#endif
1466	or `0`,`0`,`8`
1467	std `0`,`0`(`4`)
1468	b L(du_done)
1469
1470	.align `4`
1471	L(du_done):
1472	rldicr `0`,`31`,`0`,`60`
1473	mtcrf `0x01`,`31`
1474	beq cr1,`0f` / If the tail is 0 bytes we are done! /
1475
1476	add `3`,`3`,`0`
1477	add `12`,`12`,`0`
1478	/ At this point we have a tail of 0-7 bytes and we know that the*
1479	destination is double word aligned. /*
1480	`4`: bf `29`,`2f`
1481	lwz `6`,`0`(`12`)
1482	addi `12`,`12`,`4`
1483	stw `6`,`0`(`3`)
1484	addi `3`,`3`,`4`
1485	`2`: bf `30`,`1f`
1486	lhz `6`,`0`(`12`)
1487	addi `12`,`12`,`2`
1488	sth `6`,`0`(`3`)
1489	addi `3`,`3`,`2`
1490	`1`: bf `31`,`0f`
1491	lbz `6`,`0`(`12`)
1492	stb `6`,`0`(`3`)
1493	`0`:
1494	/ Return original dst pointer. /
1495	ld `31`,-`8`(`1`)
1496	ld `3`,-`16`(`1`)
1497	blr
1498	END_GEN_TB (MEMCPY,TB_TOCLESS)
1499	libc_hidden_builtin_def (memcpy)
1500

source code of glibc/sysdeps/powerpc/powerpc64/power6/memcpy.S