memmove.S source code [glibc/sysdeps/powerpc/powerpc64/power7/memmove.S]

1	/ Optimized memmove implementation for PowerPC64/POWER7.*
2	Copyright (C) 2014-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21
22	/ void* [r3] memmove (void dest [r3], const void src [r4], size_t len [r5])*
23
24	This optimization check if memory 'dest' overlaps with 'src'. If it does
25	not then it calls an optimized memcpy call (similar to memcpy for POWER7,
26	embedded here to gain some cycles).
27	If source and destiny overlaps, a optimized backwards memcpy is used
28	instead. /*
29
30	#ifndef MEMMOVE
31	# define MEMMOVE memmove
32	#endif
33	.machine power7
34	ENTRY_TOCLESS (MEMMOVE, `5`)
35	CALL_MCOUNT `3`
36
37	L(_memmove):
38	subf r9,r4,r3
39	cmpld cr7,r9,r5
40	blt cr7,L(memmove_bwd)
41
42	cmpldi cr1,r5,`31`
43	neg `0`,`3`
44	ble cr1, L(copy_LT_32) / If move < 32 bytes use short move*
45	code. /*
46
47	andi. `10`,`3`,`15`
48	clrldi `11`,`4`,`60`
49	cmpld cr6,`10`,`11` / SRC and DST alignments match? /
50
51	mr r11,`3`
52	bne cr6,L(copy_GE_32_unaligned)
53	beq L(aligned_copy)
54
55	mtocrf `0x01`,`0`
56	clrldi `0`,`0`,`60`
57
58	/ Get the DST and SRC aligned to 8 bytes (16 for little-endian). /
59	`1`:
60	bf `31`,`2f`
61	lbz `6`,`0`(r4)
62	addi r4,r4,`1`
63	stb `6`,`0`(r11)
64	addi r11,r11,`1`
65	`2`:
66	bf `30`,`4f`
67	lhz `6`,`0`(r4)
68	addi r4,r4,`2`
69	sth `6`,`0`(r11)
70	addi r11,r11,`2`
71	`4`:
72	bf `29`,`8f`
73	lwz `6`,`0`(r4)
74	addi r4,r4,`4`
75	stw `6`,`0`(r11)
76	addi r11,r11,`4`
77	`8`:
78	bf `28`,`16f`
79	ld `6`,`0`(r4)
80	addi r4,r4,`8`
81	std `6`,`0`(r11)
82	addi r11,r11,`8`
83	`16`:
84	subf r5,`0`,r5
85
86	/ Main aligned copy loop. Copies 128 bytes at a time. /
87	L(aligned_copy):
88	li `6`,`16`
89	li `7`,`32`
90	li `8`,`48`
91	mtocrf `0x02`,r5
92	srdi `12`,r5,`7`
93	cmpdi `12`,`0`
94	beq L(aligned_tail)
95	lvx `6`,`0`,r4
96	lvx `7`,r4,`6`
97	mtctr `12`
98	b L(aligned_128loop)
99
100	.align `4`
101	L(aligned_128head):
102	/ for the 2nd + iteration of this loop. /
103	lvx `6`,`0`,r4
104	lvx `7`,r4,`6`
105	L(aligned_128loop):
106	lvx `8`,r4,`7`
107	lvx `9`,r4,`8`
108	stvx `6`,`0`,r11
109	addi r4,r4,`64`
110	stvx `7`,r11,`6`
111	stvx `8`,r11,`7`
112	stvx `9`,r11,`8`
113	lvx `6`,`0`,r4
114	lvx `7`,r4,`6`
115	addi r11,r11,`64`
116	lvx `8`,r4,`7`
117	lvx `9`,r4,`8`
118	addi r4,r4,`64`
119	stvx `6`,`0`,r11
120	stvx `7`,r11,`6`
121	stvx `8`,r11,`7`
122	stvx `9`,r11,`8`
123	addi r11,r11,`64`
124	bdnz L(aligned_128head)
125
126	L(aligned_tail):
127	mtocrf `0x01`,r5
128	bf `25`,`32f`
129	lvx `6`,`0`,r4
130	lvx `7`,r4,`6`
131	lvx `8`,r4,`7`
132	lvx `9`,r4,`8`
133	addi r4,r4,`64`
134	stvx `6`,`0`,r11
135	stvx `7`,r11,`6`
136	stvx `8`,r11,`7`
137	stvx `9`,r11,`8`
138	addi r11,r11,`64`
139	`32`:
140	bf `26`,`16f`
141	lvx `6`,`0`,r4
142	lvx `7`,r4,`6`
143	addi r4,r4,`32`
144	stvx `6`,`0`,r11
145	stvx `7`,r11,`6`
146	addi r11,r11,`32`
147	`16`:
148	bf `27`,`8f`
149	lvx `6`,`0`,r4
150	addi r4,r4,`16`
151	stvx `6`,`0`,r11
152	addi r11,r11,`16`
153	`8`:
154	bf `28`,`4f`
155	ld `6`,`0`(r4)
156	addi r4,r4,`8`
157	std `6`,`0`(r11)
158	addi r11,r11,`8`
159	`4`: / Copies 4~7 bytes. /
160	bf `29`,L(tail2)
161	lwz `6`,`0`(r4)
162	stw `6`,`0`(r11)
163	bf `30`,L(tail5)
164	lhz `7`,`4`(r4)
165	sth `7`,`4`(r11)
166	bflr `31`
167	lbz `8`,`6`(r4)
168	stb `8`,`6`(r11)
169	/ Return original DST pointer. /
170	blr
171
172	/ Handle copies of 0~31 bytes. /
173	.align `4`
174	L(copy_LT_32):
175	mr r11,`3`
176	cmpldi cr6,r5,`8`
177	mtocrf `0x01`,r5
178	ble cr6,L(copy_LE_8)
179
180	/ At least 9 bytes to go. /
181	neg `8`,`4`
182	andi. `0`,`8`,`3`
183	cmpldi cr1,r5,`16`
184	beq L(copy_LT_32_aligned)
185
186	/ Force 4-byte alignment for SRC. /
187	mtocrf `0x01`,`0`
188	subf r5,`0`,r5
189	`2`:
190	bf `30`,`1f`
191	lhz `6`,`0`(r4)
192	addi r4,r4,`2`
193	sth `6`,`0`(r11)
194	addi r11,r11,`2`
195	`1`:
196	bf `31`,L(end_4bytes_alignment)
197	lbz `6`,`0`(r4)
198	addi r4,r4,`1`
199	stb `6`,`0`(r11)
200	addi r11,r11,`1`
201
202	.align `4`
203	L(end_4bytes_alignment):
204	cmpldi cr1,r5,`16`
205	mtocrf `0x01`,r5
206
207	L(copy_LT_32_aligned):
208	/ At least 6 bytes to go, and SRC is word-aligned. /
209	blt cr1,`8f`
210
211	/ Copy 16 bytes. /
212	lwz `6`,`0`(r4)
213	lwz `7`,`4`(r4)
214	stw `6`,`0`(r11)
215	lwz `8`,`8`(r4)
216	stw `7`,`4`(r11)
217	lwz `6`,`12`(r4)
218	addi r4,r4,`16`
219	stw `8`,`8`(r11)
220	stw `6`,`12`(r11)
221	addi r11,r11,`16`
222	`8`: / Copy 8 bytes. /
223	bf `28`,L(tail4)
224	lwz `6`,`0`(r4)
225	lwz `7`,`4`(r4)
226	addi r4,r4,`8`
227	stw `6`,`0`(r11)
228	stw `7`,`4`(r11)
229	addi r11,r11,`8`
230
231	.align `4`
232	/ Copies 4~7 bytes. /
233	L(tail4):
234	bf `29`,L(tail2)
235	lwz `6`,`0`(r4)
236	stw `6`,`0`(r11)
237	bf `30`,L(tail5)
238	lhz `7`,`4`(r4)
239	sth `7`,`4`(r11)
240	bflr `31`
241	lbz `8`,`6`(r4)
242	stb `8`,`6`(r11)
243	/ Return original DST pointer. /
244	blr
245
246	.align `4`
247	/ Copies 2~3 bytes. /
248	L(tail2):
249	bf `30`,`1f`
250	lhz `6`,`0`(r4)
251	sth `6`,`0`(r11)
252	bflr `31`
253	lbz `7`,`2`(r4)
254	stb `7`,`2`(r11)
255	blr
256
257	.align `4`
258	L(tail5):
259	bflr `31`
260	lbz `6`,`4`(r4)
261	stb `6`,`4`(r11)
262	blr
263
264	.align `4`
265	`1`:
266	bflr `31`
267	lbz `6`,`0`(r4)
268	stb `6`,`0`(r11)
269	/ Return original DST pointer. /
270	blr
271
272	/ Handles copies of 0~8 bytes. /
273	.align `4`
274	L(copy_LE_8):
275	bne cr6,L(tail4)
276
277	/ Though we could've used ld/std here, they are still*
278	slow for unaligned cases. /*
279
280	lwz `6`,`0`(r4)
281	lwz `7`,`4`(r4)
282	stw `6`,`0`(r11)
283	stw `7`,`4`(r11)
284	blr
285
286
287	/ Handle copies of 32+ bytes where DST is aligned (to quadword) but*
288	SRC is not. Use aligned quadword loads from SRC, shifted to realign
289	the data, allowing for aligned DST stores. /*
290	.align `4`
291	L(copy_GE_32_unaligned):
292	clrldi `0`,`0`,`60` / Number of bytes until the 1st r11 quadword. /
293	srdi `9`,r5,`4` / Number of full quadwords remaining. /
294
295	beq L(copy_GE_32_unaligned_cont)
296
297	/ DST is not quadword aligned, get it aligned. /
298
299	mtocrf `0x01`,`0`
300	subf r5,`0`,r5
301
302	/ Vector instructions work best when proper alignment (16-bytes)*
303	is present. Move 0~15 bytes as needed to get DST quadword-aligned. /*
304	`1`:
305	bf `31`,`2f`
306	lbz `6`,`0`(r4)
307	addi r4,r4,`1`
308	stb `6`,`0`(r11)
309	addi r11,r11,`1`
310	`2`:
311	bf `30`,`4f`
312	lhz `6`,`0`(r4)
313	addi r4,r4,`2`
314	sth `6`,`0`(r11)
315	addi r11,r11,`2`
316	`4`:
317	bf `29`,`8f`
318	lwz `6`,`0`(r4)
319	addi r4,r4,`4`
320	stw `6`,`0`(r11)
321	addi r11,r11,`4`
322	`8`:
323	bf `28`,`0f`
324	ld `6`,`0`(r4)
325	addi r4,r4,`8`
326	std `6`,`0`(r11)
327	addi r11,r11,`8`
328	`0`:
329	srdi `9`,r5,`4` / Number of full quadwords remaining. /
330
331	/ The proper alignment is present, it is OK to copy the bytes now. /
332	L(copy_GE_32_unaligned_cont):
333
334	/ Setup two indexes to speed up the indexed vector operations. /
335	clrldi `10`,r5,`60`
336	li `6`,`16` / Index for 16-bytes offsets. /
337	li `7`,`32` / Index for 32-bytes offsets. /
338	cmpldi cr1,`10`,`0`
339	srdi `8`,r5,`5` / Setup the loop counter. /
340	mtocrf `0x01`,`9`
341	cmpldi cr6,`9`,`1`
342	#ifdef __LITTLE_ENDIAN__
343	lvsr `5`,`0`,r4
344	#else
345	lvsl `5`,`0`,r4
346	#endif
347	lvx `3`,`0`,r4
348	li `0`,`0`
349	bf `31`,L(setup_unaligned_loop)
350
351	/ Copy another 16 bytes to align to 32-bytes due to the loop. /
352	lvx `4`,r4,`6`
353	#ifdef __LITTLE_ENDIAN__
354	vperm `6`,`4`,`3`,`5`
355	#else
356	vperm `6`,`3`,`4`,`5`
357	#endif
358	addi r4,r4,`16`
359	stvx `6`,`0`,r11
360	addi r11,r11,`16`
361	vor `3`,`4`,`4`
362	clrrdi `0`,r4,`60`
363
364	L(setup_unaligned_loop):
365	mtctr `8`
366	ble cr6,L(end_unaligned_loop)
367
368	/ Copy 32 bytes at a time using vector instructions. /
369	.align `4`
370	L(unaligned_loop):
371
372	/ Note: vr6/vr10 may contain data that was already copied,*
373	but in order to get proper alignment, we may have to copy
374	some portions again. This is faster than having unaligned
375	vector instructions though. /*
376
377	lvx `4`,r4,`6`
378	#ifdef __LITTLE_ENDIAN__
379	vperm `6`,`4`,`3`,`5`
380	#else
381	vperm `6`,`3`,`4`,`5`
382	#endif
383	lvx `3`,r4,`7`
384	#ifdef __LITTLE_ENDIAN__
385	vperm `10`,`3`,`4`,`5`
386	#else
387	vperm `10`,`4`,`3`,`5`
388	#endif
389	addi r4,r4,`32`
390	stvx `6`,`0`,r11
391	stvx `10`,r11,`6`
392	addi r11,r11,`32`
393	bdnz L(unaligned_loop)
394
395	clrrdi `0`,r4,`60`
396
397	.align `4`
398	L(end_unaligned_loop):
399
400	/ Check for tail bytes. /
401	mtocrf `0x01`,r5
402	beqlr cr1
403
404	add r4,r4,`0`
405
406	/ We have 1~15 tail bytes to copy, and DST is quadword aligned. /
407	/ Copy 8 bytes. /
408	bf `28`,`4f`
409	lwz `6`,`0`(r4)
410	lwz `7`,`4`(r4)
411	addi r4,r4,`8`
412	stw `6`,`0`(r11)
413	stw `7`,`4`(r11)
414	addi r11,r11,`8`
415	`4`: / Copy 4~7 bytes. /
416	bf `29`,L(tail2)
417	lwz `6`,`0`(r4)
418	stw `6`,`0`(r11)
419	bf `30`,L(tail5)
420	lhz `7`,`4`(r4)
421	sth `7`,`4`(r11)
422	bflr `31`
423	lbz `8`,`6`(r4)
424	stb `8`,`6`(r11)
425	/ Return original DST pointer. /
426	blr
427
428	/ Start to memcpy backward implementation: the algorithm first check if*
429	src and dest have the same alignment and if it does align both to 16
430	bytes and copy using VSX instructions.
431	If does not, align dest to 16 bytes and use VMX (altivec) instruction
432	to read two 16 bytes at time, shift/permute the bytes read and write
433	aligned to dest. /*
434	L(memmove_bwd):
435	cmpldi cr1,r5,`31`
436	/ Copy is done backwards: update the pointers and check alignment. /
437	add r11,r3,r5
438	add r4,r4,r5
439	mr r0,r11
440	ble cr1, L(copy_LT_32_bwd) / If move < 32 bytes use short move*
441	code. /*
442
443	andi. r10,r11,`15` / Check if r11 is aligned to 16 bytes /
444	clrldi r9,r4,`60` / Check if r4 is aligned to 16 bytes /
445	cmpld cr6,r10,r9 / SRC and DST alignments match? /
446
447	bne cr6,L(copy_GE_32_unaligned_bwd)
448	beq L(aligned_copy_bwd)
449
450	mtocrf `0x01`,r0
451	clrldi r0,r0,`60`
452
453	/ Get the DST and SRC aligned to 16 bytes. /
454	`1`:
455	bf `31`,`2f`
456	lbz r6,-`1`(r4)
457	subi r4,r4,`1`
458	stb r6,-`1`(r11)
459	subi r11,r11,`1`
460	`2`:
461	bf `30`,`4f`
462	lhz r6,-`2`(r4)
463	subi r4,r4,`2`
464	sth r6,-`2`(r11)
465	subi r11,r11,`2`
466	`4`:
467	bf `29`,`8f`
468	lwz r6,-`4`(r4)
469	subi r4,r4,`4`
470	stw r6,-`4`(r11)
471	subi r11,r11,`4`
472	`8`:
473	bf `28`,`16f`
474	ld r6,-`8`(r4)
475	subi r4,r4,`8`
476	std r6,-`8`(r11)
477	subi r11,r11,`8`
478	`16`:
479	subf r5,`0`,r5
480
481	/ Main aligned copy loop. Copies 128 bytes at a time. /
482	L(aligned_copy_bwd):
483	li r6,-`16`
484	li r7,-`32`
485	li r8,-`48`
486	li r9,-`64`
487	mtocrf `0x02`,r5
488	srdi r12,r5,`7`
489	cmpdi r12,`0`
490	beq L(aligned_tail_bwd)
491	lvx v6,r4,r6
492	lvx v7,r4,r7
493	mtctr `12`
494	b L(aligned_128loop_bwd)
495
496	.align `4`
497	L(aligned_128head_bwd):
498	/ for the 2nd + iteration of this loop. /
499	lvx v6,r4,r6
500	lvx v7,r4,r7
501	L(aligned_128loop_bwd):
502	lvx v8,r4,r8
503	lvx v9,r4,r9
504	stvx v6,r11,r6
505	subi r4,r4,`64`
506	stvx v7,r11,r7
507	stvx v8,r11,r8
508	stvx v9,r11,r9
509	lvx v6,r4,r6
510	lvx v7,r4,`7`
511	subi r11,r11,`64`
512	lvx v8,r4,r8
513	lvx v9,r4,r9
514	subi r4,r4,`64`
515	stvx v6,r11,r6
516	stvx v7,r11,r7
517	stvx v8,r11,r8
518	stvx v9,r11,r9
519	subi r11,r11,`64`
520	bdnz L(aligned_128head_bwd)
521
522	L(aligned_tail_bwd):
523	mtocrf `0x01`,r5
524	bf `25`,`32f`
525	lvx v6,r4,r6
526	lvx v7,r4,r7
527	lvx v8,r4,r8
528	lvx v9,r4,r9
529	subi r4,r4,`64`
530	stvx v6,r11,r6
531	stvx v7,r11,r7
532	stvx v8,r11,r8
533	stvx v9,r11,r9
534	subi r11,r11,`64`
535	`32`:
536	bf `26`,`16f`
537	lvx v6,r4,r6
538	lvx v7,r4,r7
539	subi r4,r4,`32`
540	stvx v6,r11,r6
541	stvx v7,r11,r7
542	subi r11,r11,`32`
543	`16`:
544	bf `27`,`8f`
545	lvx v6,r4,r6
546	subi r4,r4,`16`
547	stvx v6,r11,r6
548	subi r11,r11,`16`
549	`8`:
550	bf `28`,`4f`
551	ld r6,-`8`(r4)
552	subi r4,r4,`8`
553	std r6,-`8`(r11)
554	subi r11,r11,`8`
555	`4`: / Copies 4~7 bytes. /
556	bf `29`,L(tail2_bwd)
557	lwz r6,-`4`(r4)
558	stw r6,-`4`(r11)
559	bf `30`,L(tail5_bwd)
560	lhz r7,-`6`(r4)
561	sth r7,-`6`(r11)
562	bflr `31`
563	lbz r8,-`7`(r4)
564	stb r8,-`7`(r11)
565	/ Return original DST pointer. /
566	blr
567
568	/ Handle copies of 0~31 bytes. /
569	.align `4`
570	L(copy_LT_32_bwd):
571	cmpldi cr6,r5,`8`
572	mtocrf `0x01`,r5
573	ble cr6,L(copy_LE_8_bwd)
574
575	/ At least 9 bytes to go. /
576	neg r8,r4
577	andi. r0,r8,`3`
578	cmpldi cr1,r5,`16`
579	beq L(copy_LT_32_aligned_bwd)
580
581	/ Force 4-byte alignment for SRC. /
582	mtocrf `0x01`,`0`
583	subf r5,`0`,r5
584	`2`:
585	bf `30`,`1f`
586	lhz r6,-`2`(r4)
587	subi r4,r4,`2`
588	sth r6,-`2`(r11)
589	subi r11,r11,`2`
590	`1`:
591	bf `31`,L(end_4bytes_alignment_bwd)
592	lbz `6`,-`1`(r4)
593	subi r4,r4,`1`
594	stb `6`,-`1`(r11)
595	subi r11,r11,`1`
596
597	.align `4`
598	L(end_4bytes_alignment_bwd):
599	cmpldi cr1,r5,`16`
600	mtocrf `0x01`,r5
601
602	L(copy_LT_32_aligned_bwd):
603	/ At least 6 bytes to go, and SRC is word-aligned. /
604	blt cr1,`8f`
605
606	/ Copy 16 bytes. /
607	lwz r6,-`4`(r4)
608	lwz r7,-`8`(r4)
609	stw r6,-`4`(r11)
610	lwz r8,-`12`(r4)
611	stw r7,-`8`(r11)
612	lwz r6,-`16`(r4)
613	subi r4,r4,`16`
614	stw r8,-`12`(r11)
615	stw r6,-`16`(r11)
616	subi r11,r11,`16`
617	`8`: / Copy 8 bytes. /
618	bf `28`,L(tail4_bwd)
619	lwz r6,-`4`(r4)
620	lwz r7,-`8`(r4)
621	subi r4,r4,`8`
622	stw r6,-`4`(r11)
623	stw r7,-`8`(r11)
624	subi r11,r11,`8`
625
626	.align `4`
627	/ Copies 4~7 bytes. /
628	L(tail4_bwd):
629	bf `29`,L(tail2_bwd)
630	lwz `6`,-`4`(r4)
631	stw `6`,-`4`(r11)
632	bf `30`,L(tail5_bwd)
633	lhz `7`,-`6`(r4)
634	sth `7`,-`6`(r11)
635	bflr `31`
636	lbz `8`,-`7`(r4)
637	stb `8`,-`7`(r11)
638	/ Return original DST pointer. /
639	blr
640
641	.align `4`
642	/ Copies 2~3 bytes. /
643	L(tail2_bwd):
644	bf `30`,`1f`
645	lhz `6`,-`2`(r4)
646	sth `6`,-`2`(r11)
647	bflr `31`
648	lbz `7`,-`3`(r4)
649	stb `7`,-`3`(r11)
650	blr
651
652	.align `4`
653	L(tail5_bwd):
654	bflr `31`
655	lbz `6`,-`5`(r4)
656	stb `6`,-`5`(r11)
657	blr
658
659	.align `4`
660	`1`:
661	bflr `31`
662	lbz `6`,-`1`(r4)
663	stb `6`,-`1`(r11)
664	/ Return original DST pointer. /
665	blr
666
667
668	/ Handles copies of 0~8 bytes. /
669	.align `4`
670	L(copy_LE_8_bwd):
671	bne cr6,L(tail4_bwd)
672
673	/ Though we could've used ld/std here, they are still*
674	slow for unaligned cases. /*
675	lwz `6`,-`8`(r4)
676	lwz `7`,-`4`(r4)
677	stw `6`,-`8`(r11)
678	stw `7`,-`4`(r11)
679	blr
680
681
682	/ Handle copies of 32+ bytes where DST is aligned (to quadword) but*
683	SRC is not. Use aligned quadword loads from SRC, shifted to realign
684	the data, allowing for aligned DST stores. /*
685	.align `4`
686	L(copy_GE_32_unaligned_bwd):
687	andi. r10,r11,`15` / Check alignment of DST against 16 bytes.. /
688	srdi r9,r5,`4` / Number of full quadwords remaining. /
689
690	beq L(copy_GE_32_unaligned_cont_bwd)
691
692	/ DST is not quadword aligned and r10 holds the address masked to*
693	compare alignments. /*
694	mtocrf `0x01`,r10
695	subf r5,r10,r5
696
697	/ Vector instructions work best when proper alignment (16-bytes)*
698	is present. Move 0~15 bytes as needed to get DST quadword-aligned. /*
699	`1`:
700	bf `31`,`2f`
701	lbz r6,-`1`(r4)
702	subi r4,r4,`1`
703	stb r6,-`1`(r11)
704	subi r11,r11,`1`
705	`2`:
706	bf `30`,`4f`
707	lhz r6,-`2`(r4)
708	subi r4,r4,`2`
709	sth r6,-`2`(r11)
710	subi r11,r11,`2`
711	`4`:
712	bf `29`,`8f`
713	lwz r6,-`4`(r4)
714	subi r4,r4,`4`
715	stw r6,-`4`(r11)
716	subi r11,r11,`4`
717	`8`:
718	bf `28`,`0f`
719	ld r6,-`8`(r4)
720	subi r4,r4,`8`
721	std r6,-`8`(r11)
722	subi r11,r11,`8`
723	`0`:
724	srdi r9,r5,`4` / Number of full quadwords remaining. /
725
726	/ The proper alignment is present, it is OK to copy the bytes now. /
727	L(copy_GE_32_unaligned_cont_bwd):
728
729	/ Setup two indexes to speed up the indexed vector operations. /
730	clrldi r10,r5,`60`
731	li r6,-`16` / Index for 16-bytes offsets. /
732	li r7,-`32` / Index for 32-bytes offsets. /
733	cmpldi cr1,`10`,`0`
734	srdi r8,r5,`5` / Setup the loop counter. /
735	mtocrf `0x01`,`9`
736	cmpldi cr6,r9,`1`
737	#ifdef __LITTLE_ENDIAN__
738	lvsr v5,r0,r4
739	#else
740	lvsl v5,r0,r4
741	#endif
742	lvx v3,`0`,r4
743	li r0,`0`
744	bf `31`,L(setup_unaligned_loop_bwd)
745
746	/ Copy another 16 bytes to align to 32-bytes due to the loop. /
747	lvx v4,r4,r6
748	#ifdef __LITTLE_ENDIAN__
749	vperm v6,v3,v4,v5
750	#else
751	vperm v6,v4,v3,v5
752	#endif
753	subi r4,r4,`16`
754	stvx v6,r11,r6
755	subi r11,r11,`16`
756	vor v3,v4,v4
757	clrrdi r0,r4,`60`
758
759	L(setup_unaligned_loop_bwd):
760	mtctr r8
761	ble cr6,L(end_unaligned_loop_bwd)
762
763	/ Copy 32 bytes at a time using vector instructions. /
764	.align `4`
765	L(unaligned_loop_bwd):
766
767	/ Note: vr6/vr10 may contain data that was already copied,*
768	but in order to get proper alignment, we may have to copy
769	some portions again. This is faster than having unaligned
770	vector instructions though. /*
771
772	lvx v4,r4,r6
773	#ifdef __LITTLE_ENDIAN__
774	vperm v6,v3,v4,v5
775	#else
776	vperm v6,v4,v3,v5
777	#endif
778	lvx v3,r4,r7
779	#ifdef __LITTLE_ENDIAN__
780	vperm v10,v4,v3,v5
781	#else
782	vperm v10,v3,v4,v5
783	#endif
784	subi r4,r4,`32`
785	stvx v6,r11,r6
786	stvx v10,r11,r7
787	subi r11,r11,`32`
788	bdnz L(unaligned_loop_bwd)
789
790	clrrdi r0,r4,`60`
791
792	.align `4`
793	L(end_unaligned_loop_bwd):
794
795	/ Check for tail bytes. /
796	mtocrf `0x01`,r5
797	beqlr cr1
798
799	add r4,r4,`0`
800
801	/ We have 1~15 tail bytes to copy, and DST is quadword aligned. /
802	/ Copy 8 bytes. /
803	bf `28`,`4f`
804	lwz r6,-`4`(r4)
805	lwz r7,-`8`(r4)
806	subi r4,r4,`8`
807	stw r6,-`4`(r11)
808	stw r7,-`8`(r11)
809	subi r11,r11,`8`
810	`4`: / Copy 4~7 bytes. /
811	bf `29`,L(tail2_bwd)
812	lwz r6,-`4`(r4)
813	stw r6,-`4`(r11)
814	bf `30`,L(tail5_bwd)
815	lhz r7,-`6`(r4)
816	sth r7,-`6`(r11)
817	bflr `31`
818	lbz r8,-`7`(r4)
819	stb r8,-`7`(r11)
820	/ Return original DST pointer. /
821	blr
822	END_GEN_TB (MEMMOVE, TB_TOCLESS)
823	libc_hidden_builtin_def (memmove)
824

source code of glibc/sysdeps/powerpc/powerpc64/power7/memmove.S