memcpy.S source code [glibc/sysdeps/powerpc/powerpc64/cell/memcpy.S]

1	/ Optimized memcpy implementation for CELL BE PowerPC.*
2	Copyright (C) 2010-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	#ifndef MEMCPY
22	# define MEMCPY memcpy
23	#endif
24
25	#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
26	#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
27
28	/ memcpy routine optimized for CELL-BE-PPC v2.0*
29	*
30	* The CELL PPC core has 1 integer unit and 1 load/store unit
31	* CELL:
32	* 1st level data cache = 32K
33	* 2nd level data cache = 512K
34	* 3rd level data cache = 0K
35	* With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
36	* latency to memory is >400 clocks
37	* To improve copy performance we need to prefetch source data
38	* far ahead to hide this latency
39	* For best performance instruction forms ending in "." like "andi."
40	* should be avoided as the are implemented in microcode on CELL.
41	* The below code is loop unrolled for the CELL cache line of 128 bytes
42	*/
43
44	.align `7`
45
46	ENTRY_TOCLESS (MEMCPY, `5`)
47	CALL_MCOUNT `3`
48
49	dcbt `0`,r4 / Prefetch ONE SRC cacheline /
50	cmpldi cr1,r5,`16` / is size < 16 ? /
51	mr r6,r3
52	blt+ cr1,.Lshortcopy
53
54	.Lbigcopy:
55	neg r8,r3 / LS 3 bits = # bytes to 8-byte dest bdry /
56	clrldi r8,r8,`64`-`4` / align to 16byte boundary /
57	sub r7,r4,r3
58	cmpldi cr0,r8,`0`
59	beq+ .Ldst_aligned
60
61	.Ldst_unaligned:
62	mtcrf `0x01`,r8 / put #bytes to boundary into cr7 /
63	subf r5,r8,r5
64
65	bf cr7*`4`+`3`,`1f`
66	lbzx r0,r7,r6 / copy 1 byte /
67	stb r0,`0`(r6)
68	addi r6,r6,`1`
69	`1`: bf cr7*`4`+`2`,`2f`
70	lhzx r0,r7,r6 / copy 2 byte /
71	sth r0,`0`(r6)
72	addi r6,r6,`2`
73	`2`: bf cr7*`4`+`1`,`4f`
74	lwzx r0,r7,r6 / copy 4 byte /
75	stw r0,`0`(r6)
76	addi r6,r6,`4`
77	`4`: bf cr7*`4`+`0`,`8f`
78	ldx r0,r7,r6 / copy 8 byte /
79	std r0,`0`(r6)
80	addi r6,r6,`8`
81	`8`:
82	add r4,r7,r6
83
84	.Ldst_aligned:
85
86	cmpdi cr5,r5,`128`-`1`
87
88	neg r7,r6
89	addi r6,r6,-`8` / prepare for stdu /
90	addi r4,r4,-`8` / prepare for ldu /
91
92	clrldi r7,r7,`64`-`7` / align to cacheline boundary /
93	ble+ cr5,.Llessthancacheline
94
95	cmpldi cr6,r7,`0`
96	subf r5,r7,r5
97	srdi r7,r7,`4` / divide size by 16 /
98	srdi r10,r5,`7` / number of cache lines to copy /
99
100	cmpldi r10,`0`
101	li r11,`0` / number cachelines to copy with prefetch /
102	beq .Lnocacheprefetch
103
104	cmpldi r10,PREFETCH_AHEAD
105	li r12,`128`+`8` / prefetch distance /
106	ble .Llessthanmaxprefetch
107
108	subi r11,r10,PREFETCH_AHEAD
109	li r10,PREFETCH_AHEAD
110
111	.Llessthanmaxprefetch:
112	mtctr r10
113
114	.LprefetchSRC:
115	dcbt r12,r4
116	addi r12,r12,`128`
117	bdnz .LprefetchSRC
118
119	.Lnocacheprefetch:
120	mtctr r7
121	cmpldi cr1,r5,`128`
122	clrldi r5,r5,`64`-`7`
123	beq cr6,.Lcachelinealigned
124
125	.Laligntocacheline:
126	ld r9,`0x08`(r4)
127	ldu r7,`0x10`(r4)
128	std r9,`0x08`(r6)
129	stdu r7,`0x10`(r6)
130	bdnz .Laligntocacheline
131
132
133	.Lcachelinealigned: / copy while cache lines /
134
135	blt- cr1,.Llessthancacheline / size <128 /
136
137	.Louterloop:
138	cmpdi r11,`0`
139	mtctr r11
140	beq- .Lendloop
141
142	li r11,`128`ZERO_AHEAD +`8` /* DCBZ dist /
143
144	.align `4`
145	/ Copy whole cachelines, optimized by prefetching SRC cacheline /
146	.Lloop: / Copy aligned body /
147	dcbt r12,r4 / PREFETCH SOURCE some cache lines ahead /
148	ld r9, `0x08`(r4)
149	dcbz r11,r6
150	ld r7, `0x10`(r4) / 4 register stride copy is optimal /
151	ld r8, `0x18`(r4) / to hide 1st level cache latency. /
152	ld r0, `0x20`(r4)
153	std r9, `0x08`(r6)
154	std r7, `0x10`(r6)
155	std r8, `0x18`(r6)
156	std r0, `0x20`(r6)
157	ld r9, `0x28`(r4)
158	ld r7, `0x30`(r4)
159	ld r8, `0x38`(r4)
160	ld r0, `0x40`(r4)
161	std r9, `0x28`(r6)
162	std r7, `0x30`(r6)
163	std r8, `0x38`(r6)
164	std r0, `0x40`(r6)
165	ld r9, `0x48`(r4)
166	ld r7, `0x50`(r4)
167	ld r8, `0x58`(r4)
168	ld r0, `0x60`(r4)
169	std r9, `0x48`(r6)
170	std r7, `0x50`(r6)
171	std r8, `0x58`(r6)
172	std r0, `0x60`(r6)
173	ld r9, `0x68`(r4)
174	ld r7, `0x70`(r4)
175	ld r8, `0x78`(r4)
176	ldu r0, `0x80`(r4)
177	std r9, `0x68`(r6)
178	std r7, `0x70`(r6)
179	std r8, `0x78`(r6)
180	stdu r0, `0x80`(r6)
181
182	bdnz .Lloop
183
184	.Lendloop:
185	cmpdi r10,`0`
186	sldi r10,r10,`2` / adjust from 128 to 32 byte stride /
187	beq- .Lendloop2
188	mtctr r10
189
190	.Lloop2: / Copy aligned body /
191	ld r9, `0x08`(r4)
192	ld r7, `0x10`(r4)
193	ld r8, `0x18`(r4)
194	ldu r0, `0x20`(r4)
195	std r9, `0x08`(r6)
196	std r7, `0x10`(r6)
197	std r8, `0x18`(r6)
198	stdu r0, `0x20`(r6)
199
200	bdnz .Lloop2
201	.Lendloop2:
202
203	.Llessthancacheline: / less than cache to do ? /
204	cmpldi cr0,r5,`16`
205	srdi r7,r5,`4` / divide size by 16 /
206	blt- .Ldo_lt16
207	mtctr r7
208
209	.Lcopy_remaining:
210	ld r8,`0x08`(r4)
211	ldu r7,`0x10`(r4)
212	std r8,`0x08`(r6)
213	stdu r7,`0x10`(r6)
214	bdnz .Lcopy_remaining
215
216	.Ldo_lt16: / less than 16 ? /
217	cmpldi cr0,r5,`0` / copy remaining bytes (0-15) /
218	beqlr+ / no rest to copy /
219	addi r4,r4,`8`
220	addi r6,r6,`8`
221
222	.Lshortcopy: / SIMPLE COPY to handle size =< 15 bytes /
223	mtcrf `0x01`,r5
224	sub r7,r4,r6
225	bf- cr7*`4`+`0`,`8f`
226	ldx r0,r7,r6 / copy 8 byte /
227	std r0,`0`(r6)
228	addi r6,r6,`8`
229	`8`:
230	bf cr7*`4`+`1`,`4f`
231	lwzx r0,r7,r6 / copy 4 byte /
232	stw r0,`0`(r6)
233	addi r6,r6,`4`
234	`4`:
235	bf cr7*`4`+`2`,`2f`
236	lhzx r0,r7,r6 / copy 2 byte /
237	sth r0,`0`(r6)
238	addi r6,r6,`2`
239	`2`:
240	bf cr7*`4`+`3`,`1f`
241	lbzx r0,r7,r6 / copy 1 byte /
242	stb r0,`0`(r6)
243	`1`: blr
244
245	END_GEN_TB (MEMCPY,TB_TOCLESS)
246	libc_hidden_builtin_def (memcpy)
247

source code of glibc/sysdeps/powerpc/powerpc64/cell/memcpy.S