memcpy.S source code [glibc/sysdeps/powerpc/powerpc32/cell/memcpy.S]

1	/ Optimized memcpy implementation for CELL BE PowerPC.*
2	Copyright (C) 2010-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
22	#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
23
24	/ memcpy routine optimized for CELL-BE-PPC v2.0*
25	*
26	* The CELL PPC core has 1 integer unit and 1 load/store unit
27	* CELL:
28	* 1st level data cache = 32K
29	* 2nd level data cache = 512K
30	* 3rd level data cache = 0K
31	* With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
32	* latency to memory is >400 clocks
33	* To improve copy performance we need to prefetch source data
34	* far ahead to hide this latency
35	* For best performance instruction forms ending in "." like "andi."
36	* should be avoided as the are implemented in microcode on CELL.
37	* The below code is loop unrolled for the CELL cache line of 128 bytes
38	*/
39
40	.align `7`
41
42	EALIGN (memcpy, `5`, `0`)
43	CALL_MCOUNT
44
45	dcbt `0`,r4 / Prefetch ONE SRC cacheline /
46	cmplwi cr1,r5,`16` / is size < 16 ? /
47	mr r6,r3
48	blt+ cr1,.Lshortcopy
49
50	.Lbigcopy:
51	neg r8,r3 / LS 3 bits = # bytes to 8-byte dest bdry /
52	clrlwi r8,r8,`32`-`4` / align to 16byte boundary /
53	sub r7,r4,r3
54	cmplwi cr0,r8,`0`
55	beq+ .Ldst_aligned
56
57	.Ldst_unaligned:
58	mtcrf `0x01`,r8 / put #bytes to boundary into cr7 /
59	subf r5,r8,r5
60
61	bf cr7*`4`+`3`,`1f`
62	lbzx r0,r7,r6 / copy 1 byte /
63	stb r0,`0`(r6)
64	addi r6,r6,`1`
65	`1`: bf cr7*`4`+`2`,`2f`
66	lhzx r0,r7,r6 / copy 2 byte /
67	sth r0,`0`(r6)
68	addi r6,r6,`2`
69	`2`: bf cr7*`4`+`1`,`4f`
70	lwzx r0,r7,r6 / copy 4 byte /
71	stw r0,`0`(r6)
72	addi r6,r6,`4`
73	`4`: bf cr7*`4`+`0`,`8f`
74	lfdx fp9,r7,r6 / copy 8 byte /
75	stfd fp9,`0`(r6)
76	addi r6,r6,`8`
77	`8`:
78	add r4,r7,r6
79
80	.Ldst_aligned:
81
82	cmpwi cr5,r5,`128`-`1`
83
84	neg r7,r6
85	addi r6,r6,-`8` / prepare for stfdu /
86	addi r4,r4,-`8` / prepare for lfdu /
87
88	clrlwi r7,r7,`32`-`7` / align to cacheline boundary /
89	ble+ cr5,.Llessthancacheline
90
91	cmplwi cr6,r7,`0`
92	subf r5,r7,r5
93	srwi r7,r7,`4` / divide size by 16 /
94	srwi r10,r5,`7` / number of cache lines to copy /
95
96	cmplwi r10,`0`
97	li r11,`0` / number cachelines to copy with prefetch /
98	beq .Lnocacheprefetch
99
100	cmplwi r10,PREFETCH_AHEAD
101	li r12,`128`+`8` / prefetch distance /
102	ble .Llessthanmaxprefetch
103
104	subi r11,r10,PREFETCH_AHEAD
105	li r10,PREFETCH_AHEAD
106
107	.Llessthanmaxprefetch:
108	mtctr r10
109
110	.LprefetchSRC:
111	dcbt r12,r4
112	addi r12,r12,`128`
113	bdnz .LprefetchSRC
114
115	.Lnocacheprefetch:
116	mtctr r7
117	cmplwi cr1,r5,`128`
118	clrlwi r5,r5,`32`-`7`
119	beq cr6,.Lcachelinealigned
120
121	.Laligntocacheline:
122	lfd fp9,`0x08`(r4)
123	lfdu fp10,`0x10`(r4)
124	stfd fp9,`0x08`(r6)
125	stfdu fp10,`0x10`(r6)
126	bdnz .Laligntocacheline
127
128
129	.Lcachelinealigned: / copy while cache lines /
130
131	blt- cr1,.Llessthancacheline / size <128 /
132
133	.Louterloop:
134	cmpwi r11,`0`
135	mtctr r11
136	beq- .Lendloop
137
138	li r11,`128`ZERO_AHEAD +`8` /* DCBZ dist /
139
140	.align `4`
141	/ Copy whole cachelines, optimized by prefetching SRC cacheline /
142	.Lloop: / Copy aligned body /
143	dcbt r12,r4 / PREFETCH SOURCE some cache lines ahead /
144	lfd fp9, `0x08`(r4)
145	dcbz r11,r6
146	lfd fp10, `0x10`(r4) / 4 register stride copy is optimal /
147	lfd fp11, `0x18`(r4) / to hide 1st level cache latency. /
148	lfd fp12, `0x20`(r4)
149	stfd fp9, `0x08`(r6)
150	stfd fp10, `0x10`(r6)
151	stfd fp11, `0x18`(r6)
152	stfd fp12, `0x20`(r6)
153	lfd fp9, `0x28`(r4)
154	lfd fp10, `0x30`(r4)
155	lfd fp11, `0x38`(r4)
156	lfd fp12, `0x40`(r4)
157	stfd fp9, `0x28`(r6)
158	stfd fp10, `0x30`(r6)
159	stfd fp11, `0x38`(r6)
160	stfd fp12, `0x40`(r6)
161	lfd fp9, `0x48`(r4)
162	lfd fp10, `0x50`(r4)
163	lfd fp11, `0x58`(r4)
164	lfd fp12, `0x60`(r4)
165	stfd fp9, `0x48`(r6)
166	stfd fp10, `0x50`(r6)
167	stfd fp11, `0x58`(r6)
168	stfd fp12, `0x60`(r6)
169	lfd fp9, `0x68`(r4)
170	lfd fp10, `0x70`(r4)
171	lfd fp11, `0x78`(r4)
172	lfdu fp12, `0x80`(r4)
173	stfd fp9, `0x68`(r6)
174	stfd fp10, `0x70`(r6)
175	stfd fp11, `0x78`(r6)
176	stfdu fp12, `0x80`(r6)
177
178	bdnz .Lloop
179
180	.Lendloop:
181	cmpwi r10,`0`
182	slwi r10,r10,`2` / adjust from 128 to 32 byte stride /
183	beq- .Lendloop2
184	mtctr r10
185
186	.Lloop2: / Copy aligned body /
187	lfd fp9, `0x08`(r4)
188	lfd fp10, `0x10`(r4)
189	lfd fp11, `0x18`(r4)
190	lfdu fp12, `0x20`(r4)
191	stfd fp9, `0x08`(r6)
192	stfd fp10, `0x10`(r6)
193	stfd fp11, `0x18`(r6)
194	stfdu fp12, `0x20`(r6)
195
196	bdnz .Lloop2
197	.Lendloop2:
198
199	.Llessthancacheline: / less than cache to do ? /
200	cmplwi cr0,r5,`16`
201	srwi r7,r5,`4` / divide size by 16 /
202	blt- .Ldo_lt16
203	mtctr r7
204
205	.Lcopy_remaining:
206	lfd fp9,`0x08`(r4)
207	lfdu fp10,`0x10`(r4)
208	stfd fp9,`0x08`(r6)
209	stfdu fp10,`0x10`(r6)
210	bdnz .Lcopy_remaining
211
212	.Ldo_lt16: / less than 16 ? /
213	cmplwi cr0,r5,`0` / copy remaining bytes (0-15) /
214	beqlr+ / no rest to copy /
215	addi r4,r4,`8`
216	addi r6,r6,`8`
217
218	.Lshortcopy: / SIMPLE COPY to handle size =< 15 bytes /
219	mtcrf `0x01`,r5
220	sub r7,r4,r6
221	bf- cr7*`4`+`0`,`8f`
222	lfdx fp9,r7,r6 / copy 8 byte /
223	stfd fp9,`0`(r6)
224	addi r6,r6,`8`
225	`8`:
226	bf cr7*`4`+`1`,`4f`
227	lwzx r0,r7,r6 / copy 4 byte /
228	stw r0,`0`(r6)
229	addi r6,r6,`4`
230	`4`:
231	bf cr7*`4`+`2`,`2f`
232	lhzx r0,r7,r6 / copy 2 byte /
233	sth r0,`0`(r6)
234	addi r6,r6,`2`
235	`2`:
236	bf cr7*`4`+`3`,`1f`
237	lbzx r0,r7,r6 / copy 1 byte /
238	stb r0,`0`(r6)
239	`1`: blr
240
241	END (memcpy)
242	libc_hidden_builtin_def (memcpy)
243

source code of glibc/sysdeps/powerpc/powerpc32/cell/memcpy.S